-title: //div[@class='meta']/h2/a\r
-author: //div[@class='meta']/h2/following-sibling::p/a/text()\r
-date://div[@class='meta']/h2/strong\r
-body: //div[@id='article']\r
+title: //div[@class='meta']/h2/a
+author: //div[@class='meta']/h2/following-sibling::p/a/text()
+date://div[@class='meta']/h2/strong
+body: //div[@id='article']
strip: //div[@class='domore']
test_url: http://24ways.org/2011/composing-the-new-canon
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'entry-title')]
+date: //meta[@name='weibo: article:create_at']/@content
+body: //div[contains(@class, 'mainContent')]
+strip_id_or_class: related_topics
+
+prune: no
+
+test_url: http://www.36kr.com/p/207879.html
\ No newline at end of file
-title: //div[@class='post_header']//h2/a\r
-author: //span[@class='author']\r
-date: //span[@class='date']\r
-body: //div[@id='Content']\r
+title: //div[@class='post_header']//h2/a
+author: //span[@class='author']
+date: //span[@class='date']
+body: //div[@id='Content']
test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department
\ No newline at end of file
-body: //div[@class='content']\r
-date: //div[@class='content']/h2\r
-strip: //div[@class='content']/h2\r
-title: //div[@class='content']/h3\r
-\r
-strip: //div[@id='postmenu']\r
-strip: //div[@class='trackback']\r
-tidy: no\r
+body: //div[@class='content']
+date: //div[@class='content']/h2
+strip: //div[@class='content']/h2
+title: //div[@class='content']/h3
+
+strip: //div[@id='postmenu']
+strip: //div[@class='trackback']
+tidy: no
test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html
\ No newline at end of file
-body: //*[@class = 'content']\r
-author: //*[@class = 'submitted']/a\r
+body: //*[@class = 'content']
+author: //*[@class = 'submitted']/a
date: substring-after(//*[@class = 'submitted']/text(), '|')
test_url: http://www.43folders.com/2011/04/22/cranking
\ No newline at end of file
-# very loose setup for both 500px.com/photo/* and 500px.com/blog/*\r
-# photo page example: http://500px.com/photo/4181666\r
-# blog page example: http://500px.com/blog/110\r
-\r
-# avoid "no text" error\r
-tidy:no\r
-prune:no\r
-\r
-# reorganize photo page elements\r
-#body://div[contains(@class,'container')]\r
-move_into(body)://div[contains(@id,'thephoto')]\r
-move_into(body)://div[contains(@id,'description')]\r
-move_into(body)://div[contains(@id,'tags')]\r
-move_into(body)://div[contains(@id,'photo-info')]\r
-\r
-# clean photo page info\r
-strip://span[contains(@id,'copyright')]\r
-strip://*[contains(@id,'store')]\r
-strip://*[contains(@id,'user-info')]\r
-strip://*[contains(@id,'photo-stats')]\r
-strip://*[contains(@id,'voting_controls_container')]\r
-strip://*[contains(@id,'more-photos')]\r
-strip://*[contains(@id,'embed-photo')]\r
-\r
-# clean blog page side bar\r
+# very loose setup for both 500px.com/photo/* and 500px.com/blog/*
+# photo page example: http://500px.com/photo/4181666
+# blog page example: http://500px.com/blog/110
+
+# avoid "no text" error
+tidy:no
+prune:no
+
+# reorganize photo page elements
+#body://div[contains(@class,'container')]
+move_into(body)://div[contains(@id,'thephoto')]
+move_into(body)://div[contains(@id,'description')]
+move_into(body)://div[contains(@id,'tags')]
+move_into(body)://div[contains(@id,'photo-info')]
+
+# clean photo page info
+strip://span[contains(@id,'copyright')]
+strip://*[contains(@id,'store')]
+strip://*[contains(@id,'user-info')]
+strip://*[contains(@id,'photo-stats')]
+strip://*[contains(@id,'voting_controls_container')]
+strip://*[contains(@id,'more-photos')]
+strip://*[contains(@id,'embed-photo')]
+
+# clean blog page side bar
strip://*[contains(@class,'col d3 clearafter')]
test_url: http://500px.com/photo/3641041?from=editors
\ No newline at end of file
-body: //*[@id="episode"]\r
-prune: no\r
-tidy: no\r
-\r
-autodetect_next_page: no\r
-strip_id_or_class: player\r
-\r
+body: //*[@id="episode"]
+prune: no
+tidy: no
+
+autodetect_next_page: no
+strip_id_or_class: player
+
strip://*[@id="header"]
test_url: http://5by5.tv/buildanalyze/60
\ No newline at end of file
--- /dev/null
+title: //*[@id='sstitle']
+body: //div[@id='sstory']
+strip_id_or_class: newsoptions
+prune: no
+
+test_url: http://www.7newsbelize.com/sstory.php?nid=25654
+test_url: http://www.7newsbelize.com/7news.xml
\ No newline at end of file
-title: //h2[@class='border']\r
-body: //div[@class='padding']\r
-\r
-convert_double_br_tags: yes\r
-\r
-strip: //div[@id='social_sharing']\r
-strip: //div[@class='socialLinks']\r
+title: //h2[@class='border']
+body: //div[@class='padding']
+
+convert_double_br_tags: yes
+
+strip: //div[@id='social_sharing']
+strip: //div[@class='socialLinks']
test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/
\ No newline at end of file
--- /dev/null
+Full-Text RSS site config files
+================
+
+[Full-Text RSS](http://fivefilters.org/content-only/), our article extraction tool, makes use of site-specific extraction rules to improve results. Each time a URL is processed, it checks to see if there are extraction rules for the site being processed. If there are no site patterns, it tries to detect the content block automatically.
+
+This repository contains the site config files we use in Full-Text RSS.
+
+### Contributing changes
+
+We chose GitHub for this set of files because they offer one feature which we hope will make contributing changes easier: [file editing](https://github.com/blog/844-forking-with-the-edit-button) through the web interface.
+
+You can now make changes to any of our site config files and request that your changes be pulled into the main set we maintain. This is what GitHub calls the Fork and Pull model:
+
+> The Fork & Pull Model lets anyone fork an existing repository and push changes to their personal fork without requiring access be granted to the source repository. The changes must then be pulled into the source repository by the project maintainer. This model reduces the amount of friction for new contributors and is popular with open source projects because it allows people to work independently without upfront coordination.
+
+When we receive a pull request we'll review the changes and if everything's okay we'll update our copy.
+
+If a site is not in our set, you can create a file for it in the same way. See [Creating files on GitHub](https://github.com/blog/1327-creating-files-on-github).
+
+### How to write a site config file
+
+The quickest and simplest way is to use our [point-and-click interface](http://siteconfig.fivefilters.org). It's a simple tool only intended to create a rule to extract the correct content block.
+
+For further refinements, e.g. selecting the title, stripping elements, dealing with multi-page articles, please see our [help page](http://help.fivefilters.org/customer/portal/articles/223153-site-patterns).
+
+### Instapaper
+
+When we introduced site patterns, we chose to adopt the [same format](http://blog.instapaper.com/post/730281947) used by Instapaper. This allows us to make use of the existing extraction rules contributed by Instapaper users.
+
+Marco, Instapaper's creator, graciously opened up the database of contributions to everyone:
+
+> And, recognizing that your efforts could be useful to a wide range of other tools and services, I'll make the list of all of these site-specific configurations available to the public, free, with no strings attached.
+
+Most of the extraction rules in our set are borrowed from Instapaper. You can see the list maintained by Instapaper at [instapaper.com/bodytext/](http://instapaper.com/bodytext/) (login required).
+
+### Testing site config files
+
+Currently you will have to have a copy of Full-Text RSS to test changes to the site config files. In the future we will try to make this process easier.
-title: //meta[@property='og:title']/@content\r
-body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]\r
-\r
-strip_id_or_class: socialshareprivacy1\r
-strip_id_or_class: zvaFacebookButton\r
-\r
-tidy: no\r
-prune: no\r
-\r
+title: //meta[@property='og:title']/@content
+body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
+
+strip_id_or_class: socialshareprivacy1
+strip_id_or_class: zvaFacebookButton
+
+tidy: no
+prune: no
+
test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]\r
-\r
-strip_id_or_class: socialshareprivacy1\r
-strip_id_or_class: zvaFacebookButton\r
-\r
-tidy: no\r
-prune: no\r
-\r
+title: //meta[@property='og:title']/@content
+body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]
+
+strip_id_or_class: socialshareprivacy1
+strip_id_or_class: zvaFacebookButton
+
+tidy: no
+prune: no
+
test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text']\r
-strip_id_or_class: colB\r
-\r
-prune: no\r
+title: //meta[@property='og:title']/@content
+body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text' or @itemprop='articleBody']
+strip_id_or_class: colB
+
+prune: no
test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html
\ No newline at end of file
-title: //h1\r
-author: //div[@class="byline"]/a\r
-date: //span[@class="timestamp"]\r
-\r
-strip: //p[@class="topics"]\r
-strip: //h1\r
-strip: //div[@class="byline"]\r
-strip: //p[@class="published"]\r
+title: //div[@class='article section']//h1
+author: //div[@class="byline"]/a
+date: //span[@class="timestamp"]
+body: //div[@class="page section"]
+
+strip: //a[@class="inline-caption"]
+strip: //p[@class="ticker section noprint"]
+strip: //p[@class="topics"]
+strip: //h1
+strip: //div[@class="byline"]
+strip: //p[@class="published"]
strip: //div[contains(@class,"featured-scroller")]
-test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544
\ No newline at end of file
+strip_id_or_class: footer
+
+tidy: no
+
+test_url: http://www.abc.net.au/news/2013-03-27/open-speed-highways-change-clp-giles/4597892
+test_url: http://www.abc.net.au/news/2013-04-30/credit-growth-remains-subdued/4660054?section=business
-title: //h1[@class='headline']\r
-body: //div[@id='storyText']\r
-# for video entries\r
-body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]\r
-author: //div[@class='byline']\r
-date: //div[@class='date']\r
-strip: //*[@id='date_partner']\r
-\r
-strip: //div[@class='breadcrumb']\r
-strip: //div[contains(@class,'show_tools')]\r
-strip: //div[@id='sponsoredByAd']\r
-strip: //div[contains(@class,'rel_container')]\r
-strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]\r
-strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]\r
-strip: //p[contains(., 'Click here to return to')]\r
-#strip_id_or_class: media\r
-strip_id_or_class: mediaplayer\r
-\r
-replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http\r
-\r
-prune: no\r
-\r
-single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')\r
-\r
-test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744\r
-# multi-page\r
+title: //h1[@class='headline']
+body: //div[@id='storyText']
+# for video entries
+body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]
+author: //div[@class='byline']
+date: //div[@class='date']
+strip: //*[@id='date_partner']
+
+strip: //div[@class='breadcrumb']
+strip: //div[contains(@class,'show_tools')]
+strip: //div[@id='sponsoredByAd']
+strip: //div[contains(@class,'rel_container')]
+strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]
+strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]
+strip: //p[contains(., 'Click here to return to')]
+#strip_id_or_class: media
+strip_id_or_class: mediaplayer
+
+replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http
+
+prune: no
+
+single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')
+
+test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744
+# multi-page
test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544
\ No newline at end of file
-title: //div[@id='H_docTitle']\r
-\r
-body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']\r
-\r
-strip_id_or_class: F_toenail\r
-\r
-prune: no\r
-\r
+title: //div[@id='H_docTitle']
+
+body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']
+
+strip_id_or_class: F_toenail
+
+prune: no
+
test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html
\ No newline at end of file
-body: //div[starts-with(@id, 'news-id-')]\r
-\r
+body: //div[starts-with(@id, 'news-id-')]
+
test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html
\ No newline at end of file
-title://h1[@class="title"]\r
-author://div[@class="submitted"]/span/a\r
-date://div[@class="submitted"]/span\r
-body://div[@class="content-wrapper"]\r
-\r
-strip://div[@id="skip-link"]\r
-strip://div[@id="region-content-3-3"]\r
+title://h1[@class="title"]
+author://div[@class="submitted"]/span/a
+date://div[@class="submitted"]/span
+body://div[@class="content-wrapper"]
+
+strip://div[@id="skip-link"]
+strip://div[@id="region-content-3-3"]
strip://div[@id="section-footer"]
test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code
\ No newline at end of file
-tidy:no\r
-date: //time[@class='updated']\r
-dissolve: //ul[@class='video-gallery']/li\r
+tidy:no
+date: //time[@class='updated']
+dissolve: //ul[@class='video-gallery']/li
dissolve: //ul[@class='video-gallery']
test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php
\ No newline at end of file
--- /dev/null
+title: //h1[@class='articleTitle ']
+body: //div[@class='bodyText widget storyContent']
+strip: //p/span[@class='quote']/..
+strip_id_or_class: 'pull1'
+test_url: https://www.aftenposten.no/meninger/spaltister/Portrett-av-scenekunstneren-som-ung-mann-7167959.html
\ No newline at end of file
--- /dev/null
+author: //article//address[contains(@class, 'author')]
+body: //article[.//div[contains(@class, 'abBodyText')]]//*[contains(@class, 'abLeadText') or contains(@class, 'abBodyText') or contains(@class, 'abImageBlock') or contains(@class, 'abIGSatellite')]
+
+strip: //address//img
+strip: //footer
+strip_id_or_class: abSticky
+
+prune: no
+
+test_url: http://www.aftonbladet.se/sportbladet/hockey/sverige/allsvenskan/article17498194.ab
+test_url: http://www.aftonbladet.se/debatt/article16207536.ab
+test_url: http://www.aftonbladet.se/debatt/debattamnen/politik/article17483377.ab
+test_url: http://www.aftonbladet.se/rss.xml
\ No newline at end of file
-body: //div[@id='content']\r
-\r
-# clean up recipe pages\r
-strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
-\r
-#recipe pages\r
-strip_id_or_class: "recipe-feedback"\r
-strip_id_or_class: "comments"\r
-strip_id_or_class: "procedure-number"\r
-strip_id_or_class: "more-with-author"\r
-\r
-#slice\r
-strip_id_or_class: "inner"\r
+body: //div[@id='content']
+
+# clean up recipe pages
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
+
+#recipe pages
+strip_id_or_class: "recipe-feedback"
+strip_id_or_class: "comments"
+strip_id_or_class: "procedure-number"
+strip_id_or_class: "more-with-author"
+
+#slice
+strip_id_or_class: "inner"
test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='main-column']//div[@class='content']
+
+prune: no
+
+test_url: http://www.albayan.ae/across-the-uae/education/2013-08-29-1.1949645
+test_url: http://www.albayan.ae/1.448?ot=ot.AjaxPageLayout
\ No newline at end of file
--- /dev/null
+body: //section[@class='content']
+date: //span[1]
+author: //h1[@id='sitetitle']
+test_url: https://alexduner.com/blog/2013/1/something-i-learned-today
\ No newline at end of file
--- /dev/null
+body: //section[@class='content']
+date: //span[1]
+author: //h1[@id='sitetitle']
+test_url: https://alexduner.squarespace.com/blog/2013/1/tech-culture-from-the-outside-looking-in
\ No newline at end of file
-title: //h1[@class='title']\r
-author: //h3[@class='byline']/a\r
-date: //div[@class='ishinfo']\r
-\r
-body: //*[@id='articletext']\r
-strip_id_or_class: 'ishinfo'\r
-strip_id_or_class: 'metastuff'\r
-strip_id_or_class: 'learnmore'\r
-strip_id_or_class: 'discuss'\r
-\r
+title: //h1[@class='title']
+author: //h3[@class='byline']/a
+date: //div[@class='ishinfo']
+
+body: //*[@id='articletext']
+strip_id_or_class: 'ishinfo'
+strip_id_or_class: 'metastuff'
+strip_id_or_class: 'learnmore'
+strip_id_or_class: 'discuss'
+
prune: no
test_url: http://www.alistapart.com/articles/organizing-mobile/
\ No newline at end of file
-title: //span[@id='DetailedTitle']\r
-body: //td[@id='tdTextContent']\r
-strip_id_or_class: Skyscrapper_Body\r
-date: //span[@id='ctl00_cphBody_lblDate']\r
-author: //div[@id="dvAuthorInfo"]//a/text()\r
-strip: //table[ tbody/tr/td/object ]\r
-prune: no\r
+title: //span[@id='DetailedTitle']
+body: //td[@id='tdTextContent']
+strip_id_or_class: Skyscrapper_Body
+date: //span[@id='ctl00_cphBody_lblDate']
+author: //div[@id="dvAuthorInfo"]//a/text()
+strip: //table[ tbody/tr/td/object ]
+prune: no
test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html
\ No newline at end of file
-title: //h1[@id='itemTitle']\r
-body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]\r
-strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']\r
-strip: //div[contains(@class, 'rightcoltoolsdiv')]\r
-strip: //div[contains(@class, 'servings-form')]\r
-strip: //p[@class='nutritional-information']\r
-strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]\r
-strip: //div[@id='nutri-info']/div[contains(@class, 'title')]\r
-strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']\r
-strip_id_or_class: eshaAttribute\r
-strip_id_or_class: eshaParagraph\r
-prune: no\r
+title: //h1[@id='itemTitle']
+body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]
+strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']
+strip: //div[contains(@class, 'rightcoltoolsdiv')]
+strip: //div[contains(@class, 'servings-form')]
+strip: //p[@class='nutritional-information']
+strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]
+strip: //div[@id='nutri-info']/div[contains(@class, 'title')]
+strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']
+strip_id_or_class: eshaAttribute
+strip_id_or_class: eshaParagraph
+prune: no
test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd
\ No newline at end of file
-title://div[@class="article-title"]/h1[@class="title"]\r
-date: //p[@class="article-date"]\r
-body://*[@class="article-body article-text"]\r
-# Trim out related posts at bottom of article\r
-strip://blockquote[@class="memo"]\r
-\r
-# Yup, no idea why author won't work...\r
-author://div[@class="page-header article-header clearfix"]/p[@class="title"]\r
+title://div[@class="article-title"]/h1[@class="title"]
+date: //p[@class="article-date"]
+body://div[contains(@class, "article-body")]
+# Trim out related posts at bottom of article
+strip://blockquote[@class="memo"]
+
+tidy: no
+
+# Yup, no idea why author won't work...
+author://div[@class="page-header article-header clearfix"]/p[@class="title"]
# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
-test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
\ No newline at end of file
+test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
+test_url: http://allthingsd.com/20131010/google-cio-ben-fried-on-how-google-works/
\ No newline at end of file
-title: //div[@id='pageHdr']//h1\r
-body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']\r
-strip: //div[contains(@class, 'infoBox') or @id='infoBox']\r
-single_page_link: //li[@id='print']/a\r
-\r
+title: //div[@id='pageHdr']//h1
+body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']
+strip: //div[contains(@class, 'infoBox') or @id='infoBox']
+single_page_link: //li[@id='print']/a
+
prune: no
-\r
+
test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/
\ No newline at end of file
-body: //div[@class = 'entry']\r
-date: substring-after(//p[@class="date"],'بتاريخ ')\r
-strip_id_or_class: date\r
-strip_id_or_class: follow-single\r
-strip_id_or_class: ratingblock\r
-strip_id_or_class: newRatingHolder\r
-strip_id_or_class: postmetadata\r
-strip_id_or_class: addthis_toolbox\r
-strip_id_or_class: addthis_default_style\r
+body: //div[@class = 'entry']
+date: substring-after(//p[@class="date"],'بتاريخ ')
+strip_id_or_class: date
+strip_id_or_class: follow-single
+strip_id_or_class: ratingblock
+strip_id_or_class: newRatingHolder
+strip_id_or_class: postmetadata
+strip_id_or_class: addthis_toolbox
+strip_id_or_class: addthis_default_style
strip_id_or_class: size-full
test_url: http://alphabeta.argaam.com/?p=35657
\ No newline at end of file
-body: //div[@id = "article-view"]\r
-body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]\r
-author: //p[@class = "author"]\r
-strip: //h1\r
-strip: //h2\r
-strip_id_or_class: author\r
-prune: no\r
-test_url: http://www.alriyadh.com/2011/10/10/article674357.html\r
+body: //div[@id = "article-view"]
+body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]
+author: //p[@class = "author"]
+strip: //h1
+strip: //h2
+strip_id_or_class: author
+prune: no
+test_url: http://www.alriyadh.com/2011/10/10/article674357.html
test_url: http://www.alriyadh.com/net/article/780935
\ No newline at end of file
--- /dev/null
+single_page_link: //div[contains(@class, 'story_tools')]//a[contains(@href, '/print/')]
+
+test_url: http://www.alternet.org/civil-liberties/noam-chomsky-surveillance-state-beyond-imagination-being-created-one-freest
+test_url: http://feeds.feedblitz.com/alternet
\ No newline at end of file
-title: //h1\r
-\r
-author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")\r
-\r
-date: //div/a[contains (@href, "issue")]\r
-\r
-move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]\r
-\r
+title: //h1
+
+author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")
+
+date: //div/a[contains (@href, "issue")]
+
+move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]
+
body: //div[@class="enableBullets"]
test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']//div[contains(@class, 'content')]
+strip_id_or_class: widget
+strip: //a[contains(@href, 'upm_export=')]
+
+test_url: http://amandala.com.bz/news/feed/
+test_url: http://amandala.com.bz/news/poor-pse-results-30-raise/
\ No newline at end of file
-title: //span[@id = 'btAsinTitle']\r
-body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div\r
-#strip_id_or_class: quantityDropdownDiv\r
-#strip_id_or_class: addToCartSpan\r
-#strip_id_or_class: oneClickDiv\r
-strip_id_or_class: nocontent\r
-strip_id_or_class: masDynamicConten\r
-strip_id_or_class: dynamic-content\r
-prune: no\r
-\r
-find_string: <span id="actualPriceValue">\r
-replace_string: <span id="actualPriceValue"><br />Price: \r
-\r
-strip_id_or_class: collapsePS\r
-strip_id_or_class: expandPS\r
-strip_id_or_class: psPlaceHolde\r
-strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]\r
-\r
+title: //span[@id = 'btAsinTitle']
+body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div
+#strip_id_or_class: quantityDropdownDiv
+#strip_id_or_class: addToCartSpan
+#strip_id_or_class: oneClickDiv
+strip_id_or_class: nocontent
+strip_id_or_class: masDynamicConten
+strip_id_or_class: dynamic-content
+prune: no
+
+find_string: <span id="actualPriceValue">
+replace_string: <span id="actualPriceValue"><br />Price:
+
+strip_id_or_class: collapsePS
+strip_id_or_class: expandPS
+strip_id_or_class: psPlaceHolde
+strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]
+
test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/
\ No newline at end of file
-title: //div[@class='head']/h2/a\r
-author: //div[@class='head']/a\r
-date: //div[@class='head']/p[@class='date']/a\r
-body: //div[@class='copy']\r
+title: //div[@class='head']/h2/a
+author: //div[@class='head']/a
+date: //div[@class='head']/p[@class='date']/a
+body: //div[@class='copy']
strip: //p[@class='meta']
test_url: http://americandrink.net/post/10567188712/free-the-hooch
\ No newline at end of file
-title: //div[@class="editorial-content"]/h3\r
-body: //div[@class="hero-image" or @class="editorial-content"]\r
-\r
-strip: //ul[@class="hero-caption"]\r
-strip_id_or_class: footer\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //div[@class="editorial-content"]/h3
+body: //div[@class="hero-image" or @class="editorial-content"]
+
+strip: //ul[@class="hero-caption"]
+strip_id_or_class: footer
+
+prune: no
+tidy: no
+
test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/
\ No newline at end of file
-title: //h1[@class="post-title"]\r
-author: //span[@class="author"]/a\r
-date: //span[@class="date"]\r
+title: //h1[@class="post-title"]
+author: //span[@class="author"]/a
+date: //span[@class="date"]
body: //div[@class="post-content main"]
test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/
\ No newline at end of file
--- /dev/null
+title: //title
+
+body: //div[@class="entry-content"]
+
+author: //span[@class="author vcard"]
+
+date: //span[@class="entry-date"]
+test_url: http://www.amptoons.com/blog/2013/03/14/open-thread-and-link-farm-i-hate-being-sick-edition/
\ No newline at end of file
-author: //a[@class='b'][1]\r
-date: substring-after(substring-before(//div, 'Posted in'), ' on ')\r
-strip_image_src: /content/images/globals/\r
-strip: //h2[. = 'Page 1']/preceding::p\r
-strip: //h2\r
-\r
-prune: no\r
-\r
-single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))\r
-\r
+author: //a[@class='b'][1]
+date: substring-after(substring-before(//div, 'Posted in'), ' on ')
+strip_image_src: /content/images/globals/
+strip: //h2[. = 'Page 1']/preceding::p
+strip: //h2
+
+prune: no
+
+single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))
+
test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/
\ No newline at end of file
--- /dev/null
+body: //div[@class='post_content']
+date: //div[@class='date_day'] | div[@class='date_month']
+
+test_url: http://www.androidpolice.com/2014/03/30/music-boss-for-pebble-can-now-control-playback-and-volume-on-chromecast-content-from-your-smartwatch/
+
-title: //h2\r
-author: string('Andy Rutledge')\r
-date: //div[@class='articledate']\r
-body: //div[@class='copybody']\r
-\r
-strip: //*[@class='space']\r
-strip: //*[@class='articleFoot']\r
-\r
+title: //h2
+author: string('Andy Rutledge')
+date: //div[@class='articledate']
+body: //div[@class='copybody']
+
+strip: //*[@class='space']
+strip: //*[@class='articleFoot']
+
test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php
\ No newline at end of file
-title: //h1[@class="title"]\r
-\r
-author: ("Anna Manasova")\r
-# is ignored, unfortunately\r
-\r
-date: //p[@class="date"]\r
-\r
+title: //h1[@class="title"]
+
+author: ("Anna Manasova")
+# is ignored, unfortunately
+
+date: //p[@class="date"]
+
body: //div[@class="entry"]
test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/
\ No newline at end of file
-title: //h1[contains(@class, 'title')#\r
-body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']\r
-date: //div[@class='date']\r
-\r
-strip_id_or_class: sharethis\r
-strip_id_or_class: stats\r
-strip_id_or_class: apply_form\r
-strip_id_or_class: job_map\r
-strip_id_or_class: respond\r
-strip: //h1//span[@class='type']\r
-strip: //li[@class='print' or @class='map']\r
-\r
-replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //h1[contains(@class, 'title')#
+body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']
+date: //div[@class='date']
+
+strip_id_or_class: sharethis
+strip_id_or_class: stats
+strip_id_or_class: apply_form
+strip_id_or_class: job_map
+strip_id_or_class: respond
+strip: //h1//span[@class='type']
+strip: //li[@class='print' or @class='map']
+
+replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla
+
+prune: no
+tidy: no
+
test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/
\ No newline at end of file
-strip: //p[@class='sosumi']\r
-# Aren't they witty?\r
-\r
-# I can't work out what causes the before the title. \r
-title: //h1[@class='title']\r
-strip: //h1[@class='title']\r
+strip: //p[@class='sosumi']
+# Aren't they witty?
+
+# I can't work out what causes the before the title.
+title: //h1[@class='title']
+strip: //h1[@class='title']
test_url: http://www.apple.com/pr/library/2011/02/15appstore.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'articulum')]
+
+test_url: http://www.appledaily.com.tw/realtimenews/article/new/20140120/330479
+test_url: http://www.appledaily.com.tw/rss/create/kind/rnews/type/new/
\ No newline at end of file
-title: //p[@class='title']\r
-\r
-author: //p[text() = 'By ']/a/text()\r
-strip: //p[text() = 'By ']\r
-\r
-body: //td[@class='bod']\r
-strip_id_or_class: title\r
-strip_id_or_class: minor\r
-\r
-strip_id_or_class: multipagefooter\r
-test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
\ No newline at end of file
+title: //h1[@class="art-head"]
+
+author: //p[contains(@class, 'byline')]/a
+#author: //p[text() = 'By ']/a/text()
+#strip: //p[text() = 'By ']
+
+date: //p[contains(@class, 'date-header')]
+
+body: //div[@class="article"]
+strip_id_or_class: lazy
+#strip_id_or_class: minor
+strip_id_or_class: multipagefooter
+strip_id_or_class: date-header
+strip_id_or_class: byline
+
+find_string: <noscript>
+replace_string: <div>
+find_string: </noscript>
+replace_string: </div>
+
+test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
+test_url: http://appleinsider.com/articles/13/10/03/goldee-companion-app-for-philips-hue-bulbs-offers-shifting-dynamic-light-scenes
+test_url: http://appleinsider.com/appleinsider.rss
\ No newline at end of file
-date: //div[@class='post_date']\r
-\r
-body: //div[@class='post_content']\r
+date: //div[@class='post_date']
+
+body: //div[@class='post_content']
test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up
\ No newline at end of file
-# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.\r
-# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.\r
-# Exclude: header, footer, navigation, comments.\r
-# Notes: User is a newbie with XPaths.\r
-\r
-title: //h2[@class='title']\r
-author: //h3[@class='byline']\r
-author: //a[@class='login author']\r
-\r
-strip_id_or_class:header\r
-strip_id_or_class:navigation\r
-strip_id_or_class:feedback\r
-strip_id_or_class:kudos\r
-strip_id_or_class:add_comment_placeholder\r
-strip_id_or_class:add_comment\r
-strip_id_or_class:globalize\r
+# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.
+# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.
+# Exclude: header, footer, navigation, comments.
+# Notes: User is a newbie with XPaths.
+
+title: //h2[@class='title']
+author: //h3[@class='byline']
+author: //a[@class='login author']
+
+strip_id_or_class:header
+strip_id_or_class:navigation
+strip_id_or_class:feedback
+strip_id_or_class:kudos
+strip_id_or_class:add_comment_placeholder
+strip_id_or_class:add_comment
+strip_id_or_class:globalize
strip_id_or_class:footer
-test_url: http://archiveofourown.org/works/229402?view_full_work=true
\ No newline at end of file
+
+single_page_link: //div[@id='main']//a[contains(@href, 'view_adult=true')]
+
+test_url: http://archiveofourown.org/works/229402?view_full_work=true
+test_url: http://archiveofourown.org/works/750111/chapters/1399929
\ No newline at end of file
-author: //p[@class='byline']/a\r
-body: //div[contains(@class,'article-content')]\r
-strip: //h2[@class='title']\r
-strip_id_or_class: byline\r
-prune: no\r
-\r
-date: //div[@class='byline']/span[@class='posted']//abbr/@original-title\r
-date: //div[@class='byline']/span[@class='posted']//abbr\r
-\r
-title: //div[@id='story']//h2[@class='title']\r
-\r
-strip: //div[@class='pager']\r
-next_page_link: //nav//a[span/@class='next']/@href\r
-\r
-test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars\r
-test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
\ No newline at end of file
+author: //p[@class='byline']/a
+body: //div[contains(@class,'article-content')]
+strip: //h2[@class='title']
+strip_id_or_class: byline
+strip_id_or_class: story-sidebar
+prune: no
+
+date: //div[@class='byline']/span[@class='posted']//abbr/@original-title
+date: //div[@class='byline']/span[@class='posted']//abbr
+
+title: //div[@id='story']//h2[@class='title']
+
+strip: //div[@class='pager']
+next_page_link: //nav//a[span/@class='next']/@href
+
+test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars
+test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
-title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1\r
-author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")\r
-date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]\r
-\r
+title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1
+author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")
+date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]
+
strip_id_or_class: mod-pagination
test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park
\ No newline at end of file
-title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1\r
-date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]\r
-author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]\r
-\r
-strip_id_or_class: mod-article-byline\r
-strip_id_or_class: mod-article-header\r
-strip_id_or_class: mod-article-subtitle\r
-#This leaves some crud after the article, but it's better than nothing.\r
-#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.\r
+title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1
+date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]
+author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]
+
+strip_id_or_class: mod-article-byline
+strip_id_or_class: mod-article-header
+strip_id_or_class: mod-article-subtitle
+#This leaves some crud after the article, but it's better than nothing.
+#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.
test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, "article_body")]
+# print view
+body: //div[@id='print_facet']//div[@id='body']
+
+tidy: no
+prune: no
+
+single_page_link: concat(substring-before(//div[@id="echo_container_a"]/@guid, '_story.html'), '_print.html')
+
+test_url: http://articles.washingtonpost.com/2011-10-22/world/35279694_1_germany-acts-german-leaders-chancellor-angela-merkel
+test_url: http://articles.washingtonpost.com/2013-05-31/opinions/39658000_1_chemical-weapons-mass-destruction-cartels
\ No newline at end of file
-body: //div[@id='HeadLine']\r
+body: //div[@id='HeadLine']
strip: //div[@id='utility_right']
test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html
\ No newline at end of file
-title: //h1[@class='article_title']\r
-author: //span[@class='author']\r
-date: //h2[@class='dateline']\r
+title: //h1[@class='article_title']
+author: //span[@class='author']
+date: //h2[@class='dateline']
body: //div[@class='article_body']
test_url: http://ascarter.net/2012/02/20/enough-is-enough.html
\ No newline at end of file
-title: //span[@class='titel']\r
-author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']\r
-date: substring-after(//span[@class='metadaten_C'],'astronews.com')\r
-strip: //span[@class='bu']\r
-strip_image_src: '/_images/'\r
+title: //span[@class='titel']
+author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']
+date: substring-after(//span[@class='metadaten_C'],'astronews.com')
+strip: //span[@class='bu']
+strip_image_src: '/_images/'
test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml
\ No newline at end of file
-# Johannes Stühler\r
-\r
-title://h2\r
-author://span[@class='meta-content']\r
-date://abbr[@class='date published']/@title\r
-body://div[@class='entry-content']\r
+# Johannes Stühler
+
+title://h2
+author://span[@class='meta-content']
+date://abbr[@class='date published']/@title
+body://div[@class='entry-content']
test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/
\ No newline at end of file
-prune: no\r
-body: //div[@class='post-body']\r
-author: //p[@class='byline']//a\r
-date: substring-after(//div[@class='about']/p[2], 'Posted')\r
+prune: no
+body: //div[@class='post-body']
+author: //p[@class='byline']//a
+date: substring-after(//div[@class='about']/p[2], 'Posted')
strip: //div[@class='body']/div[@class='meta']
test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/
\ No newline at end of file
-author: //*[@id="article_wrapper"]/div[1]/a[1]\r
-body: //*[@id="article_wrapper"]/div[2]\r
+author: //*[@id="article_wrapper"]/div[1]/a[1]
+body: //*[@id="article_wrapper"]/div[2]
date: //*[@id="article_wrapper"]/div[1]/text()[2]
test_url: http://www.avclub.com/articles/forgetmenot,70904
\ No newline at end of file
-single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']\r
-convert_double_br_tags: yes\r
-\r
-title: //div[@class="story"]/h1\r
-body: //div[@id="story-body-text"]\r
-author: //span[@class="byline"]\r
-date: //p[@class="date"]\r
-\r
-strip: //*[@class='all']\r
-strip: //*[@class='articlerail']\r
+single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']
+convert_double_br_tags: yes
+
+title: //div[@class="story"]/h1
+body: //div[@id="story-body-text"]
+author: //span[@class="byline"]
+date: //p[@class="date"]
+
+strip: //*[@class='all']
+strip: //*[@class='articlerail']
test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story
\ No newline at end of file
--- /dev/null
+title: //h1[@class='title']
+author: //p[@class="author"]/a[1]
+body: //div[@class="article"]
+date: //p[@class="date"]
+
+# remove user tools
+strip: //div[@class='tools']
+strip: //h1
+strip: //h2[@class='subtitle']
+strip: //p[@class='author']
+strip: //p[@class='date']
+
+test_url: http://www.baseballprospectus.com/article.php?articleid=18463
\ No newline at end of file
-title: //h2\r
-date: //span[@class='date']\r
-body: //div[@class='entry']\r
-\r
-strip: //div[@class='zusatz']\r
+title: //h2
+date: //span[@class='date']
+body: //div[@class='entry']
+
+strip: //div[@class='zusatz']
test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/
\ No newline at end of file
-author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)\r
-\r
-\r
-date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)\r
-\r
-\r
-body: //div[@class='first-article-big']\r
-strip: //table[@class='newsimagecontainer']\r
-strip: //h3[@class='headlines']\r
-strip: //iframe[@class='headlines']\r
-strip: //a[@class='newslink']\r
+author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)
+
+
+date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)
+
+
+body: //div[@class='first-article-big']
+strip: //table[@class='newsimagecontainer']
+strip: //h3[@class='headlines']
+strip: //iframe[@class='headlines']
+strip: //a[@class='newslink']
convert_double_br_tags: yes
test_url: http://bb.is/Pages/82?NewsID=174119
\ No newline at end of file
-body: //div[@class="story-body"]\r
-title: //h1[@class="story-header"]\r
-date: //span[@class="story-date"]/span[@class='date']\r
-\r
-# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055\r
-body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']\r
-\r
-#strip: //div[@class="story-feature narrow"]\r
-#strip: //div[@class="story-feature wide"]\r
-#strip: //div[@class="story-feature dslideshow-enclosure"]\r
-strip: //div[contains(@class, "story-feature")]\r
-strip: //span[@class="story-date"]\r
-#strip: //div[@class="caption body-narrow-width"]\r
-strip: //div[@class="warning"]//p\r
-strip: //div[@id='page-bookmark-links-head']\r
-strip: //object\r
-strip: //div[contains(@class, "bbccom_advert_placeholder")]\r
-strip: //div[contains(@class, "embedded-hyper")]\r
-strip: //div[contains(@class, 'market-data')]\r
-strip: //a[contains(@class, 'hidden')]\r
-strip: //div[contains(@class, 'hypertabs')]\r
-strip: //div[contains(@class, 'related')]\r
-strip: //form[@id='comment-form']\r
-strip: //div[contains(@class, 'comment-introduction')]\r
-\r
-replace_string(<noscript>): <div>\r
-replace_string(</noscript>): </div>\r
-\r
-prune: no\r
-\r
-dissolve: //h2\r
-test_url: http://www.bbc.co.uk/news/business-15060862
\ No newline at end of file
+body: //div[@class="story-body"]
+# for video entries
+body: //div[contains(@class, "videoInStory") or @id="meta-information"]
+title: //h1[@class="story-header"]
+date: //span[@class="story-date"]/span[@class='date']
+# for sport site
+date: //meta[@name='DCTERMS.created']/@content
+author: //div[@id='headline']//span[@class='byline-name']
+
+# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055
+body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']
+
+#strip: //div[@class="story-feature narrow"]
+#strip: //div[@class="story-feature wide"]
+#strip: //div[@class="story-feature dslideshow-enclosure"]
+strip: //div[contains(@class, "story-feature")]
+strip: //span[@class="story-date"]
+#strip: //div[@class="caption body-narrow-width"]
+strip: //div[@class="warning"]//p
+strip: //div[@id='page-bookmark-links-head']
+strip: //object
+strip: //div[contains(@class, "bbccom_advert_placeholder")]
+strip: //div[contains(@class, "embedded-hyper")]
+strip: //div[contains(@class, 'market-data')]
+strip: //a[contains(@class, 'hidden')]
+strip: //div[contains(@class, 'hypertabs')]
+strip: //div[contains(@class, 'related')]
+strip: //form[@id='comment-form']
+strip: //div[contains(@class, 'comment-introduction')]
+strip: //div[contains(@class, 'share-tools')]
+strip: //div[@id='also-related-links']
+
+replace_string(<noscript>): <div>
+replace_string(</noscript>): </div>
+
+prune: no
+
+dissolve: //h2
+test_url: http://www.bbc.co.uk/sport/0/football/23224017
+test_url: http://www.bbc.co.uk/news/business-15060862
+# video entry
+test_url: http://www.bbc.co.uk/news/world-asia-22056933
\ No newline at end of file
--- /dev/null
+title: //header//h1
+#body: //article[contains(@class, 'node-full')]
+body: //div[contains(@class, 'recipe-details') or contains(@class, 'tips-carousel')] | //section[@id='recipe-ingredients' or @id='recipe-method']
+
+strip_id_or_class: recipe-rating-wrapper
+strip_id_or_class: magazine-subcribe-header
+strip_id_or_class: hide
+strip_id_or_class: recipe-actions
+strip_id_or_class: buy-ingredients
+strip_id_or_class: related-content
+strip_id_or_class: recipe-magazine-ad
+strip_id_or_class: copy-right
+
+prune: no
+
+test_url: http://www.bbcgoodfood.com/recipes/1131634/minced-beef-wellington
\ No newline at end of file
-body: //div[@class="entry-content"]\r
-\r
-# Remove text ‘Tweet’\r
-strip: //div[@class="entry-content"]/div[last()]\r
-\r
-title: h1[@class="entry-title"]\r
-\r
-# If the Instapaper text parser worked with HTML5 tags, we would use:\r
-date: //time[@class="entry-date"]\r
-\r
-# But since it does not, use this more complicated rule:\r
-date: //div[@class="entry-meta"]/a[@rel="bookmark"]\r
-\r
-# Unfortunately, the following rule is overridden by the automatically found author.\r
+body: //div[@class="entry-content"]
+
+# Remove text ‘Tweet’
+strip: //div[@class="entry-content"]/div[last()]
+
+title: h1[@class="entry-title"]
+
+# If the Instapaper text parser worked with HTML5 tags, we would use:
+date: //time[@class="entry-date"]
+
+# But since it does not, use this more complicated rule:
+date: //div[@class="entry-meta"]/a[@rel="bookmark"]
+
+# Unfortunately, the following rule is overridden by the automatically found author.
author: ("Benoit Maison")
test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/
\ No newline at end of file
-title: //h1[@class='headline']\r
+title: //h1[@class='headline']
body: //div[contains(@class, 'article-wrapper')]
test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, "NewsText"]
+prune: no
+
+test_url: http://www.bernama.com/bernama/v7/rss/english.php
+test_url: http://www.bernama.com/bernama/v7/newsindex.php?id=943513
\ No newline at end of file
-# some articles at this site like this one doesn't\r
-# seem to pick up the article body via normal \r
-# processing, other articles come through fine\r
-# http://www.betanews.com/joewilcox/article\r
-# /Google-is-a-marketing-sensation/1309708375\r
+# some articles at this site like this one doesn't
+# seem to pick up the article body via normal
+# processing, other articles come through fine
+# http://www.betanews.com/joewilcox/article
+# /Google-is-a-marketing-sensation/1309708375
body: //*[@id="article"]
test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375
\ No newline at end of file
-title: //div[contains(@class, 'main-content')]//h1\r
-body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]\r
-\r
-prune: no\r
-\r
-single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]\r
+title: //div[contains(@class, 'main-content')]//h1
+body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]
+
+prune: no
+
+single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]
test_url: http://www.biography.com/print/profile/martin-luther-9389283
\ No newline at end of file
--- /dev/null
+date: //meta[@name='publish-date']/@content
+body: //div[contains(@class, 'articleContentWrapper')]
+prune: no
+
+strip: //div[contains(@class, 'staff_info')]//dd[contains(., 'Twitter')]
+
+strip_id_or_class: related_content
+strip_id_or_class: enlarge
+strip_id_or_class: photoBy
+strip_id_or_class: older
+
+test_url: http://www.bizjournals.com/cincinnati/news/2013/10/03/harris-teeter-shareholders-vote-on.html
+test_url: http://feeds.bizjournals.com/industry_20?format=xml
\ No newline at end of file
-title: //h1[@class='articlehead']\r
-body: //div[@class='column']\r
-strip: //h1\r
-strip: //div[@class='help']\r
-\r
+title: //h1[@class='articlehead']
+body: //div[@class='column']
+strip: //h1
+strip: //div[@class='help']
+
#no author or date/time provided in current layout
test_url: http://bjango.com/articles/actions/
\ No newline at end of file
-tidy: no\r
-prune: no\r
-date: //article/header/h6/time\r
-title: //article/header/h3\r
-author: //meta[@name='author']/@content\r
-body: //article//post\r
+tidy: no
+prune: no
+date: //article/header/h6/time
+title: //article/header/h3
+author: //meta[@name='author']/@content
+body: //article//post
test_url: http://blog.arsln.org/aska-ayip-oluyor/
\ No newline at end of file
-title: //title\r
-author: //span[@class='author vcard']/a\r
-date: //p[@class='headline_meta']/abbr[@class='published']\r
-body: //div[@class='format_text entry-content']\r
-\r
+title: //title
+author: //span[@class='author vcard']/a
+date: //p[@class='headline_meta']/abbr[@class='published']
+body: //div[@class='format_text entry-content']
+
strip: //div[@id='dd_ajax_float']
test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html
\ No newline at end of file
-# Instapaper gets this back to front and only gets the blog title instead of the article title.\r
-title: substring-before(//title, '-')\r
-\r
-author: //a[ contains(@href, '/people') ]\r
-\r
-body: //div[ @class='post' ]\r
-\r
+# Instapaper gets this back to front and only gets the blog title instead of the article title.
+title: substring-before(//title, '-')
+
+author: //a[ contains(@href, '/people') ]
+
+body: //div[ @class='post' ]
+
# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
\ No newline at end of file
-title: //h2\r
-date: //h3\r
-body: //ul\r
+title: //h2
+date: //h3
+body: //ul
test_url: http://blog.fefe.de/?ts=b063bf55
\ No newline at end of file
-# clean Instagram blog a little bit\r
-\r
-tidy:no\r
-prune:no\r
-\r
-body://div[contains(@id,'content')]\r
-\r
-strip_id_or_class:meta\r
-strip_id_or_class:notes\r
+# clean Instagram blog a little bit
+
+tidy:no
+prune:no
+
+body://div[contains(@id,'content')]
+
+strip_id_or_class:meta
+strip_id_or_class:notes
strip_id_or_class:pagination
test_url: http://blog.instagram.com/post/8757832007/fromwhereistand
\ No newline at end of file
--- /dev/null
+author: //a[@href="http://www.marco.org/about"]
+date: //span[@class="date"]
+
+# Remove the date from article body.
+strip: //span[@class="date"]
+
+# Remove pagination links from article body.
+strip: //div[@id="pagination"]
+test_url: http://blog.instapaper.com/post/31303984531
\ No newline at end of file
-date: //span[contains(@class, 'date-links')]\r
-author: //span[contains(@class, 'author-links')]\r
+date: //span[contains(@class, 'date-links')]
+author: //span[contains(@class, 'author-links')]
body: //div[contains(@class, 'entry-content')]
test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web
\ No newline at end of file
-body: //*[contains(@class, 'post_content')]\r
-author: string('Kaelig Deloumeau-Prigent')\r
-title: //h1[@class='title']\r
+body: //*[contains(@class, 'post_content')]
+author: string('Kaelig Deloumeau-Prigent')
+title: //h1[@class='title']
date: //span[@class='date']
test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par
\ No newline at end of file
-title: //span[@class='pcol1 itemSubjectBoldfont']\r
-body: //div[@id='postListBody']\r
-date: //p[@class='date fil5 pcol2']\r
-single_page_link: /html/frameset/frame[1]/attribute::src\r
+title: //span[@class='pcol1 itemSubjectBoldfont']
+body: //div[@id='postListBody']
+date: //p[@class='date fil5 pcol2']
+single_page_link: /html/frameset/frame[1]/attribute::src
strip: //div[@class='post-btn']
test_url: http://blog.naver.com/how2invest/110135068757
\ No newline at end of file
-# PCHOME blog, a popular Chinese blog host\r
-# Oct 15, 2011\r
-# \r
-\r
-title://*[contains(@class,'imp')]/h2\r
-\r
-date://*[contains(@class,'imp')]/span\r
-body://div[contains(@id,'blog_content')]\r
-\r
-\r
+# PCHOME blog, a popular Chinese blog host
+# Oct 15, 2011
+#
+
+title://*[contains(@class,'imp')]/h2
+
+date://*[contains(@class,'imp')]/span
+body://div[contains(@id,'blog_content')]
+
+
test_url: http://blog.pchome.net/article/462502.html
\ No newline at end of file
-title: //a[@class="blog_title"]\r
-date: //p[@class="when"]/a\r
-body: //div[@class="blog_entry"]\r
-strip_id_or_class:blog_title\r
+title: //a[@class="blog_title"]
+date: //p[@class="when"]/a
+body: //div[@class="blog_entry"]
+strip_id_or_class:blog_title
strip_id_or_class:when
test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://blog.renren.com/share/224959024/14260739544
+# http://blog.renren.com/share/231323504/14261768898
+# http://blog.renren.com/share/230305019/1502806705
+
+title://h1[contains(@class, 'title-article')]
+author://span[contains(@class, 'name')]
+body://div[contains(@class, 'content-body')]
+
+convert_double_br_tags:yes
+test_url: http://blog.renren.com/share/230305019/1502806705
\ No newline at end of file
-# Sina blog, the most popular blog host in China.\r
-# Its source code is horrible.\r
-# \r
-# Issue:\r
-# Only the first image in the article is displayed.\r
-# The rest images are replace by a 1x1 transparent gif by sina blog host.\r
-# \r
-\r
-title://*[contains(@class,'titName SG_txta')]\r
-author://*[contains(@id,'ownernick')]\r
-date://*[contains(@class,'time SG_txtc')]\r
-body://div[contains(@class,'articalContent')]\r
-\r
-# Remove redundant content which has span class start with "MASS"\r
-# Example <span class="MASSf21674ffeef7"></span>\r
-strip://span[contains(@class,'MASS')]\r
-\r
-# Remove comment\r
-strip://div[contains(@class,'allComm')]\r
-\r
-# Remove hiden text and link\r
-strip://ins\r
-\r
-tidy:no\r
-convert_double_br_tags:yes\r
+# Sina blog, the most popular blog host in China.
+# Its source code is horrible.
+#
+# Issue:
+# Only the first image in the article is displayed.
+# The rest images are replace by a 1x1 transparent gif by sina blog host.
+#
+
+title://*[contains(@class,'titName SG_txta')]
+author://*[contains(@id,'ownernick')]
+date://*[contains(@class,'time SG_txtc')]
+body://div[contains(@class,'articalContent')]
+
+# Remove redundant content which has span class start with "MASS"
+# Example <span class="MASSf21674ffeef7"></span>
+strip://span[contains(@class,'MASS')]
+
+# Remove comment
+strip://div[contains(@class,'allComm')]
+
+# Remove hiden text and link
+strip://ins
+
+tidy:no
+convert_double_br_tags:yes
test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html
\ No newline at end of file
-title: //h2/a[@class="no-link title"]\r
-author: //h2[@id="blog_owner"]\r
-date: //time\r
-strip: //h2/a[@class="no-link title"]\r
-test_url: http://blog.wells.ee/retina\r
+title: //h2/a[@class="no-link title"]
+author: //h2[@id="blog_owner"]
+date: //time
+strip: //h2/a[@class="no-link title"]
+test_url: http://blog.wells.ee/retina
test_url: http://blog.wells.ee/skeuomorphism
\ No newline at end of file
-# 2011-08-23 [carlo@...] Initial version.\r
-\r
-author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()\r
-\r
-# why yes, I do feel a bit dirty\r
-date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )\r
+# 2011-08-23 [carlo@...] Initial version.
+
+author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()
+
+# why yes, I do feel a bit dirty
+date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )
test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero
\ No newline at end of file
-title: //div[@id='pageFeature']/h1\r
-body: //div[@id='articleBody']\r
-strip: //div[@class='module wide']\r
+title: //div[@id='pageFeature']/h1
+body: //div[@id='articleBody']
+strip: //div[@class='module wide']
test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29
\ No newline at end of file
-title: //h3[@class="post-name"]\r
-author: //span[@class="user-name"]\r
-date: //div[@class="post-date"]\r
-body: //div[@class="post-content user-defined-markup"]\r
+title: //h3[@class="post-name"]
+author: //span[@class="user-name"]
+date: //div[@class="post-date"]
+body: //div[@class="post-content user-defined-markup"]
footnotes: no
test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx
\ No newline at end of file
-title: //div[@id='single']/h1\r
+title: //div[@id='single']/h1
body: //div[@id='postcontent']
test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/
\ No newline at end of file
-# meta data\r
-title://h1[@class = 'postTitle']\r
-author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|')\r
-date://span[@class = 'datestamp']\r
-\r
-#body content\r
-body://div[@id = 'singleBlogPost']\r
-\r
-#reclaim author info\r
-move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv']\r
-strip://p[@class = 'moreLink mobileHide']\r
-\r
-#cleanup comments, there might be some open <div> sections\r
-strip://div[@id = 'comments2']\r
+# meta data
+title://h1[@class = 'postTitle']
+author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|')
+date://span[@class = 'datestamp']
+
+#body content
+body://div[@id = 'singleBlogPost']
+
+#reclaim author info
+move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv']
+strip://p[@class = 'moreLink mobileHide']
+
+#cleanup comments, there might be some open <div> sections
+strip://div[@id = 'comments2']
strip://h3[a[@href = '#add-comment']]
test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/
\ No newline at end of file
-# metadata\r
-author://div[@class = 'post']/div[@class='meta']/a[1]\r
-date://div[@id = 'rap']/h2[1]\r
-body://div[@class = 'post']\r
-\r
-# wrapping caption and image\r
-wrap_in(fieldset)://div[contains(@class, 'wp-caption')]\r
-\r
-\r
-# clean up\r
-strip://div[@class = 'post']/h3[@class = 'storytitle']\r
-strip://div[@class = 'post']/div[@class = 'social']\r
-strip://img[@style = 'display:none;']\r
+# metadata
+author://div[@class = 'post']/div[@class='meta']/a[1]
+date://div[@id = 'rap']/h2[1]
+body://div[@class = 'post']
+
+# wrapping caption and image
+wrap_in(fieldset)://div[contains(@class, 'wp-caption')]
+
+
+# clean up
+strip://div[@class = 'post']/h3[@class = 'storytitle']
+strip://div[@class = 'post']/div[@class = 'social']
+strip://img[@style = 'display:none;']
strip://img[@height='0' and @width='0']
test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/
\ No newline at end of file
-title: //h3[@class="post-name"]\r
-author: //span[@class="user-name"]\r
-date: //div[@class="post-date"]\r
-body: //div[@class="post-content user-defined-markup"]\r
+title: //h3[@class="post-name"]
+author: //span[@class="user-name"]
+date: //div[@class="post-date"]
+body: //div[@class="post-content user-defined-markup"]
+strip_id_or_class: log-feedback-list
+tidy: no
footnotes: no
-test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx
\ No newline at end of file
+test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx
+test_url: http://blogs.technet.com/b/isablog/archive/2009/01/07/a-pptp-client-might-fail-to-connect-to-a-vpn-server-on-the-internet-through-an-isa-server-2006.aspx
\ No newline at end of file
-body://div[@class='entry']\r
-date://div[@class='meta']\r
+body://div[@class='entry']
+date://div[@class='meta']
strip://a[@class='FlattrButton']
test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/
\ No newline at end of file
-title: //h1[@class="entry-title"][2]\r
-author: string("Paul Boag")\r
-date: substring(//span[@class="meta"], 11)\r
-body: //article\r
-strip: //h2\r
-strip: //h1\r
+title: //h1[@class="entry-title"][2]
+author: string("Paul Boag")
+date: substring(//span[@class="meta"], 11)
+body: //article
+strip: //h2
+strip: //h1
strip: //div[@id="callsToAction"]
test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/
\ No newline at end of file
-# This is far from perfect, but so is BoingBoing's markup\r
-title: //h2[@class="headline"]\r
-single_page_link: //h2[@class="headline"]/a\r
-#date: //p[@class="byline"]\r
-body: //div[@class="post"]\r
-\r
-strip_id_or_class: shareMe\r
-strip_id_or_class: authorbox\r
-strip_id_or_class: byline\r
+# This is far from perfect, but so is BoingBoing's markup
+title: //h2[@class="headline"]
+single_page_link: //h2[@class="headline"]/a
+#date: //p[@class="byline"]
+body: //div[@class="post"]
+
+strip_id_or_class: shareMe
+strip_id_or_class: authorbox
+strip_id_or_class: byline
test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html
\ No newline at end of file
-title: //h2[@class='entry-title']\r
+title: //h2[@class='entry-title']
body: //div[@class='entry-content']
test_url: http://boldizsar.palotas.eu/blog/?p=1394
\ No newline at end of file
-body: //span[@property='v:description']\r
-date: //span[@property='v:dtreviewed']\r
-author: //span[@property='v:reviewer']\r
-prune: no\r
+body: //span[@property='v:description']
+date: //span[@property='v:dtreviewed']
+author: //span[@property='v:reviewer']
+prune: no
test_url: http://book.douban.com/review/2422662/
\ No newline at end of file
-#metadata\r
-title://div[@class = 'Topper']/h1\r
-author://div[@class = 'Topper']/h3\r
-date://div[@class = 'Topper']/h6\r
-body://div[@class = 'Core']\r
-\r
-\r
-\r
-# clean up\r
-strip://div[@class = 'Topper']/h1\r
-strip://div[@class = 'Topper']/h3\r
-strip://div[@class = 'Topper']/h4\r
-strip://div[@class = 'Topper']/h5\r
-strip://div[@class = 'Topper']/h6\r
-strip://br[@clear = 'all']\r
-strip://div[@class = 'adCore']\r
-strip://div[@class = 'BookR']\r
+#metadata
+title://div[@class = 'Topper']/h1
+author://div[@class = 'Topper']/h3
+date://div[@class = 'Topper']/h6
+body://div[@class = 'Core']
+
+
+
+# clean up
+strip://div[@class = 'Topper']/h1
+strip://div[@class = 'Topper']/h3
+strip://div[@class = 'Topper']/h4
+strip://div[@class = 'Topper']/h5
+strip://div[@class = 'Topper']/h6
+strip://br[@clear = 'all']
+strip://div[@class = 'adCore']
+strip://div[@class = 'BookR']
strip://div[@class = 'InfoBox']
test_url: http://bookforum.com/inprint/018_04/8595
\ No newline at end of file
-title://h1\r
-author://div[@class="meta"]/span/a\r
-date://div[@class="date"]\r
-body://div[@class="content article"]\r
-strip://div[@class="content article"]/h1\r
+title://h1
+author://div[@class="meta"]/span/a
+date://div[@class="date"]
+body://div[@class="content article"]
+strip://div[@class="content article"]/h1
test_url: http://borderhouseblog.com/?p=7832
\ No newline at end of file
-# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.\r
-\r
-title: //div[@class="header"]/h1\r
-author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")\r
-date: //div[@class="byline"]/p[last()]\r
-body: //div[@class="article-body"]\r
-\r
-strip_id_or_class: aside\r
-strip_id_or_class: promo\r
-strip_id_or_class: skip-nav\r
-strip_id_or_class: article-more\r
-strip_id_or_class: article-bar\r
-\r
-# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.\r
+# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
+
+title: //div[@class="header"]/h1
+author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")
+date: //div[@class="byline"]/p[last()]
+body: //div[@class="article-body"]
+
+strip_id_or_class: aside
+strip_id_or_class: promo
+strip_id_or_class: skip-nav
+strip_id_or_class: article-more
+strip_id_or_class: article-bar
+
+# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
strip_id_or_class: figure
test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html
\ No newline at end of file
-#basics\r
-title://h3[@class = 'article_title']\r
-date://span[@class = 'article_date']\r
-body://div[@id = 'center_column_article']\r
-#correct, but author not being picked up in preview\r
-author://span[@class = 'article_author']\r
-\r
-#strips basics from article\r
-strip_id_or_class:article_title\r
-strip_id_or_class:article_date\r
-strip_id_or_class:article_author\r
-\r
-#strips pull quotes\r
+#basics
+title://h3[@class = 'article_title']
+date://span[@class = 'article_date']
+body://div[@id = 'center_column_article']
+#correct, but author not being picked up in preview
+author://span[@class = 'article_author']
+
+#strips basics from article
+strip_id_or_class:article_title
+strip_id_or_class:article_date
+strip_id_or_class:article_author
+
+#strips pull quotes
strip_id_or_class:pull_quote
test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php
\ No newline at end of file
-title: substring-before(//title, '|')\r
-body: //div[@class="entry"]\r
-# Remove the author's picture\r
+title: substring-before(//title, '|')
+body: //div[@class="entry"]
+# Remove the author's picture
strip: //div[@class="entry"]/a[1]
test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html
\ No newline at end of file
--- /dev/null
+title: //*[@class='articletitle']
+body: //*[(@id='articlebody')]
+date: //*[(@class='articledate')]
+author: //*[(@class='articleauthor')]
+autodetect_next_page: no
+test_url: http://bowdoinorient.com/article/8045
\ No newline at end of file
-title: //div[@class="standard"]/h1\r
-author: string("BrainFacts.org")\r
-date: //div[@class="meta"]/strong\r
-\r
-strip: //p[@class="skip"]\r
-strip: //div[@class="meta"]\r
-strip: //div[@class="standard"]/h1\r
-strip: //div[@class="modal"]\r
+title: //div[@class="standard"]/h1
+author: string("BrainFacts.org")
+date: //div[@class="meta"]/strong
+
+strip: //p[@class="skip"]
+strip: //div[@class="meta"]
+strip: //div[@class="standard"]/h1
+strip: //div[@class="modal"]
strip: //div[@class="columnRight"]
test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/
\ No newline at end of file
-# set body\r
-body: //div[@id='theContent']\r
-\r
-# set title\r
-title: //div[@id='theContent']/h3\r
+# set body
+body: //div[@id='theContent']
+
+# set title
+title: //div[@id='theContent']/h3
strip: //div[@id='theContent']/h3
test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html
\ No newline at end of file
-date://h2[@class="date-header"]\r
+date://h2[@class="date-header"]
body://div[@class="entry-content"]
test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html
\ No newline at end of file
--- /dev/null
+title: //meta[@name='DC.title']/@content
+title: //div[contains(@class, 'cabecera_noticia')]//h1
+date: //meta[@name='DC.date']/@content
+date: //meta[@name='date']/@content
+body: //div[@class='columna_texto']
+body: //div[@id='cuerpo_noticia']
+body: //div[@class='estructura_2col_1zq']//div[@class='margen_n']
+
+prune: no
+
+strip_id_or_class: disposicion_vertical
+strip_id_or_class: ampliar_foto
+strip_id_or_class: utilidades
+strip_id_or_class: info_relacionada
+strip_id_or_class: m-kiosko
+strip_id_or_class: info_complementa
+
+strip: //p[@class='nota_pie']
+strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')]
+strip: //div[@id='coment' or @id='foros_not']
+
+test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html
+test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes
-body: //div[@class='post full']\r
-title: //h1\r
-author: substring-after(//title, '- ')\r
+body: //div[@class='post full']
+title: //h1
+author: substring-after(//title, '- ')
date: //span[@class='date']
test_url: http://brettterpstra.com/byword-for-ios/
\ No newline at end of file
-title: //div[@id='contentheader']/h1\r
-author: //p[@class='attribution']/span[@class='author']/*\r
-# Is there a way to pull multiple authors? My XPath here is just grabbing the first\r
-\r
-date: /html/head/meta[@name="date"]/@content\r
-body: //div[@class='main-content']\r
-\r
-strip: //p[@class='byline']\r
-strip: //div[@class='img-gallery']\r
-strip: //div[@class='callout']\r
-strip: //div[@class='add-your-view']\r
+title: //div[@id='contentheader']/h1
+author: //p[@class='attribution']/span[@class='author']/*
+# Is there a way to pull multiple authors? My XPath here is just grabbing the first
+
+date: /html/head/meta[@name="date"]/@content
+body: //div[@class='main-content']
+
+strip: //p[@class='byline']
+strip: //div[@class='img-gallery']
+strip: //div[@class='callout']
+strip: //div[@class='add-your-view']
convert_double_br_tags: yes
test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx
\ No newline at end of file
-title: //h1\r
-body: //div[@class='article']\r
-body: //div[@class='post']\r
-date: //*[@id='single']/span\r
-prune: no\r
+title: //h1
+body: //div[@class='article']
+body: //div[@class='post']
+date: //*[@id='single']/span
+prune: no
test_url: http://brooksreview.net/2011/11/readability-agency/
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class,'articleTitle')]
+author: //span[@itemprop='name']
+date: //time[@class='published']
+body: //div[contains(@class,'bodyText')]
+
+strip_id_or_class: 'pull1'
+strip_id_or_class: 'relationArticle'
+strip: //span[@class='quote']
+
+# strip h2 if at end of article (typically a request for comments)
+strip: //div[contains(@class,'bodyText')]/node()[last()-1]/self::h2
+test_url: http://www.bt.no/meninger/debatt/Typisk-norsk-a-vare-god-nok-2884108.html
\ No newline at end of file
--- /dev/null
+date: //meta[@itemProp='datePublished']/@content
+body: //div[@class='intro' or contains(@class, 'article_text')]
+prune: no
+strip_id_or_class: embedcode
+strip_id_or_class: EmbedSwitch
+strip_id_or_class: EmbedText
+strip_id_or_class: bildergalerie
+strip_id_or_class: subline_seohour_image
+strip_id_or_class: ova-player
+strip_id_or_class: jcarouseloutput
+strip_id_or_class: cbox_embedded
+
+test_url: http://www.buffed.de/SWTOR-Star-Wars-The-Old-Republic-PC-218697/News/SWTOR-Ab-Patch-24-Lore-Klamotten-faerben-1090051/
+test_url: http://www.buffed.de/feed.cfm?menu_alias=home
\ No newline at end of file
-title: //h1\r
-author: //h2/a\r
-date: substring-after(//h2, '|')\r
-strip_id_or_class: 'attachment'\r
-strip: //h3\r
-\r
+title: //h1
+author: //h2/a
+date: substring-after(//h2, '|')
+strip_id_or_class: 'attachment'
+strip: //h3
+
body: //div[@class='entry']
test_url: http://buquad.com/2012/04/09/paul-ryan/
\ No newline at end of file
--- /dev/null
+date: substring-after(//p[@class='byline'],'Published')
+
+strip: //div[@class='article-meta']
+
+test_url: http://www.business2community.com/social-media/funky-ways-to-print-instagram-photos-0485340
-title://div[@class="sl-layout-post"]/h1\r
-body: //div[contains(@class, 'post-content') or contains(@class, 'KonaBody')]\r
-strip: //div[contains(@class, "post-sidebar")]\r
-strip: //div[@id='related-links']\r
-author://div[@class="byline"]/a\r
-date://div[@class="byline"]/span[@class="date"]\r
-prune: no\r
-\r
-strip://*[contains(@class,'sponsored-text')]\r
-strip: //div[@id='post_footer']\r
-\r
-test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1
\ No newline at end of file
+title://div[@class="sl-layout-post"]/h1
+body: //div[contains(@class, 'post-content') or contains(@class, 'slide-module') or contains(@class, 'KonaBody')]
+strip: //div[contains(@class, "post-sidebar")]
+strip: //div[@id='related-links']
+strip: //div[@class='related-links-container']
+strip: //p[@class='source']
+author://div[@class="byline"]/a
+date://div[@class="byline"]/span[@class="date"]
+prune: no
+
+single_page_link: //a[contains(text(), 'View as one page')]
+
+strip://*[contains(@class,'sponsored-text')]
+strip: //div[@id='post_footer']
+
+test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1
-body: //div[@id='article_detail']\r
-title: //meta[@property='og:title']/@content\r
-date: //div[@id='date_com_art']//a[@class='date']\r
-author: //div[@id='article_detail']//font[@class='auteur']\r
-\r
-strip_id_or_class: porte_titre_theme\r
-strip_id_or_class: cont_param\r
-strip_id_or_class: date_com_art\r
-\r
-prune: no\r
-\r
+body: //div[@id='article_detail']
+title: //meta[@property='og:title']/@content
+date: //div[@id='date_com_art']//a[@class='date']
+author: //div[@id='article_detail']//font[@class='auteur']
+
+strip_id_or_class: porte_titre_theme
+strip_id_or_class: cont_param
+strip_id_or_class: date_com_art
+
+prune: no
+
test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1
\ No newline at end of file
-# story has several pages, should be detected\r
-body: //div[@id='storyBody']\r
-body: //div[@id='article_body']\r
-body: //div[@id='story_body']\r
-\r
-title://h1[@id='article_headline']\r
-\r
-# article author\r
-author: //p[@class='author']/a\r
-# story author(s)\r
-author: substring-after(//p[@class='byline'], 'By ')\r
-\r
-# article date\r
-date: //span[@class='published_date']\r
-# story date\r
-date: //span[@class='date']\r
-\r
-date: substring-after(//div[contains(@class,'attributor')],'on')\r
-strip_id_or_class: inset\r
-strip: //p/span[@class='photoCredit']\r
-strip: //h1\r
-\r
-strip_id_or_class: page_count\r
-strip_id_or_class: tools\r
-strip_id_or_class: pagination\r
-\r
-single_page_link: //li[@id='stPrint']/a\r
-\r
-test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html\r
+# story has several pages, should be detected
+body: //div[@id='storyBody']
+body: //div[@id='article_body']
+body: //div[@id='story_body']
+
+title://h1[@id='article_headline']
+
+# article author
+author: //p[@class='author']/a
+# story author(s)
+author: substring-after(//p[@class='byline'], 'By ')
+
+# article date
+date: //span[@class='published_date']
+# story date
+date: //span[@class='date']
+
+date: substring-after(//div[contains(@class,'attributor')],'on')
+strip_id_or_class: inset
+strip: //p/span[@class='photoCredit']
+strip: //h1
+
+strip_id_or_class: page_count
+strip_id_or_class: tools
+strip_id_or_class: pagination
+
+single_page_link: //li[@id='stPrint']/a
+
+test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html
test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall
\ No newline at end of file
-# Creator: Greg Leuch <greg@...>\r
-\r
-# It can be messy.\r
-tidy:no\r
-\r
-# The basic template.\r
-title: //h1[@data-print='title']\r
-author: //a[@data-print='author']\r
-date: //time[@data-print='date']\r
-body: //div[@data-print='body']\r
-body: //section[@data-print='body']\r
-\r
-# For various things...\r
+# Creator: Greg Leuch <greg@...>
+
+# It can be messy.
+tidy:no
+
+# The basic template.
+title: //h1[@data-print='title']
+author: //a[@data-print='author']
+date: //time[@data-print='date']
+body: //div[@data-print='body']
+body: //section[@data-print='body']
+
+# For various things...
strip: *[@data-print="ignore"]
test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays
\ No newline at end of file
-title: //h1\r
-author: //a[contains(@href, '/author/')]\r
-date: //*[@class='post-date']\r
-strip: //*[@class='post-date']\r
+title: //h1
+author: //a[contains(@href, '/author/')]
+date: //*[@class='post-date']
+strip: //*[@class='post-date']
strip: //h1
test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/
\ No newline at end of file
--- /dev/null
+title: //div[@class='page-content']//h1
+body: //div[@class='page-content']
+strip_id_or_class: editorial-bar-top
+strip_id_or_class: social-bottom
+strip_id_or_class: comment-form
+strip_id_or_class: pc-why
+
+prune: no
+tidy: no
+
+test_url: http://www.cable.co.uk/news/bt-vision-unveils-interactive-guide-application-800734218/
\ No newline at end of file
-title: //h1[@class='producttabbed-title']\r
-body: //div[@class='postTabs_divs postTabs_curr_div']\r
-strip: //div[@class='ratingblock2']\r
-strip: //p[@id='breadcrumbs']\r
-strip: //div[@style='display: none']\r
-\r
+title: //h1[@class='producttabbed-title']
+body: //div[@class='postTabs_divs postTabs_curr_div']
+strip: //div[@class='ratingblock2']
+strip: //p[@id='breadcrumbs']
+strip: //div[@style='display: none']
+
test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards
\ No newline at end of file
-title: //h2\r
-body: //div[@class='entry']\r
-\r
-prune: no\r
+title: //h2
+body: //div[@class='entry']
+
+prune: no
# otherwise the footnotes are removed
test_url: http://carpeaqua.com/2011/03/27/the-intersection-of-power-and-portability/
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'basicInfo')]//h1
+
+body: //img[@id='chosenPhotoIMG'] | //div[@id='aboutThisVehicleBox']
+
+prune: no
+
+test_url: http://www.cars.com/go/search/detail.jsp?listingId=115364779
\ No newline at end of file
-body: //div[@class='article']\r
-strip: //div[@class='revhistory']\r
-strip: //div[@class='toc']\r
-tidy: no\r
-prune: no\r
+body: //div[@class='article']
+strip: //div[@class='revhistory']
+strip: //div[@class='toc']
+tidy: no
+prune: no
test_url: http://catb.org/~esr/faqs/smart-questions.html
\ No newline at end of file
-title: //div[contains(@class, 'headline')]/h1\r
-author: //h5[contains(@class, 'byline')]\r
-date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ')\r
+title: //div[contains(@class, 'headline')]/h1
+author: //h5[contains(@class, 'byline')]
+date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ')
body: //div[@id="storyboard"]
test_url: http://www.cbc.ca/news/world/story/2012/01/16/cruise-ship-monday.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'articleText')]
+date: //div[contains(@class, 'articleDate')]
+author: //a[contains(@id, 'articleDetails_lnkByLine')]
+prune: no
+
+test_url: http://www.cbn.com/cbnnews/world/2013/June/Chilly-G-8-Obama-Putin-Agree-to-Disagree-on-Syria/
+test_url: http://www.cbn.com/cbnnews/world/2013/June/UK-Agency-Accused-of-Hacking-Foreign-Diplomats/
+test_url: http://www.cbn.com/cbnnews/feed/
\ No newline at end of file
-date: //meta[@name="published"]/@content\r
-date: //div[@class="timeLine"]\r
-title: //div[@id='contentBody']//h1\r
-author: //dl[@class="storyBlogByline"]/dd/a\r
-body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')]\r
-\r
-# Content Pruning\r
-strip: //div[@class="scrollingArrows"]\r
-strip: //div[@class="timeLine"]\r
-strip: //dl[@class="storyBlogByline"]\r
-\r
-prune: no\r
-\r
-test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/
\ No newline at end of file
+date: //meta[@name="published"]/@content
+date: //div[@class="timeLine"]
+title: //div[@id='contentBody']//h1
+author: //dl[@class="storyBlogByline"]/dd/a
+body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')]
+
+# Content Pruning
+strip: //div[@class="scrollingArrows"]
+strip: //div[@class="timeLine"]
+strip: //dl[@class="storyBlogByline"]
+strip: //span[@class='image-credit']
+
+prune: no
+
+test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/
--- /dev/null
+body: //div[@class='frame']//img[@class='horizontal'] | //div[@class='content']
+test_url: http://cedarrepublican.com/online_features/gift_ideas/sending-mother-s-day-flowers-how-to-be-sure-they/article_b69af9b8-1f05-5352-8621-16ce007e5623.html
-title: //*[@id='Content']/span[1]\r
-author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(')\r
-date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter')\r
-\r
-strip: //*[@id='Content']/span[1]\r
-strip: //*[@id='Content']/span[2]\r
-\r
-body: //*[@id='Content']\r
+title: //*[@id='Content']/span[1]
+author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(')
+date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter')
+
+strip: //*[@id='Content']/span[1]
+strip: //*[@id='Content']/span[2]
+
+body: //*[@id='Content']
test_url: http://www.chinamining.org/News/2011-07-22/1311319069d48087.html
\ No newline at end of file
-title: //div[@class='title']\r
-author: //div[@class='author']\r
-prune: no\r
-\r
+title: //div[@class='title']
+author: //div[@class='author']
+prune: no
+
test_url: http://www.chomsky.info/onchomsky/2002----.htm
\ No newline at end of file
--- /dev/null
+title: //header/h1/b[contains(@class, 'title')]
+author: substring-after(//article/header/div, 'By ')
+date: //header/h1/span[contains(@class, 'date')]
+body: //div[@id='main]/article
+strip: //header
+test_url: http://chrisltd.com/blog/2012/03/fix-widows-indesign/
\ No newline at end of file
-title://div[@class='title']\r
-author://div[@class='byline']/b\r
-date:substring-after(//div[@class='byline'], 'posted')\r
-body://div[@id='body']\r
-wrap_in(h2)://span[@class='subhead']\r
-wrap_in(i)://p[@class='bio']\r
-wrap_in(i)://p[@class='copyright']\r
-strip://div[@class='title']\r
-strip://div[@class='deck']\r
-strip://div[@class='byline']\r
-strip://div[@class='copyright']\r
+title://div[@class='title']
+author://div[@class='byline']/b
+date:substring-after(//div[@class='byline'], 'posted')
+body://div[@id='body']
+wrap_in(h2)://span[@class='subhead']
+wrap_in(i)://p[@class='bio']
+wrap_in(i)://p[@class='copyright']
+strip://div[@class='title']
+strip://div[@class='deck']
+strip://div[@class='byline']
+strip://div[@class='copyright']
strip://br
test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html
\ No newline at end of file
-title: //h1[@class="entry-title"]\r
-author: //*[@class="author vcard fn"]\r
-date: //*[@class="published"]\r
+title: //h1[@class="entry-title"]
+author: //*[@class="author vcard fn"]
+date: //*[@class="published"]
body: //div[(@class = "dd_content_wrap")]
test_url: http://christianpf.com/do-ibuys-lead-to-more-buying/
\ No newline at end of file
-tidy: no\r
-prune: no\r
-date: //article//time[@pubdate]\r
-title: //article/header/h2\r
+tidy: no
+prune: no
+date: //article//time[@pubdate]
+title: //article/header/h2
body: //article
test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3
\ No newline at end of file
-body: //pre[@id='cx-desc-text']\r
-body: //div[contains(@class, 'overview-tab-right-bar-info')]\r
-title: //h1[contains(@class, 'detail-dialog-title')]\r
-tidy: no\r
-prune: no\r
-replace_string(<noscript>): <div>\r
-replace_string(</noscript>): </div>\r
+body: //pre[@id='cx-desc-text']
+body: //div[contains(@class, 'overview-tab-right-bar-info')]
+title: //h1[contains(@class, 'detail-dialog-title')]
+tidy: no
+prune: no
+replace_string(<noscript>): <div>
+replace_string(</noscript>): </div>
test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc
\ No newline at end of file
-title: //h1[contains(@class, "entry-title")]\r
-author: //p[contains(@class, "byline")]\r
-\r
-# blog articles (chronicle.com/blogs/*)\r
-body: //div[contains(@class, "abstract")]\r
-date: //p[contains(@class, "time")]\r
-\r
-# all (?) other articles\r
-body: //div[@id="article-body"]\r
-date: //p[contains(@class, "dateline")]\r
-\r
-# remove sidebars containing images (I assume this is desired for Instapaper)\r
-strip: //div[@id="related"]\r
-strip: //div[contains(@class, "image")]\r
-\r
+title: //h1[contains(@class, "entry-title")]
+author: //p[contains(@class, "byline")]
+
+# blog articles (chronicle.com/blogs/*)
+body: //div[contains(@class, "abstract")]
+date: //p[contains(@class, "time")]
+
+# all (?) other articles
+body: //div[@id="article-body"]
+date: //p[contains(@class, "dateline")]
+
+# remove sidebars containing images (I assume this is desired for Instapaper)
+strip: //div[@id="related"]
+strip: //div[contains(@class, "image")]
+
# note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet
test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/
\ No newline at end of file
--- /dev/null
+body://div[contains(@class, 'entry-content')]
+date://h2[contains(@class, 'date-header')]
+title://h3[contains(@class, 'post-title')]
+test_url: http://www.ciaosamin.com/2013/04/how-this-happened.html
\ No newline at end of file
-# fforst@...\r
-\r
-# Use link to print article for single page view\r
-single_page_link: //a[@class="print"]\r
-\r
-# set body\r
-tidy: no\r
-body: //div[@class='artikel-content']\r
-\r
-# strip title and subtitle since we got it already\r
-strip: //div[@class='issue']\r
-strip: //div[@class='artikel-content']/h2\r
-\r
-# some authors are known and have a link, others don't\r
-author: //a[contains(@href, 'autor?')]\r
-\r
-#date\r
-date: //span[@class='article-date']\r
-\r
-# Strip author since we got him\r
-strip_id_or_class: author\r
-\r
-#strip captions\r
-strip_id_or_class: field-name-field-image-credit\r
-strip_id_or_class: field-name-field-article-image-subtitle\r
-\r
-# remove community functions\r
-strip: //div[@class='meta']\r
-strip: //div[@id='comments']\r
-\r
-# remove "continue on the next page" text\r
+# fforst@...
+
+# Use link to print article for single page view
+single_page_link: //a[@class="print"]
+
+# set body
+tidy: no
+body: //div[@class='artikel-content']
+
+# strip title and subtitle since we got it already
+strip: //div[@class='issue']
+strip: //div[@class='artikel-content']/h2
+
+# some authors are known and have a link, others don't
+author: //a[contains(@href, 'autor?')]
+
+#date
+date: //span[@class='article-date']
+
+# Strip author since we got him
+strip_id_or_class: author
+
+#strip captions
+strip_id_or_class: field-name-field-image-credit
+strip_id_or_class: field-name-field-article-image-subtitle
+
+# remove community functions
+strip: //div[@class='meta']
+strip: //div[@id='comments']
+
+# remove "continue on the next page" text
strip: //p[text()="[SEITE]"]
test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049
\ No newline at end of file
-body: //*[(@id = "articlebody")]\r
-strip_id_or_class: rotulo\r
+body: //*[(@id = "articlebody")]
+strip_id_or_class: rotulo
test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/
\ No newline at end of file
-body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body']\r
-prune: no\r
-\r
-single_page_link: //li[@class='print']/a\r
-\r
+body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body']
+prune: no
+
+single_page_link: //li[@class='print']/a
+
test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php
\ No newline at end of file
--- /dev/null
+date: //div[@id='content']//p[contains(@class, 'date')]/span
+author: substring-after(//div[@id='content']//div[contains(@class, 'over-under-bars')]/p[last()]/text(), 'Posted by ')
+body: //div[@id='content']//div[@class='pane-content']
+strip_id_or_class: trackback-url
+strip_id_or_class: over-under-bars
+test_url: http://www.classyllama.com/content/layout-caching
\ No newline at end of file
-title://div[@class="entrytitle"]/a\r
-author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ")\r
-date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted")\r
-body://div[@class="entrybody"]\r
+title://div[@class="entrytitle"]/a
+author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ")
+date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted")
+body://div[@class="entrybody"]
strip://div[@class="entrybody"]//p[@class="singleinfo"]
test_url: http://clientk.com/2011/12/19/the-impact-of-more/
\ No newline at end of file
-title: //h1\r
-author: //a[@class='auteur']\r
-body: //div[@class='editorial']\r
-next_page_link: //a[contains(text(),'Page suivante')]\r
-strip: //a[contains(text(),'Page suivante')]\r
-strip: //a[contains(text(),'Page précédente')]\r
-strip_id_or_class: slideshow\r
-\r
-prune: no\r
-\r
+title: //h1
+author: //a[@class='auteur']
+body: //div[@class='editorial']
+next_page_link: //a[contains(text(),'Page suivante')]
+strip: //a[contains(text(),'Page suivante')]
+strip: //a[contains(text(),'Page précédente')]
+strip_id_or_class: slideshow
+
+prune: no
+
test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html
\ No newline at end of file
-body: //div[contains(@id,'article-body')]\r
-strip://div[contains(@id,'disqus_count_block')]\r
-strip://div[contains(@id,'col-left')]\r
-strip://div[contains(@id,'col-right')]\r
+body: //div[contains(@id,'article-body')]
+strip://div[contains(@id,'disqus_count_block')]
+strip://div[contains(@id,'col-left')]
+strip://div[contains(@id,'col-right')]
test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php
\ No newline at end of file
--- /dev/null
+title: //h2[@class="posttitle"]
+body: //div[@class="postbody"]
+prune: no
+
+test_url: http://cn.engadget.com/2013/06/29/google-play-music-all-access/
--- /dev/null
+title: //div[@id='maincontent']//h1
+body: //div[@id='resizeableText']
+
+test_url: http://cn.reuters.com/article/CNAnalysesNews/idCNKBS0FF0NM20140710
+test_url: http://cn.reuters.feedsportal.com/CNAnalysesNews
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //div[contains(@class, 'postBody')]\r
-date: //div[@id='nameAndTime']/time\r
-author: //div[@id='nameAndTime']/span[@class='author']\r
-\r
-strip_id_or_class: image-credit\r
-strip_id_or_class: noAutolink\r
-strip_id_or_class: related\r
-\r
-prune: no\r
-tidy: no\r
-\r
-# early end\r
-replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html>\r
-\r
+title: //meta[@property="og:title"]/@content
+body: //div[contains(@class, 'postBody')]
+date: //div[@id='nameAndTime']/time
+author: //div[@id='nameAndTime']/span[@class='author']
+
+strip_id_or_class: image-credit
+strip_id_or_class: noAutolink
+strip_id_or_class: related
+
+prune: no
+tidy: no
+
+# early end
+replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html>
+
test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/
\ No newline at end of file
-title: //div[@class="cnn_storyarea"]/h1\r
-author: //div[@class="cnnByline"]/strong\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri')\r
-date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat')\r
-strip: //div[@class="cnn_storyarea"]/h1\r
-strip_id_or_class: cnnByline\r
-strip_id_or_class: cnn_strytmstmp\r
-strip_id_or_class: cnn_strycaptiontxt\r
-strip_id_or_class: cnn_strybtntoolsbttm\r
-strip_id_or_class: cnn_strybtntools\r
-strip_id_or_class: cnn_strybtmcntnt\r
-strip_id_or_class: cnn_containerwht\r
-strip_id_or_class: cnn_stryathrtmp\r
-test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories
\ No newline at end of file
+body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')]
+title: //div[@class="cnn_storyarea"]/h1
+author: //div[@class="cnnByline"]/strong
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri')
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat')
+strip: //div[@class="cnn_storyarea"]/h1
+strip_id_or_class: cnnByline
+strip_id_or_class: cnn_strytmstmp
+strip_id_or_class: cnn_strycaptiontxt
+strip_id_or_class: cnn_strybtntoolsbttm
+strip_id_or_class: cnn_strybtntools
+strip_id_or_class: cnn_strybtmcntnt
+strip_id_or_class: sharebar
+#strip_id_or_class: cnn_containerwht
+strip_id_or_class: cnn_stryathrtmp
+replace_string(<a name="em0"></a>): <!-- a name -->
+test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories
+test_url: http://rss.cnn.com/rss/edition.rss
\ No newline at end of file
-# main sportsillustrated.com articles\r
-\r
-body: //div[@id="cnnStoryContent"]\r
-title: //div[@id="cnnStoryHeadline"]//h1\r
-author: //div[@id="cnnSubBanner"]//strong\r
-date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")\r
-date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")\r
-\r
-# kill ugly font buttons\r
-strip: //div[@id="cnnSCFontButtons"]\r
-\r
-# kill misc filler videos & etc\r
-strip: //div[@class="cnnDivideContent"]\r
-strip: //*[@class="cnnTMbox"]\r
-\r
-# si vault articles\r
-# -------------\r
-body: //div[@class="siv_artPara"]\r
-title: //div[@class="siv_artHeader"]//h1\r
-author: //div[@class="byline"]\r
-date: //div[@class="date"]\r
-\r
-next_page_link: //div[@id='cnnStoryContinue']/a\r
-strip_id_or_class: cnnstorypagination\r
-\r
+# main sportsillustrated.com articles
+
+body: //div[@id="cnnStoryContent"]
+title: //div[@id="cnnStoryHeadline"]//h1
+author: //div[@id="cnnSubBanner"]//strong
+date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")
+date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")
+
+# kill ugly font buttons
+strip: //div[@id="cnnSCFontButtons"]
+
+# kill misc filler videos & etc
+strip: //div[@class="cnnDivideContent"]
+strip: //*[@class="cnnTMbox"]
+
+# si vault articles
+# -------------
+body: //div[@class="siv_artPara"]
+title: //div[@class="siv_artHeader"]//h1
+author: //div[@class="byline"]
+date: //div[@class="date"]
+
+next_page_link: //div[@id='cnnStoryContinue']/a
+strip_id_or_class: cnnstorypagination
+
test_url: http://cnnsi.com/2012/writers/peter_king/01/08/wild.card.round/index.html
\ No newline at end of file
-body: //div[@id='content']\r
-title: //div[@id='page_header']/h1\r
-\r
-strip_id_or_class: 'lineno'\r
-strip_id_or_class: 'block-toolbar-button'\r
-strip_id_or_class: 'recipe_score'\r
-strip: //div[@id='recipe_tools']\r
-strip: //div[@id='addcomment']\r
-\r
+body: //div[@id='content']
+title: //div[@id='page_header']/h1
+
+strip_id_or_class: 'lineno'
+strip_id_or_class: 'block-toolbar-button'
+strip_id_or_class: 'recipe_score'
+strip: //div[@id='recipe_tools']
+strip: //div[@id='addcomment']
+
test_url: http://code.activestate.com/recipes/500261-named-tuples/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']
-body: //div[@id="gc-pagecontent"]\r
-strip: //a[@class="backtotop"]\r
-prune: no\r
-\r
+body: //div[@id="gc-pagecontent"]
+strip: //a[@class="backtotop"]
+prune: no
+
test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html
\ No newline at end of file
--- /dev/null
+body: //div[@id="contentdiv"]
+date: //span[@class="date"]
+test_url: http://www.codeproject.com/Articles/499902/Profiling-Entity-Framework-5-in-code
\ No newline at end of file
-body: //div[@class='blogbody']\r
-strip: //h3[@class='title']\r
-date: //h2[@class='date']\r
-#Should Atwood just be a literal?\r
-author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V')\r
-\r
-# tim.kingman@... 2011-07-26\r
-# Prune:no to retain all-link ULs that are part of the body content like\r
-# http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html\r
-# Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed.\r
-\r
-prune: no\r
-strip: //div[@class='posted']/following-sibling::*\r
+body: //div[@class='blogbody']
+strip: //h3[@class='title']
+date: //h2[@class='date']
+#Should Atwood just be a literal?
+author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V')
+
+# tim.kingman@... 2011-07-26
+# Prune:no to retain all-link ULs that are part of the body content like
+# http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html
+# Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed.
+
+prune: no
+strip: //div[@class='posted']/following-sibling::*
strip: //div[@class='posted']
test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html
\ No newline at end of file
-title: //h1[@class='title']\r
-author: //p[@class='byline']/a[1]\r
-date: //*[@class='date']\r
-\r
-body: //div[@class='article_body']\r
-strip: //p[@class='ca_intro']\r
-strip: //div[@id='action_bar']\r
-strip: //div[@class='below_content']\r
-strip: //div[@id='announcement']\r
-strip: //div[@id='leftovers']\r
-strip: //div[@class='form']\r
-strip: //div[@id='email_overlay']\r
+title: //h1[@class='title']
+author: //p[@class='byline']/a[1]
+date: //*[@class='date']
+
+body: //div[@class='article_body']
+strip: //p[@class='ca_intro']
+strip: //div[@id='action_bar']
+strip: //div[@class='below_content']
+strip: //div[@id='announcement']
+strip: //div[@id='leftovers']
+strip: //div[@class='form']
+strip: //div[@id='email_overlay']
strip: //a[@class='close']
test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie
\ No newline at end of file
-body: //div[@id="center"]//div[@class="node"]\r
-title: //div[@id="center"]//h2\r
-author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")\r
-date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")\r
-strip: //div[@id="center"]//h2[1]\r
-strip: //span[@class="submitted"][1]\r
+body: //div[@id="center"]//div[@class="node"]
+title: //div[@id="center"]//h2
+author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")
+date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")
+strip: //div[@id="center"]//h2[1]
+strip: //span[@class="submitted"][1]
move_into(//div[@class="node"])://div[@class="breadcrumb"]
test_url: http://community.service-now.com/blog/lawrenceeng/seasons-greetings-servicenow-team
\ No newline at end of file
-strip_id_or_class:column-3\r
-strip_id_or_class:portlet-boundary\r
-strip_id_or_class:banner\r
+strip_id_or_class:column-3
+strip_id_or_class:portlet-boundary
+strip_id_or_class:banner
test_url: http://www.computer.org/portal/web/buildyourcareer/careerwatch/jt19
\ No newline at end of file
-title://h1\r
-\r
-author://div[@id="news-meta"]/a\r
-\r
-body://*[@id="main"]/div[1]\r
-\r
-strip://*[@id="main"]/div[2]\r
-strip://*[@id="main"]/div[3]\r
-strip://*[@id="page"]//footer\r
-\r
-#date: didn't manage to parse it\r
-\r
-#Images have to be stripped because the page does it with overlay\r
-strip://img\r
-\r
-#figures are not displayed in instapaper...\r
-strip://figure | //figcaption\r
+title://h1
+
+author://div[@id="news-meta"]/a
+
+body://*[@id="main"]/div[1]
+
+strip://*[@id="main"]/div[2]
+strip://*[@id="main"]/div[3]
+strip://*[@id="page"]//footer
+
+#date: didn't manage to parse it
+
+#Images have to be stripped because the page does it with overlay
+strip://img
+
+#figures are not displayed in instapaper...
+strip://figure | //figcaption
test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/
\ No newline at end of file
-title: //meta[@name='headline']/@content\r
-date: //meta[@name='date']/@content\r
-author: //meta[@name='author']/@content\r
-body: //div[contains(@class, 'article')]\r
-body://div[@id="article_body"]\r
-\r
-strip_id_or_class: banner\r
-strip: //noscript\r
-strip: //div[@style='width:1px;height:130px;float:right;']\r
-strip: //div[@class='storyby']\r
-strip_image_src: twitter_icon\r
-strip_image_src: rss_bug\r
-\r
-tidy: no\r
-prune: no\r
-\r
-next_page_link://div[@id="next_page"]/a\r
-\r
-single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/'))\r
-\r
-test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware\r
+title: //meta[@name='headline']/@content
+date: //meta[@name='date']/@content
+author: //meta[@name='author']/@content
+body: //div[contains(@class, 'article')]
+body://div[@id="article_body"]
+
+strip_id_or_class: banner
+strip: //noscript
+strip: //div[@style='width:1px;height:130px;float:right;']
+strip: //div[@class='storyby']
+strip_image_src: twitter_icon
+strip_image_src: rss_bug
+
+tidy: no
+prune: no
+
+next_page_link://div[@id="next_page"]/a
+
+single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/'))
+
+test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware
test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy
\ No newline at end of file
-strip: //div[contains(@class, 'articleAdtechAd')]\r
-title: //div[@id='article']/h1\r
-title: //div[contains(@class, 'article')]/h1\r
-body: //div[@id='articleText']\r
+strip: //div[contains(@class, 'articleAdtechAd')]
+title: //div[@id='article']/h1
+title: //div[contains(@class, 'article')]/h1
+body: //div[@id='articleText']
test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0
\ No newline at end of file
-# get author from string like "Posted by <author> on <date>"\r
-author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on')\r
-\r
-# get date from string like "Posted by <author> on <date>"\r
-date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on')\r
-\r
-# this keeps thumbnail images\r
+# get author from string like "Posted by <author> on <date>"
+author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on')
+
+# get date from string like "Posted by <author> on <date>"
+date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on')
+
+# this keeps thumbnail images
prune: no
test_url: http://www.contemporist.com/2011/11/02/landing-200-lamp-by-kim-hyunjoo
\ No newline at end of file
-title: //div[@class='article_header']/h1\r
-body: //div[@class='article_header']/p | //div[@class='article_body']\r
-strip_id_or_class: share_this\r
-strip_id_or_class: sociable\r
-prune: no\r
-\r
+title: //div[@class='article_header']/h1
+body: //div[@class='article_header']/p | //div[@class='article_body']
+strip_id_or_class: share_this
+strip_id_or_class: sociable
+prune: no
+
test_url: http://conversaciones.nokia.com/2011/10/07/cinco-atajos-en-el-nokia-n8/
\ No newline at end of file
--- /dev/null
+body: //*[contains(@class,'body')]
+date: //abbr[@class='published']
+
+test_url: http://www.cooper.com/journal/2012/08/2-weeks-left-to-win-your-way-to-the-woodstock-of-ux-coopers-ux-boot-camp.html/
\ No newline at end of file
-body: //div[@id="permalink"]/div[@class="post"]\r
-\r
-strip: //div[@id='backArrow']\r
-strip: //div[@id='fwdArrow']\r
-strip: //div[@class="post-title"]\r
+body: //div[@id="permalink"]/div[@class="post"]
+
+strip: //div[@id='backArrow']
+strip: //div[@id='fwdArrow']
+strip: //div[@class="post-title"]
strip: //div[@class="sharing"]
test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp
\ No newline at end of file
-title: //div[@class='main']//h1[contains(@class, 'article-title')]\r
-author: //div[@class='mainauthorstyle']\r
-body: //div[@class='main']//div[@class='main-text']\r
-strip: //td[@width='140']\r
-\r
+title: //div[@class='main']//h1[contains(@class, 'article-title')]
+author: //div[@class='mainauthorstyle']
+body: //div[@class='main']//div[@class='main-text']
+strip: //td[@width='140']
+
test_url: http://www.counterpunch.org/johnstone05172011.html
\ No newline at end of file
-title://h2\r
+title://h2
body://div[contains(@class, 'entrytext')]
test_url: http://www.crazybutable.com/weblog/archives/2010/07/01/house-ideas-that-worked/
\ No newline at end of file
-body: //div[@class="readingtext"]\r
+body: //div[@class="readingtext"]
title: substring-after(substring-after(//title, ':'), ':')
test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php
\ No newline at end of file
-author: //p[contains(@class,'author')]/a\r
+author: //p[contains(@class,'author')]/a
date: //div[contains(@class,'date')]
test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html
\ No newline at end of file
-title: //h1[contains(@class, 'head')]\r
-\r
-# standard page\r
-body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')]\r
-# print page\r
-body: //div[@id='mainColumn']\r
-\r
-author: //a[contains(@class, 'ui-author')]\r
-\r
-single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')]\r
-\r
-strip_id_or_class: storyToolbar\r
-strip_id_or_class: promotion-tag\r
-\r
-tidy: no\r
-prune: no\r
+title: //h1[contains(@class, 'head')]
+
+# standard page
+body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')]
+# print page
+body: //div[@id='mainColumn']
+
+author: //a[contains(@class, 'ui-author')]
+
+single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')]
+
+strip_id_or_class: storyToolbar
+strip_id_or_class: promotion-tag
+
+tidy: no
+prune: no
test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84
\ No newline at end of file
-title: //div[@id='csn_blogST_headline']/h1\r
-\r
-body: //div[@id='csn_blogST_main']\r
-strip_id_or_class: ipfootnotes\r
-strip: //div[@id='csn_blogST_main']/p[1]/img\r
+title: //div[@id='csn_blogST_headline']/h1
+
+body: //div[@id='csn_blogST_main']
+strip_id_or_class: ipfootnotes
+strip: //div[@id='csn_blogST_main']/p[1]/img
strip: //div[@id='csn_blogST_sidebar']
test_url: http://www.csnbayarea.com/blog/giants-talk/post/-?blog%2Fgiants-talk%2Fpost%2F-=&blockID=578902&feedID=5987
\ No newline at end of file
-# author's name is not isolated as a tag.... ugh\r
-convert_double_br_tags: yes\r
-body: //csn_blogST_main\r
-\r
-#junk above and around the article\r
-strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div\r
-strip: /html/body/div[4]/header\r
-strip_id_or_class: article-right-sidebar\r
-strip_id_or_class: rsn-gigya-sharebar-container\r
-strip_id_or_class: article-bottom\r
-strip_id_or_class: hider\r
-strip_id_or_class: footer\r
-strip_id_or_class: masthead\r
-strip_id_or_class: block-menu-menu-rsn-login-or-register\r
-strip_id_or_class: block-menu-menu-header-links\r
-strip_id_or_class: block-rsn-follow-bar-follow-bar\r
-strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard\r
-strip_id_or_class: logo\r
-strip_id_or_class: element-invisible\r
-strip_id_or_class: site-name\r
-strip: //div[contains(@style, 'none')]\r
+# author's name is not isolated as a tag.... ugh
+convert_double_br_tags: yes
+body: //csn_blogST_main
+
+#junk above and around the article
+strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div
+strip: /html/body/div[4]/header
+strip_id_or_class: article-right-sidebar
+strip_id_or_class: rsn-gigya-sharebar-container
+strip_id_or_class: article-bottom
+strip_id_or_class: hider
+strip_id_or_class: footer
+strip_id_or_class: masthead
+strip_id_or_class: block-menu-menu-rsn-login-or-register
+strip_id_or_class: block-menu-menu-header-links
+strip_id_or_class: block-rsn-follow-bar-follow-bar
+strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard
+strip_id_or_class: logo
+strip_id_or_class: element-invisible
+strip_id_or_class: site-name
+strip: //div[contains(@style, 'none')]
test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career
\ No newline at end of file
--- /dev/null
+title://article[contains(@id, "post-")]/h1
+date://article[contains(@id, "post-")]/p[@class="time"]/time
+body://article[contains(@id, "post-")]
+strip://article[contains(@id, "post-")]/p[@class="time"]/time
+prune:yes
+test_url: http://css-tricks.com/off-canvas-menu-with-css-target/
\ No newline at end of file
--- /dev/null
+author://span[contains(@class,'reporter')]
+
+date://span[contains(@class,'date')]
+
+body://div[contains(@class,'mainContaner')]
+
+strip://div[contains(@class,'mainHeaer')]
+strip://div[contains(@class,'keyW')]
+strip://div[contains(@class,'wonderful')]
+strip://div[contains(@class,'pages')]
+strip://div[contains(@class,'Topics TopicsW3')]
+
+next_page_link://li[@class='pageNext']/a[contains(.,'下一頁')]
+test_url: http://www.cw.com.tw/article/article.action?id=5032848
\ No newline at end of file
-single_page_link: //a\r
-tidy: no\r
-prune: no\r
+single_page_link: //a
+tidy: no
+prune: no
test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm
\ No newline at end of file
--- /dev/null
+title: //span[@class = 'overskriftEkstrastor']
+author: //em/a
+
+test_url: http://dagogtid.no/nyhet.cfm?nyhetid=2414
\ No newline at end of file
-tidy: no\r
-body: //article\r
+tidy: no
+body: //article
test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/
\ No newline at end of file
-body: //div[@id='article-1']//div[contains(@class, 'article-body')]\r
-title: //div[@class='meta']//a[@id='titleHref']\r
-date: //div[@class='meta']//p[@class='date']\r
-\r
-strip_id_or_class: invisible\r
-strip_id_or_class: divider-doodle\r
-\r
-prune: no\r
-\r
-test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrichs-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his ex-wife
\ No newline at end of file
+body: //div[@id='article-1']//div[contains(@class, 'article-body')]
+title: //div[@class='meta']//a[@id='titleHref']
+date: //div[@class='meta']//p[@class='date']
+
+strip_id_or_class: invisible
+strip_id_or_class: divider-doodle
+
+prune: no
+
+test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrich-s-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his-ex-wife
-body: //div[@id='js-article-text']\r
-strip: //div[@class='explore-links']\r
-strip: //div[@id='js-article-text']/br[position()=1]\r
-strip_id_or_class: print-or-mail-links\r
-strip_id_or_class: shareArticles\r
-strip_id_or_class: googleAds\r
-strip_id_or_class: digg-button\r
-strip_id_or_class: article-icon-links-container\r
-strip_id_or_class: clickToEnlarge\r
-tidy: no\r
-\r
+body: //div[@id='js-article-text']
+strip: //div[@class='explore-links']
+strip: //div[@id='js-article-text']/br[position()=1]
+strip_id_or_class: print-or-mail-links
+strip_id_or_class: shareArticles
+strip_id_or_class: googleAds
+strip_id_or_class: digg-button
+strip_id_or_class: article-icon-links-container
+strip_id_or_class: clickToEnlarge
+tidy: no
+
test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='ec-blog-headline']
+body: //*[@id="divDetails"]
+date: //*[@id="ctl00_ContentPlaceHolder1_tdDate"]
+author: //*[@id="ctl00_ContentPlaceHolder1_anchorAuthor"]/a
+autodetect_next_page: no
+test_url: http://dailystar.com.lb/Opinion/Columnist/2012/Oct-10/190803-americas-new-modesty-in-the-mideast.ashx#axzz2928JP5xE
\ No newline at end of file
--- /dev/null
+tidy: no
+prune: no
+date: //article//time[@pubdate]
+title: //article/h1//span[contains(@class, 'entry-title')]
+body: //article/div[contains(@class, 'entry-content')]
+test_url: http://danleech.com/post/36822126876/simple-icons
\ No newline at end of file
-autodetect_next_page: no\r
-tidy: no\r
-prune: no\r
+autodetect_next_page: no
+tidy: no
+prune: no
body: //div[@class='NoOverflow']
test_url: http://www.dansdata.com/gz129.htm
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'fon31 mt2')]
+body: //h2[contains(@class, 'fon33 mt1')] | //div[contains(@class, 'fon34 mt3')]
+
+prune: no
+
+test_url: http://dantri.com.vn/su-kien/chang-trai-mot-minh-dap-xe-vuot-450km-de-vieng-mo-dai-tuong-869763.htm
+test_url: http://dantri.com.vn/trangchu.rss
\ No newline at end of file
-title: //div[@class="article"]/h1\r
-author: //div[@id="Sidebar"]/p/strong\r
-date: //h6[@class="dateline"]\r
-body: //div[@class="article"]\r
-strip: //h6[@class="dateline"]\r
-strip: //div[@class="article"]/h1\r
+title: //div[@class="article"]/h1
+author: //div[@id="Sidebar"]/p/strong
+date: //h6[@class="dateline"]
+body: //div[@class="article"]
+strip: //h6[@class="dateline"]
+strip: //div[@class="article"]/h1
test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels
\ No newline at end of file
-body: //div[@id="article"]\r
-date: //p[@class="date"]\r
+body: //div[@id="article"]
+date: //p[@class="date"]
author: //p[@class="byline"]
test_url: http://www.datanami.com/datanami/2011-12-07/new_path_for_sap:_in_memory_computing,_predictive_analysis_converge.html?featured=top
\ No newline at end of file
-title: (//article//h2)[1]\r
-body: //article[contains(@class, 'post')]\r
-date: //time[@id='top_time']/@datetime\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: (//article//h2)[1]
+body: //article[contains(@class, 'post')]
+date: //time[@id='top_time']/@datetime
+
+prune: no
+tidy: no
+
test_url: http://dcurt.is/predictions-txt
\ No newline at end of file
--- /dev/null
+title: //article/h1
+author: //hgroup/h3/a
+date: //time
+body: //article
+strip: //aside
+footnotes: yes
+prune: no
+tidy: no
+test_url: https://defomicron.net/2012/09/ios-6/
\ No newline at end of file
-strip_id_or_class: banner\r
-strip_id_or_class: gamma\r
+strip_id_or_class: banner
+strip_id_or_class: gamma
strip_id_or_class: module-list
test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'blog_body')]
+
+prune: no
+
+test_url: http://www.democracynow.org/blog/2014/1/9/the_fbi_the_nsa_and_a
\ No newline at end of file
-title: //div[@id='artikelHeader']/h1\r
-author: //span[@class='author']\r
-date: //span[@class='date']\r
-body: //div[@class='copytext']\r
-strip: //ul[@class='lookupLinksArtikel']\r
-\r
-strip: //div[@id='pageTop']\r
-strip: //div[@id='toolbar']\r
-strip: //div[@id='articleTools']\r
-strip: //div[@id='weiterlesen']\r
-strip: //div[@id='communityCanvas']\r
+title: //div[@id='artikelHeader']/h1
+author: //span[@class='author']
+date: //span[@class='date']
+body: //div[@class='copytext']
+strip: //ul[@class='lookupLinksArtikel']
+
+strip: //div[@id='pageTop']
+strip: //div[@id='toolbar']
+strip: //div[@id='articleTools']
+strip: //div[@id='weiterlesen']
+strip: //div[@id='communityCanvas']
test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation
\ No newline at end of file
-tidy: no\r
-body: //div[@class='main']\r
-\r
-author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am')\r
-date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ')\r
-\r
-strip_id_or_class: pagelink\r
-strip_id_or_class: wp-polls \r
-\r
+tidy: no
+body: //div[@class='main']
+
+author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am')
+date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ')
+
+strip_id_or_class: pagelink
+strip_id_or_class: wp-polls
+
next_page_link: //div[@class='post-page-next']/a
test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/
\ No newline at end of file
-body: (//blockquote[contains(@class, 'postcontent')])[1]\r
-body: (//div[starts-with(@id, 'post_message')])[1]\r
-\r
-prune: no\r
+body: (//blockquote[contains(@class, 'postcontent')])[1]
+body: (//div[starts-with(@id, 'post_message')])[1]
+
+prune: no
tidy: no
\ No newline at end of file
-title: //h1[@class="content-headline"]\r
-body: //div[@class="headers-container"] | //div[@class="content-container"]\r
-prune: no\r
-tidy: no\r
-\r
-single_page_link: //li[@class='utility-print']/a\r
-\r
+title: //h1[@class="content-headline"]
+body: //div[@class="headers-container"] | //div[@class="content-container"]
+prune: no
+tidy: no
+
+single_page_link: //li[@class='utility-print']/a
+
test_url: http://www.details.com/culture-trends/critical-eye/201108/best-new-designers-innovations
\ No newline at end of file
-title: //div[@class="bodyText"]/h1\r
+title: //div[@class="bodyText"]/h1
author: //div[@class="picture"]/a/img/@alt
test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/
\ No newline at end of file
-date: //h2[@class='date-header']\r
-body: //div[@class='post hentry']\r
-title: //h3\r
-strip: //div[@class='post-footer']\r
+date: //h2[@class='date-header']
+body: //div[@class='post hentry']
+title: //h3
+strip: //div[@class='post-footer']
test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html
\ No newline at end of file
-title: //h1[@id='query_h1']\r
-body: //div[contains(@class, 'lunatext results_content')]\r
-strip_id_or_class: spl_unshd\r
-#replace_string(<div class="dicTl">): <div class="dicTl">------------------<br />\r
-\r
-prune: no\r
+title: //h1[@id='query_h1']
+body: //div[contains(@class, 'lunatext results_content')]
+strip_id_or_class: spl_unshd
+#replace_string(<div class="dicTl">): <div class="dicTl">------------------<br />
+
+prune: no
test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/
\ No newline at end of file
-title: //div[@class='article']/h1\r
-date: substring-before(//p[@class='articletime'],'|')\r
-body: //div[@id='articletext']\r
-strip: //div[@class='inlineDiashow']\r
+title: //div[@class='article']/h1
+date: substring-before(//p[@class='articletime'],'|')
+body: //div[@id='articletext']
+strip: //div[@class='inlineDiashow']
test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do
\ No newline at end of file
-# default parser works great\r
-# only add "author" and "next page link" reference\r
-# 2012-04-13\r
-\r
-next_page_link: //div[@class = 'pagination']/a[@class = 'next_page']\r
-\r
+# default parser works great
+# only add "author" and "next page link" reference
+# 2012-04-13
+
+next_page_link: //div[@class = 'pagination']/a[@class = 'next_page']
+
author: //*[@class = 'author metadata']/a
test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work
\ No newline at end of file
-title: //div[@class='post-title']/h1\r
-author: //a[@href='#author']\r
-body: //div[@class='post-content']\r
-strip: //div[@class='post-meta']\r
-\r
+title: //div[@class='post-title']/h1
+author: //a[@href='#author']
+body: //div[@class='post-content']
+strip: //div[@class='post-meta']
+
test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically
\ No newline at end of file
-title: //div[@class="article_header"]/h1\r
-date: //div[@class="article_pub"]/span[@class="time"]\r
-author: //div[@class="article_pub"]/span[@class="editors"]/a/text()\r
+title: //div[@class="article_header"]/h1
+date: //div[@class="article_pub"]/span[@class="time"]
+author: //div[@class="article_pub"]/span[@class="editors"]/a/text()
body: //div[@class="article_body clear_left"]
test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html
\ No newline at end of file
-convert_double_br_tags: yes\r
-\r
-title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10)\r
-body: //*[contains(@class, 'SB_Content')]\r
-author: string('Scott Adams')\r
-date: //*[contains(@class, 'SB_Detail')]/text()[1]\r
+#title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10)
+title: //div[contains(@class, 'SB_Title')]//a
+body: //div[contains(@class, 'STR_Image')]
+body: //*[contains(@class, 'SB_Content')]
+author: string('Scott Adams')
+date: //*[contains(@class, 'SB_Detail')]/text()[1]
-test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/
\ No newline at end of file
+
+test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/
+test_url: http://dilbert.com/strips/comic/2013-10-22
+test_url: http://feed.dilbert.com/dilbert/daily_strip
\ No newline at end of file
-title: //div[@class='newsdetbd']\r
-body: //div[@id='innerleft'] \r
-#//p[@class = 'plnht']\r
-strip_image_src: /albums/\r
-strip: //div[@class='mrrt']\r
-prune: yes\r
-strip_id_or_class: 'fdpd'\r
-strip_id_or_class: 'epapt' \r
-strip_id_or_class: 'newsrtwd'\r
-strip_id_or_class: 'padtp'\r
-strip_id_or_class: 'newdt'\r
-strip_id_or_class: 'newdlt'\r
-strip: //div[@id='selNotes']\r
-strip_id_or_class: 'clsNotes'\r
-strip_id_or_class: 'clear'\r
-strip_id_or_class: 'cmtwrap'\r
-strip_id_or_class: 'sess'\r
+title: //div[@class='newsdetbd']
+body: //div[@id='innerleft']
+#//p[@class = 'plnht']
+strip_image_src: /albums/
+strip: //div[@class='mrrt']
+prune: yes
+strip_id_or_class: 'fdpd'
+strip_id_or_class: 'epapt'
+strip_id_or_class: 'newsrtwd'
+strip_id_or_class: 'padtp'
+strip_id_or_class: 'newdt'
+strip_id_or_class: 'newdlt'
+strip: //div[@id='selNotes']
+strip_id_or_class: 'clsNotes'
+strip_id_or_class: 'clear'
+strip_id_or_class: 'cmtwrap'
+strip_id_or_class: 'sess'
strip_id_or_class: 'parents'
test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725
\ No newline at end of file
-# Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height.\r
-\r
-body: //div[@id="article-content"]\r
-\r
-\r
-# Ads\r
-strip_id_or_class: advert-space\r
-\r
-# Read more, recommend, comments etc\r
-strip_id_or_class: fbc-recommend\r
-strip_id_or_class: recommend\r
-strip_id_or_class: article-readers\r
-strip_id_or_class: article-addons\r
-strip_id_or_class: hook\r
-strip_id_or_class: right\r
-strip_id_or_class: footer\r
-\r
-# Other news\r
-strip: //div[@id="mirrors"]\r
-\r
-# Author\r
-author: //div[@id="byline"]/div/p/strong\r
-\r
-# Date\r
-date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11)\r
-test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade
\ No newline at end of file
+# Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height.
+
+body: //div[@id="article-content"]
+
+
+# Ads
+strip_id_or_class: advert-space
+
+# Read more, recommend, comments etc
+strip_id_or_class: fbc-recommend
+strip_id_or_class: recommend
+strip_id_or_class: article-readers
+strip_id_or_class: article-addons
+strip_id_or_class: hook
+strip_id_or_class: right
+strip_id_or_class: footer
+
+# Other news
+strip: //div[@id="mirrors"]
+
+# Author
+author: //div[@id="byline"]/div/p/strong
+
+# Date
+date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11)
+
+test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade
+test_url: http://www.dn.se/m/rss/senaste-nytt
\ No newline at end of file
--- /dev/null
+title: //*[@class="news"]//h1[@class="title"]
+author: //*[@class="news"]//*[@class="newsInfo"]/a
+date: substring-before(//*[@class="news"]//*[@class="newsInfo"]/text(), ',')
+body: //*[@class="news"]//*[@class="newsContent"]
+footnotes: no
+test_url: http://www.dobreprogramy.pl/Sony-konczy-z-Foldinghome-na-PS3,Aktualnosc,36899.html
\ No newline at end of file
-strip: //*[(@id = "featured")]\r
-\r
-author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')\r
-\r
-date: concat(//div[@class='month'],' ',//div[@class='day'])\r
-\r
+strip: //*[(@id = "featured")]
+
+author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')
+
+date: concat(//div[@class='month'],' ',//div[@class='day'])
+
#doctac doesn't provide a year, but month/day is better than nothing
test_url: http://www.doctac.com/mac/iphone/instapaper-update-app/
\ No newline at end of file
-# TODO: clean up the extra junk at the end of articles\r
-\r
-# general text formatting\r
-prune: no\r
-convert_double_br_tags:yes\r
-\r
-# where to find the basic metadata\r
-author://a[@class='articleauthor']\r
-date://a[starts-with(@href,'/en/search/published/')]\r
-title:substring-before(//h2[@class='title'],'—')\r
-body://div[@id='maincontainer']\r
-\r
-dissolve://div[starts-with(@id,'commentableblock')]\r
-\r
-# clean up the crap\r
-strip://div[contains(@class,'domusnetwork')]\r
-strip://div[contains(@class,'relative_wrapper')]\r
-\r
-strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')]\r
+# TODO: clean up the extra junk at the end of articles
+
+# general text formatting
+prune: no
+convert_double_br_tags:yes
+
+# where to find the basic metadata
+author://a[@class='articleauthor']
+date://a[starts-with(@href,'/en/search/published/')]
+title:substring-before(//h2[@class='title'],'—')
+body://div[@id='maincontainer']
+
+dissolve://div[starts-with(@id,'commentableblock')]
+
+# clean up the crap
+strip://div[contains(@class,'domusnetwork')]
+strip://div[contains(@class,'relative_wrapper')]
+
+strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')]
wrap_in(em): //div[contains(@class,'captionsubimage')]/span
test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/
\ No newline at end of file
-title: //h1[@itemprop="name"]\r
-\r
-author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a\r
-\r
-date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')]\r
-\r
+title: //h1[@itemprop="name"]
+
+author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a
+
+date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')]
+
body: //div[contains(@class, 'b-typo')]
test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm
\ No newline at end of file
-# This filter is tested on:\r
-# http://www.douban.com/note/215003067/\r
-# http://www.douban.com/note/213540049/\r
-# http://www.douban.com/group/topic/31140104/\r
-\r
-title: //div[@class='note-header']/h1\r
-title: //div[@id='content']/h1\r
-\r
-author: //div[@class='info']/ul/li/a\r
-author: //h3/span/a\r
-\r
-date://div[@class='note-header']/div/span\r
-date://h3/span[contains(@class, 'color-green')]\r
-\r
-body://div[contains(@class, 'note')]\r
-body://div[contains(@class, 'topic-content')]\r
-\r
-strip://h3\r
-\r
-convert_double_br_tags: yes\r
+# This filter is tested on:
+# http://www.douban.com/note/215003067/
+# http://www.douban.com/note/213540049/
+# http://www.douban.com/group/topic/31140104/
+
+title: //div[@class='note-header']/h1
+title: //div[@id='content']/h1
+
+author: //div[@class='info']/ul/li/a
+author: //h3/span/a
+
+date://div[@class='note-header']/div/span
+date://h3/span[contains(@class, 'color-green')]
+
+body://div[contains(@class, 'note')]
+body://div[contains(@class, 'topic-content')]
+
+strip://h3
+
+convert_double_br_tags: yes
test_url: http://www.douban.com/group/topic/31140104/
\ No newline at end of file
-# next_page_link for product review\r
-# example: http://www.dpreview.com/reviews/lytro/\r
-next_page_link: //img[@alt = 'Next page']/../@href\r
-\r
-# next_page_link for other articles\r
-# example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1\r
-next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a\r
+# next_page_link for product review
+# example: http://www.dpreview.com/reviews/lytro/
+next_page_link: //img[@alt = 'Next page']/../@href
+
+# next_page_link for other articles
+# example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1
+next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a
single_page_link: //a[contains(.,'Print view')]
test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-author: //div[@class='articleFunctions']//a\r
-date: //meta[@name='pubdate']/@content\r
-\r
-# Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason)\r
-body: //div[@class='articleContent']\r
-\r
+title: //meta[@property='og:title']/@content
+author: //div[@class='articleFunctions']//a
+date: //meta[@name='pubdate']/@content
+
+# Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason)
+body: //div[@class='articleContent']
+
tidy: no
test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm
\ No newline at end of file
-body: //div[@class='postext']\r
-\r
-strip_id_or_class: ratingblock\r
-strip_id_or_class: hreview-aggregate\r
-strip: //div[contains(@style, 'display: none;')]\r
-\r
-tidy: no\r
-prune: no\r
-\r
+body: //div[@class='postext']
+
+strip_id_or_class: ratingblock
+strip_id_or_class: hreview-aggregate
+strip: //div[contains(@style, 'display: none;')]
+
+tidy: no
+prune: no
+
test_url: http://www.dramasonline.com/jago-pakistan-jago-7th-december-2012-ali-gul-pir/
\ No newline at end of file
-body: //div[@class = "description"]\r
-body: //div[@id = "post"]\r
-\r
-strip_id_or_class: vcard\r
-strip_id_or_class: journallist\r
-strip_id_or_class: infobox\r
-strip_id_or_class: terms\r
-strip_id_or_class: replieslist\r
-strip_id_or_class: communityside\r
-\r
+body: //div[@class = "description"]
+body: //div[@id = "post"]
+
+strip_id_or_class: vcard
+strip_id_or_class: journallist
+strip_id_or_class: infobox
+strip_id_or_class: terms
+strip_id_or_class: replieslist
+strip_id_or_class: communityside
+
test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/
\ No newline at end of file
--- /dev/null
+single_page_link: //a[@id='download_button_link']
\ No newline at end of file
-title://h1\r
-author://div[@class="submitted"]/a\r
-date:substring-after(//div[@class="meta"],'modified: ')\r
-date:substring-after(//div[@class="submitted"],'on ')\r
-body://div[@class="node-content"]\r
-strip://div[@class="meta"]\r
+title://h1
+author://div[@class="submitted"]/a
+date:substring-after(//div[@class="meta"],'modified: ')
+date:substring-after(//div[@class="submitted"],'on ')
+body://div[@class="node-content"]
+strip://div[@class="meta"]
strip_id_or_class:book-navigation
test_url: http://drupal.org/node/1327354
\ No newline at end of file
-title: //h2/a\r
-author: substring-before(substring-after(//span[@class='byline'], 'by'), ',')\r
-date: substring-before(substring-after(//span[@class='byline'], ','), '|')\r
-body: //div[@class='entry']\r
-\r
-\r
-# strip out auction stuff at the end of posts\r
-# tidy kills the center tag, so disable it\r
-tidy: no\r
+title: //h2/a
+author: substring-before(substring-after(//span[@class='byline'], 'by'), ',')
+date: substring-before(substring-after(//span[@class='byline'], ','), '|')
+body: //div[@class='entry']
+
+
+# strip out auction stuff at the end of posts
+# tidy kills the center tag, so disable it
+tidy: no
strip: //center//table
test_url: http://www.dukebasketballreport.com/articles/?p=42660
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://www.dushumashang.com/2389
+# http://www.dushumashang.com/2415
+# http://www.dushumashang.com/2355
+
+body://div[@class='main_content']
+#body://section[@class='entry_content fl']
+title://h2
+author://span[@class='article_author']/a
+date://span[@class='pub_date']/time
+
+strip://span[@class='article_author']
+strip://span[@class='pub_date']
+strip://div[@class='page_turn']
+strip://span[@class='source_link']/em
+wrap_in(strong)://span[@class='source_link']/a
+test_url: http://www.dushumashang.com/2355
\ No newline at end of file
-strip://*[@id = 'blog_top_stories']\r
-strip://*[@id = 'takeover_off']\r
-strip://*[@id = 'right_gray_box']\r
-strip://*[@class = 'blog_topics']\r
-strip://*[@class = 'section_titles']\r
-\r
-author://div[@class = 'post_author_info']/a\r
+strip://*[@id = 'blog_top_stories']
+strip://*[@id = 'takeover_off']
+strip://*[@id = 'right_gray_box']
+strip://*[@class = 'blog_topics']
+strip://*[@class = 'section_titles']
+
+author://div[@class = 'post_author_info']/a
date://div[@class = 'post_date_info']
test_url: http://dvice.com/archives/2012/05/is-nfc-and-smar.php
\ No newline at end of file
-title: //div [@class="post contain"]/h1\r
-strip: //div [@class="post contain"]/h1\r
-body: //div [@class="post contain"]\r
-author: substring-before(//title, ':')\r
-author: substring-before(//title, ' ')\r
-\r
+title: //div [@class="post contain"]/h1
+strip: //div [@class="post contain"]/h1
+body: //div [@class="post contain"]
+author: substring-before(//title, ':')
+author: substring-before(//title, ' ')
+
test_url: http://eamesinerudition.com/2012/03/hospital-numbers-are-bad-for-you
\ No newline at end of file
-title: //h1\r
-date: //div[@class="et_dateUnderTitle"]\r
-author: substring-after(//div[@class="et_authorUnderTitle"], 'By ')\r
-body: //div[@id="et_leftCol640split"]\r
-\r
-strip: //div[@id="et_leftCol640splitRight"]\r
+title: //h1
+date: //div[@class="et_dateUnderTitle"]
+author: substring-after(//div[@class="et_authorUnderTitle"], 'By ')
+body: //div[@id="et_leftCol640split"]
+
+strip: //div[@id="et_leftCol640splitRight"]
strip: //div[@class="et_light_greybgboxlower"]
test_url: http://eandt.theiet.org/magazine/2011/12/this-festive-waste.cfm
\ No newline at end of file
-title: //div[@class='title_text']\r
-\r
-author: //div[@class='author_text']\r
-\r
-body: //div[@class='story_text']/..\r
-\r
-strip: //b\r
-\r
-strip_id_or_class: back_to_top\r
-strip_id_or_class: author_text\r
-strip_id_or_class: title_text\r
-\r
-wrap_in(center): //a\r
-\r
-dissolve: //a\r
- \r
+title: //div[@class='title_text']
+
+author: //div[@class='author_text']
+
+body: //div[@class='story_text']/..
+
+strip: //b
+
+strip_id_or_class: back_to_top
+strip_id_or_class: author_text
+strip_id_or_class: title_text
+
+wrap_in(center): //a
+
+dissolve: //a
+
footnotes: no
test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml
\ No newline at end of file
-body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum']\r
-\r
-strip_image_src: imgLoading_30x30.gif\r
-\r
+body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum']
+
+strip_image_src: imgLoading_30x30.gif
+
test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204
\ No newline at end of file
-title: //h1[@class="title"]\r
-author: //div[@class="hosted"]/a\r
-date: substring-after(//div[@class="dateline"]/text(), '|')\r
-\r
+title: //h1[@class="title"]
+author: //div[@class="hosted"]/a
+date: substring-after(//div[@class="dateline"]/text(), '|')
+
strip: //a[@class="top" and @href="#"]
test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html
\ No newline at end of file
-date: //div[@class="bb-md-noticia-fecha"]\r
-body: //div[@class="corpo"]\r
-dissolve: //div[@class="bb-md-noticia-extras"]\r
-strip: //strong\r
-strip_id_or_class: bb-md-noticia-foto-autor\r
+date: //div[@class="bb-md-noticia-fecha"]
+body: //div[@class="corpo"]
+dissolve: //div[@class="bb-md-noticia-extras"]
+strip: //strong
+strip_id_or_class: bb-md-noticia-foto-autor
strip_id_or_class: bb-md-noticia-foto-bajada
test_url: http://economia.estadao.com.br/noticias/economia,cmn-aprova-r-67-bi-em-credito-para-20-setores-da-economia,118501,0.htm
\ No newline at end of file
-title: //div[@class='ec-blog-headline']\r
-body: //div[@class='ec-blog-body']\r
-body: //div[@class='ec-article-content clear']\r
-strip: //div[@class='related-items']\r
-date: substring-before(//p[@class='ec-article-info'], '|')\r
-prune: no\r
-\r
-autodetect_next_page: no\r
-\r
+body: //div[@class='main-content']
+date: //time[@class='date-created']
+strip: //aside
+prune: no
+
+autodetect_next_page: no
+
test_url: http://www.economist.com/node/21528429
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')]\r
-date: //time[@pubdate]/@datetime\r
-author: //span[@class='author-name']\r
-prune: no\r
-tidy: no\r
-strip: //footer\r
-\r
-replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak -->\r
-\r
-single_page_link: //a[contains(@href, '?page=show')]\r
-\r
+title: //meta[@property="og:title"]/@content
+body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')]
+date: //time[@pubdate]/@datetime
+author: //span[@class='author-name']
+prune: no
+tidy: no
+strip: //footer
+
+replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak -->
+
+single_page_link: //a[contains(@href, '?page=show')]
+
test_url: http://www.edge-online.com/features/telling-modern-warfares-story
\ No newline at end of file
-title: //div[@class='HomeLeftPannel IMGCTRL']/h2\r
-body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc']\r
-tidy: no\r
-\r
+title: //div[@class='HomeLeftPannel IMGCTRL']/h2
+body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc']
+tidy: no
+
test_url: http://edge.org/print/conversation.php?cid=the-argumentative-theory
\ No newline at end of file
--- /dev/null
+title: //div[@id='singlePage']//h2
+body: //div[@id='singlePage']//div[contains(@class, 'post')]
+strip: //a[@title='Email This Story']
+strip_id_or_class: sociable
+
+prune: no
+
+test_url: http://edition.channel5belize.com/archives/86016
+test_url: http://edition.channel5belize.com/feed
\ No newline at end of file
-body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')]\r
-strip: //div[@id='cnnCVP2']\r
-strip_id_or_class: cnn_strylftcexpbx\r
-strip_id_or_class: cnn_strylctcqrelt\r
-strip_id_or_class: cnn_strybtntoolsbttm\r
-strip_id_or_class: cnn_stryftsbttm\r
-strip_id_or_class: cnn_strybtmcntnt\r
+body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')]
+strip: //a[starts-with(@name, 'em')]
+strip: //div[@id='cnnCVP2']
+strip_id_or_class: cnn_strylftcexpbx
+strip_id_or_class: cnn_strylctcqrelt
+strip_id_or_class: cnn_strybtntoolsbttm
+strip_id_or_class: cnn_stryftsbttm
+strip_id_or_class: cnn_strybtmcntnt
+strip_id_or_class: cnn_stryshrwdgtbtm
+strip_id_or_class: cnnGalleryContainer
+strip_id_or_class: cnn_strycrcntr
+strip_id_or_class: cnn_html_slideshow
prune: no
-test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html
\ No newline at end of file
+
+test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html
+test_url: http://edition.cnn.com/2013/08/15/world/africa/nigeria-boko-haram-commander-killed/index.html?eref=edition
+test_url: http://rss.cnn.com/rss/edition.rss
+test_url: http://rss.cnn.com/rss/edition_technology.rss
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'grayshowlinks')]
+
+next_page_link: //div[@id='sitecontentcol']//a[.='Next >']
+# Doesn't work (site doesn't always load full content in print view)
+#single_page_link: //div[@id='sitecontentcol']//a[contains(@href, 'print=yes')]
+
+test_url: http://www.eetimes.com/document.asp?doc_id=1319966&
+test_url: http://www.eetimes.com/rss_simple.asp
\ No newline at end of file
-title: //h1[@class='style6 nevek']\r
-\r
-body: //div[@class='bal3']\r
-\r
-\r
-prune: yes\r
-\r
-tidy: yes\r
-convert_double_br_tags: yes\r
+title: //h1[@class='style6 nevek']
+
+body: //div[@class='bal3']
+
+
+prune: yes
+
+tidy: yes
+convert_double_br_tags: yes
test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december
\ No newline at end of file
-body: //div[@id='jobDesc-bd']/p\r
+body: //div[@id='jobDesc-bd']/p
test_url: http://www.elance.com/j/xml-technical-intergration/23687172/
\ No newline at end of file
--- /dev/null
+date: //time
+title: //h1[contains(@class, "alpha")]
+body: //article[contains(@class, "news-post")]
+
+# fix dates - dates as they are won't work as strtotime doesn't understand format (03.28.2013)
+replace_string(<time class="gamma">01.): <time class="gamma">January.
+replace_string(<time class="gamma">02.): <time class="gamma">February.
+replace_string(<time class="gamma">03.): <time class="gamma">March.
+replace_string(<time class="gamma">04.): <time class="gamma">April.
+replace_string(<time class="gamma">05.): <time class="gamma">May.
+replace_string(<time class="gamma">06.): <time class="gamma">June.
+replace_string(<time class="gamma">07.): <time class="gamma">July.
+replace_string(<time class="gamma">08.): <time class="gamma">August.
+replace_string(<time class="gamma">09.): <time class="gamma">September.
+replace_string(<time class="gamma">10.): <time class="gamma">October.
+replace_string(<time class="gamma">11.): <time class="gamma">November.
+replace_string(<time class="gamma">12.): <time class="gamma">December.
+
+prune: no
+
+test_url: http://elderscrollsonline.com/en/rss
+test_url: http://elderscrollsonline.com/en/news/post/2013/03/27/developer-question-of-the-week-17
\ No newline at end of file
-title: //h1\r
-date: //div[@class='datum']\r
-single_page_link: //a[contains(@href, '?type=99')]\r
-\r
-# this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1\r
-dissolve: //div[@class='artikelMeldung']\r
-\r
-\r
-strip_id_or_class: anzeige\r
-strip_id_or_class: top_page_navigation\r
-strip_id_or_class: cr_image_container\r
-strip_id_or_class: cr_image_reference\r
-strip_id_or_class: cr_image_icon\r
-strip_id_or_class: _close_txt\r
-strip_id_or_class: _close_ico\r
-strip_id_or_class: clearer\r
-\r
-strip://h1\r
-strip://h6\r
-strip://div[contains(@id, 'plista')]\r
-strip://img[contains(@id,'tiny')]\r
-strip://img[@class='cr_image']\r
-\r
-# strip url at the top\r
-strip: //p[@style='font-size: 10px;']\r
+title: //h1
+date: //div[@class='datum']
+single_page_link: //a[contains(@href, '?type=99')]
+
+# this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1
+dissolve: //div[@class='artikelMeldung']
+
+
+strip_id_or_class: anzeige
+strip_id_or_class: top_page_navigation
+strip_id_or_class: cr_image_container
+strip_id_or_class: cr_image_reference
+strip_id_or_class: cr_image_icon
+strip_id_or_class: _close_txt
+strip_id_or_class: _close_ico
+strip_id_or_class: clearer
+
+strip://h1
+strip://h6
+strip://div[contains(@id, 'plista')]
+strip://img[contains(@id,'tiny')]
+strip://img[@class='cr_image']
+
+# strip url at the top
+strip: //p[@style='font-size: 10px;']
test_url: http://www.elektroniknet.de/automotive/technik-know-how/sicherheitselektronik/article/87717/0/Besser_als_die_Wirklichkeit/
\ No newline at end of file
-single_page_link: //a[contains(@href, 'print_contenido')]\r
-title: //h2\r
+single_page_link: //a[contains(@href, 'print_contenido')]
+title: //h2
author: //div[@class="autor"]
test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668
\ No newline at end of file
-title: //meta[@name='DC.title']/@content\r
-title: //div[contains(@class, 'cabecera_noticia')]//h1\r
-date: //meta[@name='DC.date']/@content\r
-date: //meta[@name='date']/@content\r
-body: //div[@class='columna_texto']\r
-body: //div[@id='cuerpo_noticia']\r
-body: //div[@class='estructura_2col_1zq']//div[@class='margen_n']\r
-\r
-prune: no\r
-\r
-strip_id_or_class: disposicion_vertical\r
-strip_id_or_class: ampliar_foto\r
-strip_id_or_class: utilidades\r
-strip_id_or_class: info_relacionada\r
-strip_id_or_class: m-kiosko\r
-strip_id_or_class: info_complementa\r
-\r
-strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')]\r
-strip: //div[@id='coment' or @id='foros_not']\r
+title: //meta[@name='DC.title']/@content
+title: //div[contains(@class, 'cabecera_noticia')]//h1
+date: //meta[@name='DC.date']/@content
+date: //meta[@name='date']/@content
+body: //div[@class='columna_texto']
+body: //div[@id='cuerpo_noticia']
+body: //div[@class='estructura_2col_1zq']//div[@class='margen_n']
-test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html\r
+prune: no
+
+strip_id_or_class: disposicion_vertical
+strip_id_or_class: ampliar_foto
+strip_id_or_class: utilidades
+strip_id_or_class: info_relacionada
+strip_id_or_class: m-kiosko
+strip_id_or_class: info_complementa
+
+strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')]
+strip: //div[@id='coment' or @id='foros_not']
+
+test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html
test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes
\ No newline at end of file
--- /dev/null
+body: //div[@id='main-column']//div[@class='content']
+
+prune: no
+
+test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601844
+test_url: http://www.emaratalyoum.com/sports/arab-and-international/2013-08-29-1.601842
+test_url: http://www.emaratalyoum.com/public-sports-1.533088?ot=ot.AjaxPageLayout
\ No newline at end of file
-body: //div[@id='content']\r
-strip: //div[@class='rl'] \r
-strip: //p[@class='authdesc']\r
-strip: //p[@class='strybtm']\r
-strip: //div[@id='stryFtrLft']\r
-strip: //div[@id='f1Conversation']\r
-strip: //div[@id='cmtSpncrRuler']\r
-strip: //div[@id='stryComments']\r
+body: //div[@id='content']
+strip: //div[@class='rl']
+strip: //p[@class='authdesc']
+strip: //p[@class='strybtm']
+strip: //div[@id='stryFtrLft']
+strip: //div[@id='f1Conversation']
+strip: //div[@id='cmtSpncrRuler']
+strip: //div[@id='stryComments']
strip: //div[@id='athrData']
test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //div[@class='post_body']\r
-date: //*[@class='post_time']\r
-\r
-prune: no\r
-\r
+title: //meta[@property="og:title"]/@content
+body: //div[@class='post_body']
+date: //*[@class='post_time']
+
+prune: no
+
test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/
\ No newline at end of file
-title: //h2\r
-body: //div[@class="post_content"]\r
-author: //p[@class="author"]/a\r
-date: //p[@class="date"]\r
-strip: //h2\r
+title: //h2
+body: //div[@class="post_content"]
+author: //p[@class="author"]/a
+date: //p[@class="date"]
+strip: //h2
strip: //header
test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose
\ No newline at end of file
-title: //span[@id='DetailedTitle']\r
-body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary']\r
-strip_id_or_class: sidebar\r
-strip_id_or_class: Skyscrapper_Body\r
-strip: //td[@class='DetailedSummary']/table[position() != 1]\r
-prune: no\r
+title: //span[@id='DetailedTitle']
+body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary']
+strip_id_or_class: sidebar
+strip_id_or_class: Skyscrapper_Body
+strip: //td[@class='DetailedSummary']/table[position() != 1]
+prune: no
test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html
\ No newline at end of file
-body: //div[@id='article']//div[contains(@class, 'inside')]\r
-\r
-strip_id_or_class: tags\r
-strip_id_or_class: actions\r
-strip_id_or_class: google-ads\r
-\r
-prune: no\r
-\r
+body: //div[@id='article']//div[contains(@class, 'inside')]
+
+strip_id_or_class: tags
+strip_id_or_class: actions
+strip_id_or_class: google-ads
+
+prune: no
+
test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html
\ No newline at end of file
-author://div[@class = 'article-author']/span[@class = 'byline']\r
-title://h1[@class = 'heading']\r
-body://div[@id = 'related-article-links']\r
-strip://div[@id = 'comment-sort-order']\r
-strip://div[@id = 'my-profile']\r
-strip://div[@class = 'article-author']\r
-strip://div[@class = 'bg-f8f1d8 width-385 text-left']\r
-strip://div[@id = 'login-status']\r
+author://div[@class = 'article-author']/span[@class = 'byline']
+title://h1[@class = 'heading']
+body://div[@id = 'related-article-links']
+strip://div[@id = 'comment-sort-order']
+strip://div[@id = 'my-profile']
+strip://div[@class = 'article-author']
+strip://div[@class = 'bg-f8f1d8 width-385 text-left']
+strip://div[@id = 'login-status']
strip://div[@class = 'puff-padding']
test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece
\ No newline at end of file
--- /dev/null
+date: //h6[@class='datetime']/child::text()
+author: string("Eric J. Suh")
+footnotes: yes
+test_url: http://www.ericsuh.com/blog/posts/2012/8/strange-numbers.html
\ No newline at end of file
-title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title']))\r
-\r
-body: //div[@class='doc']\r
-\r
-prune: yes\r
-\r
-tidy: yes\r
-convert_double_br_tags: yes\r
-\r
+title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title']))
+
+body: //div[@class='doc']
+
+prune: yes
+
+tidy: yes
+convert_double_br_tags: yes
+
strip: //a[contains(@href, 'www.facebook.com/pages/Elet-es-Irodalom/')]
test_url: http://www.es.hu/2010-12-08_vissza-a-partpenzt
\ No newline at end of file
+title: //h1[@class='headline']/div[@class='name']
+
+strip_image_src: 'http://cdn.themis-media.com/media/global/images/library/deriv/115/115825.png'
+
+next_page_link: //a[@class='next_page']
+
strip_comments: no
-test_url: http://www.escapistmagazine.com/articles/view/columns/extraconsideration/8717-Extra-Consideration-The-Story
\ No newline at end of file
+test_url: http://www.escapistmagazine.com/articles/view/columns/criticalintel/10302-I-Hate-Magic
\ No newline at end of file
-title: //div[@class='headline'] | //div[@class='mod-header']/h3\r
-body: //div[contains(@class, 'article')]\r
-strip: //div[contains(@class, 'mod-inline')]\r
-strip: //*/span[@class='page-actions']\r
-strip: //div[@class='page-actions']/*\r
-strip: //div[@class='headline'] | //div[@class='mod-header']/h3\r
-strip: //div[@class='mod-blog-navigation']\r
-strip: //div[@class='monthday']\r
-strip: //div[@class='time']\r
-strip: //div[@class='timeofday']\r
+title: //div[@class='headline'] | //div[@class='mod-header']/h3
+body: //div[contains(@class, 'article')]
+strip: //div[contains(@class, 'mod-inline')]
+strip: //*/span[@class='page-actions']
+strip: //div[@class='page-actions']/*
+strip: //div[@class='headline'] | //div[@class='mod-header']/h3
+strip: //div[@class='mod-blog-navigation']
+strip: //div[@class='monthday']
+strip: //div[@class='time']
+strip: //div[@class='timeofday']
strip: //div[contains(@class, 'mod-conversations')]
test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign
\ No newline at end of file
-title: //h1\r
-author: //div[@id='byline']\r
-\r
-body: //div[@id='printBody']\r
-\r
-single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/'))\r
-\r
-prune: no\r
-\r
-test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810
\ No newline at end of file
+title: //h1
+author: //div[@id='byline']
+
+body: //div[@id='printBody']
+
+single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/'))
+
+prune: no
+
+test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810
+test_url: http://www.esquire.com/blogs/politics/police-getting-leftover-armoured-iraq-trucks-112513
\ No newline at end of file
-title: //*[@itemprop='headline']\r
-author: //*[@itemprop='author']\r
-date: //*[@itemprop='datePublished']\r
-body: //*[@itemprop='articleBody']\r
+title: //*[@itemprop='headline']
+author: //*[@itemprop='author']
+date: //*[@itemprop='datePublished']
+body: //*[@itemprop='articleBody']
strip: //*[contains(@class, 'instapaper_ignore')]
test_url: http://www.essentialpublicradio.org/story/2011-11-14/volunteers-sought-federal-tax-assistance-program-pennsylvania-9421
\ No newline at end of file
-strip_id_or_class: 'left'\r
-strip_id_or_class: 'right'\r
-strip_id_or_class: 'block-belowcontent'\r
-author: //span[@class = 'name']/a\r
-date: //div[@class= 'datum']\r
+strip_id_or_class: 'left'
+strip_id_or_class: 'right'
+strip_id_or_class: 'block-belowcontent'
+author: //span[@class = 'name']/a
+date: //div[@class= 'datum']
test_url: http://www.etc.se/intervju/lonsamt-att-radda-jorden
\ No newline at end of file
-body: //div[ @class='content' ] | //div[ @class='blog-entry' ]\r
-\r
-strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')]\r
-\r
-date://p[ @class='timestamp' ]\r
-\r
-author://a[ @class='eurogamer-author' ]\r
+body: //div[ @class='content' ] | //div[ @class='blog-entry' ]
+
+strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')]
+
+date://p[ @class='timestamp' ]
+
+author://a[ @class='eurogamer-author' ]
test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4
\ No newline at end of file
-author: substring-after(//div[@class='articleauthor'],'By ')\r
-\r
-# Blog posts\r
-date: //div[@class='articledate']\r
-# News\r
-date: //div[@class='articledate_b']\r
-\r
-body: //div[@class='articletext']\r
-\r
+author: substring-after(//div[@class='articleauthor'],'By ')
+
+# Blog posts
+date: //div[@class='articledate']
+# News
+date: //div[@class='articledate_b']
+
+body: //div[@class='articletext']
+
convert_double_br_tags: yes
test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html
\ No newline at end of file
-title: //div[@id='article']/div[contains(@class, 'content')]/h1\r
-body: //div[@id='article']/div[contains(@class, 'content')]\r
-date: //div[contains(@class, 'article-slot')]/descendant::div[contains(@id, 'articledates')]\r
-\r
-strip: //img[contains(@src, 'img/px.gif')]\r
-prune: no\r
-# remove Facebook banner and obtrusive ad\r
-strip: //div[@id='article']/div[contains(@class, 'content')]/div[contains(@class, 'art-right')]\r
-test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at
\ No newline at end of file
+title: //h1[contains(@class, 'b-headline_article')]
+body: //div[contains(@class, 'b-article_print')]
+
+single_page_link: //div[contains(@class, 'b-page__footer__actions')]//a[contains(@href, 'print=true')]
+
+prune: no
+
+test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at
+test_url: http://www.expressen.se/gt/polis-om-styckmordet-extremt-markligt-fall/
+test_url: http://www.expressen.se/Pages/OutboundFeedsPage.aspx?id=3642159&viewstyle=rss
\ No newline at end of file
--- /dev/null
+body: //div[@id='imagestage']
+prune: no
+tidy: no
+
+test_url: https://www.facebook.com/feeds/page.php?id=338077742912613&format=rss20
\ No newline at end of file
-bosdy: //div[@class='content']\r
+bosdy: //div[@class='content']
test_url: http://facta.co.jp/blog/archives/20111026001026.html
\ No newline at end of file
-title: //h2[@class='related relatedTitle']\r
-author: //a[contains(@href, 'liste.php?author_id')]\r
-\r
-# can't think of a better way unfortunately, really bad markup on this site\r
-date: substring-after(//td[@style='width:85%;'], 'vom')\r
-\r
-# not sure why, but instapaper seems to suck up the teaser paragraph\r
-# not solved!\r
-body: //div[contains(@class, 'teaser')]\r
-body: //div[@id='content']\r
-\r
-# cleanup\r
-strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif']\r
-strip: //div[@class='servicebox']\r
-strip: //h1\r
-strip: //br\r
-strip: //td[@id='adcol']
-test_url: http://www.falter.at/web/print/detail.php?id=1634
\ No newline at end of file
+title: //h1
+author: //a[contains(@href, '/kategorie/autoren')]
+date: //a[contains(@href, '/falter/ausgabe')]
+body: //article[@class='spanMain']
+
+# cleanup
+strip_id_or_class: 'respond'
+strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif']
+strip_id_or_class: 'meta'
+strip_id_or_class: 'servicebox'
+strip_id_or_class: 'related'
+strip_id_or_class: 'twitter-share-button'
+strip: //br
+test_url: http://www.falter.at/falter/2013/03/26/der-dandy-auf-der-sinkenden-galeere/
\ No newline at end of file
-body: //*[@id = 'story text']\r
-author: //a[starts-with(@href, '/u/')]\r
-next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")\r
-autodetect_next_page:yes\r
+body: //*[@id = 'story text']
+author: //a[starts-with(@href, '/u/')]
+next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")
+autodetect_next_page:yes
strip_id_or_class: 'a2a_kit'
test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love
\ No newline at end of file
-title: //h1\r
-author: //h5[@class='byline']//a\r
-date: //h5[@class='date']\r
-body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")]\r
-strip_id_or_class: article-top-wrapper\r
-strip_id_or_class: footer-message\r
-strip_id_or_class: print-logo\r
-strip: //cite\r
-strip://*[@class='timestamp']\r
-strip://div[@id='page_right']\r
-strip://section[@id='header_region']\r
-strip://h1[@class='node-title']\r
-strip://div[@class='node-submitted']\r
-strip_id_or_class: skipnav\r
-test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity\r
+title: //h1
+author: //h5[@class='byline']//a
+date: //h5[@class='date']
+body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")]
+strip_id_or_class: article-top-wrapper
+strip_id_or_class: footer-message
+strip_id_or_class: print-logo
+strip: //cite
+strip://*[@class='timestamp']
+strip://div[@id='page_right']
+strip://section[@id='header_region']
+strip://h1[@class='node-title']
+strip://div[@class='node-submitted']
+strip_id_or_class: skipnav
+test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity
test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day
\ No newline at end of file
-# Title\r
-title: //p[@class='Content HeadlineShort']\r
-\r
-# Authors\r
-# some are known and have a link, others don't\r
-author: substring-after(//span[@class='Autor'], 'Von')\r
-\r
-# Date\r
-date: //span[@class='Datum']\r
-\r
-# Body\r
-body: //div[@class='Artikel']\r
-\r
-# Removements before body text\r
-strip: //div[@class='Breadcrumbs']\r
-strip: //div[@class='QuickSearchBox']\r
-strip: //div[@class='FAZArtikelEinleitung']\r
-strip: //div[@class='FAZArtikelReiter']\r
-strip: //div[@class='clear']\r
-\r
-# General removements\r
-strip: //span[@class='Bildnachweis']\r
-\r
-# Removements after body text\r
-strip: //div[@class='ArtikelAbbinder']\r
-strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content']\r
-strip: //div[@class='FAZArtikelKommentare FAZArtikelContent']\r
-strip: //div[@class='FAZArtikelFunktionen']\r
-strip: //div[@id='FAZContentRight']\r
-test_url: http://www.faz.net/aktuell/gesellschaft/ehe-haltbarkeitsformel-verliebe-dich-oft-verlobe-dich-selten-heirate-vielleicht-11685306.html
\ No newline at end of file
+# Title
+title: //p[@class='Content HeadlineShort']
+
+# Authors
+# some are known and have a link, others don't
+author: substring-after(//span[@class='Autor'], 'Von')
+
+# Date
+date: //span[@class='Datum']
+
+# Body
+body: //div[@class='Artikel']
+
+# Removements before body text
+strip: //div[@class='Breadcrumbs']
+strip: //div[@class='QuickSearchBox']
+strip: //div[@class='FAZArtikelEinleitung']
+strip: //div[@class='FAZArtikelReiter']
+strip: //div[@class='clear']
+
+# General removements
+strip: //span[@class='Bildnachweis']
+strip: //img[@class='MediaIcon']
+strip: //div[@class='ArtikelMediaLink']
+dissolve: //a[img]
+
+# Removements after body text
+strip: //div[@class='ArtikelAbbinder']
+strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content']
+strip: //div[@class='FAZArtikelKommentare FAZArtikelContent']
+strip: //div[@class='FAZArtikelFunktionen']
+strip: //div[@id='FAZContentRight']
+
+# Fix picture captions
+wrap_in(small): //span[@class='Bildunterschrift']/text()
+test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken
\ No newline at end of file
--- /dev/null
+title: //title
+
+body: //div[@id='content']
+
+strip: (//div[@id='content']/h2)[1]
+
+strip: //h2[contains(., 'mehr News')]/following::*
+strip: //h2[contains(., 'mehr News')]
+
+strip: //div[contains(@class, 'indizar')]/following::*
+strip: //div[contains(@class, 'indizar')]
+
+strip: //h1[contains(@class, 'single')]/preceding::*
+strip: //h1[contains(@class, 'single')]
+
+strip_id_or_class: plista_widget
+
+prune: no
+
+next_page_link: //a[contains(., 'Weiter')]
+
+test_url: http://www.fertigung.de/2013/04/igus-neuer-energiekettenkatalog/
+test_url: http://www.fertigung.de/2013/04/dynamisch-und-hochpraezise/
\ No newline at end of file
-body: id('storytext')\r
-author: //a[starts-with(@href, '/u/')]\r
-#next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")\r
+body: id('storytext')
+author: //a[starts-with(@href, '/u/')]
+#next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")
strip_id_or_class: 'a2a_kit'
test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew
\ No newline at end of file
-title: //h4\r
-author: //span[@class="author"]\r
-body: //div[@id="story"]\r
-strip_id_or_class: summary\r
-strip_id_or_class: meta\r
-strip_id_or_class: storyfoot\r
-convert_double_br_tags: yes\r
-prune: no\r
-\r
-# Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface.\r
+title: //h4
+author: //span[@class="author"]
+body: //div[@id="story"]
+strip_id_or_class: summary
+strip_id_or_class: meta
+strip_id_or_class: storyfoot
+convert_double_br_tags: yes
+prune: no
+
+# Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface.
test_url: http://www.ficwad.com/story/158977
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-body: //div[@id='y-article-bd']\r
-body: //div[contains(@class, 'yom-art-content')]\r
-strip: //div[contains(@class, 'related-companies')]\r
-strip: //div[@id='y-article-related']\r
-strip: //div[@id='ypf-article-related']\r
-prune: no\r
-\r
-single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')]\r
-\r
-test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1\r
+title: //meta[@property='og:title']/@content
+body: //div[@id='y-article-bd']
+body: //div[contains(@class, 'yom-art-content')]
+strip: //div[contains(@class, 'related-companies')]
+strip: //div[@id='y-article-related']
+strip: //div[@id='ypf-article-related']
+prune: no
+
+single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')]
+
+test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1
test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html
\ No newline at end of file
-date: //div[@class='notes']/a\r
-body: //div[@id='content']\r
-\r
-strip_id_or_class: tags\r
-strip_id_or_class: permalink\r
-strip_id_or_class: notes\r
-strip_id_or_class: post_nav\r
-strip: //div[@id='content']//h2\r
+date: //div[@class='notes']/a
+body: //div[@id='content']
+
+strip_id_or_class: tags
+strip_id_or_class: permalink
+strip_id_or_class: notes
+strip_id_or_class: post_nav
+strip: //div[@id='content']//h2
strip_id_or_class: right_column
test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final
\ No newline at end of file
-title: //div[@class='articleTitle']\r
-author: //div[@class='articleAuthor']\r
-body: //div[@class='articleContent']\r
-prune: no\r
-convert_double_br_tags: yes\r
-\r
+title: //div[@class='articleTitle']
+author: //div[@class='articleAuthor']
+body: //div[@class='articleContent']
+prune: no
+convert_double_br_tags: yes
+
test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand
\ No newline at end of file
-title: substring-after(//title, 'Right:')\r
-body: //div[@class = 'post-body']\r
-author: substring-after(//*[@class='post-author'], 'by')\r
-date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a)\r
-convert_double_br_tags: yes\r
+title: substring-after(//title, 'Right:')
+body: //div[@class = 'post-body']
+author: substring-after(//*[@class='post-author'], 'by')
+date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a)
+convert_double_br_tags: yes
test_url: http://www.fivethirtyeight.com/2010/07/does-rnc-have-structural-problems.html
\ No newline at end of file
--- /dev/null
+strip_id_or_class: linenos
+test_url: http://www.flyingmachinestudios.com/programming/whoops-dci-refactoring/
\ No newline at end of file
-author: //div[@class='authorDescription']/h2\r
-body: //div[@id='story']\r
-date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-')\r
-title: //h1[@class='detail']\r
-strip: //div[@class='fact']\r
+author: //div[@class='authorDescription']/h2
+body: //div[@id='story']
+date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-')
+title: //h1[@class='detail']
+strip: //div[@class='fact']
test_url: http://fm4.orf.at/stories/1689156/
\ No newline at end of file
-title: normalize(//h1)\r
-\r
-author: //td/p[position()=last()]/em\r
-\r
-# I swear, this is really the best way to do this\r
-date: normalize(//td[contains(@style, "color: #ffffff")])\r
-\r
-# my god, it's full of tables\r
-body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td\r
-strip: //h1\r
-\r
-# the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output.\r
-strip: //p[position()=last()]/em\r
+title: normalize(//h1)
+
+author: //td/p[position()=last()]/em
+
+# I swear, this is really the best way to do this
+date: normalize(//td[contains(@style, "color: #ffffff")])
+
+# my god, it's full of tables
+body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td
+strip: //h1
+
+# the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output.
+strip: //p[position()=last()]/em
strip: //p[position()=last()]/child::text()
test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html
\ No newline at end of file
-title: //h1\r
-\r
-author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']\r
-\r
-date: //div[@class='articleHead']/span[@class='created']\r
-\r
-body: //div[@id='article']\r
-\r
-strip: //span[@class='markerText']\r
-strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']\r
-strip: //div[@class='sidebar']\r
-strip: //div[@class='starbar']\r
-strip: //div[@class='actions clearfix']\r
-strip: //div[@id='commentForm']\r
-strip: //div[@id='commentSent']\r
-strip: //div[@id='comments']\r
-strip: //div[@class='similarityBlock']\r
+title: //h1
+
+author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']
+
+date: //div[@class='articleHead']/span[@class='created']
+
+body: //div[@id='article']
+
+strip: //span[@class='markerText']
+strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']
+strip: //div[@class='sidebar']
+strip: //div[@class='starbar']
+strip: //div[@class='actions clearfix']
+strip: //div[@id='commentForm']
+strip: //div[@id='commentSent']
+strip: //div[@id='comments']
+strip: //div[@class='similarityBlock']
test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html
\ No newline at end of file
--- /dev/null
+author: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[1]/td[2]
+date: /html/body/table[3]/tbody/tr/td[1]/table[2]/tbody/tr[2]/td[2]
+body: //div[@class='main']
+test_url: http://www.folklore.org/StoryView.py?story=Calculator_Construction_Set.txt
\ No newline at end of file
--- /dev/null
+body: //div[@id='print-area']
+title: //h1[contains(@class, 'section-title')]
+single_page_link: //a[@id='prntrec']
+strip_image_src: food-logo-small
+strip_id_or_class: timer
+strip_id_or_class: photo-sm
+strip_id_or_class: page-header
+
+prune: no
+
+test_url: http://www.food.com/recipe/couldnt-be-easier-bbq-pork-tenderloin-crock-pot-317152
\ No newline at end of file
-body: //div[@class='entry-content']\r
-date: //meta[@name="date"]/@content\r
-author: //meta[@name="author"]/@content\r
-\r
-strip_id_or_class: ecapShell\r
-strip_id_or_class: noindent\r
-strip_id_or_class: targetedPromotion\r
-\r
-prune: no\r
-\r
+body: //div[@class='entry-content']
+date: //meta[@name="date"]/@content
+author: //meta[@name="author"]/@content
+
+strip_id_or_class: ecapShell
+strip_id_or_class: noindent
+strip_id_or_class: targetedPromotion
+
+prune: no
+
test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx
\ No newline at end of file
-title: //hgroup//h1\r
-title: //span[@class='mainarttitle']\r
-\r
-body: //div[@id='leftRail']//div[contains(@class, 'body')]\r
-\r
-author: //meta[@name="author"]/@content\r
-author: //span[@class='mainartauthor']\r
-\r
-date: substring-before(//hgroup//h6, '@')\r
-date: //span[@class='mainartdate']\r
-\r
-prune: no\r
-\r
-single_page_link: //a[contains(@href, '/print/')]\r
-\r
-test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html
\ No newline at end of file
+title: //hgroup//h1
+title: //span[@class='mainarttitle']
+
+body: //div[@id='leftRail']//div[contains(@class, 'body')]
+
+author: //meta[@name="author"]/@content
+author: //span[@class='mainartauthor']
+
+date: substring-before(//hgroup//h6, '@')
+date: //span[@class='mainartdate']
+
+prune: no
+strip: //aside
+strip_id_or_class: sticky_sharing
+strip_id_or_class: pagination
+strip_id_or_class: controlsbox
+strip_id_or_class: storyboxes
+strip_id_or_class: sponsoredlinks
+strip_id_or_class: nextpage
+strip_id_or_class: contextuallinks
+strip_id_or_class: article_actions
+strip_id_or_class: engagement_block
+
+single_page_link: //a[contains(@href, '/print/')]
+
+test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html
+test_url: http://www.forbes.com/sites/bruceupbin/2012/09/11/the-iphone-5-winners-and-losers/
\ No newline at end of file
--- /dev/null
+# TIDY
+#tidy: no
+# PRUNE
+#prune: no
+
+# SINGLE PAGE
+single_page_link: //div[@class='showlinks']/a
+
+# TITLE
+title: //h1[@class="title"]
+
+# AUTHOR
+author: //div[contains(@class,"field-field-article-display-authors")]/div/div/a/text()
+
+# DATE
+date: //div[contains(@class,"field-field-article-issue")]/div/div/a/text() | //span[@class="date-display-single"]
+
+# BODY
+body: //div[contains(@class,"content-resize")]
+
+# Remove clutter
+strip: //div[@class="article-sidebar"]
+strip: //div[@class="showlinks"]
+strip: //div[contains(@class,"premium-box")]
+strip: //div[contains(@class,"premium-box")]
+strip: //table[contains(@border,"2")]
+
+# Fix picture captions
+wrap_in(small): //p/img/following-sibling::em
+wrap_in(small): //p[img]/text()
+
+# Fix sub-headlines
+wrap_in(h3): //div[contains(@class,"field-field-article-subtitle")]/div/div/text()
+test_url: http://www.foreignaffairs.com/articles/138810/pierre-n-leval/the-long-arm-of-international-law
\ No newline at end of file
-title: //div[@id='art-mast']//h1\r
-author: substring-after(//span[@id='by-line'], 'BY ')\r
-date: //span[@id='pub-date']\r
-body: //div[@id='art-mast']//h2 | //div[@id='art-mast']/h3 | //div[@id='art-body']//div[@class='translateBody']\r
-strip: //div[@id='share-box']\r
-prune: no\r
-\r
-single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')]\r
-\r
-test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me\r
-test_url: test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus
\ No newline at end of file
+title: //div[@class='translateHead']//h1 | //div[@id='art-mast']//h1
+author: substring-after(//span[@id='by-line'], 'BY ')
+date: //span[@id='pub-date']
+body: //div[@id='art-mast']/h2 | //div[@class='translateBody'] | //div[@id='art-body']
+#Strip inside article content
+strip: //div[@id='share-box']
+strip: //div[@id='special-box']
+
+prune: no
+
+single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')]
+single_page_link: //a[text()='SINGLE PAGE']
+
+test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me
+test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus
\ No newline at end of file
-title: //div[@class="articleHeader"]/h1\r
-author: //p[@class="byline"]\r
-date: //p[contains(@class,"publishedDate")]/span\r
-# remove the right menu\r
-strip: //div[contains(@class,"aside")]\r
-# remove some SharePoint webpart label junk\r
-strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"]\r
+title: //div[@class="articleHeader"]/h1
+author: //p[@class="byline"]
+date: //p[contains(@class,"publishedDate")]/span
+# remove the right menu
+strip: //div[contains(@class,"aside")]
+# remove some SharePoint webpart label junk
+strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"]
strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"]
test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx
\ No newline at end of file
-prune: no\r
-\r
-author: //meta[@name="dc.publisher"]/@content\r
-date: //meta[@name="dc.date"]/@content\r
-strip: //p[contains(@class, 'contributor vcard')]\r
-replace_string(<ul><li><div class="photo">): <div class="photo">\r
-strip: //p[a[contains(., 'Click here to read more on this story ')]]\r
-\r
+prune: no
+
+author: //meta[@name="dc.publisher"]/@content
+date: //meta[@name="dc.date"]/@content
+strip: //p[contains(@class, 'contributor vcard')]
+replace_string(<ul><li><div class="photo">): <div class="photo">
+strip: //p[a[contains(., 'Click here to read more on this story ')]]
+
test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/
\ No newline at end of file
-body: //div[@id="projectDetailsContent"]//td\r
+body: //div[@id="projectDetailsContent"]//td
test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html
\ No newline at end of file
-body: //div[@class = 'instapaperbody']\r
-convert_double_br_tags: no\r
-date: //div[@class='instadate']\r
+body: //div[@class = 'instapaperbody']
+convert_double_br_tags: no
+date: //div[@class='instadate']
title: //h2[@class = 'instatitle']
test_url: http://freytag-film.com/blog/artikel/shooting_a_feature_film_in_10_days
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.fria.nu/artikel/112079
+test_url: http://www.fria.nu/taxonomy/term/1928/all/feed
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.friatidningen.se/artikel/112074
\ No newline at end of file
-#body: (//div[@class='ftr-yt-vid'])[1]\r
-body: (//blockquote[contains(@class, 'postcontent')])[1]\r
-body: (//div[starts-with(@id, 'post_message')])[1]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
-#replace_string(</iframe>): </iframe> </div>\r
-\r
+#body: (//div[@class='ftr-yt-vid'])[1]
+body: (//blockquote[contains(@class, 'postcontent')])[1]
+body: (//div[starts-with(@id, 'post_message')])[1]
+
+prune: no
+tidy: no
+
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"
+#replace_string(</iframe>): </iframe> </div>
+
test_url: http://www.friendskorner.com/forum/f137/debate-personal-lives-leaders-west-vs-pakistan-must-read-297989/
\ No newline at end of file
-body: //div[contains(@class, 'ft-story-body')]\r
-\r
-author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ')\r
+body: //div[contains(@class, 'ft-story-body')]
+
+author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ')
date: substring-before(substring-after(//div[contains(@class, 'ft-story-header')]/p[2], 'Published:'), '|')
test_url: http://www.ft.com/cms/s/2/e1be4b5a-620c-11e0-8ee4-00144feab49a.html
\ No newline at end of file
--- /dev/null
+# Modified to define the single_page_link
+# This filter is tested on:
+# http://www.ftchinese.com/story/001047373
+# http://www.ftchinese.com/story/001047631
+# http://www.ftchinese.com/story/001047622/?print=y
+# http://www.ftchinese.com/story/001049052
+# http://www.ftchinese.com/story/001049088
+
+title:substring-before(//title, '-')
+author: //div[@class='byline']/a
+date: //a[@class='storytime']
+#Set date in print view
+#date: //div[@class='byline']/a/following-sibling::a
+body: //div[@id="bodytext"]
+strip://div[@class='pagination']
+single_page_link://div[@class='pagination']/a[.='全文']
+#next_page_link: //div[@class='pagination']//a[.='下一页']
+test_url: http://www.ftchinese.com/story/001049088
\ No newline at end of file
-body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft']\r
-single_page_link: //a[@class='icon print']\r
+body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft']
+single_page_link: //a[@class='icon print']
-test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html\r
+test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html
test_url: http://www.ftd.de/it-medien/medien-internet/:verkauf-von-warner-music-musikbranche-auf-dem-sprung/60048185.html
\ No newline at end of file
-body: //div[@class = 'entry']\r
+body: //div[@class = 'entry']
test_url: http://www.fubiz.net/2011/05/31/world-press-photo-2011/
\ No newline at end of file
-date: //span[@class='date']\r
-strip: //div[@class='postsidebar']\r
-body: //div[@class='singlepost']\r
-title: //div[@class='singlepost']/h1\r
-move_into(//div[@class='singlepost']): //div[@class='info']\r
-strip: //div[@class='gallery']\r
-strip: //div[@class='biggallery']\r
-strip: //ul[@class='social']\r
-strip: //ul[@class='social_mail']\r
+date: //span[@class='date']
+strip: //div[@class='postsidebar']
+body: //div[@class='singlepost']
+title: //div[@class='singlepost']/h1
+move_into(//div[@class='singlepost']): //div[@class='info']
+strip: //div[@class='gallery']
+strip: //div[@class='biggallery']
+strip: //ul[@class='social']
+strip: //ul[@class='social_mail']
test_url: http://futurezone.at/future/5502-erste-galileo-satelliten-starten-ins-all.php
\ No newline at end of file
-# default view title\r
-title: //span[@class='newsTitle']\r
-# print view title\r
-title: //h3[@class='title']\r
-\r
-# default view author\r
-author: //span[@class='newsAuth']/a\r
-author: substring-after(//span[@class='newsAuth'], 'by ')\r
-\r
-# default view date\r
-date: //td[@class='newsDate']\r
-\r
-# default view body\r
-body: //td[@class='featureText']\r
-body: //td[@class='newsText']\r
-\r
-strip: //h3[@class='title']\r
-\r
+# default view title
+title: //span[@class='newsTitle']
+# print view title
+title: //h3[@class='title']
+
+# default view author
+author: //span[@class='newsAuth']/a
+author: substring-after(//span[@class='newsAuth'], 'by ')
+
+# default view date
+date: //td[@class='newsDate']
+
+# default view body
+body: //td[@class='featureText']
+body: //td[@class='newsText']
+
+strip: //h3[@class='title']
+
single_page_link: //a[contains(@href, '?print=1')]
test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')]\r
-\r
-prune: no\r
-\r
-strip_id_or_class: noprint\r
-strip: //div[@id='gbNewsTextContent']/following-sibling::*\r
-\r
-test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video\r
+title: //meta[@property="og:title"]/@content
+body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')]
+
+prune: no
+
+strip_id_or_class: noprint
+strip: //div[@id='gbNewsTextContent']/following-sibling::*
+
+test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video
test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible
\ No newline at end of file
--- /dev/null
+title: //h1[@class='title']
+
+date: substring-before(substring-after(//div[@class='comment-bubble']/.., 'Posted'), 'by')
+
+body: //div[@class='the-content']
+
+strip: //div[@class='article-image responsive']
+
+strip_id_or_class: 'pullquote'
+test_url: http://gamechurch.com/virtual-gun-control-the-best-amendment/
\ No newline at end of file
--- /dev/null
+body: //div[@class='pageContent description']
+date: //div[@class='authorsAndDateTime']/span[@title]
+single_page_link: //div[@class='pages']/a[last()-1]
+
+# fix images and captions
+wrap_in(figure): //div[contains(concat(' ', @class, ' '), ' image')]
+wrap_in(figcaption): //div[contains(concat(' ', @class, ' '), ' image')]/div[@class='text']/text()
+
+# get rid of videos
+strip_id_or_class: 'video full'
+test_url: http://www.gamer.no/artikler/142455/slik-blei-ambisiose-dragons-dogma-skapt/
\ No newline at end of file
--- /dev/null
+title: //div[@id='content']/div/h1
+
+author: //a[@itemprop='reviewer']
+
+date: //time[@itemprop='dtreviewed']/@datetime
+
+body: //div[@id='breadtext']
+
+# fix for NOT magically removing anchors with text identical to title
+dissolve: //a[text()=//div[@id='content']/div/h1/text()]
+test_url: http://www.gamereactor.no/previews/177481/The+Evil+Within/?sid=38b5bd30f56f1b7214de4ff5bed4b76f
\ No newline at end of file
-tidy: no\r
-\r
+tidy: no
+
test_url: http://www.garythink.com/eft/testing.html
\ No newline at end of file
-# These should work, but don't. They were given by Firefox XPather extension\r
-title: //article//header//a//h1\r
+# These should work, but don't. They were given by Firefox XPather extension
+title: //article//header//a//h1
body: //article//section
test_url: http://gasteroprod.com/blog/faut-il-continuer-a-supporter-internet-explorer-6.html
\ No newline at end of file
-body: //div[@class='panel']\r
-strip: //div[@style='float:right']\r
-strip: //span[@class='titulosHomePublicidad']\r
-strip: //div[@id='TitTop5Der']\r
-strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png']\r
-\r
+body: //div[@class='panel']
+strip: //div[@style='float:right']
+strip: //span[@class='titulosHomePublicidad']
+strip: //div[@id='TitTop5Der']
+strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png']
+
prune: yes
test_url: http://www.gatopardo.com/ReportajesGP.php?R=95
\ No newline at end of file
-body: //div[@class="post-body"]\r
-\r
-# Remove 'content is restricted'\r
-strip: //div[@id='agegate_IDHERE']\r
-\r
+body: //div[@class="post-body"]
+
+# Remove 'content is restricted'
+strip: //div[@id='agegate_IDHERE']
+
test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy
\ No newline at end of file
-author: substring-after(//span[@class='storyauthor'],'Posted by')\r
+author: substring-after(//span[@class='storyauthor'],'Posted by')
date: //span[@class='storydate']
test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/
\ No newline at end of file
-body: //div[@id = 'article']\r
+body: //div[@id = 'article']
strip: //div[@id = 'klasbox']
test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html
\ No newline at end of file
-body: //div[@class='post']\r
+body: //div[@class='post']
strip: //ul[@id='bookmark_single']
test_url: http://getnews.jp/archives/117312
\ No newline at end of file
-# 2011-11-19 - carlo@... - Initial setup.\r
-\r
-strip_id_or_class: user-review-detail\r
-strip: //h1\r
-\r
-body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"]\r
-\r
-author: //span[@class="reviewer"] | //p[@class="byline"]/a/text()\r
-date: //span[@class="dtreviewed"]\r
+# 2011-11-19 - carlo@... - Initial setup.
+
+strip_id_or_class: user-review-detail
+strip: //h1
+
+body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"]
+
+author: //span[@class="reviewer"] | //p[@class="byline"]/a/text()
+date: //span[@class="dtreviewed"]
test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/
\ No newline at end of file
-tidy:no\r
-title://h2[@class="title"]\r
-# author:"Ben Miller"\r
-date://div[@id="stats"]/span\r
-strip_id_or_class:stats\r
-strip_id_or_class:breadcrumbs\r
-strip_id_or_class:gn-why-content\r
-strip_id_or_class:single-social\r
-strip_id_or_class:sidebar-ads\r
-strip_id_or_class:sidebar-top\r
-strip_id_or_class:footer\r
-strip_id_or_class:post_meta\r
-# strip_id_or_class:\r
-# strip_id_or_class:\r
-# strip_id_or_class:\r
-# strip_id_or_class:\r
-# strip_id_or_class:\r
-# strip_id_or_class:\r
+tidy:no
+title://h2[@class="title"]
+# author:"Ben Miller"
+date://div[@id="stats"]/span
+strip_id_or_class:stats
+strip_id_or_class:breadcrumbs
+strip_id_or_class:gn-why-content
+strip_id_or_class:single-social
+strip_id_or_class:sidebar-ads
+strip_id_or_class:sidebar-top
+strip_id_or_class:footer
+strip_id_or_class:post_meta
+# strip_id_or_class:
+# strip_id_or_class:
+# strip_id_or_class:
+# strip_id_or_class:
+# strip_id_or_class:
+# strip_id_or_class:
test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033
\ No newline at end of file
-date: //meta[@name='DC.date.issued']/@content\r
-date: //span[@class='post-meta the-date']\r
-\r
-title: //meta[@property='og:title']/@content\r
-\r
-author: //meta[@name='DC.creator']/@content\r
-\r
-body: //div[contains(@class, 'post-sub-head') or starts-with(@id, 'post-content-')]\r
-\r
-find_string: id="content"\r
-replace_string: id="content-ignore"\r
-\r
-strip_id_or_class: sharedaddy\r
-\r
-prune: no\r
-\r
-test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/
\ No newline at end of file
+date: //meta[@name='dcterms.created']/@content
+title: //meta[@property='og:title']/@content
+author: //section[@class="post-meta"]//a[@rel="author"]
+
+body: //div[starts-with(@id, 'post-content-')]
+
+strip_id_or_class: sharedaddy
+
+prune: no
+
+test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/
+test_url: http://gigaom.com/2012/12/26/snapchat-rises-why-pokes-decline-shows-facebooks-inability-to-invent/
\ No newline at end of file
-single_page_link: //p[@id='skip']//a[contains(@href, 'skip')]\r
+single_page_link: //p[@id='skip']//a[contains(@href, 'skip')]
test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010
\ No newline at end of file
-body: //div[@class="highlight"]/pre\r
-\r
-prune: no\r
-tidy: no\r
-\r
+body: //div[@class="highlight"]/pre
+
+prune: no
+tidy: no
+
test_url: https://gist.github.com/1258908
\ No newline at end of file
-single_page_link: //div[@id="content"]//h2/a\r
+single_page_link: //div[@id="content"]//h2/a
test_url: http://givemesomethingtoread.com/post/6285838917/the-baddest-lawyer-in-the-history-of-jersey
\ No newline at end of file
-body: //div[@id="leadimage" or @class="postcontent"]\r
-author: //div[@class="contentauthor"]\r
-date: //div[@class="timestamp"]\r
-\r
-prune: no\r
-\r
+body: //div[@id="leadimage" or @class="postcontent"]
+author: //div[@class="contentauthor"]
+date: //div[@class="timestamp"]
+
+prune: no
+
test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/
\ No newline at end of file
-body: //div[@class="post-body" or contains(@class, 'illustration top')]\r
-author: (//cite//span[@class="plus-icon"])[1]\r
-date: //span[@class="date"]\r
-\r
-prune: no\r
-\r
-test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science
\ No newline at end of file
+#body: //div[@class="post-body" or contains(@class, 'illustration top')]
+body: //div[contains(@class, 'image-annotation-box') or contains(@class, 'post-content')]
+#author: (//cite//span[@class="plus-icon"])[1]
+author: //span[contains(@class, 'display-name')]
+date: //span[@class="date"]
+
+prune: no
+
+test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science
+test_url: http://gizmodo.com/what-van-goghs-paintings-would-look-like-if-they-came-874035680
+test_url: http://gizmodo.com/vip.xml
\ No newline at end of file
--- /dev/null
+title: //h1
+
+body: //div[@id='destaques']//div[contains(@class, 'img')] | //div[@id='maincontent']//p
+
+test_url: http://gizmodo.uol.com.br/nvidia-gtx-titan-z/
+test_url: http://gizmodo.uol.com.br/perfil-mark-zuckerberg-hackeado/
-# Look for Open Graph data - http://ogp.me\r
-title: //meta[@property="og:title"]/@content\r
-date: //meta[@property="article:published_time"]/@content\r
-# article:author is someties URL, e.g. on guardian.co.uk
\ No newline at end of file
+# Look for Open Graph data - http://ogp.me
+title: //meta[@property="og:title"]/@content
+date: //meta[@property="article:published_time"]/@content
+# article:author is someties URL, e.g. on guardian.co.uk
+
+# Remove Google Publisher Tags: https://support.google.com/dfp_sb/answer/1649768?hl=en
+#strip_id_or_class: div-gpt-ad
+
+# Strip doubleclick image ads
+strip_image_src: doubleclick.net
+
+# If you get chunks of Javascript code appearing in the extracted output, try uncommenting the lines below.
+# This tries to convert script tags to hidden div elements (which Full-Text RSS removes).
+# If you notice issues with this approach, please let us know.
+#find_string: <script
+#replace_string: <div style="display:none"
+#find_string: </script>
+#replace_string: </div>
\ No newline at end of file
-body: //div[@id='content']\r
-\r
-strip: //p[@class='top']\r
-strip: //h2[.='Where next?']\r
-strip_id_or_class: where-next\r
-strip_id_or_class: social-bookmarks\r
-strip_id_or_class: link-to-here\r
-strip_id_or_class: options-heading\r
-strip_id_or_class: page-options-content\r
-strip_id_or_class: page-info-bottom\r
-\r
-tidy: no\r
-prune: no\r
-\r
+body: //div[@id='content']
+
+strip: //p[@class='top']
+strip: //h2[.='Where next?']
+strip_id_or_class: where-next
+strip_id_or_class: social-bookmarks
+strip_id_or_class: link-to-here
+strip_id_or_class: options-heading
+strip_id_or_class: page-options-content
+strip_id_or_class: page-info-bottom
+
+tidy: no
+prune: no
+
test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]
+
+body: //div[@class='materia-titulo']/h2 | //*[@id="materia-letra"]
+
+date: //abbr[@class="published"]
+date: //abbr[@class="updated"]
+
+author: //*[@class="author"]/strong
+
+strip: //div[contains(@class,'foto')]/strong
+strip: //div[contains(@class,'frase-materia')]/div[@class='autor']
+strip: //div[contains(@class,'saibamais')]
+strip: //*[contains(text(),'Clique aqui e veja mais')]/ancestor::p
+strip: //ul[@class="toolbar"]
+
+# quotes
+wrap_in(blockquote): //div[@id='materia-letra']//div[contains(@class,'frase-materia')]/div[@class='frase']
+
+prune: no
+
+replace_string([Clique aqui e veja mais vídeos do Fluminense]): []
+
+test_url: http://globoesporte.globo.com/atletismo/noticia/2013/08/michael-johnson-diz-que-bolt-e-melhor-da-historia-nao-ha-duvidas.html
+test_url: http://globoesporte.globo.com/futebol/futebol-internacional/futebol-espanhol/noticia/2013/08/barca-atropela-levante-e-neymar-passa-em-branco-em-estreia-oficial.html
+test_url: http://globoesporte.globo.com/futebol/times/fluminense/noticia/2013/08/poupado-no-sabado-felipe-se-diz-pronto-para-ser-titular-contra-o-goias.html
--- /dev/null
+title: //article[@id='material']/header/h1
+author: //article[@id='material']/header/div[2]/p
+date: //article[@id='material']/header/p/time[1]
+body: //section[@id='tresc']
+next_page_link: .//section[@id='tresc']/div[@class='stronicowanie']/a[@rel='next']
+strip://div[@class='podobneSonda']
+
+test_url: http://www.gloswielkopolski.pl/artykul/803547,abc-telemarketingu-praca-ktora-zwalnia-z-myslenia,id,t.html
\ No newline at end of file
-title: //div[@id='article_headline']//h1\r
-date: //div[contains(@class, 'articleDate')]//h4\r
-body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content']\r
-\r
-strip_id_or_class: relatedLinksBox\r
-strip_id_or_class: betting-widget\r
-strip_image_src: install_flash.gif\r
-\r
-strip: //table[contains(@style, 'float: right; width: 285px;')]\r
-strip: //div[@class='caption']\r
-\r
-tidy: no\r
-prune: no\r
-\r
-test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and-\r
+title: //div[@id='article_headline']//h1
+date: //div[contains(@class, 'articleDate')]//h4
+body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content']
+
+strip_id_or_class: relatedLinksBox
+strip_id_or_class: betting-widget
+strip_image_src: install_flash.gif
+
+strip: //table[contains(@style, 'float: right; width: 285px;')]
+strip: //div[@class='caption']
+
+tidy: no
+prune: no
+
+test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and-
test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to#
\ No newline at end of file
-# Jens Kohl, jens.kohl@...\r
-# - Added publication date\r
-# - Striped pagination block\r
-# - Added single page link\r
-# - Added xpath-querys for the printer friendly version\r
-\r
-title: //h1\r
-body: //div[@class='formatted']\r
-prune: no\r
-\r
-date: substring-after(//li[2][@class="text1"], 'Datum:')\r
-strip: //ol[@class="list-chapters"]\r
-strip_comments: yes\r
-\r
-# next: commands for printer friendly pages\r
-single_page_link: //a[contains(@href, 'print.php?a=')]/@href\r
-title: //body/h3\r
-strip_image_src: staticrl/images/logo.jpg\r
-strip_image_src: http://cpx.golem.de/cpx.php?class=7\r
-strip: //body/h3\r
-strip: //body/b[1]\r
-strip: //body/b[2]\r
-strip: //body/b[3]\r
-strip: //div[1]\r
+# Jens Kohl, jens.kohl@...
+# - Added publication date
+# - Striped pagination block
+# - Added single page link
+# - Added xpath-querys for the printer friendly version
+
+title: //h1
+body: //div[@class='formatted']
+prune: no
+
+date: substring-after(//li[2][@class="text1"], 'Datum:')
+strip: //ol[@class="list-chapters"]
+strip_comments: yes
+
+# next: commands for printer friendly pages
+single_page_link: //a[contains(@href, 'print.php?a=')]/@href
+title: //body/h3
+strip_image_src: staticrl/images/logo.jpg
+strip_image_src: http://cpx.golem.de/cpx.php?class=7
+strip: //body/h3
+strip: //body/b[1]
+strip: //body/b[2]
+strip: //body/b[3]
+strip: //div[1]
test_url: http://www.golem.de/1112/88696.html
\ No newline at end of file
-title: //div[@class="title"]/div/h1\r
-body: //div[@class="body"]\r
-date: //li[@class="date-time"]\r
+title: //div[@class="title"]/div/h1
+body: //div[@class="body"]
+date: //li[@class="date-time"]
test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/
\ No newline at end of file
--- /dev/null
+strip_id_or_class: gutter
+test_url: http://goodfil.ms/blog/posts/2012/08/13/angularjs-and-the-goodfilms-mobile-site-part-1/
\ No newline at end of file
-date: //meta[@name='og:article:published_time']/@value\r
-\r
-body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']\r
-\r
-strip_id_or_class: itemImageGallery\r
-\r
-# remove extras at end of post content\r
-find_string: <div style="margin:5px 0 10px;">\r
-replace_string: </div></body></html><!--\r
-\r
-prune: no\r
-\r
-test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous\r
+date: //meta[@name='og:article:published_time']/@value
+
+body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']
+
+strip_id_or_class: itemImageGallery
+
+# remove extras at end of post content
+find_string: <div style="margin:5px 0 10px;">
+replace_string: </div></body></html><!--
+
+prune: no
+
+test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous
test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.goteborgsfria.se/artikel/112079
\ No newline at end of file
-title: //div[@class='entry-header']\r
-author: //span[@class='vcard author']\r
-date: //abbr[@class='published']\r
-#move_into(//div[@class='entry-body']): //img[@id='photo_1']\r
-body: //div[@class='entry-body']\r
+title: //div[@class='entry-header']
+author: //span[@class='vcard author']
+date: //abbr[@class='published']
+#move_into(//div[@class='entry-body']): //img[@id='photo_1']
+body: //div[@class='entry-body']
strip: //div[@class='galleryEaseThumbs']
test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php
\ No newline at end of file
-title: //span[@id="showTitle"]\r
-author: //span[@id="showAuthor"]\r
-date: //span[@id="showRefDate"]\r
-\r
-strip: //span[@class="black_bold"]\r
-strip: //div[@id="sectionName"]\r
-strip: //div[@id="storyHeader"]\r
-\r
-body: //div[@id="newsBodyText"]\r
-\r
-strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif"\r
-strip_image_src: "http://www.gotomanager.com/images/separator.gif"\r
-strip_image_src: "http://www.gotomanager.com/images/spaces.gif"\r
-\r
-convert_double_br_tags: yes\r
-tidy: yes\r
-\r
-strip: //div[@id="smallLeadImage"]\r
-strip: //div[@id="truehitsSurvey"]\r
+title: //span[@id="showTitle"]
+author: //span[@id="showAuthor"]
+date: //span[@id="showRefDate"]
+
+strip: //span[@class="black_bold"]
+strip: //div[@id="sectionName"]
+strip: //div[@id="storyHeader"]
+
+body: //div[@id="newsBodyText"]
+
+strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif"
+strip_image_src: "http://www.gotomanager.com/images/separator.gif"
+strip_image_src: "http://www.gotomanager.com/images/spaces.gif"
+
+convert_double_br_tags: yes
+tidy: yes
+
+strip: //div[@id="smallLeadImage"]
+strip: //div[@id="truehitsSurvey"]
strip: //table[@id="relatedInfoTable"]
test_url: http://www.gotomanager.com/news/details.aspx?id=86759
\ No newline at end of file
--- /dev/null
+strip: //body//title
+
+test_url: http://www.gov.ky/pls/portal/PORTAL.wwv_media.show?p_id=7593947&p_settingssetid=1&p_settingssiteid=0&p_siteid=2425&p_type=basetext&p_textid=7593948
+test_url: http://www.rcips.ky/pls/portal/wlacomp.wlafeed.show_cignewsfeed_agency?p_sitecode=POL&p_agency=Police
\ No newline at end of file
--- /dev/null
+body: //div[@id='articleContainer']
+author: //div[@id='articleContent']//div[contains(@class, 'byline')]//span[contains(@class, 'name fn')]
+strip_id_or_class: toolbar
+strip_id_or_class: ADad
+strip_id_or_class: articleSerieWrapper
+strip_id_or_class: articleFloatContainer
+strip: //div[contains(@class, 'byline')]//img
+prune: no
+
+test_url: http://www.gp.se/nyheter/bohuslan/1.2045564-styckade-mannen-hade-mordat-hustrun
+test_url: http://www.gp.se/1.16560
\ No newline at end of file
-next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a\r
-strip_id_or_class: utility\r
-strip_id_or_class: keywords\r
-strip_id_or_class: pagination\r
-strip_id_or_class: position2_content\r
-body: //div[@class='article']\r
-title: //h1[@class='content-headline']\r
+next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a
+strip_id_or_class: utility
+strip_id_or_class: keywords
+strip_id_or_class: pagination
+strip_id_or_class: position2_content
+body: //div[@class='article']
+title: //h1[@class='content-headline']
author: //span[@class='contributor']//a
test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012
\ No newline at end of file
-# this is fragile with footnotes -- leave it for now\r
-\r
-#tidy: no\r
-#prune: no\r
-#move_into(//article): //aside[@id='footnotes']\r
-author: //cite/a\r
-date: //time\r
-\r
-strip: //a[text()='Grantland']\r
-strip_id_or_class: ad-wrapper\r
-strip_id_or_class: fb-connect-link\r
-strip_id_or_class: fb-status\r
-strip: //li[@class='print']\r
-strip: //cite\r
-strip: //a[contains(text(), '[+]')]\r
-strip: //a[@id='jump-nav-link']\r
-strip: //h1[text()='Share This']\r
-strip: //h1[text()='Top Stories']\r
-strip: //div[@id="update-text-size"]\r
+# this is fragile with footnotes -- leave it for now
+
+#tidy: no
+#prune: no
+#move_into(//article): //aside[@id='footnotes']
+author: //cite/a
+date: //time
+
+strip: //a[text()='Grantland']
+strip_id_or_class: ad-wrapper
+strip_id_or_class: fb-connect-link
+strip_id_or_class: fb-status
+strip: //li[@class='print']
+strip: //cite
+strip: //a[contains(text(), '[+]')]
+strip: //a[@id='jump-nav-link']
+strip: //h1[text()='Share This']
+strip: //h1[text()='Top Stories']
+strip: //div[@id="update-text-size"]
test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day
\ No newline at end of file
-title: //div[@class="blogpost"]/h2\r
-author: //div[@class="blogpost"]/p[@class="byline"]/a\r
-date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"]\r
-body: //div[@class="blogpost"]\r
-strip_id_or_class: flag\r
-strip_id_or_class: byline\r
-strip_id_or_class: post_footer\r
-strip_id_or_class: related_posts\r
-strip_id_or_class: post_author_bios\r
+title: //div[@class="blogpost"]/h2
+author: //div[@class="blogpost"]/p[@class="byline"]/a
+date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"]
+body: //div[@class="blogpost"]
+strip_id_or_class: flag
+strip_id_or_class: byline
+strip_id_or_class: post_footer
+strip_id_or_class: related_posts
+strip_id_or_class: post_author_bios
strip: //h2
test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/
\ No newline at end of file
-title://h1\r
-author://span[@class="submitted"]/a\r
-date:substring-after(//span[@class="submitted"],'on ')\r
+title://h1
+author://span[@class="submitted"]/a
+date:substring-after(//span[@class="submitted"],'on ')
body://div[@class="content"]
test_url: http://groups.drupal.org/node/36816
\ No newline at end of file
-body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article']\r
-strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1]\r
-prune: no\r
-tidy: no\r
+body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article']
+strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1]
+prune: no
+tidy: no
test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084
\ No newline at end of file
-# To administrator:\r
-# Please change the hostname to "www.guokr.com/article/*"\r
-# Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com\r
-\r
-# This filter is tested on:\r
-# http://www.guokr.com/article/274325/\r
-# http://www.guokr.com/article/275013/\r
-\r
-title://h1\r
-author://div[contains(@class, 'content-th-info')]/a\r
-date://div[contains(@class, 'content-th-info')]/span\r
-body://div[contains(@class, 'Content')]\r
-\r
-strip://div[contains(@class, 'bottom-i')]\r
-strip://div[contains(@class, 'copyright')]\r
-strip://div[contains(@class, 'fr')]\r
-strip://div[contains(@class, 'content-th-info')]\r
-strip://h1[contains(@id, 'articleTitle')]\r
-strip://div[contains(@class, 'side')]\r
-strip://div[contains(@class, 'top-wp')]\r
-test_url: http://www.guokr.com/article/275013/\r
+# To administrator:
+# Please change the hostname to "www.guokr.com/article/*"
+# Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com
+
+# This filter is tested on:
+# http://www.guokr.com/article/274325/
+# http://www.guokr.com/article/275013/
+
+title://h1
+author://div[contains(@class, 'content-th-info')]/a
+date://div[contains(@class, 'content-th-info')]/span
+body://div[contains(@class, 'Content')]
+
+strip://div[contains(@class, 'bottom-i')]
+strip://div[contains(@class, 'copyright')]
+strip://div[contains(@class, 'fr')]
+strip://div[contains(@class, 'content-th-info')]
+strip://h1[contains(@id, 'articleTitle')]
+strip://div[contains(@class, 'side')]
+strip://div[contains(@class, 'top-wp')]
+test_url: http://www.guokr.com/article/275013/
test_url: http://www.guokr.com/article/338387/
\ No newline at end of file
-title: //div[@id="habermetni"]/h1[@id="haber_baslik"]\r
-body: //div[@id="habermetni"]/p\r
-strip: //img[@class='newsDetailLeft']\r
+title: //div[@id="habermetni"]/h1[@id="haber_baslik"]
+body: //div[@id="habermetni"]/p
+strip: //img[@class='newsDetailLeft']
strip_image_src: /haber-resimleri/
test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/
\ No newline at end of file
--- /dev/null
+date: //article//time[@pubdate]
+body: //article/div[@id="post-wide"]
+title: //article/header/h2
+strip: /div[@id="comment"]
+strip: //footer
+author: substring-after(//footer/p[@class='byline'] , 'By')
+test_url: http://hackmake.org/2012/12/21/mindfulness-of-concentration
\ No newline at end of file
-title:substring-before(id("maincontent")/table, 'Posted')\r
-body:id("maincontent")/p\r
-# eventually convert linebreaks better\r
+title:substring-before(id("maincontent")/table, 'Posted')
+body:id("maincontent")/p
+# eventually convert linebreaks better
test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html
\ No newline at end of file
-# Remove right column\r
-strip: //*[(@class = 'right_col')]\r
-\r
-# Remove comments etc.\r
-strip: //*[(@class = 'category')]\r
+# Remove right column
+strip: //*[(@class = 'right_col')]
+
+# Remove comments etc.
+strip: //*[(@class = 'category')]
strip: /html/body/div[1][@class='absolute_content_high']/div[1][@class='wrapper']/div[1][@class='main_col']/div[@class='main_content']/h3
test_url: http://hammers.theoffside.com/carling-cup/a-funny-thing-happened-on-the-way-to-4-nil.html
\ No newline at end of file
--- /dev/null
+#Single Page
+single_page_link: //li[contains(@class,"hcf-print")]/a
+
+# Title hcf-headline
+title: //span[@class='hcf-headline']
+
+# Authors
+author: //div[@class="hcf-author"]/a/text()
+author: substring-after(//div[@class='hcf-author'], 'von ')
+
+# Date
+date: //div[@class='hcf-article-date']
+
+# Body
+body: //div[@class='article']
+
+# General removements
+strip: //div[contains(@class,"hcf-smartbox")]
+strip: //div[contains(@class,"hcf-stopper")]
+strip: //div[contains(@class,"hcf-img-controls")]
+strip: //span[@class='hcf-location-mark']
+strip: //span[@class='hcf-copyright']
+strip: //div[@class='hcf-copyright']
+strip: //div[@class='hcf-origin']
+
+
+
+
+# Fix picture captions
+wrap_in(small): //div[@class="hcf-caption"]
+test_url: http://www.handelsblatt.com/meinung/gastbeitraege/gastkommentar-zum-emissionshandel-kurskorrekturen-fuehren-zum-kentern/8044326.html
\ No newline at end of file
-date: //span[@class="item-date"]\r
-body: //div[@class="item-content"]\r
+date: //span[@class="item-date"]
+body: //div[@class="item-content"]
strip_comments: no
test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx
\ No newline at end of file
-title: //h1\r
-author: //a[@class='a_aut']\r
-body: //div[@class='content_dossier']\r
-strip: //div[@id='pagination']\r
+title: //h1
+author: //a[@class='a_aut']
+body: //div[@class='content_dossier']
+strip: //div[@id='pagination']
next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href
test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='headline']
+title: //h2[@itemprop='alternativeHeadline']
+title: //h1[@itemprop='headline']
+author: //span[@itemprop='name']
+date: //time[@itemprop='datePublished']
+body: //div[@itemprop='reviewBody']
+
+wrap_in(blockquote): //div[@class='factBox']
+
+next_page_link: //a[@rel='next']
+
+strip_id_or_class: 'product-box'
+strip: //a[@rel='next']
+strip: //a[text()='Del på Facebook']
+strip: //a[text()='Del på Twitter']
+test_url: http://www.hardware.no/artikler/asus-vg248qe/132792
\ No newline at end of file
-title: //div[@id='article-title']\r
-author: //div[@id='articleAuthors']\r
-body: //div[@id='article']\r
-strip: //div[@class='module wide']\r
-next_page_link: //a[@title='Next Page']
-test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/
\ No newline at end of file
+title: //div[@id='article-title']
+author: //div[@id='articleAuthors']
+body: //div[@id='article']
+strip: //div[@class='module wide']
+#single_page_link: //a[@class='social-print']
+test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/
+test_url: http://hbr.org/2013/03/big-bang-disruption/ar/
\ No newline at end of file
--- /dev/null
+title://div[@class='content']/h3[1]
+body://div[@class='content']
+
+# Article nav
+strip://div[@class='content']/p[1]
+
+# Comments and trackbacks
+strip://h2/following-sibling::p
+strip://h2
+
+# Posted on
+strip://b/p
+strip://div[@class='content']/p[@class='posted']
+test_url: http://headrush.typepad.com/creating_passionate_users/2005/05/the_case_for_ea.html
\ No newline at end of file
-body: //div[@id='content']/div\r
+body: //div[@id='content']/div
date: //p[@class='author_date']/span[@class='date']
test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html
\ No newline at end of file
-single_page_link: //p[@class='news_option']/a\r
-\r
-date: //p[@class='news_datum']\r
-title: //h1\r
-body: //div[@class='meldung_wrapper']\r
-\r
+single_page_link: //p[@class='news_option']/a
+
+date: //p[@class='news_datum']
+title: //h1
+body: //div[@class='meldung_wrapper']
+
test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html
\ No newline at end of file
--- /dev/null
+title: //h2
+body: //div[@id='leftdetail']
+single_page_link: //a[contains(@href, 'printable=1')]
+strip: //a[contains(., 'Full Version')]
+
+prune: no
+
+test_url: http://www.hemmings.com/classifieds/dealer/ferrari/330gtc/1601235.html
+test_url: http://www.hemmings.com/rss/keyword.xml?adtype=carsforsale&make=ferrari
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'title')]//h1
+body: //div[contains(@class, 'story')]
+
+prune: no
+
+test_url: http://www.heroturko.me/5223034-ds-catia-p3-v5-6r2014-gasp0-x86x64-multilanguage-english-docs.html
\ No newline at end of file
-body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body']\r
-\r
-prune: no\r
-tidy: no\r
-\r
-test_url: http://hespress.com/videos/73684.html\r
+body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body']
+
+prune: no
+tidy: no
+
+test_url: http://hespress.com/videos/73684.html
test_url: http://hespress.com/permalink/73678.html
\ No newline at end of file
--- /dev/null
+body: (//div[contains(@class, 'gallery-slides')]//img)[1] | //div[contains(@class, 'node_body_inner')]
+
+test_url: http://www.hiamag.com/rss.xml
\ No newline at end of file
-body: //div[@class='journal-entry-text']\r
+body: //div[@class='journal-entry-text']
test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html
\ No newline at end of file
-body: //div[@class = 'pd']\r
-strip: //div[@id = 'overzicht-albumrecensies']\r
+body: //div[@class = 'pd']
+strip: //div[@id = 'overzicht-albumrecensies']
strip: //div[@id = 'jc']
test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142
\ No newline at end of file
-body://div[@id = 'content']\r
-author://span[@class = 'authors']\r
-author://span[@class = 'ht-vtag'][1]\r
-date:substring-before(//meta[@name = 'dc.date']/@content,'T')\r
-strip://div[contains(@class, 'region-ubercontent')]\r
-strip://h1\r
-strip://div[@id = 'ht-author']\r
-strip://ul[@class = 'links inline'] \r
-strip://div[@id = 'ht-tools']\r
+body://div[@id = 'content']
+author://span[@class = 'authors']
+author://span[@class = 'ht-vtag'][1]
+date:substring-before(//meta[@name = 'dc.date']/@content,'T')
+strip://div[contains(@class, 'region-ubercontent')]
+strip://h1
+strip://div[@id = 'ht-author']
+strip://ul[@class = 'links inline']
+strip://div[@id = 'ht-tools']
test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur
\ No newline at end of file
-title: //*[@class='ptitle']\r
-date: //span[@class='date']\r
-body: //div[@class='body']\r
+title: //*[@class='ptitle']
+date: //span[@class='date']
+body: //div[@class='body']
prune: no
test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/
\ No newline at end of file
--- /dev/null
+date: //meta[@name='sailthru.date']/@content
+body: //article[contains(@class, 'entry-content')]
+
+strip_image_src: subscribe.png
+
+strip_id_or_class: wpcom-iframe-form
+strip_id_or_class: gallery-thumbs
+strip_id_or_class: twitter
+strip_id_or_class: fb-link
+strip_id_or_class: pinterest
+
+strip: //div[@class='data']
+strip: //iframe[contains(@name, 'wpcom')]
+
+find_string: <a href="http://www.youtube.com/subscription_center?add_user_id=2rJLq19N0dGrxfib80M
+replace_string: </p></div></body></html><!--
+
+find_string: <h3>More
+replace_string: </div></body></html><!--
+
+test_url: http://hollywoodlife.com/2013/10/04/miriam-carey-dead-capitol-hill-car-chase-shooting-postpartum-depression/
+test_url: http://hollywoodlife.com/feed/
\ No newline at end of file
-body: //div[@id='entry-body']\r
-strip_id_or_class: paginate\r
+body: //div[@id='entry-body']
+strip_id_or_class: paginate
strip: //p[contains(., 'Additional Resources')]
test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/
\ No newline at end of file
-body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content']\r
-tidy: no\r
-strip_image_src: analytics.apnewsregistry\r
-\r
+body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content']
+tidy: no
+strip_image_src: analytics.apnewsregistry
+
test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'thecontent')]
+
+strip_image_src: loading.gif
+find_string:src="http://cdn.howtogeek.com/public/images/blank.gif"
+replace_string:-
+find_string:data-href=
+replace_string:src=
+
+strip_id_or_class: relatedside
+
+test_url: http://www.howtogeek.com/school/microsoft-excel-formulas-and-functions/lesson1/
\ No newline at end of file
-prune: yes\r
+prune: yes
tidy: yes
test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405
\ No newline at end of file
-single_page_link: //iframe[@id='hootFrame']/@src\r
-\r
+single_page_link: //iframe[@id='hootFrame']/@src
+
test_url: http://ht.ly/bOiZV
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')]\r
-date: //meta[@name="publish_date"]/@content\r
-author: //a[@rel="author"]\r
-author: //meta[@name="author"]/@content\r
-prune: no\r
-tidy: no\r
-strip: //footer\r
-strip_id_or_class: ps-slideshow\r
-strip_id_or_class: fs-slideshow\r
-strip: //p[contains(., 'Related on HuffPost:')]\r
-# end early\r
-replace_string(<div class="sbm-main): </body></html><div class="not-interested \r
-\r
-test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html\r
-test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html
\ No newline at end of file
+title: //meta[@property="og:title"]/@content
+body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')]
+date: //meta[@name="publish_date"]/@content
+author: //a[@rel="author"]
+author: //meta[@name="author"]/@content
+
+prune: no
+tidy: no
+
+strip: //footer
+strip_id_or_class: ps-slideshow
+strip_id_or_class: fs-slideshow
+strip: //p[contains(., 'Related on HuffPost:')]
+strip_id_or_class: contribute-story
+strip_id_or_class: promo_holder
+
+# end early
+replace_string(<div class="sbm-main): </body></html><div class="not-interested
+
+test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html
+test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html
-title: //h3[@class="entry-header"]\r
-date: //h2[@class="date-header"]\r
-body: //div[contains(@class, 'entry')]\r
+title: //h3[@class="entry-header"]
+date: //h2[@class="date-header"]
+body: //div[contains(@class, 'entry')]
test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html
\ No newline at end of file
-title: //div[@class='HaberDetayTitleHold Title']/h1\r
-body: //div[@id='YazarDetayText']\r
-author: //div[@class='HaberDetayTitleHold Title']/h1\r
-prune: no\r
-\r
-test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp\r
+title: //div[@class='HaberDetayTitleHold Title']/h1
+body: //div[@id='YazarDetayText']
+author: //div[@class='HaberDetayTitleHold Title']/h1
+prune: no
+
+test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp
test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp
\ No newline at end of file
-title: //div[@id='pg-content']//h1\r
-body: //div[@id='articleBody0']\r
-replace_string(</table>): </table><br /><br />\r
-\r
-single_page_link: //div[@class="up-header"]/a\r
-\r
-prune: no\r
+title: //div[@id='pg-content']//h1
+body: //div[@id='articleBody0']
+replace_string(</table>): </table><br /><br />
+
+single_page_link: //div[@class="up-header"]/a
+
+prune: no
test_url: http://hvg.hu/w/20111125_sparta
\ No newline at end of file
-body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1]\r
-author: //span[@class='author']/a\r
-\r
-strip_id_or_class: disqus\r
-strip_id_or_class: paginator\r
-strip_id_or_class: photo-number\r
-\r
-prune: no\r
-\r
+body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1]
+author: //span[@class='author']/a
+
+strip_id_or_class: disqus
+strip_id_or_class: paginator
+strip_id_or_class: photo-number
+
+prune: no
+
test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/
\ No newline at end of file
--- /dev/null
+tidy:no
+prune:no
+
+body://div[contains(@id,'content')]
+
+strip_id_or_class:meta
+strip_id_or_class:notes
+strip_id_or_class:pagination
+test_url: http://icannabis.tumblr.com/post/28660592471/reviewmswireless3000
\ No newline at end of file
--- /dev/null
+body: //div[@class='content']
+
+strip: //p[@class='dateline']
+strip: //hr
+strip_id_or_class: share
+strip_id_or_class: comments
+strip_id_or_class: tags
+
+title: substring-before(//title,' ::')
+author: substring-before(//p[@class='dateline'],',')
+date: //p[@class='dateline']/time
+test_url: http://www.idealog.co.nz/blog/2012/12/geeks-plane-help-kiwis-take-san-francisco
\ No newline at end of file
-title: //a[@class='post_title']\r
-body: //div[@class='entrybox']\r
-strip_id_or_class: post_title\r
-date: //div[@class='entrybox']/b[1]\r
-strip: //div[@class='entrybox']/b[1]\r
+title: //a[@class='post_title']
+body: //div[@class='entrybox']
+strip_id_or_class: post_title
+date: //div[@class='entrybox']/b[1]
+strip: //div[@class='entrybox']/b[1]
author: string('Maciej Cegłowski')
test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm
\ No newline at end of file
-author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ')\r
-date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- ')))\r
-body: //div[@class='content clear-block zoneApple']\r
+author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ')
+date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- ')))
+body: //div[@class='content clear-block zoneApple']
test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112
\ No newline at end of file
-title://h1[@class='page-title']\r
-body://*[@id='content']//div[contains(@class,'node-content')]\r
-\r
-author://*[@id='content']//div[contains(@class,'node-submitted')]/a\r
-\r
+title://h1[@class='page-title']
+body://*[@id='content']//div[contains(@class,'node-content')]
+
+author://*[@id='content']//div[contains(@class,'node-submitted')]/a
+
date:substring-after(//div[contains(@class,'node-submitted')],' on ')
test_url: http://ignoredbydinosaurs.com/2011/09/great-lie-lorem-ipsum
\ No newline at end of file
-# Get proper Title, Author and Date info\r
-title: substring-before(//title, '|')\r
-author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By')\r
-date: //span[@class='instapaper_date']\r
-\r
-# For Reviews & First Looks, get the intro paragraph and put it in front of the main body.\r
-move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body']\r
-body: //div[@id='instapaper_para1']\r
-strip: //div[@class='reviewinfo']\r
-\r
-# We don't use footnotes, so why bother checking for them? \r
+# Get proper Title, Author and Date info
+title: substring-before(//title, '|')
+author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By')
+date: //span[@class='instapaper_date']
+
+# For Reviews & First Looks, get the intro paragraph and put it in front of the main body.
+move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body']
+body: //div[@id='instapaper_para1']
+strip: //div[@class='reviewinfo']
+
+# We don't use footnotes, so why bother checking for them?
footnotes: no
test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter
\ No newline at end of file
-title: //div[@class='published visible e2-smart-title']//span\r
-author: //span[@id='e2-blog-title']\r
-date: //p[@class='super-h']\r
+title: //div[@class='published visible e2-smart-title']//span
+author: //span[@id='e2-blog-title']
+date: //p[@class='super-h']
body: //div[@class='text published visible']
test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/
\ No newline at end of file
-author: substring-after(substring-before(//div[@id='byline'],'|'),'By')\r
-author: //div[@class='byline']/a\r
-date: //span[@class='pubdate']\r
-# print friendly page\r
-body: //div[@id='text']\r
-# regular page\r
-body: //div[@id= 'articlecontent']\r
-\r
-strip: //div[@id= 'articlecontent']/h1\r
-strip: //div[@id='articlecontent']/p[@class='deck']\r
-strip: //div[@id='articlecontent']/div[@class='byline']\r
-strip: //div[@id='articlespacer']\r
-strip: //div[@id='incsharebox']\r
-strip: //div[@id='articlesidebar']\r
-\r
-prune: no\r
-\r
-single_page_link: //a[contains(@href, 'Printer_Friendly.html')]\r
-strip: //a[contains(., 'Dig Deeper')]\r
-test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html\r
+author: substring-after(substring-before(//div[@id='byline'],'|'),'By')
+author: //div[@class='byline']/a
+date: //span[@class='pubdate']
+# print friendly page
+body: //div[@id='text']
+# regular page
+body: //div[@id= 'articlecontent']
+
+strip: //div[@id= 'articlecontent']/h1
+strip: //div[@id='articlecontent']/p[@class='deck']
+strip: //div[@id='articlecontent']/div[@class='byline']
+strip: //div[@id='articlespacer']
+strip: //div[@id='incsharebox']
+strip: //div[@id='articlesidebar']
+
+prune: no
+
+single_page_link: //a[contains(@href, 'Printer_Friendly.html')]
+strip: //a[contains(., 'Dig Deeper')]
+test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html
test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-body: //div[contains(@class, 'articleContent')]\r
-date: //meta[@property='article:published_time']/@content\r
-author: //div[@id='main']//div[@class='byline']//span[@class='authorName']\r
-\r
-strip_id_or_class: RelatedArtTag\r
-\r
+title: //meta[@property='og:title']/@content
+body: //div[contains(@class, 'articleContent')]
+date: //meta[@property='article:published_time']/@content
+author: //div[@id='main']//div[@class='byline']//span[@class='authorName']
+
+strip_id_or_class: RelatedArtTag
+
tidy: no
test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html
\ No newline at end of file
-body: //figure[@class='mainVideo']\r
-strip: //figcaption\r
-\r
-prune: no\r
-\r
+body: //figure[@class='mainVideo']
+strip: //figcaption
+
+prune: no
+
test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html
\ No newline at end of file
-title: //div[@class='weblogPost']/h3[1]\r
-author: ("Brent Simmons")\r
-date: //span[@class="weblogPostDisplayDate"]\r
+title: //div[@class='weblogPost']/h3[1]
+author: ("Brent Simmons")
+date: //span[@class="weblogPostDisplayDate"]
body: //div[@class='weblogPostBody']
test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo
\ No newline at end of file
-title://h1\r
-body://div[@id='texto_link']\r
+title://h1
+body://div[@id='texto_link']
test_url: http://info.abril.com.br/noticias/internet/filme-do-youtube-vai-estrear-nos-cinemas-22042011-6.shl
\ No newline at end of file
-body: //div[@id="intTranscript"]\r
-body: //div[@class="box-content"]\r
-title: //div[@class="box-content"]//h1[1]\r
-author: //p[@class="info"]/strong \r
-date: substring-before(substring-after(//p[@class="info"], "on"), "Length")\r
-strip: //div[@class="box-content"]//h1[1]\r
-strip: //div[@class="box-content"]//p[@class="info"]\r
-strip_id_or_class: vendor-content-box\r
-strip_id_or_class: tags2\r
-strip_id_or_class: instructions\r
-strip_id_or_class: comments\r
-strip_id_or_class: forum-list-tree\r
+body: //div[@id="intTranscript"]
+body: //div[@class="box-content"]
+title: //div[@class="box-content"]//h1[1]
+author: //p[@class="info"]/strong
+date: substring-before(substring-after(//p[@class="info"], "on"), "Length")
+strip: //div[@class="box-content"]//h1[1]
+strip: //div[@class="box-content"]//p[@class="info"]
+strip_id_or_class: vendor-content-box
+strip_id_or_class: tags2
+strip_id_or_class: instructions
+strip_id_or_class: comments
+strip_id_or_class: forum-list-tree
strip: //div[@class="addthis_toolbox addthis_default_style"]
test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview
\ No newline at end of file
-title: //div[@class='tituloInt']\r
-body: //div[@class='notaPortada']\r
-strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota']\r
-date: //span[@class='publi']\r
-author: //span[@class='autor']\r
-tidy: no\r
-prune: no\r
+title: //div[@class='tituloInt']
+body: //div[@class='notaPortada']
+strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota']
+date: //span[@class='publi']
+author: //span[@class='autor']
+tidy: no
+prune: no
test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-author: //*[@property='dc:creator']\r
-date: //*[@property='dc:date']/@content\r
-body: //div[@id='page-content']//div[contains(@class, 'article-body')]\r
-\r
+title: //meta[@property='og:title']/@content
+author: //*[@property='dc:creator']
+date: //*[@property='dc:date']/@content
+body: //div[@id='page-content']//div[contains(@class, 'article-body')]
+
tidy: no
test_url: http://www.information.dk/282307
\ No newline at end of file
-title://h1[@class="post_title"]\r
-body://article[@class="post"]\r
-date://h1[@class="section_separator"]\r
-author://span[@class="post_author"]\r
-strip://nav[@class="arrow_nav"]\r
-strip://section[@id="contact"]\r
-strip_id_or_class:post_title\r
-strip_id_or_class:post_author\r
+title://h1[@class="post_title"]
+body://article[@class="post"]
+date://h1[@class="section_separator"]
+author://span[@class="post_author"]
+strip://nav[@class="arrow_nav"]
+strip://section[@id="contact"]
+strip_id_or_class:post_title
+strip_id_or_class:post_author
strip_id_or_class:section_separator
test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/
\ No newline at end of file
-title: //head/title\r
-body: //table[@id='table3']//div[@class='postContent']\r
-prune: no\r
-tidy: no\r
-\r
+title: //head/title
+body: //table[@id='table3']//div[@class='postContent']
+prune: no
+tidy: no
+
test_url: http://www.informationclearinghouse.info/article28238.htm
\ No newline at end of file
-title: //div[@id='content']/h1\r
-body: //div[@id="content"]\r
-strip: //img[contains(@src, 'informit_printer.png')]\r
-single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')]\r
-prune: no\r
-\r
+title: //div[@id='content']/h1
+body: //div[@id="content"]
+strip: //img[contains(@src, 'informit_printer.png')]
+single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')]
+prune: no
+
test_url: http://www.informit.com/articles/article.aspx?p=1729268
\ No newline at end of file
-body: //div[@id='main_text']\r
-title: //div[@id='main_text']/h1\r
-strip: //div[@id='main_text']/h1\r
-strip: //div[@id='main_text']/h2\r
-strip_id_or_class: tools\r
-strip_id_or_class: articleTools\r
-strip_id_or_class: pagination\r
-strip_id_or_class: byline\r
-strip_id_or_class: tweet\r
-date: //div[@class='date']\r
+body: //div[@id='main_text']
+title: //div[@id='main_text']/h1
+strip: //div[@id='main_text']/h1
+strip: //div[@id='main_text']/h2
+strip_id_or_class: tools
+strip_id_or_class: articleTools
+strip_id_or_class: pagination
+strip_id_or_class: byline
+strip_id_or_class: tweet
+date: //div[@class='date']
strip: //div[@class='date']
test_url: http://www.infoworld.com/d/the-industry-standard/it-jobs-the-rise-both-offshore-and-in-us-187689
\ No newline at end of file
-# This filter is tested on:\r
-# http://www.infzm.com/content/71068\r
-# http://www.infzm.com/content/41577\r
-\r
-author://em[contains(@class, 'toAuthor')]\r
-date:substring(//em[contains(@class, 'pubTime')],1)\r
-body://section[contains(@id, 'articleContent')]\r
+# This filter is tested on:
+# http://www.infzm.com/content/71068
+# http://www.infzm.com/content/41577
+
+author://em[contains(@class, 'toAuthor')]
+date:substring(//em[contains(@class, 'pubTime')],1)
+body://section[contains(@id, 'articleContent')]
title://h1[contains(@class ,'articleHeadline clearfix')]
test_url: http://www.infzm.com/content/41577
\ No newline at end of file
-# set body\r
-body: //div[@class='post-listing']\r
-\r
-# remove clutter\r
-strip: //a/big\r
-strip: //a/em\r
+# set body
+body: //div[@class='post-listing']
+
+# remove clutter
+strip: //a/big
+strip: //a/em
strip: //p/em
test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/
\ No newline at end of file
-title: //div[@class='caption']\r
-author: //p[@class='username']\r
-\r
-strip: //div[@class='contents']/h3\r
+title: //div[@class='caption']
+author: //p[@class='username']
+
+strip: //div[@class='contents']/h3
strip: //div[@class='location']
test_url: http://instagr.am/p/G-s_aciyDJ/
\ No newline at end of file
-body: //div[@id = 'post']\r
-strip: //div[@class = 'postinfo']\r
-strip: //div[@id = 'postmetanew']\r
-strip: //div[@class = 'paginator']\r
-strip: //div[@class = 'col-2']\r
+body: //div[@id = 'post']
+strip: //div[@class = 'postinfo']
+strip: //div[@id = 'postmetanew']
+strip: //div[@class = 'paginator']
+strip: //div[@class = 'col-2']
strip: //div[@id = 'adfactor-label']
test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/
\ No newline at end of file
-body: //div[@id = 'post']\r
-strip: //div[@class = 'postinfo']\r
-strip: //div[@id = 'postmetanew']\r
-strip: //div[@class = 'paginator']\r
-strip: //div[@class = 'col-2']\r
+body: //div[@id = 'post']
+strip: //div[@class = 'postinfo']
+strip: //div[@id = 'postmetanew']
+strip: //div[@class = 'paginator']
+strip: //div[@class = 'col-2']
strip: //div[@id = 'adfactor-label']
test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/
\ No newline at end of file
-body: //div[@id = 'post']\r
-strip: //div[@class = 'postinfo']\r
-strip: //div[@id = 'postmetanew']\r
-strip: //div[@class = 'paginator']\r
-strip: //div[@class = 'col-2']\r
-strip: //div[@id = 'adfactor-label']\r
+body: //div[@id = 'post']
+strip: //div[@class = 'postinfo']
+strip: //div[@id = 'postmetanew']
+strip: //div[@class = 'paginator']
+strip: //div[@class = 'col-2']
+strip: //div[@id = 'adfactor-label']
test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/
\ No newline at end of file
-title: //meta[@name='og:title']/@content\r
-body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')]\r
-\r
-strip: //span[@vanilla-identifier]\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //meta[@name='og:title']/@content
+body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')]
+
+strip: //span[@vanilla-identifier]
+
+prune: no
+tidy: no
+
test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html
\ No newline at end of file
-# Remove social buttons\r
-strip: //div[@id='temp_Content_Right']\r
-\r
-# Remove duplicate article title\r
+# Remove social buttons
+strip: //div[@id='temp_Content_Right']
+
+# Remove duplicate article title
strip: //*[(@class='storytitle')]
test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/
\ No newline at end of file
-author: //p[@class = 'writer']\r
-\r
-date: //p[@class = 'published-time']\r
-\r
+author: //p[@class = 'writer']
+
+date: //p[@class = 'published-time']
+
body: //div[@class = 'text main']
test_url: http://www.itavisen.no/899786/old-republic-blir-gratis
\ No newline at end of file
--- /dev/null
+body: //div[@id='cmsBody']
+
+next_page_link: //span[@id='next']/a
+
+strip_id_or_class: cmsCopyright
+strip_id_or_class: masterSocialbuttonBtm
+
+test_url: http://www.itmedia.co.jp/enterprise/articles/0912/05/news002.html
\ No newline at end of file
-title: //h1[@class="entry-title"]\r
-body: //div[@class='format_text entry-content']\r
-author: //span[@class="author vcard"]/a\r
-date: //abbr[@class="published"]\r
-\r
-strip_id_or_class: related-posts\r
-strip_id_or_class: membershipbox\r
-strip_id_or_class: share_this_compact_bt\r
-\r
-\r
+title: //h1[@class="entry-title"]
+body: //div[@class='format_text entry-content']
+author: //span[@class="author vcard"]/a
+date: //abbr[@class="published"]
+
+strip_id_or_class: related-posts
+strip_id_or_class: membershipbox
+strip_id_or_class: share_this_compact_bt
+
+
footnotes: no
test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/
\ No newline at end of file
--- /dev/null
+author: //a[@rel="author"]
+date: //li[@class="itemDateCreated"]
+strip: //div[contains(@class, 'legend-rounded')]
+
+test_url: http://www.itwire.com/it-industry-news/market/59661-ibm-looks-to-high-value-solutions-to-meet-changing-demands
-title: //*[@id="article-title"]\r
-author: //*[@id="article-info"]/strong\r
-date: //*[@class="article-dateline"]/strong\r
+title: //*[@id="article-title"]
+author: //*[@id="article-info"]/strong
+date: //*[@class="article-dateline"]/strong
body: //*[@id="article-content"]
test_url: http://www.itworld.com/open-source/140916/android-sued-microsoft-not-linux
\ No newline at end of file
-body: //div[starts-with(@id, 'news-id-')]\r
-prune: no\r
-\r
+body: //div[starts-with(@id, 'news-id-')]
+prune: no
+
test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html
\ No newline at end of file
-body: //div[@id='content']//div[@class = 'post f']\r
-strip_id_or_class: comment-big\r
-strip_id_or_class: avatar\r
-strip: //div[@class='time_s']\r
+body: //div[@id='content']//div[@class = 'post f']
+strip_id_or_class: comment-big
+strip_id_or_class: avatar
+strip: //div[@class='time_s']
test_url: http://jandan.net/2011/04/03/iphone-5-sony.html
\ No newline at end of file
-title: //h1\r
-author: //p[contains(@class, 'author')]/a\r
-date: //p[contains(@class, 'time')]\r
-body: //div[@class='content']/div[contains(@class, 'text')]\r
-\r
-# prevent "no text" errors on multi-page articles\r
-tidy: no\r
-\r
-# we use a custom next-link detector instead of the print view because\r
-# it's pretty hard to strip out the unwanted parts in the print view\r
-autodetect_next_page: no\r
-next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more ']\r
-\r
-strip: //h1\r
-\r
-strip_id_or_class: meta\r
-strip_id_or_class: author\r
-strip_id_or_class: paging\r
-\r
-# prevent "Report an Error" from being recognized as footnote\r
+title: //h1
+author: //p[contains(@class, 'author')]/a
+date: //p[contains(@class, 'time')]
+body: //div[@class='content']/div[contains(@class, 'text')]
+
+# prevent "no text" errors on multi-page articles
+tidy: no
+
+# we use a custom next-link detector instead of the print view because
+# it's pretty hard to strip out the unwanted parts in the print view
+autodetect_next_page: no
+next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more ']
+
+strip: //h1
+
+strip_id_or_class: meta
+strip_id_or_class: author
+strip_id_or_class: paging
+
+# prevent "Report an Error" from being recognized as footnote
footnotes: no
test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken
\ No newline at end of file
-body: //div[@class='entry']\r
-prune: no\r
+body: //div[@class='entry']
+prune: no
test_url: http://www.jjahnke.net/rundbr87.html#2514
\ No newline at end of file
-body: //div[@id='formatCont_en']\r
-\r
-prune: no\r
-\r
+body: //div[@id='formatCont_en']
+
+prune: no
+
test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922
\ No newline at end of file
-# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html\r
-\r
-author: substring-after(//div[@class="author"], 'by ')\r
-date: //div[@class="date"]\r
-\r
-## Clean stuff at top ##\r
-\r
-strip: //h1[1]\r
-strip: //h2[1]\r
-strip: //div[@class="date"]\r
-strip: //div[@class="author"]\r
-\r
-## Clean stuff at bottom ##\r
-\r
-strip: //blockquote[@class="textmessage"]\r
-strip: //div[@style="width:500px"]/p[last()]\r
-strip: //div[@style="width:500px"]/p[last()-1]\r
-strip: //div[@style="width:500px"]/h4[last()]\r
-strip: //div[@style="width:500px"]/h4[last()-1]\r
+# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html
+
+author: substring-after(//div[@class="author"], 'by ')
+date: //div[@class="date"]
+
+## Clean stuff at top ##
+
+strip: //h1[1]
+strip: //h2[1]
+strip: //div[@class="date"]
+strip: //div[@class="author"]
+
+## Clean stuff at bottom ##
+
+strip: //blockquote[@class="textmessage"]
+strip: //div[@style="width:500px"]/p[last()]
+strip: //div[@style="width:500px"]/p[last()-1]
+strip: //div[@style="width:500px"]/h4[last()]
+strip: //div[@style="width:500px"]/h4[last()-1]
strip: //div[@style="width:500px"]/div[last()]
test_url: http://www.joelonsoftware.com/items/2011/09/15.html
\ No newline at end of file
-author: //h1\r
+author: //h1
date: //p[contains(@class,'date')]
test_url: http://jouire.com/2011/01/exquisite-whispers/
\ No newline at end of file
-author: //a[@class="byline-author"]\r
-title: //h1[@class="headline"]\r
-strip: //div[@id="info-card"]\r
-strip: //div[@id="breaking-news"]\r
-strip: //div[@class="rmod list-post-mod"]\r
-strip: //div[@id="footer"]\r
+author: //a[@class="byline-author"]
+title: //h1[@class="headline"]
+strip: //div[@id="info-card"]
+strip: //div[@id="breaking-news"]
+strip: //div[@class="rmod list-post-mod"]
+strip: //div[@id="footer"]
strip: //div[@id="GH_strip"]
test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/
\ No newline at end of file
-body: //div[@id='article_container']\r
-author: //h4//a[@class='author']\r
-title: //h1\r
-\r
-replace_string(lang="en"): lang="de"\r
-replace_string(/>1</a>):/></a>\r
-\r
-strip_id_or_class: share_toolbox\r
-strip_id_or_class: article_header\r
-strip_id_or_class: phototext\r
-\r
-strip_image_src: icon_author.gif\r
-\r
-strip: //img[@src='']\r
-strip: //h4[@id='author']\r
-\r
-prune: no\r
-\r
+body: //div[@id='article_container']
+author: //h4//a[@class='author']
+title: //h1
+
+replace_string(lang="en"): lang="de"
+replace_string(/>1</a>):/></a>
+
+strip_id_or_class: share_toolbox
+strip_id_or_class: article_header
+strip_id_or_class: phototext
+
+strip_image_src: icon_author.gif
+
+strip: //img[@src='']
+strip: //h4[@id='author']
+
+prune: no
+
test_url: http://www.juedische-allgemeine.de/article/view/id/13366
\ No newline at end of file
-convert_double_br_tags: yes\r
-\r
-title: //div[@id="storycredits"]/p/span[@class="title"]\r
-author: //div[@id="storycredits"]/p/br[1]/following-sibling::text()\r
-\r
-strip: //div[@id="storycredits"]\r
+convert_double_br_tags: yes
+
+title: //div[@id="storycredits"]/p/span[@class="title"]
+author: //div[@id="storycredits"]/p/br[1]/following-sibling::text()
+
+strip: //div[@id="storycredits"]
test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005
\ No newline at end of file
-body: //div[contains(@class, 'inner_content')]\r
+body: //div[contains(@class, 'inner_content')]
test_url: http://kachestvo.ru/promtovar/odezhda/denim.html
\ No newline at end of file
--- /dev/null
+title: //h3[contains(@class, 'entry-title')]
+date: //abbr[@itemprop='datePublished']/@title
+body: //div[@itemprop='articleBody']
+tidy: no
+
+test_url: http://www.kachiblog.com/2013/05/samsung-galaxy-s4-vs-samsung-galaxy.html
+test_url: http://www.kachiblog.com/feeds/posts/default
\ No newline at end of file
--- /dev/null
+title: //td[contains(@class, 'articleTitlos')]
+body: //td[contains(@class, 'eelantext')]
+
+test_url: http://www.kathimerini.gr/4dcgi/_w_articles_kathremote_1_03/12/2013_530490
\ No newline at end of file
-# Ads\r
-strip: //table[@align="right"][@width="120"]\r
-\r
-# Affiliate link paragraphs\r
-strip: //a[.="Adorama"]/parent::p[contains(., "goodies")]\r
+# Ads
+strip: //table[@align="right"][@width="120"]
+
+# Affiliate link paragraphs
+strip: //a[.="Adorama"]/parent::p[contains(., "goodies")]
strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")]
test_url: http://www.kenrockwell.com/tech/composition.htm
\ No newline at end of file
-# set body\r
-body: //div[@id='ovArtikel']\r
-\r
-# set title\r
-title: //div[@id='ovArtikel']/h1\r
-# strip main title and leave sub title\r
-strip: //div[@id='ovArtikel']/h1\r
-\r
-date: //div[@class='publicdate']\r
-\r
-#remove captions\r
-strip: //*/div[@class='bu']\r
-strip: //*/div[@class='credit']\r
-\r
-#remove adds\r
-strip: //*/div[@class='ad-head']\r
-strip: //*/div[@class='linksebay']\r
-\r
-# remove video content\r
+# set body
+body: //div[@id='ovArtikel']
+
+# set title
+title: //div[@id='ovArtikel']/h1
+# strip main title and leave sub title
+strip: //div[@id='ovArtikel']/h1
+
+date: //div[@class='publicdate']
+
+#remove captions
+strip: //*/div[@class='bu']
+strip: //*/div[@class='credit']
+
+#remove adds
+strip: //*/div[@class='ad-head']
+strip: //*/div[@class='linksebay']
+
+# remove video content
strip: //*/div[@class='ovVideo']
test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html
\ No newline at end of file
-title: //h1[@id='name']\r
-body: //*[@id='leftcol']\r
-\r
-strip_id_or_class: 'share-box'\r
-strip_id_or_class: 'project-faqs'\r
+title: //h1[@id='name']
+body: //*[@id='leftcol']
+
+strip_id_or_class: 'share-box'
+strip_id_or_class: 'project-faqs'
strip_id_or_class: 'report-issue-wrap'
test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone
\ No newline at end of file
-title: //div[@class='post']/h2\r
-body: //div[@class='entry']\r
+title: //div[@class='post']/h2
+body: //div[@class='entry']
strip: //p[contains(.,'Tags:')]
test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/
\ No newline at end of file
-title: //h2\r
-author: //*[@id='main']/div/a[1]\r
-date: substring-before(substring-after(//div[@class='meta'],'•'),'•')\r
-body: //div[@id='main']\r
-strip: //div[@class='meta']\r
+title: //h2
+author: //*[@id='main']/div/a[1]
+date: substring-before(substring-after(//div[@class='meta'],'•'),'•')
+body: //div[@id='main']
+strip: //div[@class='meta']
test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters
\ No newline at end of file
-body: //div[@class = "entry-full"]\r
+body: //div[@class = "entry-full"]
test_url: http://www.kumailplus.com/2011/12/02/24308
\ No newline at end of file
-title: //div[@id='centrediv']/h1\r
-\r
-author: substring-after(//div[@id='centrediv']/h3,'By: ')\r
-\r
-date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ')\r
-\r
-body: //div[@class='KonaBody']\r
-\r
+title: //div[@id='centrediv']/h1
+
+author: substring-after(//div[@id='centrediv']/h3,'By: ')
+
+date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ')
+
+body: //div[@class='KonaBody']
+
convert_double_br_tags: yes
test_url: http://www.kumb.com/story.php?id=126084
\ No newline at end of file
-date: //span[@class='datum']\r
-title: //div[@class='artikel']/h2\r
-body: //div[@class='entry']\r
-strip: //p[@class='tags']\r
-author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ')\r
-strip: //div[@class='authorinfo']\r
-strip: //div[@class='authorpic']\r
+date: //span[@class='datum']
+title: //div[@class='artikel']/h2
+body: //div[@class='entry']
+strip: //p[@class='tags']
+author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ')
+strip: //div[@class='authorinfo']
+strip: //div[@class='authorpic']
test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.landetsfria.se/artikel/112070
\ No newline at end of file
-title: //h1[@class='headline']\r
-body: //div[@class='article']\r
-strip: //div[@class='article']//h3[contains(@class, 'section')]\r
-strip: //div[@class='article']//ul[contains(@class, 'article-actions')]\r
-strip: //div[@id='syndication-upper']\r
-strip: //a[@id='syndication']\r
-strip: //dl[@id='article-tags']\r
-strip: //div[@id='article-like']\r
-prune: no\r
-\r
-single_page_link: //li[@class='single-page']/a\r
-\r
+title: //h1[@class='headline']
+body: //div[@class='article']
+strip: //div[@class='article']//h3[contains(@class, 'section')]
+strip: //div[@class='article']//ul[contains(@class, 'article-actions')]
+strip: //div[@id='syndication-upper']
+strip: //a[@id='syndication']
+strip: //dl[@id='article-tags']
+strip: //div[@id='article-like']
+prune: no
+
+single_page_link: //li[@class='single-page']/a
+
test_url: http://www.laphamsquarterly.org/essays/balanced-diets.php
\ No newline at end of file
-tidy: no\r
+tidy: no
test_url: http://www.laprensagrafica.com/opinion/editorial/229252-reflexiones-sobre-la-educacion-que-necesitamos.html
\ No newline at end of file
-body: //div[@id='content-content']//div[@class='content']\r
-title: //h1[@class='title']\r
-date: substring-after(//*[@class='submitted'],'Submitted on')\r
-tidy: no\r
-strip: //div[@class='terms terms-inline']\r
-strip: //div[@class='more']\r
-strip: //div[@class='share-links']\r
-strip: //table[@id='attachments']\r
-\r
+body: //div[@id='content-content']//div[@class='content']
+title: //h1[@class='title']
+date: substring-after(//*[@class='submitted'],'Submitted on')
+tidy: no
+strip: //div[@class='terms terms-inline']
+strip: //div[@class='more']
+strip: //div[@class='share-links']
+strip: //table[@id='attachments']
+
test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution
\ No newline at end of file
-#meta data\r
-title:substring-after(title,'|')\r
-\r
-author:substring-before( substring-after(//meta[@name = 'description']/@content, normalize-space(substring-after(//title,'|'))),' respond ')\r
-date://h5[@class = 'postDate']\r
-\r
-#text\r
-body://div[@class = 'articleBody']\r
-\r
-#clean up\r
-strip://center
-test_url: http://lareviewofbooks.org/post/14066007115/literary-transactions-and-their-vicissitudes
\ No newline at end of file
+#metadata
+title: substring-before(//title,' |')
+author: //a[contains(@class,'person') and starts-with(@href, '/contributor')]
+
+#text
+body: //div[contains(@class, 'article_body')]
+
+#clean up
+strip_id_or_class: recommended_section
+
+test_url: http://lareviewofbooks.org/review/american-politics-redeembale-robert-gates-hillary-clinton-two-memoirs-washington-dc
+test_url: http://lareviewofbooks.org/interview/souvenirs-future
-strip: //div[@id="tugs_story_display"]\r
-strip: //div[@id="search_overlay"]\r
-strip: //div[@id="adv_search"]\r
-body: //div[@class='story']\r
-tidy: no\r
-convert_double_br_tags: yes\r
-single_page_link: //a[contains(@href, ',print.')]\r
-strip: //p[starts-with(., 'latimes.com')]\r
-strip: //h1[starts-with(., 'latimes.com')]\r
+strip: //div[@id="tugs_story_display"]
+strip: //div[@id="search_overlay"]
+strip: //div[@id="adv_search"]
+body: //div[@class='story']
+tidy: no
+convert_double_br_tags: yes
+single_page_link: //a[contains(@href, ',print.')]
+strip: //p[starts-with(., 'latimes.com')]
+strip: //h1[starts-with(., 'latimes.com')]
strip_id_or_class: cubead
test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story
\ No newline at end of file
-title: //h1[@class='entry-title']\r
+title: //h1[@class='entry-title']
body: //div[@class='entry-content']
test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/
\ No newline at end of file
-title: //div[@id="content"]/h1[1]\r
-date: substring-before(//p[@class="postdate"], ' at ')\r
-author: ("Dr. Drang")\r
-\r
-strip: //div[@id="content"]/h1[1]\r
-strip: //p[@class="postdate"]\r
-strip: //h2[@id="respond"]\r
+title: //div[@id="content"]/h1[1]
+date: substring-before(//p[@class="postdate"], ' at ')
+author: ("Dr. Drang")
+
+strip: //div[@id="content"]/h1[1]
+strip: //p[@class="postdate"]
+strip: //h2[@id="respond"]
strip: //blockquote[@class="bbpTweet"]/p/span/a/img
test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
\ No newline at end of file
-title: //meta[@name='title']/@content\r
-author: //span[@class='sign']//a[@class='journaliste']\r
-author: //meta[@name='author']/@content\r
-body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte']\r
-date: //time[@pubdate]/@datetime\r
-prune: no\r
-test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php\r
+title: //meta[@name='title']/@content
+author: //span[@class='sign']//a[@class='journaliste']
+author: //meta[@name='author']/@content
+body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte']
+date: //time[@pubdate]/@datetime
+prune: no
+test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php
test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php
\ No newline at end of file
-title: //h1\r
-\r
-# they have a single component containing both author and date\r
-#author: //p[@class='source']\r
-#date: //p[@class='source']\r
-\r
-body: //div[@class='contenu_article']\r
-#Shoot the insane "conjugaison.lemonde.fr" links :\r
-strip: //a[contains(@class, 'listLink')]\r
-\r
-prune: no\r
-\r
-test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html
\ No newline at end of file
+title: //h1
+
+# We can have multiple authors
+author: //a[@class='auteur']
+
+# Last edition date (if any)
+date: //time[@itemprop='dateModified']/@datetime
+# Publication date
+date: //time[@itemprop='datePublished']/@datetime
+
+
+body: //div[@id='articleBody']
+#Shoot the insane "conjugaison.lemonde.fr" links :
+#strip: //a[contains(@class, 'conjug')]
+
+prune: no
+
+test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html
-title: //h1/following::span[@class='fn']\r
-# Author: should stop parsing until <br> reached, but I don't know how to do this.\r
-author: //following::div[@class='PDate2']\r
-date: //following::div[@class='PDate2']/strong\r
-\r
-body: //div[@class='ArTexte']\r
-body: //div[@id='prod_txt_b']\r
-body: //div[@class='ArPhotoP']\r
+title: //h1/following::span[@class='fn']
+# Author: should stop parsing until <br> reached, but I don't know how to do this.
+author: //following::div[@class='PDate2']
+date: //following::div[@class='PDate2']/strong
+
+body: //div[@class='ArTexte']
+body: //div[@id='prod_txt_b']
+body: //div[@class='ArPhotoP']
test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html
\ No newline at end of file
-title: //h2\r
+title: //h2
strip_image_src: logo.gif
test_url: http://www.letemps.ch/Facet/print/Uuid/7c9f912c-07c9-11e0-9b50-4d96c9eca37f
\ No newline at end of file
--- /dev/null
+date: //span[contains(@class, 'page-date')]
+body: //div[@id='node-page']
+strip_id_or_class: book-navigation
+prune: no
+
+test_url: http://libcom.org/library/what-was-the-ussr-aufheben-1
+test_url: http://libcom.org/library-latest/feed
\ No newline at end of file
-title: //h2[@class="entry-title"]\r
+title: //h2[@class="entry-title"]
body: //div[@class="entry-content"]
test_url: http://www.lifeandculture.fr/digital/facebook-and-the-epiphanator-an-end-to-endings/
\ No newline at end of file
-# Adds author text: Gawker sites commonly show as "Author: View Profile"\r
-author://a[@class="plus-icon modfont"]\r
-\r
-# Add date and time\r
-date: //span[@class="date"]\r
-\r
-# Remove date and time from article text\r
-strip: //span[@class="date"]\r
-\r
-# Remove login/comment text\r
-strip: //*[(@class="presence_control_external smalltype")]\r
-\r
-strip: //div[@class="nodebyline modfont"]\r
-\r
-# Remove right sidebar\r
-strip: //div[@id="rightwrapper"]\r
-\r
-# Remove print header\r
-strip: //div[@id='printhead']/h1\r
-\r
-# Remove 'content is restricted'\r
-strip: //div[@id='agegate_IDHERE']\r
-\r
-# Remove follow text\r
-strip: //*[(@class="permalink_ads")]\r
-\r
-# Remove view/comment count\r
-strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line']\r
-\r
-# Remove contact text\r
-strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo']\r
-\r
-# Remove medium duplicates of the article image\r
-strip_image_src: medium.jpg\r
-\r
-# Remove "arrow" class at bottom of page\r
-strip: //p[@class="arrow"]\r
-\r
-# Remove "track" image from article body\r
-strip: //img[@alt="track"]\r
-test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos\r
-test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse
\ No newline at end of file
+# Adds author text: Gawker sites commonly show as "Author: View Profile"
+author://a[@class="plus-icon modfont"]
+
+# Add date and time
+date: //span[@class="date"]
+
+body: //div[contains(@class, 'marquee-asset-wrapper') or contains(@class, 'post-content')]
+
+# Remove date and time from article text
+strip: //span[@class="date"]
+
+# Remove login/comment text
+strip: //*[(@class="presence_control_external smalltype")]
+
+strip: //div[@class="nodebyline modfont"]
+
+# Remove right sidebar
+strip: //div[@id="rightwrapper"]
+
+# Remove print header
+strip: //div[@id='printhead']/h1
+
+# Remove 'content is restricted'
+strip: //div[@id='agegate_IDHERE']
+
+# Remove follow text
+strip: //*[(@class="permalink_ads")]
+
+strip_id_or_class: inset_groups
+
+# Remove view/comment count
+strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line']
+
+# Remove contact text
+strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo']
+
+# Remove medium duplicates of the article image
+strip_image_src: medium.jpg
+
+# Remove "arrow" class at bottom of page
+strip: //p[@class="arrow"]
+
+# Remove "track" image from article body
+strip: //img[@alt="track"]
+test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos
+test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse
+test_url: http://lifehacker.com/what-happens-to-the-brain-when-you-meditate-and-how-it-1202533314
\ No newline at end of file
--- /dev/null
+title: //h1[@class='singlePageTitle']
+
+strip: //p[contains(text(), 'Follow Us')]
+strip: //p/strong[contains(text(), 'Recent Stories:')]
+strip: //div[@id="sharefeature"]
+
+test_url: http://lifestyle.inquirer.net/100223/dusting-your-ceiling-fan
--- /dev/null
+# This filter is tested on:
+# http://www.lifeweek.com.cn/2012/1211/39439.shtml
+# http://www.lifeweek.com.cn/2013/0308/40213.shtml
+
+title:substring-before(//h1, '(')
+title://h1
+date://ul[@class='authorbox']/li
+author: substring-after(//ul[@class='authorbox']/li/following-sibling::li, '作者:')
+
+next_page_link: //div[@class='pageturn_list']/a[@class='pagedown']
+body: //div[@class='original ']
+
+strip://h1
+strip://ul[@class='authorbox']
+strip://span[@class='app_p']
+strip://div[@style='text-align:right;']
+strip://div[@class='pageturn_list']
+strip://div[@class='lifespeaks']
+strip://div[@class='vright fr']
+strip://div[@class='copyrt mg20']
+strip://div[@class='keyabout mg20']
+strip://ul[@class='readabout mg20']
+test_url: http://www.lifeweek.com.cn/2013/0308/40213.shtml
\ No newline at end of file
--- /dev/null
+title: //div[@class="album_title"]//h1
+author: substring-before(//div[@class='by_line'], ',')
+date: substring-after(substring-before(//div[@class="album_time"], ' Time'), 'Date: ')
+body: //div[@class="about_text"]
+
+strip: //div[@class='large_popper']
+strip: //span[contains(@id, 'mag_glass')]
+strip: //span[contains(@class, 'img_overlay')]
+strip: //td//span
+strip: //div[@class="center_adsense"]
+strip: //div[@class="article_info"]//div[@class='asset_section']
+strip: //div[@class="article_additional"]
+strip: //div[contains(@style, 'overflow:hidden')]
+strip: //div[@class="aa_text"]
+strip: //div[@id='nointelliTXT']
+
+prune: no
+autodetect_on_failure: no
+
+test_url: http://www.livescience.com/34569-why-flowers-close-at-night-nyctinasty.html
-single_page_link: //div[@class="post"]/div[@class="title"]/a\r
+single_page_link: //div[@class="post"]/div[@class="title"]/a
test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/
\ No newline at end of file
-body: //div[@class='container_16']//div[@class='grid_11']\r
-strip: //h2[@class='mast']\r
-strip: //div[@class='container_16']//div[@class='grid_11']/h1\r
-strip: //div[@class='container_16']//div[@class='grid_11']/p[1]\r
-strip: //div[@class='container_16']//div[@class='grid_11']/div\r
-author: //a[starts-with(@title, 'Posts by')]\r
-date: substring-before(substring-after(//time, 'Posted on '), ' at')\r
-test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/\r
+body: //div[@class='container_16']//div[@class='grid_11']
+strip: //h2[@class='mast']
+strip: //div[@class='container_16']//div[@class='grid_11']/h1
+strip: //div[@class='container_16']//div[@class='grid_11']/p[1]
+strip: //div[@class='container_16']//div[@class='grid_11']/div
+author: //a[starts-with(@title, 'Posts by')]
+date: substring-before(substring-after(//time, 'Posted on '), ' at')
+test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/
test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/
\ No newline at end of file
-prune: no\r
+prune: no
convert_double_br_tags: yes
test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html
\ No newline at end of file
--- /dev/null
+title: //*[@id='title']
+date: //*[@id='date']
+body: //*[@id='desc']
+tidy: no
+
+test_url: http://www.lovefm.com/local_news.php?item=2176
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'post')]//h1
+body: //div[contains(@class, 'post')]
+strip: //hr
+strip_id_or_class: post-meta
+
+prune: no
+
+test_url: http://www.lovetv.com.bz/2013/06/28/recently-discovered-ancient-maya-wooden-canoe-paddle-to-be-handed-over-to-archaeology/
+test_url: http://www.lovetv.com.bz/feed/
\ No newline at end of file
-title: substring-before(//title, ' · LRB')\r
-\r
-body: //div[@class="article-body indent"]\r
-\r
-date: substring-after(//p[@class="meta-info"]/a, '· ')\r
-\r
-prune: no
-test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened
\ No newline at end of file
+title: //div[contains(@class, "article-body")]/hgroup/h1
+body: //div[contains(@class, "article-body")]
+
+date: substring-after(//p[@class="meta-info"]/a, '· ')
+
+author: //div[contains(@class, "article-body")]/hgroup/h2
+
+strip_id_or_class: print-hide
+strip_id_or_class: books
+
+test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened
+test_url: http://www.lrb.co.uk/v36/n13/benjamin-kunkel/paupers-and-richlings
-title: //h2\r
-\r
-body: // div[@id='content']\r
-\r
+title: //h2
+
+body: // div[@id='content']
+
strip: //div[@class='sidebar_wrapper']
test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml
\ No newline at end of file
--- /dev/null
+body: //div[@class='post-content']
+prune: no
+
+test_url: http://www.luxuo.com/watches/feed
\ No newline at end of file
-title: //div[@class="story-body"]/div[@class="story-inner"]/h1\r
-body: //div[@class="story-body"]\r
-date: //p[@class='date']/strong\r
-author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By')\r
-\r
-strip: //div[@class="story-inner"]/div[@class="byline"]\r
+title: //div[@class="story-body"]/div[@class="story-inner"]/h1
+body: //div[@class="story-body"]
+date: //p[@class='date']/strong
+author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By')
+
+strip: //div[@class="story-inner"]/div[@class="byline"]
test_url: http://m.bbc.co.uk/news/science-environment-19144464
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://m.douban.com/note/240776310/?session=6ac86d1e
+# http://m.douban.com/note/208270705/?session=e00ec732_3433229
+
+title: //h2
+author: //a[@class='founder']
+date: substring-after(//span[@class='info'],' | ')
+body: //div[contains(@class,'entry item')]
+
+strip://span[contains(@class,'info')]
+
+convert_double_br_tags: yes
+test_url: http://m.douban.com/note/240776310/?session=6ac86d1e
\ No newline at end of file
--- /dev/null
+# Article Metadata
+title: //h1
+author: //span[@class="name"]/a
+date: //time
+
+# Content Pruning
+strip: //h5
+strip: //time
+strip: //div[@class="byline"]
+strip: //h2[@class="headline "]
+test_url: http://m.vanityfair.com/politics/2012/10/michael-lewis-profile-barack-obama
\ No newline at end of file
-author: substring-after(//div[@class='author'],'Par ')\r
-date: //div[@class='date']\r
-body: //div[@class='content']\r
+author: substring-after(//div[@class='author'],'Par ')
+date: //div[@class='date']
+body: //div[@class='content']
test_url: http://www.mac4ever.com/news/64182/icloud_les_prix_en_euros_et_en_chf/
\ No newline at end of file
-title: substring-before(//title,' « Macdrifter')
+title: substring-before(//title,' « Macdrifter')
test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/
\ No newline at end of file
-# Remove news feed\r
-strip: //div[@id='news_feed_front']\r
-\r
-# Remove pull quote\r
-strip: //div[@class='field field-type-text field-field-pull-quote']\r
-\r
-# Remove login\r
+# Remove news feed
+strip: //div[@id='news_feed_front']
+
+# Remove pull quote
+strip: //div[@class='field field-type-text field-field-pull-quote']
+
+# Remove login
strip: //div[@class='right_bar_login']
test_url: http://macformat.techradar.com/blog/solid-state-storage-bringing-parity-back-mac-29-10-10&article=89189666
\ No newline at end of file
-author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le')\r
-date: substring-after(//div[@class='dateNews'],' le ')\r
-body: //div[@class='singleNews zoneApple']\r
+author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le')
+date: substring-after(//div[@class='dateNews'],' le ')
+body: //div[@class='singleNews zoneApple']
test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox
\ No newline at end of file
-# Remove sliders\r
-strip: //*[(@class="slides_container")]\r
-strip: //div[(@id="slides_two")]\r
-\r
-# Remove tag cloud\r
-strip: //span[(@class="secao")]\r
-\r
-# Fix date article\r
-# TODO\r
-\r
-# Remove other stuff\r
-strip: //div[(@id="idc-container")]\r
-strip: //div[(@id="idc-noscript")]\r
-strip: //div[(@class="linkwithin_div")]\r
-strip: //div[(@class="navPosts")]\r
-strip: //div[(@id="lateral")]\r
-strip: //div[(@id="autor")]\r
-strip: //div[(@id="rodape")]\r
-strip: //div[(@id="post")]/h1\r
+# Remove sliders
+strip: //*[(@class="slides_container")]
+strip: //div[(@id="slides_two")]
+
+# Remove tag cloud
+strip: //span[(@class="secao")]
+
+# Fix date article
+# TODO
+
+# Remove other stuff
+strip: //div[(@id="idc-container")]
+strip: //div[(@id="idc-noscript")]
+strip: //div[(@class="linkwithin_div")]
+strip: //div[(@class="navPosts")]
+strip: //div[(@id="lateral")]
+strip: //div[(@id="autor")]
+strip: //div[(@id="rodape")]
+strip: //div[(@id="post")]/h1
strip: //div[(@id="post")]/div[(@id="boxInformacoes")]
test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/
\ No newline at end of file
-author: substring-after(//div[@class='byline'], " by ")\r
-date: substring-before(//div[@class='byline'], " by ")\r
-\r
-# set body\r
-body: //div[@class='content']\r
-\r
-# set title\r
-title: //h3\r
+author: substring-after(//div[@class='byline'], " by ")
+date: substring-before(//div[@class='byline'], " by ")
+
+# set body
+body: //div[@class='content']
+strip_id_or_class: commentsContainer
+strip_id_or_class: linkback
+
+# set title
+title: //h3
#strip: //div[@class='content']/h3
-test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/
\ No newline at end of file
+test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/
-strip: //*[(@id = "featured")]\r
-\r
-author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')\r
-\r
-date: concat(//div[@class='month'],' ',//div[@class='day'])\r
-\r
-#macstories doesn't provide a year, but month/day is better than nothing\r
+strip: //*[(@id = "featured")]
+
+author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')
+
+date: concat(//div[@class='month'],' ',//div[@class='day'])
+
+#macstories doesn't provide a year, but month/day is better than nothing
test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/
\ No newline at end of file
-author://div[@class="article_username_container_full"]\r
-date://div[@class="article_username_container"]\r
+author://div[@class="article_username_container_full"]
+date://div[@class="article_username_container"]
body://div[@class="article cms_clear restore postcontainer"]
test_url: http://www.mactalk.com.au/content/chat-basil-shkara-developer-taptax-2452/
\ No newline at end of file
-title: substring-after(substring-after(//title, '>'), '>')\r
+title: substring-after(substring-after(//title, '>'), '>')
body: //div[@class='NewsArticleContent']
test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html
\ No newline at end of file
-title: //article//h1\r
-date: //meta[@name="date"]/@content\r
-author: //div[@class="author-name" or @class="article-byline"]/a[1]\r
-\r
-body: //section[@class="page"]\r
-\r
-# remove 'From the Lab' and 'Recent posts' text\r
-strip: //div[@class='blogLabel']\r
-\r
-# remove byline and meta info\r
-strip: //div[@class="article-meta"]\r
-strip: //div[@class="author-info"]\r
-\r
-#strip tags and categories\r
-strip: //div[@class="department"]\r
-\r
-#strip product cap links\r
-strip: //div[@class="cap-main"]\r
-strip: //div[@id="compare-lede"]\r
-\r
-prune: no\r
-\r
-# copes less well with Review pages, seems fine for News\r
+title: //article//h1
+date: //meta[@name="date"]/@content
+author: //div[@class="author-name" or @class="article-byline"]/a[1]
+
+body: //section[@class="page"]
+
+# remove 'From the Lab' and 'Recent posts' text
+strip: //div[@class='blogLabel']
+
+# remove byline and meta info
+strip: //div[@class="article-meta"]
+strip: //div[@class="author-info"]
+
+#strip tags and categories
+strip: //div[@class="department"]
+
+#strip product cap links
+strip: //div[@class="cap-main"]
+strip: //div[@id="compare-lede"]
+
+prune: no
+
+# copes less well with Review pages, seems fine for News
test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html
\ No newline at end of file
-body: //div[@class='NewsArticle']\r
+body: //div[@class='NewsArticle']
test_url: http://mainichi.jp/select/weathernews/20110311/news/20110520k0000e040062000c.html
\ No newline at end of file
-title: substring-before(//title, '|')\r
-body: //*[@id='content-left']\r
-\r
-# Why is this not working here?\r
-# body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail']\r
-\r
-\r
-#Header\r
-strip_id_or_class: 'subHead'\r
-strip_id_or_class: 'fl_right'\r
-strip_id_or_class: 'infolink'\r
-strip_id_or_class: 'content-head'\r
-strip_id_or_class: 'tab'\r
-strip_id_or_class: 'tab-active'\r
-strip: //*[contains(@class,'trenner')]\r
-\r
-# Headline\r
-strip: //h1/*\r
-strip_id_or_class: 'font16'\r
-\r
-#Images\r
-strip_id_or_class: 'leftimage'\r
-strip_id_or_class: 'rightimage'\r
-\r
-#Comments\r
-strip: //table\r
+title: substring-before(//title, '|')
+body: //*[@id='content-left']
+
+# Why is this not working here?
+# body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail']
+
+
+#Header
+strip_id_or_class: 'subHead'
+strip_id_or_class: 'fl_right'
+strip_id_or_class: 'infolink'
+strip_id_or_class: 'content-head'
+strip_id_or_class: 'tab'
+strip_id_or_class: 'tab-active'
+strip: //*[contains(@class,'trenner')]
+
+# Headline
+strip: //h1/*
+strip_id_or_class: 'font16'
+
+#Images
+strip_id_or_class: 'leftimage'
+strip_id_or_class: 'rightimage'
+
+#Comments
+strip: //table
strip: //p/following-sibling::*[0]
test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211
\ No newline at end of file
-tidy: no
+title: //h1[@class='entry-title']
-test_url: http://www.makeuseof.com/dir/kindle-it-web-pages-kindle-friendly/
\ No newline at end of file
+body: //article//header//img | //article//section[@class='post']
+
+strip: //article//section[@class='post']/aside
+strip: //article//section[@class='post']/footer
+
+test_url: http://www.makeuseof.com/tag/cool-websites-and-tools-advanced-photo-editor-keep-your-kids-stuff-online-identify-60-languages/
+test_url: http://www.makeuseof.com/tag/what-do-you-think-of-our-new-look-makeuseof-poll/
--- /dev/null
+title: //td[@class="headline"]
+author: //font[@color="#003366"]
+date: //td[@class="date"]
+
+strip: //td[@class="headline"]
+strip: //font[@color="#003366"]
+strip: //td[@class="date"]
+
+strip: //img[@src="images/2009/logo_en.gif"]
+
+body: //tbody[@class="body"]
+convert_double_br_tags:yes
+
+strip: //img[@src="/images/TabOver.gif"]
+strip: //td[@width="160"]
+strip: //img[@src="/images/TabUnder.gif"]
+
+strip: //td[@class="small"]
+strip: //td[@height="47"]
+
+strip: //td[@valign="middle"]
+strip: //td[@background="/images/menu_bottombg.gif"]
+strip: //img[@src="/images/sc_footer_l.gif"]
+strip: //img[@src="/images/sc_footer_m.gif"]
+strip: //img[@src="/images/sc_footer_r.gif"]
+test_url: http://www.manager.co.th/Entertainment/ViewNews.aspx?NewsID=9550000101979
\ No newline at end of file
-tidy: no\r
-prune: no\r
-date: //article//time[@pubdate]\r
-title: //article/header/h2\r
-body: //article\r
-strip: //header\r
-test_url: http://www.marco.org/2012/09/08/businessweek-gruber\r
+tidy: no
+prune: no
+date: //article//time[@pubdate]
+title: //article/header/h2
+body: //article
+strip: //header
+test_url: http://www.marco.org/2012/09/08/businessweek-gruber
test_url: http://www.marco.org/2012/04/24/might-upgrade-someday
\ No newline at end of file
-date: //div[@id="main"]/p[@class="date"]\r
-author: string("Martin Fowler")\r
-body: //div[@id="main"]\r
-strip_id_or_class: date\r
-strip_id_or_class: tags\r
-strip_id_or_class: tagLabel\r
+date: //div[@id="main"]/p[@class="date"]
+author: string("Martin Fowler")
+body: //div[@id="main"]
+strip_id_or_class: date
+strip_id_or_class: tags
+strip_id_or_class: tagLabel
strip: //div[@id="main"]/h1[1]
test_url: http://martinfowler.com/bliki/DatabaseThaw.html
\ No newline at end of file
-title: //header[@class='entry-title']/h1
-body: //div[@class='description']
+title: //h1[@class='title']
+author: substring-after(//span[@class='author_name'], 'By ')
+date: //time
+
+body: //article
strip: //div[@class='ytm-gallery-box']
-test_url: http://mashable.com/2011/12/05/india-wants-google-and-facebook-to-censor-user-content/
\ No newline at end of file
+strip: //div[contains(@class, 'adsense')]
+strip: //aside[contains(@class, 'social')]
+strip_id_or_class: article-topics
+
+test_url: http://mashable.com/2013/05/24/myspace-architects-rebuilding-a-brand/
--- /dev/null
+title: //h1
+author: string("Matt Might")
+strip: //h1/following-sibling::div
+
+test_url: http://matt.might.net/articles/oo-cesk/
\ No newline at end of file
-strip: //div[contains(@class, 'article-tools')]\r
+strip_id_or_class: article-tools
+strip_id_or_class: pagenav
+prune: no
test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'post-content-inner')]
+strip_id_or_class: follow-ups
+strip_id_or_class: footer
+
+prune: no
+
+test_url: https://medium.com/p/6844c0d7893b
\ No newline at end of file
--- /dev/null
+title: //h3[@class='episode_title']
+body: //ul[contains(@class, 'episode_imgdesc')]/li/descendant::*
+prune: no
+strip://*[contains(@class, 'plugin')]
+strip://*[contains(@class, 'episode_keywords')]
+
+test_url: http://www.megamp3.eu/?p=episode&name=2013-04-19_la_filiere_progressive_431.mp3
+test_url: http://www.megamp3.eu/feed.xml
-# need to find a way to eliminate <span> content for "related content" without eliminating important content\r
-\r
-convert_double_br_tags: [yes]\r
-#body: //div[@id='leftside']\r
-title: //h1\r
-title: //h2\r
-Author: substring-after(//h4, 'By ')\r
-Author: substring-after(//h4, 'By: ')\r
-#Strip: //span\r
-strip_id_or_class: morefromcat\r
-strip_id_or_class: mostpopular\r
-strip_id_or_class: articlepagination\r
-strip_id_or_class: toolbar\r
-body: //div[@id='zmodcontent']\r
-single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')]\r
+# need to find a way to eliminate <span> content for "related content" without eliminating important content
+
+convert_double_br_tags: [yes]
+#body: //div[@id='leftside']
+title: //h1
+title: //h2
+Author: substring-after(//h4, 'By ')
+Author: substring-after(//h4, 'By: ')
+#Strip: //span
+strip_id_or_class: morefromcat
+strip_id_or_class: mostpopular
+strip_id_or_class: articlepagination
+strip_id_or_class: toolbar
+body: //div[@id='zmodcontent']
+single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')]
test_url: http://www.menshealth.com/mhlists/pursuit_of_happiness/index.php
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'copy') or contains(@class, 'comments')]
+strip_id_or_class: related
+strip: //a[. = 'Subscribe']
+strip: //h1/span[@class = 'smallcopy']
+strip: //a[@class = 'skip']
+strip: //div[@id = 'logo']
+strip: //div[contains(@class, 'comments') and contains(., 'You are not currently logged in')]
+test_url: http://www.metafilter.com/128101/Probably-more-secure-than-the-Drafts-folder-on-a-shared-Gmail-account
\ No newline at end of file
--- /dev/null
+body: (//td[starts-with(@id, 'postmessage_')])[1]
+
+prune: no
+
+test_url: http://mforum.cari.com.my/forum.php?mod=viewthread&tid=788033
+test_url: http://mforum.cari.com.my/forum.php?mod=rss&fid=265&auth=0
\ No newline at end of file
-title: //div[@class="blogtitle"]\r
-strip: //div[@class="blogtitle"]\r
-\r
+title: //div[@class="blogtitle"]
+strip: //div[@class="blogtitle"]
+
author: substring-after(//span[@class="blogheader"], 'Author: ')
test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html
\ No newline at end of file
-title: //div[@class='post_content']/h2\r
-date: //div[@class='dateline']\r
-body: //div[@class='entry']\r
-\r
-strip: //div[@class='closer']\r
-strip: //div[@class='navigation']\r
-strip: //div[@class='aux_pane']\r
+title: //div[@class='post_content']/h2
+date: //div[@class='dateline']
+body: //div[@class='entry']
+
+strip: //div[@class='closer']
+strip: //div[@class='navigation']
+strip: //div[@class='aux_pane']
strip: //div[@class='aux_aux_pane']
test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another
\ No newline at end of file
-title: //*[@class="article"]/h1\r
-date: //*[@class="article"]/div[@class="date"]\r
-\r
-# strip the title and date from the article text\r
-strip: //*[@class="article"]/h1\r
-strip: //*[@class="article"]/div[@class="date"]\r
-\r
-# strip annoying <br> between metadata and article\r
+title: //*[@class="article"]/h1
+date: //*[@class="article"]/div[@class="date"]
+
+# strip the title and date from the article text
+strip: //*[@class="article"]/h1
+strip: //*[@class="article"]/div[@class="date"]
+
+# strip annoying <br> between metadata and article
strip: //*[@class="article"]/div[@class="date"]/following-sibling::br
test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/
\ No newline at end of file
-title: //*[@id="content-header"]/h1\r
-author: //*[contains(@class, 'byline')]/a/text()\r
-date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|')\r
+title: //*[@id="content-header"]/h1
+author: //*[contains(@class, 'byline')]/a/text()
+date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|')
body: //*[contains(@class, 'node-body')]
test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court
\ No newline at end of file
-# Remove extra links\r
+# Remove extra links
strip: //*[@class='appended_html']
test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html
\ No newline at end of file
-strip_id_or_class: 'book-ad'\r
-strip_id_or_class: 'bigger pullquote'\r
-strip_id_or_class: 'subscribe'\r
+strip_id_or_class: 'book-ad'
+strip_id_or_class: 'bigger pullquote'
+strip_id_or_class: 'subscribe'
strip_id_or_class: 'blog-link'
test_url: http://mises.org/daily/4804
\ No newline at end of file
-title: //h1[@class='article-headline']\r
-date: //span[@class='timeStamp']\r
-author: substring-before(//p[@class='article-byline'], '/')\r
-body: //div[@id='article']\r
-#strip: //div[@class='inner']\r
-strip: //div[@id='article_head']\r
-strip: //p[@class='tagLine']\r
-strip: //div[@id='article_related_links']\r
-strip: //div[@id='article_related_mlb']\r
-strip: //span[@class='more']\r
-strip: //div[@class='article_component']\r
-strip: //span[@class='screen_reader']\r
-strip: //ul[@class='columnists_blurb']\r
+title: //h1[@class='article-headline']
+date: //span[@class='timeStamp']
+author: substring-before(//p[@class='article-byline'], '/')
+body: //div[@id='article']
+#strip: //div[@class='inner']
+strip: //div[@id='article_head']
+strip: //p[@class='tagLine']
+strip: //div[@id='article_related_links']
+strip: //div[@id='article_related_mlb']
+strip: //span[@class='more']
+strip: //div[@class='article_component']
+strip: //span[@class='screen_reader']
+strip: //ul[@class='columnists_blurb']
test_url: http://mlb.mlb.com/news/article.jsp?ymd=20120403&content_id=27880830
\ No newline at end of file
-title: //h1[@id = 'stream_title']\r
-author: //p[@class = 'byline']/a\r
-date: //span[@class = 'datetime']\r
-\r
-body: //div[@id = 'stream_container']\r
-strip: //p[@class = 'byline']\r
-strip_id_or_class: stream_summary\r
-strip_id_or_class: social-spoken\r
-strip_id_or_class: datetime\r
-strip_id_or_class: author-mini-profile\r
-strip_id_or_class: social-tools\r
-strip_id_or_class: entry-tags\r
+title: //h1[@id = 'stream_title']
+author: //p[@class = 'byline']/a
+date: //span[@class = 'datetime']
+
+body: //div[@id = 'stream_container']
+strip: //p[@class = 'byline']
+strip_id_or_class: stream_summary
+strip_id_or_class: social-spoken
+strip_id_or_class: datetime
+strip_id_or_class: author-mini-profile
+strip_id_or_class: social-tools
+strip_id_or_class: entry-tags
strip_id_or_class: fb-like-box
test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage
\ No newline at end of file
-title: //*[@class="header_title"]/h1\r
-date: //*[@class="field-date"]\r
-author: //*[@class="field-author"]\r
-body: //div[contains(@class, 'content')]\r
+title: //*[@class="header_title"]/h1
+date: //*[@class="field-date"]
+author: //*[@class="field-author"]
+body: //div[contains(@class, 'content')]
test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation
\ No newline at end of file
-title: //h1\r
-body: //div[@id = 'article_content']/div[contains(@class,'article')]\r
-author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')]\r
+title: //h1
+body: //div[@id = 'article_content']/div[contains(@class,'article')]
+author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')]
date: //div[@class = 'article_username_container']
test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot
\ No newline at end of file
-tidy: no\r
-author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text()\r
-date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2]\r
-body: //div[@class="node"]\r
-\r
-strip_id_or_class: vertical-social-bar\r
-strip_id_or_class: blogs_paginator\r
-strip_id_or_class: horizontal-social-links\r
-strip_id_or_class: servicelinksdiv\r
+tidy: no
+author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text()
+date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2]
+body: //div[@class="node"]
+
+strip_id_or_class: vertical-social-bar
+strip_id_or_class: blogs_paginator
+strip_id_or_class: horizontal-social-links
+strip_id_or_class: servicelinksdiv
test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal
\ No newline at end of file
-title: //title\r
-\r
-author: //div[@class="author"]\r
-\r
-strip_id_or_class: 'header'\r
-strip_id_or_class: 'cikk_ajanlo'\r
-strip_id_or_class: 'buttons'\r
-strip_id_or_class: 'related'\r
-strip_id_or_class: 'adbox ad_cikk_kozepre'\r
-strip_id_or_class: 'cikk-cimkek'\r
-strip_id_or_class: 'cikk_ertekeles'\r
-\r
+title: //title
+
+author: //div[@class="author"]
+
+strip_id_or_class: 'header'
+strip_id_or_class: 'cikk_ajanlo'
+strip_id_or_class: 'buttons'
+strip_id_or_class: 'related'
+strip_id_or_class: 'adbox ad_cikk_kozepre'
+strip_id_or_class: 'cikk-cimkek'
+strip_id_or_class: 'cikk_ertekeles'
+
strip_comments: yes
test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'headline')]
+body: //article[contains(@class, 'full-art')]
+strip_id_or_class: image-credit
+test_url: http://mobile.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html
\ No newline at end of file
-title: //h2[@class="article_title"]\r
-strip: //a[@class="houseAdLink"]\r
-strip: //h1\r
+title: //h2[@class="article_title"]
+strip: //a[@class="houseAdLink"]
+strip: //h1
strip: //div[@class="more_articles"]
test_url: http://mobile.slate.com/rss.jsp?rssid=411&item=http%3a%2f%2fwww.slate.com%2fdefault.aspx%3fdisplaymode%3d201%26id%3d2293749%26device%3drss
\ No newline at end of file
-body: //div[@class='post uncustomized-post-template']\r
-\r
-# remove duplicate of post title, which is a link\r
-strip: //h3[@class='post-title']\r
-\r
-# remove permalink and timestamp, which isn't useful as it's a time with no date\r
-strip: //span[@class='post-timestamp']\r
-\r
-# remove labels (tags)\r
+body: //div[@class='post uncustomized-post-template']
+
+# remove duplicate of post title, which is a link
+strip: //h3[@class='post-title']
+
+# remove permalink and timestamp, which isn't useful as it's a time with no date
+strip: //span[@class='post-timestamp']
+
+# remove labels (tags)
strip: //span[@class='post-labels']
test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-author: //meta[@name="author"]/@content\r
-date: //span[@class='date1']\r
-body: //div[@id='newsimage'] | //div[@id='bodytext']\r
-tidy: no\r
-prune: no\r
-\r
+title: //meta[@property="og:title"]/@content
+author: //meta[@name="author"]/@content
+date: //span[@class='date1']
+body: //div[@id='newsimage'] | //div[@id='bodytext']
+tidy: no
+prune: no
+
test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-title: //h1[@class='storyheadline']\r
-author: //meta[@name="AUTHOR"]/@content\r
-date: //span[@class='cnnDateStamp']\r
-date: //meta[@name="DATE"]/@content\r
-body: //div[@id='storytext' or @class='storytext']\r
-\r
-strip_id_or_class: ie_column\r
-strip_id_or_class: sharewidgets\r
-strip_image_src: bug.gif\r
-\r
-strip: //div[@class="hed_side"]\r
-strip: //span[@class="byline"]\r
-strip: //a[@class="soc-twtname"]\r
-strip: //span[@class="cnnDateStamp"]\r
-strip: //div[@class="storytimestamp"]\r
-strip: //div[@class="cnnCol_side"]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29\r
-test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm\r
+title: //meta[@property="og:title"]/@content
+title: //h1[@class='storyheadline']
+author: //meta[@name="AUTHOR"]/@content
+date: //span[@class='cnnDateStamp']
+date: //meta[@name="DATE"]/@content
+body: //div[@id='storytext' or @class='storytext']
+
+strip_id_or_class: ie_column
+strip_id_or_class: sharewidgets
+strip_image_src: bug.gif
+
+strip: //div[@class="hed_side"]
+strip: //span[@class="byline"]
+strip: //a[@class="soc-twtname"]
+strip: //span[@class="cnnDateStamp"]
+strip: //div[@class="storytimestamp"]
+strip: //div[@class="cnnCol_side"]
+
+prune: no
+tidy: no
+
+test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29
+test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm
test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm
\ No newline at end of file
-strip_image_src: menu\r
-strip_image_src: templates\r
-strip: //div/a\r
-strip: //div/b\r
-strip: //div/strong\r
-strip: //td[@width='30%']\r
-strip: //br[1]\r
-strip: //br[2]\r
-strip: //br[3]\r
-strip: //br[4]\r
-strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home']\r
+strip_image_src: menu
+strip_image_src: templates
+strip: //div/a
+strip: //div/b
+strip: //div/strong
+strip: //td[@width='30%']
+strip: //br[1]
+strip: //br[2]
+strip: //br[3]
+strip: //br[4]
+strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home']
strip_id_or_class: cse-branding-right
test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous
\ No newline at end of file
-title: //h1[@class='print-title']\r
-body: //div[@class='print-submitted' or @class='print-created' or @class='print-content']\r
-prune: no\r
-\r
-single_page_link: //li[@class='print']/a\r
-\r
+title: //h1[@class='print-title']
+body: //div[@class='print-submitted' or @class='print-created' or @class='print-content']
+prune: no
+
+single_page_link: //li[@class='print']/a
+
test_url: http://moreintelligentlife.com/content/places/paul-markillie/they-trash-cars-dont-they
\ No newline at end of file
-author: //span[@class="author"]/a\r
-date: //span[@class="date"]\r
-body: //div[@class="story-content"]\r
-strip: //aside\r
+author: //span[@class="author"]/a
+date: //span[@class="date"]
+body: //div[@class="story-content"]
+strip: //aside
test_url: http://motherboard.vice.com/blog/you-can-carry-a-copy-of-the-pirate-bay-in-your-pocket
\ No newline at end of file
-title: //h2[contains(@class,'post_headline')]\r
-body: //div[@class='entry']\r
-convert_double_br_tags: yes\r
-strip_image_src: _selected.gif\r
-strip_id_or_class: addthis_\r
+title: //h2[contains(@class,'post_headline')]
+body: //div[@class='entry']
+convert_double_br_tags: yes
+strip_image_src: _selected.gif
+strip_id_or_class: addthis_
strip: //a[contains(@href,'feedburner.com')]
test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down
\ No newline at end of file
-title: //h1\r
-body: //div[@id = 'content-area']\r
-next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')]\r
-tidy: no\r
-author: //p[contains(@class, 'byline')]/a\r
-\r
-strip_id_or_class: node-header\r
-strip_id_or_class: hdr-tools\r
-strip_id_or_class: node-body-break\r
-strip_id_or_class: pullquote\r
-strip_id_or_class: node-pager\r
-strip_id_or_class: author-bio\r
-strip_id_or_class: node-footer\r
+title: //h1
+body: //div[@id = 'content-area']
+next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')]
+tidy: no
+author: //p[contains(@class, 'byline')]/a
+
+strip_id_or_class: node-header
+strip_id_or_class: hdr-tools
+strip_id_or_class: node-body-break
+strip_id_or_class: pullquote
+strip_id_or_class: node-pager
+strip_id_or_class: author-bio
+strip_id_or_class: node-footer
test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://movie.douban.com/review/1062013/
+
+title: //span[contains(@property, 'v:summary')]
+author: //span[contains(@property, 'v:reviewer')]
+date://span[contains(@property, 'v:dtreviewed')]
+body://div[contains(@class, 'main-bd')]
+
+strip://img[contains(@class,'rating')]|//img[contains(@class,'review-stat')]
+convert_double_br_tags: yes
+test_url: http://movie.douban.com/review/1062013/
+test_url: http://movie.douban.com/review/1021870/
\ No newline at end of file
-body: //div[class="mainBody"]\r
+body: //div[class="mainBody"]
footnotes: no
test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx
\ No newline at end of file
-title: //title\r
-author: //div[@id='byline']\r
-\r
-date: //div[contains(@class,'timestamp')]/abbr/text()\r
-\r
-body: //div[@id='intellitTXT']\r
-\r
-strip: //div[@id='byline']\r
-strip: //div[contains(@class,'timestamp')]\r
-strip: //div[contains(@class, 'ad-label')]\r
-strip: //div[contains(@class, 'ad-break')]\r
-strip: //span[contains(@class, 'x-video')]\r
-strip: //span[contains(@class, 'inline')]\r
-strip: //div[contains(@class, 'video')]\r
-strip: //div[contains(@class, 'discuss')]\r
-strip: //div[@id='most-popular']\r
-strip: //div[contains(@class,'drawer')]\r
-strip: //*[contains(@class, 'hide')]\r
-\r
+title: //title
+author: //div[@id='byline']
+
+date: //div[contains(@class,'timestamp')]/abbr/text()
+
+body: //div[@id='intellitTXT']
+
+strip: //div[@id='byline']
+strip: //div[contains(@class,'timestamp')]
+strip: //div[contains(@class, 'ad-label')]
+strip: //div[contains(@class, 'ad-break')]
+strip: //span[contains(@class, 'x-video')]
+strip: //span[contains(@class, 'inline')]
+strip: //div[contains(@class, 'video')]
+strip: //div[contains(@class, 'discuss')]
+strip: //div[@id='most-popular']
+strip: //div[contains(@class,'drawer')]
+strip: //*[contains(@class, 'hide')]
+
footnotes: no
test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE
\ No newline at end of file
--- /dev/null
+body: //div[@id='WNStoryBody']
+author: //div[@id='WNStoryByline']
+prune: no
+
+test_url: http://www.myfoxatlanta.com/category/233685/local-news?clienttype=rss
\ No newline at end of file
-body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"]\r
-tidy: no\r
-\r
+body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"]
+tidy: no
+
test_url: http://www.myfoxboston.com/dpp/news/local/transit-police-say-woman-spat-on-mbta-bus-driver-2010611
\ No newline at end of file
-title: //h2[contains(@class, 'name')]\r
-body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')]\r
-\r
-strip_id_or_class: photoBy\r
-strip_id_or_class: link\r
-\r
-single_page_link: //li[@class='print']/a[contains(@href, '/print/')]\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //h2[contains(@class, 'name')]
+body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')]
+
+strip_id_or_class: photoBy
+strip_id_or_class: link
+
+single_page_link: //li[@class='print']/a[contains(@href, '/print/')]
+
+prune: no
+tidy: no
+
test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/
\ No newline at end of file
-title: //div[@class='address']/span\r
-author: substring-before(//span[@class='credits'],',')\r
-date: //div[@class='promodatepress']/span\r
-body: //div[@class='default_style_wrap']\r
-strip: //div[@class='text_adjust']\r
-strip: //div[@class='skiplink']\r
+title: //div[@class='address']/span
+author: substring-before(//span[@class='credits'],',')
+date: //div[@class='promodatepress']/span
+body: //div[@class='default_style_wrap']
+strip: //div[@class='text_adjust']
+strip: //div[@class='skiplink']
strip: //h2
test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html
\ No newline at end of file
-date://span[contains(@class,'date')]\r
-\r
-body://div[contains(@class,'contWarp')]\r
-\r
-strip://div[contains(@class,'keyWord')]\r
-strip://div[contains(@class,'submitComt')]\r
-strip://div[contains(@class,'cmts')]\r
-strip://div[contains(@class,'notice')]\r
+date://span[contains(@class,'date')]
+
+body://div[contains(@class,'contWarp')]
+
+strip://div[contains(@class,'keyWord')]
+strip://div[contains(@class,'submitComt')]
+strip://div[contains(@class,'cmts')]
+strip://div[contains(@class,'notice')]
strip://div[contains(@class,'part pt-second')]
test_url: http://www.nbweekly.com/news/china/201203/29316.aspx
\ No newline at end of file
-#host configuration should be http://www.neh.gov/news/humanities/\r
-\r
-\r
-#meta data \r
-title:substring-after(substring-after(//title,':'),':')\r
-author:substring-after(//h2[@class = 'subHead'],'By')\r
-date:substring-before(substring-after(//title,':'),':')\r
-\r
-#img and caption handling\r
-wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text()\r
-wrap_in(fieldset)://div[@id = 'mainContent']/table\r
-\r
-# clean up\r
-strip: //table[@class = 'marginpaddingTop']\r
-strip: //h2[@class = 'subHead']\r
+#host configuration should be http://www.neh.gov/news/humanities/
+
+
+#meta data
+title:substring-after(substring-after(//title,':'),':')
+author:substring-after(//h2[@class = 'subHead'],'By')
+date:substring-before(substring-after(//title,':'),':')
+
+#img and caption handling
+wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text()
+wrap_in(fieldset)://div[@id = 'mainContent']/table
+
+# clean up
+strip: //table[@class = 'marginpaddingTop']
+strip: //h2[@class = 'subHead']
test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html
\ No newline at end of file
-title: //*[@class="header_title"]/h1\r
+title: //*[@class="header_title"]/h1
body: //div[contains(@class, 'content')]
test_url: http://neomoney.co/personal/expatriate-and-migrant-loans/expatriate-loans/
\ No newline at end of file
-title: //div[@class='content-title']\r
-#date: substring-after(//div[@class='dernek-text-under'],'Posted on')\r
-body: //div[@class='content-item']\r
-next_page_link: //li[@class='next']/a\r
-convert_double_br_tags: yes\r
+title: //div[@class='content-title']
+#date: substring-after(//div[@class='dernek-text-under'],'Posted on')
+body: //div[@class='content-item']
+next_page_link: //li[@class='next']/a
+convert_double_br_tags: yes
test_url: http://www.net-security.org/article.php?id=1732
\ No newline at end of file
-title: //h1\r
-author: //div[@class="submitted"]/span\r
-\r
-# seems like this should work, but nothing is returned. Issue with xpath parser?\r
-date: //div[@class="submitted"]/time\r
-\r
-body: //div[@id="main-content"]\r
-\r
-strip_comments: no\r
-\r
-strip: //h1\r
-strip: //div[@class="submitted"]\r
-strip: //dd[@class="profile-avatar"]\r
-strip: //div[@class="author-profile"]/dl/dt[1]\r
+title: //h1
+author: //div[@class="submitted"]/span
+
+# seems like this should work, but nothing is returned. Issue with xpath parser?
+date: //div[@class="submitted"]/time
+
+body: //div[@id="main-content"]
+
+strip_comments: no
+
+strip: //h1
+strip: //div[@class="submitted"]
+strip: //dd[@class="profile-avatar"]
+strip: //div[@class="author-profile"]/dl/dt[1]
strip: //div[@id="right-col"]
test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile
\ No newline at end of file
-title: //h1[@class='entry-title']\r
-author: //a[@ref='author']\r
-date: //span[@class='entry-date']\r
-body: //div[@class='entry-content']\r
+title: //h1[@class='entry-title']
+author: //a[@ref='author']
+date: //span[@class='entry-date']
+body: //div[@class='entry-content']
test_url: http://netzpolitik.org/2011/buch-generation-facebook/
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'article_header')]//h3
+
+test_url: http://www.newleftproject.org/index.php/site/article_comments/do_we_need_a_facebook_of_the_left
\ No newline at end of file
-title: //div[@id="maincontent"]/h1\r
-body: //div[@id="maincontent"]\r
-date: //div[@id="maincontent"]/p[2]\r
-author: //ul[@id="contributors"]/li/p/b\r
-\r
-strip: //p[@*]\r
-strip: //h1\r
+title: //div[@id="maincontent"]/h1
+body: //div[@id="maincontent"]
+date: //div[@id="maincontent"]/p[2]
+author: //ul[@id="contributors"]/li/p/b
+
+strip: //p[@*]
+strip: //h1
strip: //div[@id="maincontent"]/div
test_url: http://newmatilda.com/2011/07/22/turnbull-makes-sense-climate
\ No newline at end of file
--- /dev/null
+author: //span[@class="authors"]
+date: //span[@class="date"]
+body: //div[@class="primary"]
+
+strip: //div[@id="controls"]
+strip: //div[@id="read-next"]
+
+test_url: http://www.newrepublic.com/article/112731/moocs-will-online-education-ruin-university-experience
\ No newline at end of file
-title: //div[@id="main-content"]//h2\r
-\r
-author: //div[@id="main-content"]//span[@class="authors"]\r
-\r
-date: //div[@id="main-content"]//span[@class="timestamp"]\r
-\r
+title: //div[@id="main-content"]//h2
+
+author: //div[@id="main-content"]//span[@class="authors"]
+
+date: //div[@id="main-content"]//span[@class="timestamp"]
+
body: //div[@id="main-content"]//div[@class="content"]
test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html
\ No newline at end of file
-#This should apply to *.cnet.com. Not just news.cnet.com.\r
-title: //h1\r
-author: //img[@class="mugshot"]/@alt\r
-strip: //h1\r
-strip_id_or_class: breadcrumb\r
-strip: //p[@id="introP"]\r
-strip: //div[@class="postByline"]\r
-strip: //div[@class="editorBio"]\r
-strip: //div[@class="inline-slideshow"]\r
-strip: //div[@class="related"]\r
+#This should apply to *.cnet.com. Not just news.cnet.com.
+title: //h1
+author: //img[@class="mugshot"]/@alt
+strip: //h1
+strip_id_or_class: breadcrumb
+strip: //p[@id="introP"]
+strip: //div[@class="postByline"]
+strip: //div[@class="editorBio"]
+strip: //div[@class="inline-slideshow"]
+strip: //div[@class="related"]
body: //div[@class="postBody txtWrap"]
test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts
\ No newline at end of file
-title://div[@class="content_detail"]/h1\r
-\r
-author://div[@class="author"]/strong\r
-\r
-date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB')\r
-\r
+title://div[@class="content_detail"]/h1
+
+author://div[@class="author"]/strong
+
+date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB')
+
body://div[@class="text_detail"]
test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai
\ No newline at end of file
-body: //div[@id='main']\r
-strip: //div[@id='sbs']\r
-strip: //div[@id='fsizeSwitch']\r
-strip: //div[@id='googleAd']\r
-strip: //div[@id='detailFoot']\r
-strip_image_src: counter?key\r
-convert_double_br_tags: yes\r
+body: //div[@id='main']
+strip: //div[@id='sbs']
+strip: //div[@id='fsizeSwitch']
+strip: //div[@id='googleAd']
+strip: //div[@id='detailFoot']
+strip_image_src: counter?key
+convert_double_br_tags: yes
test_url: http://news.kanaloco.jp/localnews/article/1105200018/
\ No newline at end of file
-title: //h2[@class="lyt-hdg-02-04"]\r
-\r
-author: //div[@class="lyt-namearea"]/a\r
-\r
-date: //div[@class="lyt-namearea"]/text()\r
-\r
-body: //div[@class="articleContent"]\r
-\r
-strip: //div[@id="tab-aside"]\r
+title: //h2[@class="lyt-hdg-02-04"]
+
+author: //div[@class="lyt-namearea"]/a
+
+date: //div[@class="lyt-namearea"]/text()
+
+body: //div[@class="articleContent"]
+
+strip: //div[@id="tab-aside"]
test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html
\ No newline at end of file
-single_page_link: //div[@id='content']//p[@class='readMore']/a\r
-\r
-title: //div[@class='hidden offscreen']/h2\r
-body: //div[@id="storyText"]\r
-move_into(//div[@id='storyText']): //div[@class='fact']\r
-strip: //small[@class='credit']\r
-strip: //small[@class='caption']\r
-date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
-strip: //p[@class='toplink']\r
+single_page_link: //div[@id='content']//p[@class='readMore']/a
+
+title: //div[@class='hidden offscreen']/h2
+body: //div[@id="storyText"]
+move_into(//div[@id='storyText']): //div[@class='fact']
+strip: //small[@class='credit']
+strip: //small[@class='caption']
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')
+strip: //p[@class='toplink']
test_url: http://news.orf.at/stories/2084731/
\ No newline at end of file
-body: //article\r
-title: //h1\r
-author: //span[@class='b-article-source-dropdown']\r
-strip: //span[@class='b-article-photo-incut__source']\r
-strip: //a[@class='b-read-more b-read-more_bottom']\r
-\r
-\r
+body: //article
+title: //h1
+author: //span[@class='b-article-source-dropdown']
+strip: //span[@class='b-article-photo-incut__source']
+strip: //a[@class='b-read-more b-read-more_bottom']
+
+
tidy:no
test_url: http://news.rambler.ru/12972208/
\ No newline at end of file
-body: //div[@class='main']/div[@class='item']\r
-strip: //div[@class='right']\r
-\r
+body: //div[@class='main']/div[@class='item']
+strip: //div[@class='right']
+
test_url: http://news.techmeme.com/110516/fh-rip
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-title: //h1[@class='headline']\r
-author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn']\r
-date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title\r
-body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')]\r
-#strip: //cite/abbr\r
-strip_id_or_class: action\r
-strip_id_or_class: prefetch\r
-tidy: no\r
-prune: no\r
+title: //meta[@property='og:title']/@content
+title: //h1[@class='headline']
+author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn']
+date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title
+body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')]
+#strip: //cite/abbr
+strip_id_or_class: action
+strip_id_or_class: prefetch
+tidy: no
+prune: no
test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html
\ No newline at end of file
-strip_comments: no\r
+strip_comments: no
strip: //a[. = 'reply']
test_url: http://news.ycombinator.com/item?id=1516461
\ No newline at end of file
--- /dev/null
+body://div[@class="newsdetail_wrapper"]
+strip://div[@class="more_news"]
+test_url: http://news.zing.vn/xa-hoi/s-phat-nang-xe-may-di-duong-tren-cao-ha-noi/a280838.html#home_noibat1
\ No newline at end of file
--- /dev/null
+title: //h1[@class='title']
+
+body: //img[@id='relPicsMainPic'] | //div[contains(@class, 'storyContent')]
+
+test_url: http://news247.gr/eidiseis/katatheseis_fwtia_htan_apofasismenoi_akomh_kai_na_afairesoyn_zwes_an_thewrousan_oti_to_thuma_htan_antipalos_toys.2433351.html
+test_url: http://news247.gr/?widget=rssfeed&view=feed&contentId=38291
\ No newline at end of file
-date: //meta[@name='og:article:published_time']/@value\r
-\r
-body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']\r
-\r
-strip_id_or_class: itemImageGallery\r
-\r
-prune: no\r
-\r
+date: //meta[@name='og:article:published_time']/@value
+
+body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']
+
+strip_id_or_class: itemImageGallery
+
+prune: no
+
test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex
\ No newline at end of file
-title: //h1\r
-body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent']\r
-author: //div[@class='byline']//a[contains(@href, '/user/')]\r
-\r
-strip_id_or_class: facts\r
-strip_id_or_class: articleBlogsHolder\r
-strip_id_or_class: byline\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //h1
+body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent']
+author: //div[@class='byline']//a[contains(@href, '/user/')]
+
+strip_id_or_class: facts
+strip_id_or_class: articleBlogsHolder
+strip_id_or_class: byline
+
+prune: no
+tidy: no
+
test_url: http://www.newsmill.se/artikel/2012/05/06/medielogiken-v-ger-tyngre-n-reportrarnas-sikter
\ No newline at end of file
-body: //div[@class='right']//div[@class='articles']\r
-author: //div[@id='artinfo']//a[contains(@href, '/author/')]\r
-strip: //div[@id='artinfo']\r
-strip: //table[//a[contains(@href, 'twitter.com')]]\r
-strip_id_or_class: twitter\r
-\r
-prune: no\r
-tidy: no\r
-\r
+body: //div[@class='right']//div[@class='articles']
+author: //div[@id='artinfo']//a[contains(@href, '/author/')]
+strip: //div[@id='artinfo']
+strip: //table[//a[contains(@href, 'twitter.com')]]
+strip_id_or_class: twitter
+
+prune: no
+tidy: no
+
test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb
\ No newline at end of file
--- /dev/null
+body: //div[@class = 'article-body']
+title: //h1[@class = 'article-title']
+strip: //aside
+
+test_url: http://www.newsweek.com/day-steve-mcqueen-met-his-new-nazi-neighbor-keith-moon-229741
+test_url: http://www.newsweek.com/2014/06/13/how-greylock-partners-finds-next-facebook-253329.html
--- /dev/null
+prune: no
+tidy: no
+
+title: //h1/a[2]
+body: //div[@id="main"]
+author: //span[@id="articlesource"]
+date: //span[contains(@class, 'releasedate')]
+
+strip: //div[@class="inst-logo"]
+strip: //h1[1]
+
+strip_id_or_class: addthis
+strip_id_or_class: released
+strip_id_or_class: skiptranslate
+strip_id_or_class: flash
+
+test_url: http://www.newswise.com/articles/first-heat-wave-of-season-puts-elderly-at-risk
-title: //h1[@id='articlehed'] | //h2[@id="articleintro"]\r
-body: //div[@id='articletext']\r
-\r
-strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"]\r
-\r
-date: //h4[@id='articleauthor']/span[@class='dd dds']\r
-date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published']\r
-\r
-single_page_link: //div[@class='paginationViewSinglePage']/a\r
-test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html
\ No newline at end of file
+title: //h1[@id='articlehed'] | //h2[@id="articleintro"]
+body: //div[@id='articletext']
+
+strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"] | //div[@class="cartoon"]
+
+date: //h4[@id='articleauthor']/span[@class='dd dds']
+date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published']
+
+single_page_link: //div[@class='paginationViewSinglePage']/a
+test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html
+test_url: http://www.newyorker.com/reporting/2013/04/22/130422fa_fact_bilger?currentPage=all&mobify=0
\ No newline at end of file
-# 2011-08-22 [carlo@...] initial version\r
-# 2011-08-22 [carlo@...] removed comments & social links\r
-\r
-tidy: no\r
-\r
-single_page_link: //a[@class="single active"]\r
-\r
-body: //div[@id="main"]//div[@class="content-region"]/article\r
-author: //span[@class="author-name"]\r
-date: //time/text()\r
-\r
-strip_id_or_class: //aside[@id="related"]\r
-strip: //footer\r
-\r
+# 2011-08-22 [carlo@...] initial version
+# 2011-08-22 [carlo@...] removed comments & social links
+
+tidy: no
+
+single_page_link: //a[@class="single active"]
+
+body: //div[@id="main"]//div[@class="content-region"]/article
+author: //span[@class="author-name"]
+date: //time/text()
+
+strip_id_or_class: //aside[@id="related"]
+strip: //footer
+
title: //h1
test_url: http://www.next-gen.biz/reviews/deus-ex-human-revolution-review
\ No newline at end of file
-# doesn't look like selecting an attribute value works?\r
-# author: //meta[@id="authorName"]@value\r
-\r
-author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ")\r
-date: //abbr[@id="article-time"]\r
-title: //div[@id="article-hdr"]/h1\r
-body: //div[@class="articleText"]\r
-\r
-# strip miscellaneous teasers & etc\r
+# doesn't look like selecting an attribute value works?
+# author: //meta[@id="authorName"]@value
+
+author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ")
+date: //abbr[@id="article-time"]
+title: //div[@id="article-hdr"]/h1
+body: //div[@class="articleText"]
+
+# strip miscellaneous teasers & etc
strip: //div[@class="removeformobile"]
test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream
\ No newline at end of file
-next_page_link: //div[@class='nextpage_continue']/a\r
-strip: //div[@class='nextpage_continue']\r
-strip_id_or_class: nextpage\r
-title: //div[@class='article_title']//h1\r
-body: //div[@class='article_title']/..\r
+next_page_link: //div[@class='nextpage_continue']/a
+strip: //div[@class='nextpage_continue']
+strip_id_or_class: nextpage
+title: //div[@class='article_title']//h1
+body: //div[@class='article_title']/..
body: //div[@class='content']
test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text
\ No newline at end of file
-body: //div[@id="main"]\r
-title: //div[@id="main"]/h3\r
-\r
-# Remove ‘Review’ and ‘Wii’.\r
-strip: //div[@class="badge"]\r
-\r
-# Remove duplicate title and country flag.\r
-strip: //h3\r
-\r
-# Commented out below are attempts to extract the author and date, which did not work.\r
-# author: //p[@class="extra "]/a\r
+body: //div[@id="main"]
+title: //div[@id="main"]/h3
+
+# Remove ‘Review’ and ‘Wii’.
+strip: //div[@class="badge"]
+
+# Remove duplicate title and country flag.
+strip: //h3
+
+# Commented out below are attempts to extract the author and date, which did not work.
+# author: //p[@class="extra "]/a
# date: //p[@class="extra "]/span[@class="when"]
test_url: http://www.nintendoworldreport.com/review/28400
\ No newline at end of file
-author: //span[@class='meta']/span[@class='username']\r
-body: //div[@class='article-content']\r
-\r
+author: //span[@class='meta']/span[@class='username']
+body: //div[@class='article-content']
+
strip_id_or_class: 'article-actions'
test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i
\ No newline at end of file
-title: //h1\r
-body: //div[@id='pn-maincontent']\r
-strip_id_or_class: z-menu\r
-strip_id_or_class: news_category\r
-strip_id_or_class: news_title\r
-strip_id_or_class: news_modify\r
-strip_id_or_class: news_morearticlesincat\r
-strip_id_or_class: ezc_comments\r
-strip_comments: yes\r
-\r
+title: //h1
+body: //div[@id='pn-maincontent']
+strip_id_or_class: z-menu
+strip_id_or_class: news_category
+strip_id_or_class: news_title
+strip_id_or_class: news_modify
+strip_id_or_class: news_morearticlesincat
+strip_id_or_class: ezc_comments
+strip_comments: yes
+
test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972
\ No newline at end of file
--- /dev/null
+title: //div[@id='tab-recept']//h1
+body: //div[@id='tab-recept']//div[contains(@class, 'column-container')]
+strip_id_or_class: ajanlo-box
+prune: no
+
+test_url: http://www.nosalty.hu/recept/szupergyors-fank
\ No newline at end of file
-title: /html/body/div[3]/div/div/h1\r
-\r
-body: //*[@id="article-body"]\r
-\r
+title: /html/body/div[3]/div/div/h1
+
+body: //*[@id="article-body"]
+
test_url: http://nplusonemag.com/the-outskirts-of-progress
\ No newline at end of file
-title: //div[contains(@class, 'storytitle')]//h1\r
-author: //p[@class="byline"]/span\r
-body: //div[@id='storyspan02']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext'] | //div[@class='transcript']\r
-date: //meta[@name="date"]/@content\r
-\r
-strip: //div[@class='enlarge_measure']\r
-strip: //div[@class='enlarge_html']\r
-strip: //a[@class='enlargeicon']\r
-strip: //div[contains(@class, 'bookedition')]\r
-strip: //div[@class='textsize']\r
-strip: //ul[@class='genres']\r
-strip: //span[@class='bull']\r
-strip_id_or_class: secondary\r
-strip_id_or_class: con1col\r
-strip: //h3[@class='conheader']\r
-\r
-replace_string(<a name="more"> </a>): <!-- no more -->\r
-replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2>\r
-\r
-prune: no\r
-strip://div[@class="ecommercepop"]\r
-strip://span[@class="bull"]\r
-strip://span[@class="purchaseLink"]\r
-strip://div[@class="enlarge_html"]\r
-strip://div[@class="enlarge_measure"]\r
-strip://div[@class="container con1col small"]\r
-strip://a[contains(@class, "enlargebtn")]\r
-strip://div[contains(@class, "bucketwrap internallink")]\r
-\r
-test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates\r
-test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right\r
-test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres
\ No newline at end of file
+title: //div[contains(@class, 'storytitle')]//h1
+author: //p[@class="byline"]/span
+body: //div[@id='primaryaudio']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext' or @id='supplementarycontent' or contains(@class, 'transcript')]
+date: //meta[@name="date"]/@content
+
+strip_id_or_class: enlarge_measure
+strip_id_or_class: enlarge_html
+strip: //a[contains(@class, 'enlargeicon')]
+strip: //div[contains(@class, 'bookedition')]
+strip: //div[@class='textsize']
+strip: //ul[@class='genres']
+strip: //span[@class='bull']
+strip_id_or_class: secondary
+strip_id_or_class: con1col
+strip: //h3[@class='conheader']
+
+replace_string(<a name="more"> </a>): <!-- no more -->
+replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2>
+replace_string(<div class="transcript storytext">): <div class="transcript storytext"><h2>Transcript</h2>
+
+prune: no
+strip://div[@class="ecommercepop"]
+strip://span[@class="bull"]
+strip://span[@class="purchaseLink"]
+strip://div[@class="enlarge_html"]
+strip://div[@class="enlarge_measure"]
+strip://div[@class="container con1col small"]
+strip://a[contains(@class, "enlargebtn")]
+strip://div[contains(@class, "bucketwrap internallink")]
+
+test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates
+test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right
+test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres
+test_url: http://www.npr.org/templates/story/story.php?storyId=229103221
\ No newline at end of file
-strip_id_or_class: sIFR-alternate\r
-title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2\r
-single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))]\r
-\r
-body: //div[@id = 'article-body']\r
-strip_id_or_class:article-tools\r
-strip_id_or_class:js_target\r
-strip_id_or_class:marker\r
-author://div[@id = 'page-title']/h3\r
-date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')]\r
-\r
-\r
+strip_id_or_class: sIFR-alternate
+title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2
+single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))]
+
+body: //div[@id = 'article-body']
+strip_id_or_class:article-tools
+strip_id_or_class:js_target
+strip_id_or_class:marker
+author://div[@id = 'page-title']/h3
+date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')]
+
+
test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/
\ No newline at end of file
-title: //h2[contains(@class, 'primary')]\r
-body: //div[@id='story']\r
-author: //*[@class='by']/a\r
-date: substring-after(//*[@class='date'], 'Published')\r
-\r
-next_page_link: //div[@class='page-navigation']//li[@class='next']/a\r
-\r
+title: //h2[contains(@class, 'primary')]
+body: //div[@id='story']
+author: //*[@class='by']/a
+date: substring-after(//*[@class='date'], 'Published')
+
+next_page_link: //div[@class='page-navigation']//li[@class='next']/a
+
test_url: http://nymag.com/news/features/wall-street-2012-2/
\ No newline at end of file
-title: //div[@class="article default-article"]/h1\r
-author: //p[@class="author"]/a[2]\r
-\r
-# Article introduction:\r
-#move_into(//div[@class="article-bread"]): //p[@class="lead"]\r
-\r
+title: //div[@class="article default-article"]/h1
+author: //p[@class="author"]/a[2]
+
+# Article introduction:
+#move_into(//div[@class="article-bread"]): //p[@class="lead"]
+
body: //div[@class="article-bread"]
test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece
\ No newline at end of file
-title://h1[@class="articleHeadline"]\r
-body://div[@id="article"]\r
-strip_id_or_class:articleTools\r
-strip_id_or_class:readerscomment\r
-#strip://div[contains(@class, "articleInline runaroundLeft")]\r
-strip: //div[contains(@class, "doubleRule")]\r
-# strip image credit - appears as a bold heading\r
-strip: //div[contains(@class, "articleInline")]//h6\r
-strip_id_or_class:enlargeThis\r
-strip_id_or_class:pageLinks\r
-strip_id_or_class:memberTools\r
-strip_id_or_class:articleExtras\r
-strip_id_or_class:singleAd\r
-strip_id_or_class:byline\r
-strip_id_or_class:dateline\r
-strip_id_or_class:articleheadline\r
-strip_id_or_class:articleBottomExtra\r
-strip://a[contains(@href, 'nytimes.com/adx/')]\r
-strip: //nyt_byline\r
-strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')]\r
-strip: //p[@class='caption']//a[contains(., 'More Photos')]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-date: substring-after(//*[contains(@class, 'dateline')], 'Published:')\r
-\r
-single_page_link: //link[contains(@href, 'pagewanted=all')]\r
-#single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))]\r
-\r
-strip://ul[@id = 'toolsList']\r
-strip://h6[@class = 'kicker']\r
-author:substring-after(//h6[@class='byline'],'By ')\r
-\r
-test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html\r
-test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html
\ No newline at end of file
+title://h1[@class="articleHeadline"]
+body://div[@id="article"]
+body://*[@itemprop="articleBody"]
+strip_id_or_class:articleTools
+strip_id_or_class:readerscomment
+#strip://div[contains(@class, "articleInline runaroundLeft")]
+strip: //div[contains(@class, "doubleRule")]
+# strip image credit - appears as a bold heading
+strip: //div[contains(@class, "articleInline")]//h6
+strip_id_or_class:enlargeThis
+strip_id_or_class:pageLinks
+strip_id_or_class:memberTools
+strip_id_or_class:articleExtras
+strip_id_or_class:singleAd
+strip_id_or_class:byline
+strip_id_or_class:dateline
+strip_id_or_class:articleheadline
+strip_id_or_class:articleBottomExtra
+strip_id_or_class:shareTools
+strip://a[contains(@href, 'nytimes.com/adx/')]
+strip: //nyt_byline
+strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')]
+strip: //p[@class='caption']//a[contains(., 'More Photos')]
+
+prune: no
+tidy: no
+
+find_string: <script
+replace_string: <div style="display:none"
+find_string: </script>
+replace_string: </div>
+
+date: substring-after(//*[contains(@class, 'dateline')], 'Published:')
+
+single_page_link: //link[contains(@href, 'pagewanted=all')]
+single_page_link: //link[@rel='alternate' and contains(@href, 'mobile.nytimes.com')]/@href
+single_page_link: concat(substring-before(//div[@id='pageLinks']//a[contains(@href, 'pagewanted=')]/@href, 'pagewanted='), 'pagewanted=all')
+#single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))]
+
+strip://ul[@id = 'toolsList']
+strip://h6[@class = 'kicker']
+author:substring-after(//h6[@class='byline'],'By ')
+
+test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html
+test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html
+test_url: http://www.nytimes.com/2013/03/25/world/middleeast/israeli-military-responds-after-patrols-come-under-fire-from-syria.html
+test_url: http://www.nytimes.com/2013/08/15/nyregion/when-the-new-york-city-subway-ran-without-rails.html
+test_url: http://www.nytimes.com/2004/02/29/weekinreview/correspondence-class-consciousness-china-s-wealthy-live-creed-hobbes-darwin-meet.html
+test_url: http://www.nytimes.com/2014/06/19/opinion/gail-collins-romney-and-the-2016-contenders-huddle.html
\ No newline at end of file
-body: //*[@class='article-full']\r
-title: //h3\r
-strip: //header[@class='group']\r
-#body: //p[@class='lead']\r
-#move_into(//p[@class='lead']): //*[@class='article-full']/figure\r
-#move_into(//p[@class='lead']): //div[@id='articleBodyText']\r
-strip: //div[@id='social-media-floater']\r
-strip: //div[@class='advertisement']\r
-strip: //div[@class='infobox']\r
-strip: //div[@id='articleComments']\r
-\r
+body: //*[@class='article-full']
+title: //h3
+strip: //header[@class='group']
+#body: //p[@class='lead']
+#move_into(//p[@class='lead']): //*[@class='article-full']/figure
+#move_into(//p[@class='lead']): //div[@id='articleBodyText']
+strip: //div[@id='social-media-floater']
+strip: //div[@class='advertisement']
+strip: //div[@class='infobox']
+strip: //div[@id='articleComments']
+
test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213
\ No newline at end of file
-body: //article[contains(@class, 'instapaper_body')]\r
-\r
-prune: no\r
-\r
-single_page_link: //a[@id='print-button']\r
-\r
+body: //article[contains(@class, 'instapaper_body')]
+
+prune: no
+
+single_page_link: //a[@id='print-button']
+
test_url: http://www.observer.com/2008/would-you-take-tumblr-man
\ No newline at end of file
-body: //div[(@id = "content")]\r
-strip: //div[(@class = "links-bar")]\r
-strip: //div[(@class = "povrzani")]\r
-strip: //div[(@class = "povrzani-dolu")]\r
-strip: //div[(@class = "tags")]\r
+body: //div[(@id = "content")]
+strip: //div[(@class = "links-bar")]
+strip: //div[(@class = "povrzani")]
+strip: //div[(@class = "povrzani-dolu")]
+strip: //div[(@class = "tags")]
strip: //h1[(@id = "page-title")]
test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi
\ No newline at end of file
-title: //div[@id='squeeze']/h1\r
-strip: //div[@id='squeeze']/h1\r
-author: //div[@class='submitted']/a\r
-strip: //div[@class='submitted']/a\r
-convert_double_br_tags: yes\r
-\r
-\r
+title: //div[@id='squeeze']/h1
+strip: //div[@id='squeeze']/h1
+author: //div[@class='submitted']/a
+strip: //div[@class='submitted']/a
+convert_double_br_tags: yes
+
+
test_url: http://omiliya.org/content/predchuvstvie.html
\ No newline at end of file
-body: //div[(@class = "statija")]\r
-strip: //div[(@class = "relatedBlock")]\r
-strip: //div[(@class = "swftools")]\r
+body: //div[(@class = "statija")]
+strip: //div[(@class = "relatedBlock")]
+strip: //div[(@class = "swftools")]
strip: //table[(@class = "links")]
test_url: http://on.net.mk/video/na-trkala/lamborghini-aventador-avionot-shto-ne-leta
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-body: //div[@id='article_story_body']\r
-\r
-author: //h3[@class='byline']/a\r
-# for slid show content\r
-body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1]\r
-date: //li[@class='dateStamp']/small\r
-\r
-strip_id_or_class: insetFullBracket\r
-strip_id_or_class: insettipBox\r
-#strip_id_or_class: legacyInset\r
-strip_id_or_class: recipeACShopAndBuyText\r
-\r
-strip: //div[contains(@class, 'insetContent')]//cite\r
-strip: //*[contains(@style, 'visibility: hidden;')]\r
-strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html\r
-# slide show\r
-test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html
\ No newline at end of file
+title: //meta[@property="og:title"]/@content
+body: //div[@id='article_story_body']
+
+author: //h3[@class='byline']/a
+# for slide show content
+body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1]
+date: //li[@class='dateStamp']/small
+
+strip_id_or_class: insetFullBracket
+strip_id_or_class: insettipBox
+#strip_id_or_class: legacyInset
+strip_id_or_class: recipeACShopAndBuyText
+
+strip: //div[contains(@class, 'insetContent')]//cite
+strip: //*[contains(@style, 'visibility: hidden;')]
+strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))]
+strip: //div[contains(@class, 'carousel')]
+
+prune: no
+tidy: no
+
+test_url: http://online.wsj.com/news/articles/SB10001424052702304626304579509100018004342
+test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html
+# slide show
+test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html
--- /dev/null
+title: //h1[@class='entry-title']
+
+author: //a[@rel='author']
+
+date: substring-before(//aside[@class='entry-meta'], '|')
+
+body: //div[@class='entry-content']
+test_url: http://ontologicalgeek.com/change-or-live-final-fantasy-x-as-catholic-dystopia/
\ No newline at end of file
-body: //div[@id = 'content-inner']\r
-strip: //div[@id = 'content-bottom']\r
+body: //div[@id = 'content-inner']
+strip: //div[@id = 'content-bottom']
strip_id_or_class: print_sharebutton
test_url: http://openthemagazine.com/article/nation/sania-vs-saina
\ No newline at end of file
-body: //div[@class="chapter"]\r
-prune: no\r
-tidy: no\r
+body: //div[@class="chapter"]
+prune: no
+tidy: no
test_url: http://openwebx.org/docs/springext.html
\ No newline at end of file
-single_page_link: //div[@id='content']//p[@class='readMore']/a\r
-\r
-title: //div[@class='hidden offscreen']/h2\r
-body: //div[@id="storyText"]\r
-move_into(//div[@id='storyText']): //div[@class='fact']\r
-strip: //small[@class='credit']\r
-strip: //small[@class='caption']\r
-date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
-strip: //p[@class='toplink']\r
+single_page_link: //div[@id='content']//p[@class='readMore']/a
+
+title: //div[@class='hidden offscreen']/h2
+body: //div[@id="storyText"]
+move_into(//div[@id='storyText']): //div[@class='fact']
+strip: //small[@class='credit']
+strip: //small[@class='caption']
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')
+strip: //p[@class='toplink']
test_url: http://orf.at/stories/2084731/
\ No newline at end of file
-title: /html/body/div[5]/div[2]/h1\r
-body: /html/body/div[5]/div[2]/div[6]/div/div\r
-body: //*[@id="cikk"]\r
-strip: /html/body/div[5]/div[2]/h1\r
-strip: /html/body/div[5]/div[2]/div[4]\r
-strip: //*[@id="multidoboz"]\r
-strip: /html/body/div[5]/div[2]/div[6]/div[2]\r
-strip: //*[@id="comments"]\r
-strip: //*[@id="rating-doboz"]\r
-strip: /html/body/div[5]/div[2]/div[10]\r
-strip: /html/body/div[5]/div[2]/a\r
-strip: /html/body/div[5]/div[2]/span\r
-strip: /html/body/div[5]/div[2]/span[2]\r
-strip: /html/body/div[5]/div[2]/span[3]\r
-strip: /html/body/div[5]/div[2]/span[4]\r
-strip: /html/body/div[5]/div[2]/span[5]\r
+title: /html/body/div[5]/div[2]/h1
+body: /html/body/div[5]/div[2]/div[6]/div/div
+body: //*[@id="cikk"]
+strip: /html/body/div[5]/div[2]/h1
+strip: /html/body/div[5]/div[2]/div[4]
+strip: //*[@id="multidoboz"]
+strip: /html/body/div[5]/div[2]/div[6]/div[2]
+strip: //*[@id="comments"]
+strip: //*[@id="rating-doboz"]
+strip: /html/body/div[5]/div[2]/div[10]
+strip: /html/body/div[5]/div[2]/a
+strip: /html/body/div[5]/div[2]/span
+strip: /html/body/div[5]/div[2]/span[2]
+strip: /html/body/div[5]/div[2]/span[3]
+strip: /html/body/div[5]/div[2]/span[4]
+strip: /html/body/div[5]/div[2]/span[5]
strip: //*[@id="kommentszam"]
test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html
\ No newline at end of file
--- /dev/null
+title: //h1
+strip_id_or_class: syntaxhighlighter
+test_url: http://www.oschina.net/translate/event-based-programming-what-async-has-over-sync?print
\ No newline at end of file
-#body: (//div[@class='ftr-yt-vid'])[1]\r
-body: (//blockquote[contains(@class, 'postcontent')])[1]\r
-body: (//div[starts-with(@id, 'post_message')])[1]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
-#replace_string(</iframe>): </iframe> </div>\r
-\r
+#body: (//div[@class='ftr-yt-vid'])[1]
+body: (//blockquote[contains(@class, 'postcontent')])[1]
+body: (//div[starts-with(@id, 'post_message')])[1]
+
+prune: no
+tidy: no
+
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"
+#replace_string(</iframe>): </iframe> </div>
+
test_url: http://pakistantvdekho.com/showthread.php?647741-Sitam-Gar-by-HUM-TV-Episode-07&p=659080#post659080
\ No newline at end of file
--- /dev/null
+title: //h1[@class='entry-title']
+body: //article//div[@class='entry']
+strip_id_or_class: addthis
+strip_id_or_class: gdsrcacheloader
+strip_id_or_class: entry-meta
+strip_id_or_class: entry-tags
+strip_id_or_class: authorbox
+strip: //div[@class='entry']/p[1]
+strip: //img[@width='600' and @height='70']
+# related posts
+strip: //h3[contains(., 'Related posts')]
+strip: //div[contains(@style, 'border: 0pt none ; margin: 0pt; padding: 0pt;')]
+
+prune: no
+tidy: no
+
+test_url: http://pakmedia.tv/tv-one/feed
\ No newline at end of file
-title://h2\r
-author://div[@class="posted"]/a\r
-date://div[@class="date"]\r
+title://h2
+author://div[@class="posted"]/a
+date://div[@class="date"]
body://div[@class="entry"]
test_url: http://pandagon.net/index.php/site/its-okay-to-admit-that-mass-hysteria-is-real
\ No newline at end of file
-tidy: no\r
-body: //article\r
-date: //time/@datetime\r
+tidy: no
+body: //article
+date: //time/@datetime
strip_id_or_class: sharedaddy
test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/
\ No newline at end of file
-body: //div[@class='entry']\r
+body: //div[@class='entry']
date: //h3[@class='postDate']
test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/
\ No newline at end of file
--- /dev/null
+title: //h2[@class="page_title"]
+body: //div[@class="entry arquivo"]
+author: //span[@class="author"]
+footnotes: yes
+prune: yes
+test_url: http://papodehomem.com.br/um-relato-confessional-sobre-a-maioridade-penal/
\ No newline at end of file
-title: //h2[@class="post-title"]\r
-author: substring-after(//div[@class="description"],'Words by ')\r
-date: //li[@class="date"]\r
-strip: //h2[@class="post-title"]\r
+title: //h2[@class="post-title"]
+author: substring-after(//div[@class="description"],'Words by ')
+date: //li[@class="date"]
+strip: //h2[@class="post-title"]
body: //div[@class="copy"]
test_url: http://parislemon.com/post/13462682469/the-15-inch-air
\ No newline at end of file
-title: //h1\r
+title: //h1
body: //div[@id='news-article']
test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/
\ No newline at end of file
-title://div[@class="paste_box_line1"]/h1\r
-author://div[@class="paste_box_line2"]/a\r
-body://div[@class="text"]\r
-date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|')\r
+title://div[@class="paste_box_line1"]/h1
+author://div[@class="paste_box_line2"]/a
+body://div[@class="text"]
+date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|')
dissolve://li
test_url: http://pastebin.com/LAykd1es
\ No newline at end of file
-title: //h1\r
-body: //div[@id='ff-pastepad-content']\r
-prune: no\r
+title: //h1
+body: //div[@id='ff-pastepad-content']
+prune: no
# todo: add test file
test_url: http://pastepad.fivefilters.org/test.html
\ No newline at end of file
-title://*[contains(@class,'post-title')]\r
-body://div[contains(@class,'post-body')]\r
-body://div[contains(@class,'entry-content')]\r
-strip_comments:no\r
-prune:no\r
-convert_double_br_tags:yes\r
+title://*[contains(@class,'post-title')]
+body://div[contains(@class,'post-body')]
+body://div[contains(@class,'entry-content')]
+strip_comments:no
+prune:no
+convert_double_br_tags:yes
tidy:yes
test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html
\ No newline at end of file
-prune:yes\r
-\r
-date://*[contains(@class,'date')]\r
-\r
-body://div[contains(@id,'content')]\r
-\r
-next_page_link://a[contains(.,'Next >')]\r
-\r
+prune:yes
+
+date://*[contains(@class,'date')]
+
+body://div[contains(@id,'content')]
+
+next_page_link://a[contains(.,'Next >')]
+
strip_id_or_class:sponsors
test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp
\ No newline at end of file
-title: //div[@class='articleHead']//h1\r
-author: //div[@class="author-name"]/a[1]\r
-body: //div[@class="main"]\r
-\r
-# remove 'From the Lab' and 'Recent posts' text\r
-strip: //div[@class='blogLabel']\r
-\r
-# remove byline and meta info\r
-strip: //h1\r
-strip: //div[@class="article-meta"]\r
-strip: //div[@class="author-info"]\r
-\r
-#strip tags and categories\r
-strip: //div[@class="department"]\r
-\r
-#strip product cap links\r
-strip: //div[@class="cap-main"]\r
-strip: //div[@id="compare-lede"]\r
+title: //div[@class='articleHead']//h1
+author: //div[@class="author-name"]/a[1]
+body: //div[@class="main"]
+
+# remove 'From the Lab' and 'Recent posts' text
+strip: //div[@class='blogLabel']
+
+# remove byline and meta info
+strip: //h1
+strip: //div[@class="article-meta"]
+strip: //div[@class="author-info"]
+
+#strip tags and categories
+strip: //div[@class="department"]
+
+#strip product cap links
+strip: //div[@class="cap-main"]
+strip: //div[@id="compare-lede"]
test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html
\ No newline at end of file
-# 2012-01-14 carlo@... - fixed title, body; added author, date\r
-\r
-title: //div[@class="title"]/h2/a\r
-# body: //div[@class="post"]\r
-# author: //p[@class="iconEmail"]/a\r
-# date: //p[@class="iconDate"]\r
-\r
-# 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report\r
-\r
-# Penny Arcade\r
-\r
-author: //li[@class="iconEmail"]/a\r
-date: //li[@class="iconDate"]\r
-body: //div[@class="body"]\r
-\r
-# PA Report\r
-\r
-author: //div[@class="meta"]/p/a\r
-date: substring-after(//div[@class="meta"]/p, '/ ')\r
-title: substring-after(//title, '- ')\r
-\r
-test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news\r
+# 2012-01-14 carlo@... - fixed title, body; added author, date
+
+title: //div[@class="title"]/h2/a
+# body: //div[@class="post"]
+# author: //p[@class="iconEmail"]/a
+# date: //p[@class="iconDate"]
+
+# 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report
+
+# Penny Arcade
+
+author: //li[@class="iconEmail"]/a
+date: //li[@class="iconDate"]
+body: //div[@class="body"]
+
+# PA Report
+
+author: //div[@class="meta"]/p/a
+date: substring-after(//div[@class="meta"]/p, '/ ')
+title: substring-after(//title, '- ')
+
+test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news
test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech
\ No newline at end of file
-prune: no\r
-tidy: no\r
-body: //div[@class='article-content']\r
-dissolve: //nobr/a\r
+prune: no
+tidy: no
+body: //div[@class='article-content']
+dissolve: //nobr/a
dissolve: //nobr
test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7
\ No newline at end of file
-title: //h1[@class='entry-title']\r
-author: //p[@class='byline']/span\r
-body: //@id='body-content'\r
-date: //div[@class='article_timestamp']/span\r
-\r
-strip: //@class=b-group\r
-strip: //*[contains(@style, 'none')]\r
-strip: //a[contains(@href, 'comments')]\r
+title: //h1[@class='entry-title']
+author: //p[@class='byline']/span
+body: //@id='body-content'
+date: //div[@class='article_timestamp']/span
+
+strip: //@class=b-group
+strip: //*[contains(@style, 'none')]
+strip: //a[contains(@href, 'comments')]
strip: //*[contains(@class, 'comment')]
test_url: http://www.philly.com/philly/sports/eagles/20120127_Ohio_State_s_Posey_didn_t_waste_time_lost_to_suspension.html
\ No newline at end of file
-author: substring-before(//div[@class='post_meta'],' on')\r
-date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on')\r
-title: //h1[class='post_title']\r
-body: //div[@class='article']\r
+author: substring-before(//div[@class='post_meta'],' on')
+date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on')
+title: //h1[class='post_title']
+body: //div[@class='article']
test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/
\ No newline at end of file
-body: //div[@id='content']\r
-strip_id_or_class: manualnavbar\r
-\r
-prune: no\r
+body: //div[@id='content']
+strip_id_or_class: manualnavbar
+
+prune: no
test_url: http://www.php.net/manual/en/migration5.incompatible.php
\ No newline at end of file
-title: //div[@class='abstitle']//h1\r
-author: //div[@class='authorList']\r
-body: //div[@id='fulltext_body']\r
-\r
-prune: no\r
+title: //div[@class='abstitle']//h1
+author: //div[@class='authorList']
+body: //div[@id='fulltext_body']
+
+prune: no
test_url: http://www.physicstoday.org/resource/1/phtoad/v64/i10/p48_s1?bypassSSO=1
\ No newline at end of file
--- /dev/null
+title: //title
+body: //div[contains(@class, 'imageContainer')]
+
+test_url: http://pinterest.com/pin/380906080954441188/
+test_url: http://pinterest.com/michaelsorm/architecture/rss
\ No newline at end of file
-title:concat(//h1,' - ',//h2,' - ',//h3)\r
-author://address\r
-date://span[@class='pub-date']\r
-body://div[@id='main']\r
-single_page_link://link[@rel='canonical']\r
-strip://div[@class='info']\r
-strip_id_or_class:'object-grid related-content'\r
-strip_id_or_class:'object-prevnext'\r
-strip_id_or_class:'object-header'\r
-strip_id_or_class:'source'\r
-strip_id_or_class:'label'\r
-strip_id_or_class:'title'\r
-dissolve://ul\r
-strip://li[@class='next']\r
+title:concat(//h1,' - ',//h2,' - ',//h3)
+author://address
+date://span[@class='pub-date']
+body://div[@id='main']
+single_page_link://link[@rel='canonical']
+strip://div[@class='info']
+strip_id_or_class:'object-grid related-content'
+strip_id_or_class:'object-prevnext'
+strip_id_or_class:'object-header'
+strip_id_or_class:'source'
+strip_id_or_class:'label'
+strip_id_or_class:'title'
+dissolve://ul
+strip://li[@class='next']
strip://li[@class='prev']
test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/
\ No newline at end of file
-title: //h2[@class='post-title']\r
-author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/')\r
-date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in')\r
-strip: //h2[@class='post-title']\r
-strip: //p[@class='post-details']\r
-strip: //h3[@class='post-byline']\r
+title: //h2[@class='post-title']
+author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/')
+date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in')
+strip: //h2[@class='post-title']
+strip: //p[@class='post-details']
+strip: //h3[@class='post-byline']
body: //div[@id='content']
test_url: http://pittnews.com/newsstory/mens-basketball-pitt-recruit-robinson-to-bring-leadership/
\ No newline at end of file
-title: substring-before(//title,'pirates.com')\r
-date: //span[@class='timeStamp']\r
-author: substring-before(substring-after(//div[@class='byLine'],'By'),'/')\r
-body: //div[@id='article']\r
-#strip: //div[@class='inner']\r
-strip: //div[@id='article_head']\r
-strip: //p[@class='tagLine']\r
-strip: //div[@id='article_related_links']\r
-strip: //div[@id='article_related_mlb']\r
-strip: //div[@id='article_related_club']\r
-strip: //span[@class='more']\r
-strip: //div[@class='article_component']\r
-strip: //span[@class='screen_reader']\r
+title: substring-before(//title,'pirates.com')
+date: //span[@class='timeStamp']
+author: substring-before(substring-after(//div[@class='byLine'],'By'),'/')
+body: //div[@id='article']
+#strip: //div[@class='inner']
+strip: //div[@id='article_head']
+strip: //p[@class='tagLine']
+strip: //div[@id='article_related_links']
+strip: //div[@id='article_related_mlb']
+strip: //div[@id='article_related_club']
+strip: //span[@class='more']
+strip: //div[@class='article_component']
+strip: //span[@class='screen_reader']
strip: //ul[@class='columnists_blurb']
test_url: http://pittsburgh.pirates.mlb.com/news/article.jsp?ymd=20120330&content_id=27759040&vkey=news_pit&c_id=pit
\ No newline at end of file
-title: substring-before(//title,'- Pittsburgh Tribune')\r
-author: substring-before(substring-after(//div[@class='byline'],'By '),',')\r
-date: substring-after(substring-after(//div[@class='byline'],','),',')\r
-body: //div[@id='storyBody']\r
-strip: //div[@class='morestories']\r
+title: substring-before(//title,'- Pittsburgh Tribune')
+author: substring-before(substring-after(//div[@class='byline'],'By '),',')
+date: substring-after(substring-after(//div[@class='byline'],','),',')
+body: //div[@id='storyBody']
+strip: //div[@class='morestories']
dissolve: //p[@class='subheader']
test_url: http://www.pittsburghlive.com/x/pittsburghtrib/sports/columnists/s_785654.html
\ No newline at end of file
-title: //title\r
-author: substring-after(//div[@class='by-line'],'BY')\r
-\r
-body: //div[@id='article-body']\r
-\r
-strip: //div[@class='by-line']\r
+title: //title
+author: substring-after(//div[@class='by-line'],'BY')
+
+body: //div[@id='article-body']
+
+strip: //div[@class='by-line']
strip: //div[@id='article-body']/h1
test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/
\ No newline at end of file
-title: //span[@class='StoryHeadline']\r
-strip: //div[@class='fivevert']\r
+title: //span[@class='StoryHeadline']
+strip: //div[@class='fivevert']
body: //div[@id='Content']
test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html
\ No newline at end of file
-title: //h1[@class='articletitle']\r
-author: substring-after(//span[@class='author'],'by')\r
-date: //span[@class='created']\r
-body: //div[@class='article']\r
-strip: //div[@class='headline']\r
-strip: //p[@class='articleinfo']\r
+title: //h1[@class='articletitle']
+author: substring-after(//span[@class='author'],'by')
+date: //span[@class='created']
+body: //div[@class='article']
+strip: //div[@class='headline']
+strip: //p[@class='articleinfo']
#dissolve: //p[@class='subheader']
test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='frnRahmen']/div/div[@id='content']/div[2]/h2
+author: //div[@id='content']/div[1]/div/a
+body: //div[@id='content']/div[2]/span
+strip: //div[@id='commenthead']
+test_url: http://www.planetvita.de/news/10389-psn-store-update-vom-03-april-neue-inhalte-fuer-psvita.html
\ No newline at end of file
-author: //article//*[@class="author"]\r
-date: //article//*[@class="publication-date"]\r
-body: //article\r
-strip: //article/header\r
+author: //article//*[@class="author"]
+date: //article//*[@class="publication-date"]
+body: //article
+strip: //article/header
strip: //article/section
test_url: http://www.playboy.com/playground/view/playboy-interview-jon-hamm
\ No newline at end of file
-body: //div[@id='contentPane']//div[@class='vg']\r
-body: //div[@id='contentPane']\r
-\r
-# Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :(\r
-\r
-author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title\r
-\r
-\r
-strip: //*[@title="People who +1'd this"]/../..\r
-strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')]\r
-strip: //*[@role='menu']\r
-strip: //img[contains(@alt, 'profile photo')]\r
-strip: //*[@class='a-f-i-Ad']\r
-\r
-tidy: no\r
-\r
+body: //div[@id='contentPane']//div[@class='vg']
+body: //div[@id='contentPane']
+
+# Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :(
+
+author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title
+
+
+strip: //*[@title="People who +1'd this"]/../..
+strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')]
+strip: //*[@role='menu']
+strip: //img[contains(@alt, 'profile photo')]
+strip: //*[@class='a-f-i-Ad']
+
+tidy: no
+
test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp
\ No newline at end of file
-title: //h2[@class='jcw-pagetitle'\r
-date: //p[@class='postinfo']\r
+title: //h2[@class='jcw-pagetitle'
+date: //p[@class='postinfo']
body: //div[@class='contenttext']
test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/
\ No newline at end of file
-body: //div[@id="content"]/div[1]\r
-\r
+body: //div[@id="content"]/div[1]
+
title: //h1[@class="entry-title"]
test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/
\ No newline at end of file
-title://div[contains(@class, "article")]/h1\r
-body://div[contains(@class,"story-text")]\r
-\r
-# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"]\r
-\r
-next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a\r
-next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a\r
-date://meta[@name="publish_date"]/@content\r
-\r
-strip://div[contains(@class, "breadcrumbs")]\r
-strip://a[contains(@class, "hidden")]\r
-strip://div[contains(@class, "story-embed")]\r
+title://div[contains(@class, "article")]/h1
+body://div[contains(@class,"story-text")]
+
+# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"]
+
+next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a
+date://meta[@name="publish_date"]/@content
+
+strip://div[contains(@class, "breadcrumbs")]
+strip://a[contains(@class, "hidden")]
+strip://div[contains(@class, "story-embed")]
strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/..
-strip://div[contains(@class, "story-interrupt")]\r
-strip://footer[contains(@class, "author-bio")]\r
-\r
test_url: http://www.politico.com/news/stories/0712/78105.html
\ No newline at end of file
-body: //div[@id="content"]\r
-\r
+body: //div[@id="content"]
+
strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"]
test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/
\ No newline at end of file
-# 21/10-2011:\r
-# Added Author+Date\r
-# Remove fakta-boks if found\r
-# Deleted 'Læs også...' filter \r
-# - Change in markup caused it to strip too much.\r
-\r
-author://span[@class='autor-name']\r
-date:substring-after(//div[@class='art-created'], ' ')\r
-title: //h1[contains(@class, 'stor-type')]\r
-body: //div[@id='art-body']\r
-strip: //div[@class='art-fakta article-box']\r
+# 21/10-2011:
+# Added Author+Date
+# Remove fakta-boks if found
+# Deleted 'Læs også...' filter
+# - Change in markup caused it to strip too much.
+
+author://span[@class='autor-name']
+date:substring-after(//div[@class='art-created'], ' ')
+title: //h1[contains(@class, 'stor-type')]
+body: //div[@id='art-body']
+strip: //div[@class='art-fakta article-box']
test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/
\ No newline at end of file
--- /dev/null
+body: //div[@id='article-content']
+body: //article[@id='entry-top']/div[@class='float_wrapper']
+author: //header/p[@class='byline']/em/a
+date: //header/p[@class='byline']/span[@class='timestamp']
+
+strip: //div[@id='article-content']//header
+strip: //label
+
+#photos on left column (delete all)
+strip: //div[@class='big_photo']
+
+#photos on left column (remove extras used for scroll effect)
+#strip: //div[@class='big_photo']/div[./img]
+#strip: //div[@class='big_photo']/img[position()>1]
+
+strip_id_or_class: vox-lazy-load
+strip_id_or_class: social_buttons
+strip_id_or_class: feature_toc
+
+prune: no
+
+find_string: <noscript>
+replace_string: <div>
+find_string: </noscript>
+replace_string: </div>
+
+#find_string: <script
+#replace_string: <div style="display:none"
+#find_string: </script>
+#replace_string: </div>
+
+strip: //div[@class='float_wrapper']/header
+test_url: http://www.polygon.com/2013/4/5/4189028/donkey-kong-country-returns-3d-new-content
+test_url: http://www.polygon.com/features/2013/8/22/4602568/30-years-xbox-360-playstation-3-wii
\ No newline at end of file
-next_page_link: //div[@id='longPagination']/a[@class='next']\r
-\r
-title: //div[@id='contentHeader']//h1\r
-\r
-body: //div[@id='articleBody']\r
-# this is so sad\r
+next_page_link: //div[@id='longPagination']/a[@class='next']
+
+title: //div[@id='contentHeader']//h1
+
+body: //div[@id='articleBody']
+# this is so sad
body: //div[@id='intelliTXT']
test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877
\ No newline at end of file
--- /dev/null
+author: //*[(@class = "author")]
+date: //*[(@class = "date")]
+test_url: http://portertech.ca/2012/12/10/iac-morning-market/
\ No newline at end of file
-title: //div[@id="newsDetailTitle"]\r
-author: //span[@id="showAuthor"]\r
-date: //span[@id="showRefDate"]\r
-\r
-strip: //div[@id="breadcrumbs"]\r
-strip: //span[@id="PageTitle"]\r
-strip: //div[@id="newsDetailAuthorPublish"]\r
-\r
-strip: //div[@class="leadPix"]\r
-\r
-strip: //span[@id="ctl00_PageTitle"]\r
-strip: //div[@id="newsDetailTitle"]\r
-convert_double_br_tags:yes\r
-\r
-strip: //div[@id="newsDetailCredential"]\r
-strip: //div[@id="sidebar2"]\r
-strip: //div[@id="footer"]\r
+title: //div[@id="newsDetailTitle"]
+author: //span[@id="showAuthor"]
+date: //span[@id="showRefDate"]
+
+strip: //div[@id="breadcrumbs"]
+strip: //span[@id="PageTitle"]
+strip: //div[@id="newsDetailAuthorPublish"]
+
+strip: //div[@class="leadPix"]
+
+strip: //span[@id="ctl00_PageTitle"]
+strip: //div[@id="newsDetailTitle"]
+convert_double_br_tags:yes
+
+strip: //div[@id="newsDetailCredential"]
+strip: //div[@id="sidebar2"]
+strip: //div[@id="footer"]
test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083
\ No newline at end of file
-title: //div[@class='story_headline']\r
-author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/')\r
-date: //div[@class='story_lastupdate'] \r
-body: //div[@id='story']\r
-strip: //div[@class='story_byline']\r
-strip: //div[@class='story_lastupdate']\r
-strip: //div[@class='story_headline']\r
-strip: //div[@id='abuse']\r
-strip: //h2\r
-strip: //div[@class='pagenumbers_wrap']\r
-strip: //ul[@class='pagenumbers']\r
-strip: //div[starts-with(., 'To report inappropriate comments')]\r
-\r
-strip_id_or_class: story_share\r
-strip_id_or_class: OUTBRAIN\r
-strip_id_or_class: story_box_right\r
-strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']]\r
-strip: //ul[@id='pikame']/li[position()>1]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-single_page_link: //a[contains(@href, '?p=0')]\r
-\r
-test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/\r
+title: //div[@class='story_headline']
+author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/')
+date: //div[@class='story_lastupdate']
+body: //div[@id='story']
+strip: //div[@class='story_byline']
+strip: //div[@class='story_lastupdate']
+strip: //div[@class='story_headline']
+strip: //div[@id='abuse']
+strip: //h2
+strip: //div[@class='pagenumbers_wrap']
+strip: //ul[@class='pagenumbers']
+strip: //div[starts-with(., 'To report inappropriate comments')]
+
+strip_id_or_class: story_share
+strip_id_or_class: OUTBRAIN
+strip_id_or_class: story_box_right
+strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']]
+strip: //ul[@id='pikame']/li[position()>1]
+
+prune: no
+tidy: no
+
+single_page_link: //a[contains(@href, '?p=0')]
+
+test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/
test_url: http://www.post-gazette.com/stories/sports/pirates/pirates-fork-over-changes-for-fans-at-pnc-park-629789
\ No newline at end of file
-title: //div[@id='divAdnetKeyword']/h1\r
-body: //div[@id='_middle_content_bottom']\r
-\r
-wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img\r
-\r
-strip: //div[@id='_middle_content_bottom_child1']\r
-strip: //div[@id='_middle_content_bottom_child4']\r
-strip: //div[@class='cls']\r
-strip: //div[@class='iphoneBox']\r
-strip: //ul[@class='ilgiliHaber']\r
-strip: //div[@class='yorumlar']\r
-strip: //div[@class='kategoriler']\r
-strip: //div[@class='textSize']\r
+title: //div[@id='divAdnetKeyword']/h1
+body: //div[@id='_middle_content_bottom']
+
+wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img
+
+strip: //div[@id='_middle_content_bottom_child1']
+strip: //div[@id='_middle_content_bottom_child4']
+strip: //div[@class='cls']
+strip: //div[@class='iphoneBox']
+strip: //ul[@class='ilgiliHaber']
+strip: //div[@class='yorumlar']
+strip: //div[@class='kategoriler']
+strip: //div[@class='textSize']
strip: //span[@class='tarih']
test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044
\ No newline at end of file
-title: //h1\r
-date: /html/head/meta[@name="date"]/@content\r
-body: //div[@id="featuredlinksbox"]\r
-strip: //div[@class="relatedbox"]\r
-strip: //h1\r
-strip: //br\r
+title: //h1
+date: /html/head/meta[@name="date"]/@content
+body: //div[@id="featuredlinksbox"]
+strip: //div[@class="relatedbox"]
+strip: //h1
+strip: //br
strip_image_src: "/images"
test_url: http://www.prb.org/Journalists/Webcasts/2011/military-families.aspx
\ No newline at end of file
-title: //h1\r
-body: //div[@id='left']\r
-strip: //h1\r
-convert_double_br_tags: yes\r
-strip_id_or_class: entry-footer\r
-strip: //h1[. = 'Previously']/following::*\r
-author: string('James Hague')\r
+title: //h1
+body: //div[@id='left']
+strip: //h1
+convert_double_br_tags: yes
+strip_id_or_class: entry-footer
+strip: //h1[. = 'Previously']/following::*
+author: string('James Hague')
date: //div[@class = 'entry-footer']/text()
test_url: http://prog21.dadgum.com/105.html
\ No newline at end of file
-body: //div[@class='body']\r
-title: //h2[@class='title']\r
+body: //div[@class='body']
+title: //h2[@class='title']
date: //span[@class='posted-on']
test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html
\ No newline at end of file
-title: //h1[@class="article-title"]\r
-author: //meta[@name="author"]/@content\r
-body: //div[@class="article-full"]\r
-strip_id_or_class: sidebar_inject\r
-strip_id_or_class: callout\r
-strip_id_or_class: content-inset\r
-strip_id_or_class: byline-block\r
-strip_id_or_class: photo-caption\r
-strip_id_or_class: foot-tools\r
+title: //h1[@class="article-title"]
+author: //meta[@name="author"]/@content
+body: //div[@class="article-full"]
+strip_id_or_class: sidebar_inject
+strip_id_or_class: callout
+strip_id_or_class: content-inset
+strip_id_or_class: byline-block
+strip_id_or_class: photo-caption
+strip_id_or_class: foot-tools
test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places
\ No newline at end of file
-author: //p[@class='name']\r
-date: substring-before(//p[@class='date'], ' | ')\r
+author: //p[@class='name']
+date: substring-before(//p[@class='date'], ' | ')
body: //div[@class='news_single_item']
test_url: http://www.prosa.dk/aktuelt/nyhed/artikel/internetaktivisten-uden-maske/
\ No newline at end of file
-#basics\r
-author: (//div[contains(@class,'author')])[1]\r
-date: substring-before(//a[@class='issue'], '—')\r
-#body://div[@class = 'entry']\r
-# use this until move_into support is ready\r
-body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image']\r
-\r
-#moves header image and tagline into body\r
-move_into(//div[@class='entry']/div)://div[@class = 'lead_image']\r
-move_into(//div[@class='entry']/div)://div[@class = 'standfirst']\r
-\r
-\r
-# moves author info to end of text\r
-move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em\r
-\r
-prune: no\r
-\r
-# strips social links\r
-strip_id_or_class:login-status\r
-strip_id_or_class:shareinpost\r
-strip_id_or_class:content_subscribe\r
-strip_id_or_class:postinfo\r
-strip_id_or_class:postutils\r
-strip_id_or_class:comments\r
-strip://strong[string(.) = 'Follow Prospect on Twitter']\r
+#basics
+author: (//div[contains(@class,'author')])[1]
+date: substring-before(//a[@class='issue'], '—')
+#body://div[@class = 'entry']
+# use this until move_into support is ready
+body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image']
+
+#moves header image and tagline into body
+move_into(//div[@class='entry']/div)://div[@class = 'lead_image']
+move_into(//div[@class='entry']/div)://div[@class = 'standfirst']
+
+
+# moves author info to end of text
+move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em
+
+prune: no
+
+# strips social links
+strip_id_or_class:login-status
+strip_id_or_class:shareinpost
+strip_id_or_class:content_subscribe
+strip_id_or_class:postinfo
+strip_id_or_class:postutils
+strip_id_or_class:comments
+strip://strong[string(.) = 'Follow Prospect on Twitter']
test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/
\ No newline at end of file
--- /dev/null
+body: //a[contains(@rel, 'mainphotos')] | //div[contains(@class, 'article-content')]
+
+prune: no
+
+test_url: http://www.protothema.gr//politics/article/326464/diamadopoulou-floridis-kaminis-kai-boutaris-se-ekdilosi-ton-europaion-fileleutheron/
+test_url: http://www.protothema.gr/rss/news/politics/
\ No newline at end of file
-title: //div[@class="page-title"]/h1\r
-author: //a[@title="View Bio"]\r
-date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by')\r
-strip://div[@class="page-title"]/h1\r
-strip://div[@class="article-abstract"]\r
-strip://div[@class="article-meta"]\r
-strip://div[@id="rightColumn"]\r
+title: //div[@class="page-title"]/h1
+author: //a[@title="View Bio"]
+date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by')
+strip://div[@class="page-title"]/h1
+strip://div[@class="article-abstract"]
+strip://div[@class="article-meta"]
+strip://div[@id="rightColumn"]
strip://div[@id="inline-content-bottom-left"]
test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook
\ No newline at end of file
-author: //meta[@name="Author"]\r
-date: //meta[@name="Date"]\r
+author: //meta[@name="Author"]
+date: //meta[@name="Date"]
strip: //h5
test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]
+author: //span[@class="author"]
+body: //article[@itemtype="http://schema.org/Article"]
+date: //time[@itemprop="dateCreated"]
+
+strip: //header[@class="entry-header single-header"]
+strip: //aside[@class="entry-assets"]
+strip: //div[@class="entry-options entry-options-above group"]
+strip: //div[@class="entry-options entry-options-below group"]
+
+convert_double_br_tags: yes
+test_url: http://www.publico.pt/politica/noticia/passos-diz-que-se-limitacao-de-mandatos-fosse-para-todos-os-concelhos-estaria-claro-na-lei-1577691
\ No newline at end of file
-title: //div[@class='title']\r
-body: //div[@class='body']\r
+title: //div[@class='title']
+body: //div[@class='body']
next_page_link: //div[@class='source']/text()[contains(., 'page')]/following-sibling::a
test_url: http://purpleplanetmedia.com/eye/inte/ngaiman.php
\ No newline at end of file
--- /dev/null
+# this site seems to work OK in the web view, but only occasionally in the instapaper app itself.
+
+body: //div[@class='entry-content']
+author: //span[@class='byline']
+test_url: http://qctimes.com/news/local/woman-faces-perjury-charges-in-meth-case/article_83f4c470-956a-11e2-a921-001a4bcf887a.html
\ No newline at end of file
-title: //div[contains(@class, "hentry")]/h3\r
-\r
-author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")]\r
-\r
-date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under")\r
-\r
-body: //div[contains(@class, "entry")]\r
-\r
-strip_id_or_class: addtoany_share_save_container\r
-strip_id_or_class: postmetadata\r
-strip_id_or_class: author_bio\r
-strip_id_or_class: author_bio_2\r
+title: //div[contains(@class, "hentry")]/h3
+
+author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")]
+
+date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under")
+
+body: //div[contains(@class, "entry")]
+
+strip_id_or_class: addtoany_share_save_container
+strip_id_or_class: postmetadata
+strip_id_or_class: author_bio
+strip_id_or_class: author_bio_2
strip: //div[contains(@class, "hentry")]/h3
test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/
\ No newline at end of file
-body: //div[@class='copy']\r
+body: //div[@class='copy']
title: //h1[@class='hed']
test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/
\ No newline at end of file
-title: //h1\r
-\r
-body: //div[@class="cuerpoArticulo"]\r
-\r
+title: //h1
+
+body: //div[@class="cuerpoArticulo"]
+
test_url: http://www.quepasa.cl/magazine/articulo/print.html?id=5299
\ No newline at end of file
-tidy: no\r
-prune: no\r
-body: //div[contains(@class, 'main_col')]\r
-title: //h1\r
-\r
-strip_id_or_class: hidden\r
-strip_id_or_class: item_action_bar\r
-strip_id_or_class: answer_voters\r
-strip_id_or_class: question_topics\r
-strip_id_or_class: answer_header_text\r
-strip_id_or_class: editor_link\r
-strip_id_or_class: view_tag\r
-strip_id_or_class: include_details\r
-strip_id_or_class: sig_edit\r
-strip_id_or_class: profile_photo_img\r
+tidy: no
+prune: no
+body: //div[contains(@class, 'main_col')]
+title: //h1
+
+strip_id_or_class: hidden
+strip_id_or_class: item_action_bar
+strip_id_or_class: answer_voters
+strip_id_or_class: question_topics
+strip_id_or_class: answer_header_text
+strip_id_or_class: editor_link
+strip_id_or_class: view_tag
+strip_id_or_class: include_details
+strip_id_or_class: sig_edit
+strip_id_or_class: profile_photo_img
test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life
\ No newline at end of file
--- /dev/null
+author: /html/body/center/b
+date: /html/body/table/tr[2]/td/i
+single_page_link: //*[@id='oTxt']/table[3]/tr[2]/td/a[1]
+
+test_url: http://www.racjonalista.pl/kk.php/s,7214/q,Geneza.szubrawstwa
\ No newline at end of file
-date://span[@class='date']\r
+date://span[@class='date']
body://div[@class='entry-body']
test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html
\ No newline at end of file
-body: //div[@class='body']\r
+body: //div[@class='body']
title: //div[@class='newsstory']/h2
test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d
\ No newline at end of file
-title: //div[@id='center-col']/h4\r
-author: substring-before(//title,'In')\r
-date: substring-after(//div[@class='commenttext']/span,'#')\r
-body: //div[@id='center-col']\r
-strip: //div[@id='center-col']/h4\r
-strip: //div[@class='graytext']\r
-\r
-# Anthony Perez-Sanz 2012.3.14\r
-# Removed long gif from the end\r
-strip: //img[@src='http://www.randsinrepose.com/spreader.gif']\r
+title: //div[@id='center-col']/h4
+author: substring-before(//title,'In')
+date: substring-after(//div[@class='commenttext']/span,'#')
+body: //div[@id='center-col']
+strip: //div[@id='center-col']/h4
+strip: //div[@class='graytext']
+
+# Anthony Perez-Sanz 2012.3.14
+# Removed long gif from the end
+strip: //img[@src='http://www.randsinrepose.com/spreader.gif']
test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html
\ No newline at end of file
-single_page_link: //link[@rel='canonical']/@href\r
+single_page_link: //link[@rel='canonical']/@href
test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler
\ No newline at end of file
-title: //h1[@class="titlelink"]\r
-date: //span[@class="timestamp"]/@data-published\r
-body: //div[@class="asset-content"]\r
-strip_id_or_class: related-entries\r
-strip_id_or_class: like-and-retweet\r
-\r
-author: //div[@id="submeta"]/a[1]\r
+title: //h1[@class="titlelink"]
+date: //span[@class="timestamp"]/@data-published
+body: //div[@class="asset-content"]
+strip_id_or_class: related-entries
+strip_id_or_class: like-and-retweet
+
+author: //div[@id="submeta"]/a[1]
test_url: http://www.readwriteweb.com/archives/why_facebook_terrifies_google.php
\ No newline at end of file
-body: //div[@id='_ctl12__ctl0_Article']\r
-prune: no\r
+body: //div[@id='_ctl12__ctl0_Article']
+prune: no
autodetect_on_failure: no
\ No newline at end of file
-body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients']\r
-\r
-strip_id_or_class: location\r
-strip_id_or_class: savings\r
-strip_id_or_class: recipeDetailDescButton\r
-\r
-prune: no\r
-tidy: no\r
-\r
+body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients']
+
+strip_id_or_class: location
+strip_id_or_class: savings
+strip_id_or_class: recipeDetailDescButton
+
+prune: no
+tidy: no
+
test_url: http://www.recipe.com/avocado-basil-pasta/
\ No newline at end of file
-body: //div[@class='short-text' or starts-with(@id, 'news-id-')]\r
-prune: no\r
-tidy: no\r
-\r
+body: //div[@class='short-text' or starts-with(@id, 'news-id-')]
+prune: no
+tidy: no
+
test_url: http://red-hot-girls.com/2011/06/10/the_red_hot_natalia_maria_53_pics.html
\ No newline at end of file
-# This setup grabs the text from a Reddit self post. It ignores all comments etc.\r
-\r
-title: //p[@class="title"]/a/text()\r
-\r
-author: //p[@class="tagline"]/a\r
-\r
-# this doesn't work for some reason...?\r
-date: //p[@class="tagline"]//@datetime\r
-\r
-body: //div[@class="expando"]//div[@class="usertext-body"]\r
-\r
-strip_id_or_class: tagline\r
-strip_id_or_class: unvotable-message\r
-strip_id_or_class: buttons\r
-\r
-test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/
\ No newline at end of file
+# This setup grabs the text from a Reddit self post. It ignores all comments etc.
+
+title: //p[@class="title"]/a/text()
+
+author: //p[@class="tagline"]/a
+
+# this doesn't work for some reason...?
+date: //p[@class="tagline"]//@datetime
+
+body: //div[@class="expando"]//div[@class="usertext-body"]
+
+strip_id_or_class: tagline
+strip_id_or_class: unvotable-message
+strip_id_or_class: buttons
+
+# follow the posted link (unless it's a self post - relative URL, no http://)
+single_page_link: //p[@class="title"]/a[contains(@href, 'http://')]
+
+test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/
+test_url: http://www.reddit.com/r/worldnews/comments/1as37r/twelve_north_korean_soldiers_attempting_to_defect/
\ No newline at end of file
-title: //div[@class='posthead']//h2\r
-body: //div[contains(@class, 'postcontent') or @class='posthead']\r
-author: //div[@class='posthead']//a[@rel='author']\r
-\r
-strip: //div[@class='posthead']//h2\r
-replace_string(>Advertisements</div>): ></div>\r
-replace_string(<p>You can follow us on): <p style="display:none;">\r
-strip_id_or_class: likeThisPost\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //div[@class='posthead']//h2
+body: //div[contains(@class, 'postcontent') or @class='posthead']
+author: //div[@class='posthead']//a[@rel='author']
+
+strip: //div[@class='posthead']//h2
+replace_string(>Advertisements</div>): ></div>
+replace_string(<p>You can follow us on): <p style="display:none;">
+strip_id_or_class: likeThisPost
+
+prune: no
+tidy: no
+
test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/
\ No newline at end of file
-# Think there might be something up with your parser that it strips out 'print' from the title :)\r
-\r
-title: //meta[@name='title']/@content\r
-author: //meta[@name='author']/@content\r
-date: //meta[@name='date']/@content\r
-\r
-body: //div[@class='articleText']\r
-\r
-strip: //div[contains(@class, 'day')]\r
-strip: //div[contains(@class, 'month')]\r
-strip: //div[contains(@class, 'year')]\r
-strip: //div[contains(@class, 'time')]\r
-strip: //h1[@class='gl_headline']\r
-strip: //div[@class='byline']\r
-strip: //div[@id='left_ear']\r
-strip: //div[@id='right_ear']\r
-strip: //div[contains(@class, 'PopularPosts')]\r
-strip ://div[@class='discuss_page_break']\r
+# Think there might be something up with your parser that it strips out 'print' from the title :)
+
+title: //meta[@name='title']/@content
+author: //meta[@name='author']/@content
+date: //meta[@name='date']/@content
+
+body: //div[@class='articleText']
+
+strip: //div[contains(@class, 'day')]
+strip: //div[contains(@class, 'month')]
+strip: //div[contains(@class, 'year')]
+strip: //div[contains(@class, 'time')]
+strip: //h1[@class='gl_headline']
+strip: //div[@class='byline']
+strip: //div[@id='left_ear']
+strip: //div[@id='right_ear']
+strip: //div[contains(@class, 'PopularPosts')]
+strip ://div[@class='discuss_page_break']
strip ://div[contains(@class, 'p-content_TagList')]
test_url: http://redtape.msnbc.msn.com/_news/2011/09/28/8020661-sprint-raises-fee-but-wont-free-users-from-two-year-contracts?preview=true
\ No newline at end of file
-body://div[@class='storycontent']\r
-date://div[@class='date']\r
-strip://li[@class='sharing_label']\r
+body://div[@class='storycontent']
+date://div[@class='date']
+strip://li[@class='sharing_label']
strip://a[@class='FlattrButton']
test_url: http://reflets.info/orange-nokia-siemens-deep-packet-inspection/
\ No newline at end of file
-title: //*[@class='entry-title']\r
+title: //*[@class='entry-title']
body: //div[@class='entry-content']
test_url: http://www.renenekuda.cz/recept-na-produktivitu/
\ No newline at end of file
--- /dev/null
+date: //meta[@name='bi3dPubDate']/@content
+body: //div[contains(@class, 'articleBody')]
+
+prune: no
+
+test_url: http://www.resume.se/nyheter/media/2013/09/18/kvallspress-och-tv-slass-om-playtittarna-men-youtube-ohotat-storst/
+test_url: http://www.resume.se/nyheter/media/2013/09/18/cecilia-blankens-lamnar-mama-for-konkurrent/
+test_url: http://www.resume.se/nyheter/reklam/2013/09/18/ravelli-trodde-jag-var-med-i-blasningen/
+test_url: http://www.resume.se/rss-nyheter
\ No newline at end of file
-single_page_link://a[contains(@href, 'print')]\r
-\r
-# Grab metadata from the "printer-friendly" page, after specifying single_page_link\r
-title://h2\r
+single_page_link://a[contains(@href, 'print')]
+
+# Grab metadata from the "printer-friendly" page, after specifying single_page_link
+title://h2
date://cite
test_url: http://www.retrieverweekly.com/?cmd=displaystory&story_id=7548&format=html
\ No newline at end of file
-title: //h1[@class='headline3']\r
-author: substring-after(//p[@class="byline"], 'By ')\r
-date: //meta[@name="REVISION_DATE"]/@content\r
-body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation']\r
-strip: //li[@class='next']\r
-strip: //span[@class='articleLocation']\r
-prune: no\r
-tidy: no\r
-\r
+title: //h1[@class='headline3']
+author: substring-after(//p[@class="byline"], 'By ')
+date: //meta[@name="REVISION_DATE"]/@content
+body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation']
+strip: //li[@class='next']
+strip: //span[@class='articleLocation']
+prune: no
+tidy: no
+
test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408
\ No newline at end of file
-title: //div[@class="article_header"]/h3\r
-author: //div[@class="autor"]/p/*\r
-date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ")\r
-\r
-move_into(//div[@class="new_article"]): //div[@class="img_article"]/img\r
-\r
-body: //div[@class="article_content"]\r
-convert_double_br_tags: yes\r
+title: //div[@class="article_header"]/h3
+author: //div[@class="autor"]/p/*
+date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ")
+
+move_into(//div[@class="new_article"]): //div[@class="img_article"]/img
+
+body: //div[@class="article_content"]
+convert_double_br_tags: yes
test_url: http://revistapiaui.estadao.com.br/edicao-68/questoes-latino-americanas/filhos-da-guerra-suja
\ No newline at end of file
--- /dev/null
+body: //div[@class='step-content'] | //div[@class='global-active ingredients-box']
+title: //div[@class='step-1-container']
+
+tidy: no
+test_url: http://www.rezeptwelt.de/backen-herzhaft-rezepte/w%C3%BCrstchen-schlangen/530372
\ No newline at end of file
-body: //div[@id="post"]\r
-strip: //div[@id="author-description"]\r
-date: //span[@class="entry-date"]\r
+body: //div[@id="post"]
+strip: //div[@id="author-description"]
+date: //span[@class="entry-date"]
author: //span[@class="author vcard"]
test_url: http://richardmuscat.wordpress.com/2011/06/20/the-price-of-free/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+TheBrooksReview+%28The+Brooks+Review%29
\ No newline at end of file
-body: //div[@class='post-body entry-content']\r
-strip: //div[@id='lws_0']\r
-prune: no\r
-\r
+body: //div[@class='post-body entry-content']
+strip: //div[@id='lws_0']
+prune: no
+
test_url: http://ritemail.blogspot.com/2011/06/hayden-panettiere-candids-in-los.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='post']/h2
+author: substring-before(substring-after(//div[@class='alignright']/small, 'By '),'-')
+date: substring-after(//div[@class='alignright']/small, '-')
+strip: //div[@class='alignleft']
+test_url: http://www.ritholtz.com/blog/2012/09/situational-awareness/
\ No newline at end of file
--- /dev/null
+strip_id_or_class: 'sharedaddy'
+strip_id_or_class: 'respond'
+strip_id_or_class: 'meta'
+test_url: http://www.robertsspaceindustries.com/news-update-ai-pilots/
\ No newline at end of file
--- /dev/null
+body: //section[@class='post text']
+title: //h1[@class='title']
+date: //p[@class='post-date']
+strip: //section[@class='meta-info']
+test_url: http://robots.thoughtbot.com/post/32455387133/four-phase-test
\ No newline at end of file
-title: //h2\r
-\r
-strip: //div[ contains(@class, 'respond') ] | //h2 | //h1\r
-\r
-date: substring-after(//p[@class='info'], ' on ')\r
-\r
+title: //h2
+
+strip: //div[ contains(@class, 'respond') ] | //h2 | //h1
+
+date: substring-after(//p[@class='info'], ' on ')
+
author: //p[@class='info']//a
test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/
\ No newline at end of file
-author: //article/header/span[@class='author']\r
-title://article/header/h1\r
-body: //article\r
-strip: //article/header\r
-strip: //article/p[@class='metadata']\r
+author: //article/header/span[@class='author']
+title://article/header/h1
+body: //article
+strip: //article/header
+strip: //article/p[@class='metadata']
footnotes: yes
test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/
\ No newline at end of file
-title: substring-before(//title,':')\r
-author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY')\r
-\r
-body: //div[@class='text']\r
-\r
-strip: //a[contains(@href,'printart')]\r
+title: substring-before(//title,':')
+author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY')
+
+body: //div[@class='text']
+
+strip: //a[contains(@href,'printart')]
strip_id_or_class: enlarge_photo
test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY
\ No newline at end of file
-body: //div[contains(@class, 'inhoud')]\r
-date: //span[@class ='published']\r
-author: //span[@class ='author']\r
-strip: //div[@class = 'grid_2']\r
-strip: //div[@class = 'block-citation-text']\r
+body: //div[contains(@class, 'inhoud')]
+date: //span[@class ='published']
+author: //span[@class ='author']
+strip: //div[@class = 'grid_2']
+strip: //div[@class = 'block-citation-text']
test_url: http://www.rolfinjapan.nl/2011/06/duizend-kraanvogels/
\ No newline at end of file
-body: //div[@class='movie_content_area']\r
-strip_id_or_class: tomatometer_bar_help\r
-strip_id_or_class: critic-links\r
-strip_id_or_class: top-critics-numbers\r
-strip_id_or_class: fan_side\r
-strip_id_or_class: fblike\r
-strip_id_or_class: rating_widget\r
-strip_id_or_class: friend_reviews\r
-prune: no\r
+body: //div[@class='movie_content_area']
+strip_id_or_class: tomatometer_bar_help
+strip_id_or_class: critic-links
+strip_id_or_class: top-critics-numbers
+strip_id_or_class: fan_side
+strip_id_or_class: fblike
+strip_id_or_class: rating_widget
+strip_id_or_class: friend_reviews
+prune: no
test_url: http://www.rottentomatoes.com/m/thor/
\ No newline at end of file
-body: //div[@class='content']\r
-strip: //p[@class='postmeta']/following::*\r
-strip: //p[@class='postmeta']\r
+body: //div[@class='content']
+strip: //p[@class='postmeta']/following::*
+strip: //p[@class='postmeta']
strip: //p[@align='left']
test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php
\ No newline at end of file
-body: //div[@id='news-text']\r
-prune: no\r
-test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy\r
+body: //div[@id='news-text']
+prune: no
+test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy
test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork
\ No newline at end of file
-author: //div[contains(@class, 'author_text')]/h4/text()\r
-date: //li[@class='date']\r
-\r
-# stripping excessive tags\r
-strip: //div[contains(@class, 'entry_meta')]\r
-strip: //div[contains(@class, 'single_meta')]\r
-strip: //br[contains(@class, 'clear')]\r
+author: //div[contains(@class, 'author_text')]/h4/text()
+date: //li[@class='date']
+
+# stripping excessive tags
+strip: //div[contains(@class, 'entry_meta')]
+strip: //div[contains(@class, 'single_meta')]
+strip: //br[contains(@class, 'clear')]
strip: //h3[contains(., 'Komentarz')]
test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/
\ No newline at end of file
-title: //h1[@class='entry-title']\r
-author: ///span[@class='author vcard']\r
-date: //abbr[@class='published']\r
-body: //div[@class='entry-content']\r
+title: //h1[@class='entry-title']
+author: ///span[@class='author vcard']
+date: //abbr[@class='published']
+body: //div[@class='entry-content']
test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/
\ No newline at end of file
-author: //a[@class='author']\r
+author: //a[@class='author']
tidy: no
test_url: http://ruttloff.org/2012/06/13/intervention
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
-author: (//span[@class="byline"]/a)[1]\r
-date: //span[contains(@class, "toLocalTime")]\r
-body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")]\r
-\r
-prune: no\r
-\r
-# deal with singleton links\r
-single_page_link: (//h1/a[contains(@href, '/singleton')])[1]\r
-\r
+title: //meta[@property='og:title']/@content
+author: (//span[@class="byline"]/a)[1]
+date: //span[contains(@class, "toLocalTime")]
+body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")]
+
+prune: no
+
+# deal with singleton links
+single_page_link: (//h1/a[contains(@href, '/singleton')])[1]
+
test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/singleton/
\ No newline at end of file
-body: //p[@class='teaser1 darkgrey myriad']\r
-move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear']\r
-strip: //div[@class='hidden']\r
-strip: //div[@id='article_related_source']\r
-\r
+body: //p[@class='teaser1 darkgrey myriad']
+move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear']
+strip: //div[@class='hidden']
+strip: //div[@id='article_related_source']
+
test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'post')]//h1
+date: //div[contains(@class, 'post')]//h6
+body: //div[contains(@class, 'entry')]
+strip_id_or_class: post_stats
+strip_id_or_class: related-posts
+strip_id_or_class: after_story
+prune: no
+
+test_url: http://www.sanpedrosun.com/community-and-society/2013/06/05/little-angelspre-school-talent-show/
+test_url: http://www.sanpedrosun.com/feed/
\ No newline at end of file
-title://h1\r
-\r
-# my section divs seem to interfere with the Instapaper parser, so I ditch 'em\r
-dissolve://div[contains(@class, 'section')]\r
-\r
-#these don't seem to be necessary, but just in case\r
-strip_id_or_class:'masthead'\r
-strip_id_or_class:'footer'\r
-\r
-#again, Instapaper seems to understand where my content is, but just in case\r
-body://div[@id='content']\r
-\r
-# in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing\r
-strip_id_or_class:'screen-only'\r
-strip_id_or_class:'no-print'\r
-\r
-#other misc removals and simplifications\r
-strip_id_or_class:'popup'\r
-strip_id_or_class:'ZoomSpin'\r
-\r
-#I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes\r
-wrap_in(blockquote)://div[contains(@class, 'sidebar')]\r
-wrap_in(blockquote)://div[contains(@class, 'meta')]\r
+title://h1
+
+# my section divs seem to interfere with the Instapaper parser, so I ditch 'em
+dissolve://div[contains(@class, 'section')]
+
+#these don't seem to be necessary, but just in case
+strip_id_or_class:'masthead'
+strip_id_or_class:'footer'
+
+#again, Instapaper seems to understand where my content is, but just in case
+body://div[@id='content']
+
+# in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing
+strip_id_or_class:'screen-only'
+strip_id_or_class:'no-print'
+
+#other misc removals and simplifications
+strip_id_or_class:'popup'
+strip_id_or_class:'ZoomSpin'
+
+#I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes
+wrap_in(blockquote)://div[contains(@class, 'sidebar')]
+wrap_in(blockquote)://div[contains(@class, 'meta')]
wrap_in(blockquote)://p[contains(@class, 'meta')]
test_url: http://saveyourself.ca/tutorials/low-back-pain.php
\ No newline at end of file
--- /dev/null
+date: //meta[@property='article:published_time']/@content
+body: (//div[contains(@class, 'article-slider')]//img)[1] | //div[contains(@class, 'bottom-article-con')]
+
+test_url: http://www.sayidaty.net/taxonomy/term/10/all/feed
\ No newline at end of file
-title: //h1[@id='stream_title']\r
-\r
-# Author and date don't work\r
-author: //div[@class='byline']\r
-date: //div[@class='date-stamp']\r
-\r
-body: //div[@class='node-article']\r
-\r
-strip_id_or_class: fb-like-box\r
-strip_id_or_class: stream-fb-like\r
-strip_id_or_class: social-meta\r
-strip_id_or_class: social-spoken\r
-strip_id_or_class: twitter-share-button\r
-strip_id_or_class: twitter-follow-button\r
-strip_id_or_class: spinner_node_list\r
-strip_id_or_class: node-sort-link\r
-strip_id_or_class: stream_title\r
-strip_id_or_class: stream_summary\r
-strip_id_or_class: update-count-container\r
-strip_id_or_class: major-updates\r
-strip_id_or_class: newsletter-slide\r
-strip_id_or_class: author-mini-profile\r
-strip_id_or_class: byline\r
-strip_id_or_class: header\r
-strip_id_or_class: footer\r
-\r
+title: //h1[@id='stream_title']
+
+# Author and date don't work
+author: //div[@class='byline']
+date: //div[@class='date-stamp']
+
+body: //div[@class='node-article']
+
+strip_id_or_class: fb-like-box
+strip_id_or_class: stream-fb-like
+strip_id_or_class: social-meta
+strip_id_or_class: social-spoken
+strip_id_or_class: twitter-share-button
+strip_id_or_class: twitter-follow-button
+strip_id_or_class: spinner_node_list
+strip_id_or_class: node-sort-link
+strip_id_or_class: stream_title
+strip_id_or_class: stream_summary
+strip_id_or_class: update-count-container
+strip_id_or_class: major-updates
+strip_id_or_class: newsletter-slide
+strip_id_or_class: author-mini-profile
+strip_id_or_class: byline
+strip_id_or_class: header
+strip_id_or_class: footer
+
# Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns
test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic
\ No newline at end of file
-author: //p[@class='mastname']\r
-\r
-body: //div[@class='indivbody']\r
-date: //div[@class='indivbody']/h2[1]\r
-\r
-# Remove blog title. Specify first occurrence in case h1 is used in article\r
-strip: //div[@class='indivbody']/h1[1]\r
-\r
-# Remove blog description (the first p element)\r
-strip: //div[@class='indivbody']/p[1]\r
-\r
-# Remove navigation (second p element)\r
-strip: //div[@class='indivbody']/p[2]\r
-\r
-# Remove duplicate of article title. Specify first occurrence in case h3 is used in article\r
-strip: //div[@class='indivbody']/h3[1]\r
-\r
-# Remove publishing date, it's extracted by rule above\r
-strip: //div[@class='indivbody']/h2[1]\r
-\r
-# Remove duplicate of date at end, and newsletter signup\r
-strip: //p[@class='posted']\r
-\r
-# Leave date at top\r
+author: //p[@class='mastname']
+
+body: //div[@class='indivbody']
+date: //div[@class='indivbody']/h2[1]
+
+# Remove blog title. Specify first occurrence in case h1 is used in article
+strip: //div[@class='indivbody']/h1[1]
+
+# Remove blog description (the first p element)
+strip: //div[@class='indivbody']/p[1]
+
+# Remove navigation (second p element)
+strip: //div[@class='indivbody']/p[2]
+
+# Remove duplicate of article title. Specify first occurrence in case h3 is used in article
+strip: //div[@class='indivbody']/h3[1]
+
+# Remove publishing date, it's extracted by rule above
+strip: //div[@class='indivbody']/h2[1]
+
+# Remove duplicate of date at end, and newsletter signup
+strip: //p[@class='posted']
+
+# Leave date at top
test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html
\ No newline at end of file
-body: //div[@class="storybox"]\r
-title: //div[@class="storybox"]//h1\r
-strip: //p[@class='metaline']\r
-date: substring-after(//*[@class='time'],'Erstellt am')\r
-strip: //div[@class='fact']\r
-strip: //p[@class='backlink']\r
-strip: //div[@class='mailto']\r
-strip: //div[@id='forumDisclaimer']\r
-strip: //div[@class='forum']\r
+body: //div[@class="storybox"]
+title: //div[@class="storybox"]//h1
+strip: //p[@class='metaline']
+date: substring-after(//*[@class='time'],'Erstellt am')
+strip: //div[@class='fact']
+strip: //p[@class='backlink']
+strip: //div[@class='mailto']
+strip: //div[@id='forumDisclaimer']
+strip: //div[@class='forum']
test_url: http://science.orf.at/stories/1700900/
\ No newline at end of file
-single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a\r
-\r
-author: //div[@class='details clear']//a[@class='hi']\r
-body: //div[@class='title']\r
-strip: //p[@class='entrypagination']\r
-strip: //p[@class='details_top']\r
-date: //p[@class='details_top']\r
-title: //div[@class='title']/h1\r
-strip: //p[@class='details']\r
-strip: //p[@class='details_bottom']\r
+single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a
+
+author: //div[@class='details clear']//a[@class='hi']
+body: //div[@class='title']
+strip: //p[@class='entrypagination']
+strip: //p[@class='details_top']
+date: //p[@class='details_top']
+title: //div[@class='title']/h1
+strip: //p[@class='details']
+strip: //p[@class='details_bottom']
test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php
\ No newline at end of file
-body: //div[@class='post']\r
-title: //h1[@id='singlePageTitle']\r
-date: substring-before(//small,'• Rubrik')\r
-\r
-strip: //div[@class='post-ratings']\r
-strip: //div[@class='post-ratings-loading']\r
-strip: //a[@title='Empfehlen Sie den Text weiter!']\r
-strip: //a[@title='Drucken']\r
-strip: //div[@class='share']\r
+body: //div[@class='post']
+title: //h1[@id='singlePageTitle']
+date: substring-before(//small,'• Rubrik')
+
+strip: //div[@class='post-ratings']
+strip: //div[@class='post-ratings-loading']
+strip: //a[@title='Empfehlen Sie den Text weiter!']
+strip: //a[@title='Drucken']
+strip: //div[@class='share']
test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/
\ No newline at end of file
-#\r
-# After site revisions at SciAm, this configuration does\r
-# not work, especially for multi-page articles. For\r
-# every article there is now a "Print" link which\r
-# is far more reliable. So this configuration should be\r
-# removed or disabled.\r
-# 2/3/13\r
-#\r
-\r
-# meta data\r
-title://h1[@class = 'articleTitle']\r
-author:substring-after(//span[@class = 'byline'],'By ')\r
-date:substring-before(//span[@class = 'datestamp'],'|')\r
-\r
-#body content\r
-body://div[@id = 'articleContent']\r
-#next_page_link://li[@id = 'flairPagination']/a[last()]\r
-\r
-single_page_link: //a[contains(@href, 'print=true')]\r
-\r
-#cleanup\r
-strip://div[@class = 'fsgBooks']\r
-\r
-test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state\r
+#
+# After site revisions at SciAm, this configuration does
+# not work, especially for multi-page articles. For
+# every article there is now a "Print" link which
+# is far more reliable. So this configuration should be
+# removed or disabled.
+# 2/3/13
+#
+
+# meta data
+title://h1[@class = 'articleTitle']
+author:substring-after(//span[@class = 'byline'],'By ')
+date:substring-before(//span[@class = 'datestamp'],'|')
+
+#body content
+body://div[@id = 'articleContent']
+#next_page_link://li[@id = 'flairPagination']/a[last()]
+
+single_page_link: //a[contains(@href, 'print=true')]
+
+#cleanup
+strip://div[@class = 'fsgBooks']
+
+test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state
test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet
\ No newline at end of file
--- /dev/null
+title: //h1
+author: //div[@class='date']/a
+date: substring-after(//div[@class='date'], ',')
+body: //div[@class='entrybody']
+
+strip_id_or_class: socialshareprivacy
+strip: //div[@class='entrybody']/br[1]
+
+# Strip related articles
+# 'p'-Tag strips 'Ähnliche Artikel: ' (<br> tags become <p>)
+strip: //div[@class='entrybody']/p[last()]
+strip: //div[@class='entrybody']/ul[last()]
+
+convert_double_br_tags: yes
+test_url: http://www.scilogs.de/wblogs/blog/formbar/fusion/2012-10-08/rundgang-durch-deutschlands-gr-tes-fusionsexperiment
\ No newline at end of file
-title: //title\r
-author: //p[@id='author-name-role']/a\r
-date: substring-after(//p[@class='time'],'Posted')\r
-body: //div[@id='main']\r
-strip: //div[@id='author-info']\r
-strip: //div[@id='author-links']\r
+title: //title
+author: //p[@id='author-name-role']/a
+date: substring-after(//p[@class='time'],'Posted')
+body: //div[@id='main']
+strip: //div[@id='author-info']
+strip: //div[@id='author-links']
strip: //h1
test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/
\ No newline at end of file
-title: //h2\r
+title: //h2
body: //div[@class='body']
test_url: http://scraplab.net/2010/10/26/please-keep-your-belongings-with-you-at-all-times/
\ No newline at end of file
-strip: //a[starts-with(@href, '#')]\r
-strip: //*[@class='storyByline']\r
-body: //*[@class='storyPageText']/..\r
-author: string('Dave Winer')\r
-date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at')\r
-title: //h1\r
+strip: //a[starts-with(@href, '#')]
+strip: //*[@class='storyByline']
+body: //*[@class='storyPageText']/..
+author: string('Dave Winer')
+date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at')
+title: //h1
footnotes: no
test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html
\ No newline at end of file
-body: //*[@class="entry-content"]\r
-title: //h1[@class="entry-title"]\r
-date: //*[@class="entry-date"]\r
+body: //*[@class="entry-content"]
+title: //h1[@class="entry-title"]
+date: //*[@class="entry-date"]
author: //*[@class="author vcard"]
test_url: http://sct.temple.edu/blogs/news-events/2011/05/congratulations-sct-class-of-2011/
\ No newline at end of file
--- /dev/null
+strip: //ul[contains(@id, "social")]
+strip: //div[contains(@class, "ts-fab-wrapper")]
+strip: //div[contains(@id, 'gpt-ad')]
+
+test_url: http://www.searchenginejournal.com/web-design-vs-seo-it-doesnt-make-much-sense/62294/
-body: //div[@class="storyBox"]\r
-title: //div[@class="storyBox"]/h1\r
-author: //a[@rel="author"]\r
-date: substring-before(//span[@class="dateline"], 'by')\r
-\r
-#Removes related content but cleans up article text\r
-strip: //h1\r
-strip: //p[@class="homeStory tdmSideInfo"]\r
-strip: //div[@id="bylineShare"]\r
-strip: //script\r
-strip: //hr\r
-\r
-strip_id_or_class: homeStory\r
-strip_id_or_class: authorpic\r
-strip_id_or_class: insideComments\r
-strip_id_or_class: authorbio\r
-strip_id_or_class: gpt-ad-sel-cube\r
-strip_id_or_class: smxTextAd\r
+body: //div[@class="storyBox"]
+title: //div[@class="storyBox"]/h1
+author: //a[@rel="author"]
+date: substring-before(//span[@class="dateline"], 'by')
+
+#Removes related content but cleans up article text
+strip: //h1
+strip: //p[@class="homeStory tdmSideInfo"]
+strip: //div[@id="bylineShare"]
+strip: //script
+strip: //hr
+
+strip_id_or_class: homeStory
+strip_id_or_class: authorpic
+strip_id_or_class: insideComments
+strip_id_or_class: authorbio
+strip_id_or_class: gpt-ad-sel-cube
+strip_id_or_class: smxTextAd
test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348
\ No newline at end of file
-title: substring-before(//title, '«')\r
-body: //div[@class = 'entry']\r
+title: substring-before(//title, '«')
+body: //div[@class = 'entry']
strip_id_or_class: 'postmetabox'
test_url: http://sebbo.net/2010/12/akkus/
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://select.yeeyan.org/view/18312/332365
+# http://select.yeeyan.org/view/365295/333788
+# http://select.yeeyan.org/view/174464/332336
+
+tidy:no
+prune:no
+title://h1
+author: //div[@class='sa_author']/span/a
+date: substring-after(//div[@class='sa_author']/span/following-sibling::span, ':')
+body: //div[@class='sa_left closetag']
+wrap_in(b)://div[@class='sa_abstract']
+
+strip://ul[@class='sa_next clearfix']
+strip: //div[@class='sa_author']
+strip: //div[@class='sa_title_box']
+
+test_url: http://select.yeeyan.org/view/258033/333481
\ No newline at end of file
-body: //div[@id='content']\r
-\r
-# clean up recipe pages\r
-strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
-\r
-#recipe pages\r
-strip_id_or_class: "recipe-feedback"\r
-strip_id_or_class: "comments"\r
-strip_id_or_class: "procedure-number"\r
-strip_id_or_class: "more-with-author"\r
-\r
-#slice\r
-strip_id_or_class: "inner"\r
+body: //div[@id='content']
+
+# clean up recipe pages
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
+
+#recipe pages
+strip_id_or_class: "recipe-feedback"
+strip_id_or_class: "comments"
+strip_id_or_class: "procedure-number"
+strip_id_or_class: "more-with-author"
+
+#slice
+strip_id_or_class: "inner"
test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html
\ No newline at end of file
-title: //h1[@class='post-title']\r
-author: //div[@class='post-byline']/a\r
-date: substring-before(//div[@class='post-byline'], ', by')\r
-\r
-body: //div[@class='post-body']\r
+title: //h1[@class='post-title']
+author: //div[@class='post-byline']/a
+date: substring-before(//div[@class='post-byline'], ', by')
+
+body: //div[@class='post-body']
dissolve: //noscript
test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php
\ No newline at end of file
-title: //h1[@class="post-title"]\r
-author: //div[@class="post-byline"]/a\r
-date: substring-before(//div[@class='post-byline'], ', by')\r
-\r
-body: //div[@class='post-body']\r
+title: //h1[@class="post-title"]
+author: //div[@class="post-byline"]/a
+date: substring-before(//div[@class='post-byline'], ', by')
+
+body: //div[@class='post-body']
strip_id_or_class: post-kicker
test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php
\ No newline at end of file
-title: /html/head/title\r
-\r
-body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')]\r
-author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn']\r
-date: //div[@class = 'articleheadings']/span[@class = 'updated']\r
-strip: //div[div[contains(@class, 'imgbox')]]\r
-\r
-body: //div[@class = 'blogitem']\r
-author: //p[@class="credit"]/span[@class="author"]/a[position() = 1]\r
-date: //span[@class = 'pubdate']\r
+title: /html/head/title
+
+body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')]
+author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn']
+date: //div[@class = 'articleheadings']/span[@class = 'updated']
+strip: //div[div[contains(@class, 'imgbox')]]
+
+body: //div[@class = 'blogitem']
+author: //p[@class="credit"]/span[@class="author"]/a[position() = 1]
+date: //span[@class = 'pubdate']
test_url: http://www.sfgate.com/columnists/garchik/
\ No newline at end of file
-body: //div[contains(@class, 'content_body')]\r
+body: //div[contains(@class, 'content_body')]
strip_id_or_class: det_rel
test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/
\ No newline at end of file
-date: //span[@class='date']\r
+date: //span[@class='date']
body: //div[@class='post_content']
test_url: http://www.shabayek.com/blog/2011/10/16/%D8%AF%D8%B1%D9%88%D8%B3-%D9%85%D9%86-%D9%82%D8%B5%D8%A9-%D8%AA%D8%A3%D8%B3%D9%8A%D8%B3-%D8%AA%D9%88%D9%8A%D8%AA%D8%B1-%E2%80%93%D8%AC3/
\ No newline at end of file
-title://*[@class='primary']/h1\r
-date: //*[@class='articledate']\r
-author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.')\r
-body: //div[@class='primary']\r
-footnotes: yes\r
-strip: //*[@class='primary']/h1\r
-strip: //*[@class='articledate']\r
-strip: //*[@class='detailsarticle']\r
-strip: //*[@class='endnav']\r
-strip: //*[@class='endmeta']\r
+title://*[@class='primary']/h1
+date: //*[@class='articledate']
+author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.')
+body: //div[@class='primary']
+footnotes: yes
+strip: //*[@class='primary']/h1
+strip: //*[@class='articledate']
+strip: //*[@class='detailsarticle']
+strip: //*[@class='endnav']
+strip: //*[@class='endmeta']
test_url: http://shawnblanc.net/2011/11/kindle-touch-review/
\ No newline at end of file
-body: //div[ @class='entry-content' ]\r
-\r
-strip: //div[ contains(@class, 'sharing') ]\r
-\r
+body: //div[ @class='entry-content' ]
+
+strip: //div[ contains(@class, 'sharing') ]
+
date: //div[ @class='entry-meta' ]/a
test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit
\ No newline at end of file
-#body: (//div[@class='ftr-yt-vid'])[1]\r
-body: (//blockquote[contains(@class, 'postcontent')])[1]\r
-body: (//div[starts-with(@id, 'post_message')])[1]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
-#replace_string(</iframe>): </iframe> </div>\r
-\r
+#body: (//div[@class='ftr-yt-vid'])[1]
+body: (//blockquote[contains(@class, 'postcontent')])[1]
+body: (//div[starts-with(@id, 'post_message')])[1]
+
+prune: no
+tidy: no
+
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"
+#replace_string(</iframe>): </iframe> </div>
+
test_url: http://www.siasat.pk/forum/showthread.php?107668-Policy-Matters-17th-March-2012-Dr-Shahid-Masood-Gen-Hameed-gul-amp-Fawad-Chudhary-Pak-US-Relationship&p=787733
\ No newline at end of file
--- /dev/null
+author: //span[contains(@class, 'byline_1')]
+date: //span[@class='posted_date']
+body: //*[contains(@class, 'bigimage_container') or contains(@class, 'overlay_text') or contains(@id, 'articlebody')]
+
+strip_id_or_class: leftWrapper
+
+prune: no
+
+test_url: http://www.signalscv.com/section/46/article/102948/
+test_url: http://www.signalscv.com/syndication/feeds/rss/
\ No newline at end of file
-body: //div[contains(@class, "entry")]\r
-\r
-date: //div[contains(@class, "entryFooter")]/a\r
+body: //div[contains(@class, "entry")]
+
+date: //div[contains(@class, "entryFooter")]/a
test_url: http://simonwillison.net/2009/Oct/22/redis/
\ No newline at end of file
-body: //div[@class='post-body']\r
-strip: //div[@id='lws_0']\r
-prune: no\r
+body: //div[@class='post-body']
+strip: //div[@id='lws_0']
+prune: no
test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html
\ No newline at end of file
-title: //div[@class='headline']//h2\r
-body: //div[contains(@class, 'storycontent')]\r
-\r
-prune: no\r
-\r
+title: //div[@class='headline']//h2
+body: //div[contains(@class, 'storycontent')]
+
+prune: no
+
test_url: http://sintagoulis.gr/sokolatenia/sokolatenia-mpompa-me-amaretti-
\ No newline at end of file
--- /dev/null
+title: //article[@class='post']/header[@class='wrapper']//h1/a
+author: //header[@id='masthead']//h1/a
+date: //article[@class='post']/header[@class='wrapper']//p[@class='postdate']
+body: //div[@id='body-content']
+
+test_url: http://sivers.org/delegate/
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.skanesfria.se/artikel/112045
\ No newline at end of file
-title: substring-before(//title,'| /Film')\r
-date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by')\r
-strip: //div[@class='pm-left']\r
-strip: //div[@class='pm-right']\r
-strip: //h2/span\r
-next_page_link: //h2/strong/a\r
-strip: //h2/strong/a\r
-strip: //p[contains(text(),'we have to split this post over')]\r
-strip: //p[@class='post-info']\r
-strip: //h1/a\r
-strip: //img[contains(@src,'siteimages/authors')]\r
-strip: //div[@id='header']\r
-strip: //div[@class='topad-right']\r
-strip: //strong[contains(text(),'Cool Posts From Around the Web:')]\r
+title: substring-before(//title,'| /Film')
+date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by')
+strip: //div[@class='pm-left']
+strip: //div[@class='pm-right']
+strip: //h2/span
+next_page_link: //h2/strong/a
+strip: //h2/strong/a
+strip: //p[contains(text(),'we have to split this post over')]
+strip: //p[@class='post-info']
+strip: //h1/a
+strip: //img[contains(@src,'siteimages/authors')]
+strip: //div[@id='header']
+strip: //div[@class='topad-right']
+strip: //strong[contains(text(),'Cool Posts From Around the Web:')]
test_url: http://www.slashfilm.com/superhero-bits-206/
\ No newline at end of file
-title: //h1[@class="sl-art-head-dek"]\r
-body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')]\r
-strip: //div[@class="department_kicker"]\r
-strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"]\r
-strip: //div[@id="bottom_sponsored_links"]\r
-strip: //div[@class="sl-art-ad-midflex"]\r
-#strip: //dl\r
-#strip: //p[em/a[contains(@href, 'facebook.com')]]\r
-prune: no\r
-\r
-author: //div[@id='author_bio']//a[contains(@href, '/author/')]\r
-author: //a[contains(@href, '/authors.')]\r
-\r
-date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ')\r
-\r
-single_page_link: //a[@class='sl-art-sinpage']\r
-\r
-test_url: http://www.slate.com/id/2274583/pagenum/all/\r
+title: //h1[@class="sl-art-head-dek"]
+body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')]
+strip: //div[@class="department_kicker"]
+strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"]
+strip: //div[@id="bottom_sponsored_links"]
+strip: //div[@class="sl-art-ad-midflex"]
+#strip: //dl
+#strip: //p[em/a[contains(@href, 'facebook.com')]]
+prune: no
+
+author: //div[@id='author_bio']//a[contains(@href, '/author/')]
+author: //a[contains(@href, '/authors.')]
+
+date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ')
+
+single_page_link: //a[@class='sl-art-sinpage']
+
+test_url: http://www.slate.com/id/2274583/pagenum/all/
test_url: http://www.slate.com/id/2293116/
\ No newline at end of file
-body: //div[@id='content']\r
-\r
-# clean up recipe pages\r
-strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
-\r
-#recipe pages\r
-strip_id_or_class: "recipe-feedback"\r
-strip_id_or_class: "comments"\r
-strip_id_or_class: "procedure-number"\r
-strip_id_or_class: "more-with-author"\r
-\r
-#slice\r
-strip_id_or_class: "inner"\r
+body: //div[@id='content']
+
+# clean up recipe pages
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']
+
+#recipe pages
+strip_id_or_class: "recipe-feedback"
+strip_id_or_class: "comments"
+strip_id_or_class: "procedure-number"
+strip_id_or_class: "more-with-author"
+
+#slice
+strip_id_or_class: "inner"
test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html
\ No newline at end of file
-strip_id_or_class: postCategory\r
-title: //h3[@class='postTitle']\r
+strip_id_or_class: postCategory
+title: //h3[@class='postTitle']
body: //div[@class='postBody']
test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone
\ No newline at end of file
-title: //td[@class='hweissblau2']\r
-body: //p[@class='copy'] | //div[@class='Section1']\r
-prune: no\r
+title: //td[@class='hweissblau2']
+body: //p[@class='copy'] | //div[@class='Section1']
+prune: no
test_url: http://www.smartinvestor.de/news/smartinvestor/detail.hbs?itemid=item949496655&recnr=14593
\ No newline at end of file
-title: //meta[@property='og:title']/@content\r
+title: //meta[@property='og:title']/@content
date: //p[@class='autor_line']/b/text()
test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html
\ No newline at end of file
-# meta data\r
-title://h1[@id = 'articleTitle']\r
-author:substring-after(//ul[@id = 'byLine']/li[1],'By ')\r
-date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',')\r
-body://div[@id = 'article-body']\r
-\r
-# full content\r
-single_page_link://td/li[@class = 'article-singlepage']/a\r
-\r
-# caption clean up\r
-wrap_in(i)://span[@class='articleImageCaptionwide']\r
-move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p\r
-\r
-\r
-# clean up\r
-strip://p[@id = 'articlePaginationWrapper']\r
-strip://ul[contains(@class, 'cat-breadcrumb')]\r
-strip://div [@class= 'viewMorePhotos']\r
+# meta data
+title://h1[@id = 'articleTitle']
+author:substring-after(//ul[@id = 'byLine']/li[1],'By ')
+date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',')
+body://div[@id = 'article-body']
+
+# full content
+single_page_link://td/li[@class = 'article-singlepage']/a
+
+# caption clean up
+wrap_in(i)://span[@class='articleImageCaptionwide']
+move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p
+
+
+# clean up
+strip://p[@id = 'articlePaginationWrapper']
+strip://ul[contains(@class, 'cat-breadcrumb')]
+strip://div [@class= 'viewMorePhotos']
test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html
\ No newline at end of file
-title: //h2[@class='custom-entry-title']\r
-author: substring-after(//span[@class='author vcard'],'by ')\r
-date: substring-after(//span[@class='publ'],'Published on ')\r
-body: //div[@class='postentry-content']\r
+title: //h2[@class='custom-entry-title']
+author: substring-after(//span[@class='author vcard'],'by ')
+date: substring-after(//span[@class='publ'],'Published on ')
+body: //div[@class='postentry-content']
test_url: http://smokingapples.com/software/popclip-for-mac/
\ No newline at end of file
--- /dev/null
+title: //h1
+body: //div[@id = 'content-area']
+author: //p[contains(@class, 'byline')]/a
+autodetect_next_page: yes
+tidy: no
+
+strip_id_or_class: articleid
+strip_id_or_class: logo
+strip_id_or_class: pagebar
+strip_id_or_class: featurenavlinks
+strip_id_or_class: featured_frontpage
+strip_id_or_class: sidebar
+strip_id_or_class: footer
+strip_id_or_class: byline
+strip_id_or_class: logo
+strip_id_or_class: nav_network
+test_url: http://www.somethingawful.com/d/dungeons-and-dragons/wtf-monster-manual.php
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://songshuhui.net/archives/65522
+# http://songshuhui.net/archives/75760
+title://h2/span/a
+date:substring-before(substring-after(//div[@class='atrctitle']/div, '发表于'),' |')
+body://div[@class='entry']
+test_url: http://songshuhui.net/archives/74819
\ No newline at end of file
-#grab the actual content div\r
-body: //div[@class='rt-article']\r
-\r
+#grab the actual content div
+body: //div[@class='rt-article']
+
test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html
\ No newline at end of file
-author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text()\r
-\r
-body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']\r
-\r
-# Not very helpfull, the title and author are container by the same element that contains the body\r
+author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text()
+
+body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']
+
+# Not very helpfull, the title and author are container by the same element that contains the body
strip: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/h2 | /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']
test_url: http://www.spectator.co.uk/arts-and-culture/night-and-day/7449683/spotify-sunday-my-personal-soundtrack.thtml
\ No newline at end of file
-body://div[@class="articleBody"]\r
+body://div[@class="articleBody"]
author://p[@class="articleBodyTtl"]
test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/
\ No newline at end of file
-# A. Niepel, narya.de@...\r
-# - added single_page_link\r
-# - added author for default and single page view\r
-# - added date for single page view\r
-# fforst@...\r
-# - Fixed it\r
-# bode2104@...\r
-# - Fixed single_page_link\r
-# - Included intro text in single page view\r
-# - Added body in default view\r
-\r
-# set body\r
-tidy: no\r
-# body in single page view\r
-body: //div[@id="spArticleContent"]\r
-# body in default view\r
-body: //div[@id="spArticleSection"]\r
-# body in "Fotostrecke"\r
-body: //div[@id="spBigaContent"]\r
-\r
-# set date in single page view\r
-date: //div[@id="spArticleContent"]/h3\r
-# strip date\r
-strip: //div[@id="spArticleContent"]/h3\r
-# set date in "Fotostrecke"\r
-date: //div[@id="spBigaDatum"]\r
-\r
-#set title in single page view\r
-title: //div[@id='spArticleContent']/h2\r
-# strip title\r
-strip: //div[@id='spArticleContent']/h1\r
-strip: //div[@id='spArticleContent']/h2\r
-#set title in "Fotostrecke"\r
-title: //div[@class='spBigaHeadline']\r
-\r
-# set author\r
-author: //p[@class="spAuthor"]/a\r
-author: substring-after(//p[@class="spAuthor"], 'Von ')\r
-# strip author\r
-strip: //p[@class='spAuthor']\r
-\r
-# remove captions\r
-strip: //*/span[@class='spPicLayerText']\r
-strip: //*/div[@class='spPanoPlayerPaneControl']\r
-strip: //*/div[@class='spCredit']\r
-strip: //*/div[@class='spCredit']/following-sibling::p\r
-\r
-# remove ads\r
-strip: //div[@class='spMInline']\r
-\r
-# remove photogalleries and extras\r
-strip: //div[@class='spPhotoGallery']\r
-strip: //div[@class='spPhotoGallery']/following-sibling::br\r
-strip: //div[@class='spAssetAlignleft']\r
-strip: //div[contains(@class,'spAsset')]\r
-strip: //br[@clear='all']\r
-\r
-# remove community functions\r
-strip: //div[@id='spSocialBookmark']\r
-strip: //div[contains(@class, 'spCommunityBox')]\r
-strip: //div[contains(@class, 'spArticleNewsfeedBox')]\r
-strip: //div[@class='spArticleCredit']\r
-\r
-# remove clutter in "Fotostrecke"\r
-strip: //div[@id='spBreadcrumb']\r
-strip: //div[@id='spBigaLatestEntries']\r
-strip: //div[contains(@class, 'spBigaNavi')]\r
-strip: //div[@class='spDottedLine']\r
-\r
-# Use link to print article for single page view\r
-single_page_link: //a[contains(@href, '-druck')]\r
-\r
-# use next link in "Fotostrecke"\r
-next_page_link: //a[@class='spBigaControlForw']\r
+# A. Niepel, narya.de@...
+# - added single_page_link
+# - added author for default and single page view
+# - added date for single page view
+# fforst@...
+# - Fixed it
+# bode2104@...
+# - Fixed single_page_link
+# - Included intro text in single page view
+# - Added body in default view
+
+# set body
+tidy: no
+# body in single page view
+body: //div[@id="spArticleContent"]
+# body in default view
+body: //div[@id="spArticleSection"]
+# body in "Fotostrecke"
+body: //div[@id="spBigaContent"]
+
+# set date in single page view
+date: //div[@id="spArticleContent"]/h3
+# strip date
+strip: //div[@id="spArticleContent"]/h3
+# set date in "Fotostrecke"
+date: //div[@id="spBigaDatum"]
+
+#set title in single page view
+title: //div[@id='spArticleContent']/h2
+# strip title
+strip: //div[@id='spArticleContent']/h1
+strip: //div[@id='spArticleContent']/h2
+#set title in "Fotostrecke"
+title: //div[@class='spBigaHeadline']
+
+# set author
+author: //p[@class="spAuthor"]/a
+author: substring-after(//p[@class="spAuthor"], 'Von ')
+# strip author
+strip: //p[@class='spAuthor']
+
+# remove captions
+strip: //*/span[@class='spPicLayerText']
+strip: //*/div[@class='spPanoPlayerPaneControl']
+strip: //*/div[@class='spCredit']
+strip: //*/div[@class='spCredit']/following-sibling::p
+
+# remove ads
+strip: //div[@class='spMInline']
+
+# remove photogalleries and extras
+strip: //div[@class='spPhotoGallery']
+strip: //div[@class='spPhotoGallery']/following-sibling::br
+strip: //div[@class='spAssetAlignleft']
+strip: //div[contains(@class,'spAsset')]
+strip: //br[@clear='all']
+
+# remove community functions
+strip: //div[@id='spSocialBookmark']
+strip: //div[contains(@class, 'spCommunityBox')]
+strip: //div[contains(@class, 'spArticleNewsfeedBox')]
+strip: //div[@class='spArticleCredit']
+
+# remove clutter in "Fotostrecke"
+strip: //div[@id='spBreadcrumb']
+strip: //div[@id='spBigaLatestEntries']
+strip: //div[contains(@class, 'spBigaNavi')]
+strip: //div[@class='spDottedLine']
+
+# Use link to print article for single page view
+single_page_link: //a[contains(@href, '-druck')]
+
+# use next link in "Fotostrecke"
+next_page_link: //a[@class='spBigaControlForw']
test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='articleTitleWrapper' or @id='mainFeature']//h1
+author: //*[@id='authorNameJob']//a
+date: //div[@id='articleMeta']/p
+body: //div[@id='mainFeature']//img | //div[contains(@class, 'fullText')]
+
+test_url: http://www.spiked-online.com/newsite/article/standing_up_to_the_white-coated_gods_of_fortune/13785
+test_url: http://www.spiked-online.com/newsite/article/sex_box_and_the_crisis_of_intimacy/14168
\ No newline at end of file
-tidy: no\r
-body: //section[contains(@class, 'main')]\r
-strip: //footer\r
+tidy: no
+body: //section[contains(@class, 'main')]
+strip: //footer
strip: //a[@class='paginated']
test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare
\ No newline at end of file
-author:string('Dan Frommer/SplatF')\r
-date://div[@class='postdate']\r
-body://div[@class='entry']\r
+author:string('Dan Frommer/SplatF')
+date://div[@class='postdate']
+body://div[@class='entry']
title://div[@class='post']/h1
test_url: http://www.splatf.com/2012/02/month-six/
\ No newline at end of file
-author: //div[@class='byline']/a\r
-date: //div[@id='date']\r
+author: //div[@class='byline']/a
+date: //div[@id='date']
body: //div[@class='entry']
test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/
\ No newline at end of file
-title://div[@class="content_detail"]/h1\r
-\r
-author://div[@class="author"]/strong\r
-\r
-date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB')\r
-\r
+title://div[@class="content_detail"]/h1
+
+author://div[@class="author"]/strong
+
+date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB')
+
body://div[@class='text_detail']
test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270
\ No newline at end of file
-single_page_link: //div[@id='content']//p[@class='readMore']/a\r
-\r
-title: //div[@class='hidden offscreen']/h2\r
-body: //div[@id="storyText"]\r
-move_into(//div[@id='storyText']): //div[@class='fact']\r
-strip: //small[@class='credit']\r
-strip: //small[@class='caption']\r
-date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
-strip: //p[@class='toplink']\r
+single_page_link: //div[@id='content']//p[@class='readMore']/a
+
+title: //div[@class='hidden offscreen']/h2
+body: //div[@id="storyText"]
+move_into(//div[@id='storyText']): //div[@class='fact']
+strip: //small[@class='credit']
+strip: //small[@class='caption']
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')
+strip: //p[@class='toplink']
test_url: http://sport.orf.at/stories/2084851/
\ No newline at end of file
--- /dev/null
+body: //h2[contains(@class, 'body_head')] | //div[@id='img_article' or contains(@class, 'body_content')]
+body: //div[contains(@class, 'cpanel')]//div[contains(@class, 'thumbnails')]
+prune: no
+strip: //div[starts-with(@class, 'actu_')]
+strip: //div[contains(@class, 'data')]
+
+test_url: http://www.sport365.fr/basketball/nba/new-york-accord-avec-toronto-pour-bargnani-1038773.shtml
+test_url: http://www.sport365.fr/rss.xml
\ No newline at end of file
-title: //div[@class='headline'] | //div[@class='mod-header']/h3\r
-body: //div[contains(@class, 'article')]\r
-strip: //div[contains(@class, 'mod-inline')]\r
-strip: //*/span[@class='page-actions']/a\r
-strip: //*/span[@class='page-actions']/a\r
-strip: //div[@class='page-actions']/*\r
-strip: //div[@class='headline'] | //div[@class='mod-header']/h3\r
-strip: //div[@class='mod-blog-navigation']\r
-strip: //div[@class='monthday']\r
-strip: //div[@class='time']\r
-strip: //div[@class='timeofday']\r
+title: //div[@class='headline'] | //div[@class='mod-header']/h3
+body: //div[contains(@class, 'article')]
+strip: //div[contains(@class, 'mod-inline')]
+strip: //*/span[@class='page-actions']/a
+strip: //*/span[@class='page-actions']/a
+strip: //div[@class='page-actions']/*
+strip: //div[@class='headline'] | //div[@class='mod-header']/h3
+strip: //div[@class='mod-blog-navigation']
+strip: //div[@class='monthday']
+strip: //div[@class='time']
+strip: //div[@class='timeofday']
test_url: http://sports.espn.go.com/espn/page2/story?page=simmonsnfl2010/lebron_james_return_clevelend&sportCat=nba
\ No newline at end of file
-title: //div[@id='article']/div[@class='hd']/h1\r
-body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0']\r
-strip: //div[@class='foot']\r
-strip: //div[@id='sidebar']//div[@class='ft']\r
-strip: //p[@id='byline']//em\r
-tidy: no\r
-prune: no\r
+title: //div[@id='article']/div[@class='hd']/h1
+body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0']
+strip: //div[@class='foot']
+strip: //div[@id='sidebar']//div[@class='ft']
+strip: //p[@id='byline']//em
+tidy: no
+prune: no
test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals
\ No newline at end of file
-title://div[@id='ardContent']/h1\r
-\r
-author://p[@id='ardAutor']\r
-author://span[@id='ardQuelle']\r
-author:string('sportschau.de')\r
-\r
-date:substring-after(//span[@id='ardStand'], 'Stand: ')\r
-\r
-body://div[@id='ardContent']\r
-\r
-strip://div[@id='ardContent']/h1\r
-strip://p[@id='ardAutor']\r
-strip: //div[@class='embeddedPlayer_clipinfo']\r
-strip: //div[@class='ardMehrZumThemaRechts']\r
-strip: //*[contains(@class, 'inv')]\r
-\r
-strip: //p[@id='ardAbbinder']\r
-strip: //div[@class='socialBookmarks']\r
-strip: //div[@id='ardContentEnd']\r
-strip: //div[@id='ardDisclaimer']\r
+title://div[@id='ardContent']/h1
+
+author://p[@id='ardAutor']
+author://span[@id='ardQuelle']
+author:string('sportschau.de')
+
+date:substring-after(//span[@id='ardStand'], 'Stand: ')
+
+body://div[@id='ardContent']
+
+strip://div[@id='ardContent']/h1
+strip://p[@id='ardAutor']
+strip: //div[@class='embeddedPlayer_clipinfo']
+strip: //div[@class='ardMehrZumThemaRechts']
+strip: //*[contains(@class, 'inv')]
+
+strip: //p[@id='ardAbbinder']
+strip: //div[@class='socialBookmarks']
+strip: //div[@id='ardContentEnd']
+strip: //div[@id='ardDisclaimer']
strip: //div[@id='ardRechteSpalte']
test_url: http://www.sportschau.de/sp/fussball/news201203/17/analyse_leverkusen_gladbach.jsp
\ No newline at end of file
-# main sportsillustrated.com articles\r
-#\r
-body: //div[@id="cnnStoryContent"]\r
-title: //div[@id="cnnStoryHeadline"]//h1\r
-author: //div[@id="cnnSubBanner"]//strong\r
-date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")\r
-date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")\r
-\r
-# kill ugly font buttons\r
-strip: //div[@id="cnnSCFontButtons"]\r
-\r
-# kill misc filler videos & etc\r
-strip: //div[@class="cnnDivideContent"]\r
-strip: //*[@class="cnnTMbox"]\r
-\r
-# si vault articles\r
-# -------------\r
-body: //div[@class="siv_artPara"]\r
-title: //div[@class="siv_artHeader"]//h1\r
-author: //div[@class="byline"]\r
-date: //div[@class="date"]\r
-\r
-next_page_link: //div[@id='cnnStoryContinue']/a\r
-strip_id_or_class: cnnstorypagination\r
-\r
+# main sportsillustrated.com articles
+#
+body: //div[@id="cnnStoryContent"]
+title: //div[@id="cnnStoryHeadline"]//h1
+author: //div[@id="cnnSubBanner"]//strong
+date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")
+date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")
+
+# kill ugly font buttons
+strip: //div[@id="cnnSCFontButtons"]
+
+# kill misc filler videos & etc
+strip: //div[@class="cnnDivideContent"]
+strip: //*[@class="cnnTMbox"]
+
+# si vault articles
+# -------------
+body: //div[@class="siv_artPara"]
+title: //div[@class="siv_artHeader"]//h1
+author: //div[@class="byline"]
+date: //div[@class="date"]
+
+next_page_link: //div[@id='cnnStoryContinue']/a
+strip_id_or_class: cnnstorypagination
+
test_url: http://sportsillustrated.cnn.com/2012/writers/peter_king/02/27/combine/index.html
\ No newline at end of file
-title: //h2\r
-author: string('Michael Spreng')\r
-date: //div[@class='date']\r
+title: //h2
+author: string('Michael Spreng')
+date: //div[@class='date']
body: //div[@class='entry']
test_url: http://www.sprengsatz.de/?p=3691
\ No newline at end of file
-body: //div[@id='ff-body']\r
-\r
-replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center>\r
-\r
-prune: no\r
-\r
+body: //div[@id='ff-body']
+
+replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center>
+
+prune: no
+
test_url: http://www.sqlite.org/fileformat2.html
\ No newline at end of file
-body: //div[@class='content']\r
-date: substring-before( //div[@class='unit dateAndNotes'], 'with')\r
+body: //div[@class='content']
+date: substring-before( //div[@class='unit dateAndNotes'], 'with')
title: //h3
test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending
\ No newline at end of file
-body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2\r
-\r
-replace_string(<div class="user-details"><br></div>): <!-- nothing -->\r
-replace_string(<div class="vote">): <div class="vote"><h3>Vote count: \r
-\r
-strip_id_or_class: vote-up\r
-strip_id_or_class: vote-down\r
-strip_id_or_class: star-off\r
-strip_id_or_class: favoritecount\r
-strip_id_or_class: -share\r
-strip_id_or_class: badgecount\r
-\r
+body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2
+
+replace_string(<div class="user-details"><br></div>): <!-- nothing -->
+replace_string(<div class="vote">): <div class="vote"><h3>Vote count:
+
+strip_id_or_class: vote-up
+strip_id_or_class: vote-down
+strip_id_or_class: star-off
+strip_id_or_class: favoritecount
+strip_id_or_class: -share
+strip_id_or_class: badgecount
+
test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax
\ No newline at end of file
-title: //div[@class='articleLeft']/h3\r
-\r
-author: substring-after(//span[@class='articleAuthor']/a,'By ')\r
-\r
-date: substring-before(//span[@class='articleDateTime'],'in ')\r
-\r
-body: //div[@class='articleLeft']\r
-strip: //div[@class='articleMoreNews']\r
-strip: //div[@class='articleLeft']/h3\r
-strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix']\r
-\r
-# Remove duplicate title from text\r
-strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3\r
+title: //div[@class='articleLeft']/h3
+
+author: substring-after(//span[@class='articleAuthor']/a,'By ')
+
+date: substring-before(//span[@class='articleDateTime'],'in ')
+
+body: //div[@class='articleLeft']
+strip: //div[@class='articleMoreNews']
+strip: //div[@class='articleLeft']/h3
+strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix']
+
+# Remove duplicate title from text
+strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3
test_url: http://www.stalbansreview.co.uk/news/9581446.New_roundabout_in_King_Harry_Lane/r/?ref=rss
\ No newline at end of file
-autodetect_next_page: no\r
-footnotes: no\r
-dissolve: //div[@class="column-2"]//div[@class="widget"]\r
-dissolve: //div[@class="column-2"]//div\r
-\r
-author: //div[@class="innerbyline"]/a\r
-strip: //div[@class="innerbyline"]/a\r
-\r
-strip: //p[@class="dateline"]\r
-date: //p[@class="dateline"]\r
-\r
-title: //h1[@class="title"]\r
-author: //div[@class="innerbyline"]/a\r
-date: //p[@class="dateline"]\r
+autodetect_next_page: no
+footnotes: no
+dissolve: //div[@class="column-2"]//div[@class="widget"]
+dissolve: //div[@class="column-2"]//div
+
+author: //div[@class="innerbyline"]/a
+strip: //div[@class="innerbyline"]/a
+
+strip: //p[@class="dateline"]
+date: //p[@class="dateline"]
+
+title: //h1[@class="title"]
+author: //div[@class="innerbyline"]/a
+date: //p[@class="dateline"]
body: //div[@class="column-2"]
test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html
\ No newline at end of file
-title: //h1[@id='storyTitle']\r
-author: substring-after(//span[@class='hsa_postCredit'], 'By ') \r
-date://span[@class='hsa_dateStamp']\r
-body: //div[@class='storytext']\r
-strip_id_or_class: insideStoryAd \r
-strip_id_or_class: printDesc\r
-strip_id_or_class: sb_2010_story_tools\r
-strip_id_or_class: FBConnectButton_Text\r
-strip_id_or_class: breadcrumbs\r
-prune: no\r
+title: //h1[@id='storyTitle']
+author: substring-after(//span[@class='hsa_postCredit'], 'By ')
+date://span[@class='hsa_dateStamp']
+body: //div[@class='storytext']
+strip_id_or_class: insideStoryAd
+strip_id_or_class: printDesc
+strip_id_or_class: sb_2010_story_tools
+strip_id_or_class: FBConnectButton_Text
+strip_id_or_class: breadcrumbs
+prune: no
test_url: http://www.staradvertiser.com/news/20111112_World_leaders_step_onto_isle_stage.html
\ No newline at end of file
-title: /html/head/meta[@name='title']/@content\r
-author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a\r
-date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')]\r
-\r
-body: //div[@class='entry-content']\r
-\r
+title: /html/head/meta[@name='title']/@content
+author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a
+date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')]
+
+body: //div[@class='entry-content']
+
single_page_link: //p[@class='pagination']/a
test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/
\ No newline at end of file
-title: article/h1\r
-author: //p[@class='byline']\r
-date: //p[@class='date']\r
+title: article/h1
+author: //p[@class='byline']
+date: //p[@class='date']
body: //div[@class='body']
test_url: https://www.stlbeacon.org/#!/content/23404/mogop_caucus_031712
\ No newline at end of file
-strip_id_or_class: 'left'\r
-strip_id_or_class: 'right'\r
-strip_id_or_class: 'block-belowcontent'\r
+strip_id_or_class: 'left'
+strip_id_or_class: 'right'
+strip_id_or_class: 'block-belowcontent'
test_url: http://stockholm.etc.se/reportage/bakom-stangda-dorrar-pa-fas-3-massa
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.stockholmsfria.nu/artikel/112068
\ No newline at end of file
--- /dev/null
+body: //div[@id='article' or @id='current_illustration']
+title: //div[@id='article']//h1
+date: //div[@id='article']//div[@class='date']
+prune: no
+
+test_url: http://www.straightdope.com/columns/read/947/whatever-happened-to-adoption-of-the-metric-system-in-the-u-s
\ No newline at end of file
-title: //h2[@class="post-title"]\r
-date: //span[@class="post-date"]\r
-body: //div[@class="post-entry"]\r
-\r
-#This is also good for *.streetsblog.org, for example:\r
+title: //h2[@class="post-title"]
+date: //span[@class="post-date"]
+body: //div[@class="post-entry"]
+
+#This is also good for *.streetsblog.org, for example:
#http://dc.streetsblog.org/2011/10/21/friday-job-market/
test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/
\ No newline at end of file
-title://div[@id='left_col']/h1\r
-author:substring-after(//span[contains(@class,'storycredit')],'BY ')\r
-author://span[contains(@class,'storycredit')]\r
-date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ')\r
-date://div[contains(@class,'toolbox_date')]\r
-body://div[@id='left_col']\r
-\r
-strip_id_or_class: toolbox\r
-strip_id_or_class: story_features\r
-strip_id_or_class: sharebox_new\r
-strip_id_or_class: related_box\r
-strip_id_or_class: sponsored_links\r
-strip_id_or_class: hidden_ad\r
-strip_id_or_class: story_content_top\r
-strip_id_or_class: total_number\r
-strip_id_or_class: sort_order\r
-strip_id_or_class: subscribe_order\r
-\r
-strip://div[contains(@class,'ad_story')]\r
-\r
-test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge\r
+title://div[@id='left_col']/h1
+author:substring-after(//span[contains(@class,'storycredit')],'BY ')
+author://span[contains(@class,'storycredit')]
+date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ')
+date://div[contains(@class,'toolbox_date')]
+body://div[@id='left_col']
+
+strip_id_or_class: toolbox
+strip_id_or_class: story_features
+strip_id_or_class: sharebox_new
+strip_id_or_class: related_box
+strip_id_or_class: sponsored_links
+strip_id_or_class: hidden_ad
+strip_id_or_class: story_content_top
+strip_id_or_class: total_number
+strip_id_or_class: sort_order
+strip_id_or_class: subscribe_order
+
+strip://div[contains(@class,'ad_story')]
+
+test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge
test_url: http://www.stuff.co.nz/entertainment/7045944/International-praise-for-Ladyhawke
\ No newline at end of file
-single_page_link: //iframe[@id='stumbleFrame']/@src\r
-\r
-test_url: www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/
\ No newline at end of file
+single_page_link: //iframe[@id='tb-stumble-frame']/@src
+
+test_url: http://www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/
\ No newline at end of file
-title: //*[@id='posts']/div[1]/h2\r
-author: //*[@id='posts']/div[1]/div[2]/span[2]/a\r
-date: //*[@class='date']\r
-body: //div[@class='body-lead']\r
-\r
-# take out the bit saying 'body'\r
-strip: //div[@class='body-lead']/div[@class='info-label']\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
-\r
+title: //*[@id='posts']/div[1]/h2
+author: //*[@id='posts']/div[1]/div[2]/span[2]/a
+date: //*[@class='date']
+body: //div[@class='body-lead']
+
+# take out the bit saying 'body'
+strip: //div[@class='body-lead']/div[@class='info-label']
+
+
+
+
+
+
+
+
+
test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations
\ No newline at end of file
-# 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...\r
-\r
-single_page_link: //a[ contains( @href, "/2.220/" ) ]\r
-\r
-body: //article[@id="sitecontent"]/section[@class="body"]\r
-author: //address[@class="author"]\r
-date: //div[@class="header"]//h1//span[@class="updated"]\r
-wrap_in(small): //div[@class="footer"]\r
-wrap_in(i): //figcaption/h3\r
-dissolve: //figcaption//h3\r
-dissolve: //figure/div[@class="body"]\r
-dissolve: //figure/a\r
-\r
-strip: //figure[ not( contains(@class, "zoomimage" ) ) ]\r
-strip: //div[@data-onlineonly="true"]\r
-strip: //address[@class="author"]\r
-\r
+# 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...
+
+single_page_link: //a[ contains( @href, "/2.220/" ) ]
+
+body: //article[@id="sitecontent"]/section[@class="body"]
+author: //address[@class="author"]
+date: //div[@class="header"]//h1//span[@class="updated"]
+wrap_in(small): //div[@class="footer"]
+wrap_in(i): //figcaption/h3
+dissolve: //figcaption//h3
+dissolve: //figure/div[@class="body"]
+dissolve: //figure/a
+
+strip: //figure[ not( contains(@class, "zoomimage" ) ) ]
+strip: //div[@data-onlineonly="true"]
+strip: //address[@class="author"]
+
test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693
\ No newline at end of file
-title: //div[@class='story-details']/h1\r
-date: //span[@class='date-time']\r
-Author: substring-after(//p[@class='by-line'], 'By ')\r
-\r
-strip: //div[@class='videoThumbnails']\r
-strip: //div[@class='ad-square2-container']\r
-strip: //div[@class='homeDeliveryContainer5']\r
-\r
-strip: //div[@class='image-description']\r
-strip: //div[@id='internal-side-bar']\r
-\r
-strip: //span[@class='hide']\r
+title: //div[@class='story-details']/h1
+date: //span[@class='date-time']
+Author: substring-after(//p[@class='by-line'], 'By ')
+
+strip: //div[@class='videoThumbnails']
+strip: //div[@class='ad-square2-container']
+strip: //div[@class='homeDeliveryContainer5']
+
+strip: //div[@class='image-description']
+strip: //div[@id='internal-side-bar']
+
+strip: //span[@class='hide']
strip: //div[@class='date']
test_url: http://www.suntimes.com/technology/ihnatko/8816567-452/review-kindle-fire-is-no-ipad-killer-but-it-is-a-killer-device.html
\ No newline at end of file
-# Ads\r
-strip_id_or_class: articlead\r
+body: //div[@id='article-content']
+author: //div[@id='article']//div[@class='byline']/p
-test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd
\ No newline at end of file
+# Ads
+strip_id_or_class: articlead
+
+# Sharing
+strip_id_or_class: share
+
+prune: no
+
+test_url: http://www.svd.se/nyheter/inrikes/oppositionen-stoppar-skattesankning_8531228.svd
+test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd
+test_url: http://www.svd.se/?service=rss&type=senastenytt
\ No newline at end of file
--- /dev/null
+title: //article[@role='main']//h1
+body: //article[@role='main']
+strip: //aside
+replace_string(<noscript>): <div>
+replace_string(</noscript>): </div>
+strip_id_or_class: svtHide-No-Js
+strip_id_or_class: aside
+strip_id_or_class: Aside
+strip_id_or_class: hidden
+strip_id_or_class: Share
+tidy: no
+prune: no
+
+test_url: http://www.svt.se/ug/framtidsdrommar-om-jobb-blev-lackande-gifthal
+test_url: http://www.svt.se/nyheter/het-debatt-mellan-borg-och-andersson
+test_url: http://www.svt.se/nyheter/regionalt/svtsormland/sj-tag-evakuerades-efter-rokdrama
\ No newline at end of file
-title: //h1\r
-\r
-author: //a[contains(@href, '/sok/?')]/text()\r
-\r
-date: substring-after(//span[@class='date'], 'Publicerad ')\r
-\r
-body: //div[@class='two_column_left']\r
-strip_id_or_class: story\r
-strip: //div[@class='leadText saplo:lead']/h5\r
-
-test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna--
\ No newline at end of file
+title: //h1
+
+author: //a[contains(@href, '/sok/?')]/text()
+
+date: //meta[@name='bi3dPubDate']/@content
+
+body: (//div[contains(@class, 'slider_wrapper')])[1] | //div[@id='article_image' or @class='two_column_left']
+strip_id_or_class: story
+strip_id_or_class: article_body_ad
+strip: //div[@class='leadText saplo:lead']/h5
+
+replace_string(<br />): <br /><br />
+
+prune: no
+
+test_url: http://www.sydsvenskan.se/malmo/allt-jag-ager-ligger-pa-botten/
+test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna--
+test_url: http://www.sydsvenskan.se/rss.xml
\ No newline at end of file
-title: //div[contains(@class, "post")]/h2\r
-\r
-author: //div[contains(@class, "post")]/p[position()=last()]/text()[1]\r
-\r
-date: //div[contains(@class, "post")]/p[1]\r
-\r
-body: //div[contains(@class, "post")]\r
-\r
-strip: //div[contains(@class, "post")]/h2[1]\r
-strip: //div[contains(@class, "post")]/p[1]\r
+title: //div[contains(@class, "post")]/h2
+
+author: //div[contains(@class, "post")]/p[position()=last()]/text()[1]
+
+date: //div[contains(@class, "post")]/p[1]
+
+body: //div[contains(@class, "post")]
+
+strip: //div[contains(@class, "post")]/h2[1]
+strip: //div[contains(@class, "post")]/p[1]
strip: //div[contains(@class, "post")]/p[position()=last()]
test_url: http://www.symmetrymagazine.org/breaking/?p=12784
\ No newline at end of file
-title: //h1\r
-body://div[@class='drucken']\r
-author: substring-after(//span[@class='autor'], 'Von ')\r
-author: //span[@class='autor']\r
-\r
-single_page_link://a[contains(@href, '/drucken/')]\r
-convert_double_br_tags:yes\r
-\r
-dissolve://div[@class='vorspann']\r
-\r
-strip://h1\r
-strip_id_or_class: klassifizierung\r
-strip_id_or_class: source\r
+title: //h1
+body://div[@class='drucken']
+author: substring-after(//span[@class='autor'], 'Von ')
+author: //span[@class='autor']
+
+single_page_link://a[contains(@href, '/drucken/')]
+convert_double_br_tags:yes
+
+dissolve://div[@class='vorspann']
+
+strip://h1
+strip_id_or_class: klassifizierung
+strip_id_or_class: source
strip_id_or_class: autor
test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567
\ No newline at end of file
--- /dev/null
+# 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...
+
+single_page_link: //a[ contains( @href, "/2.220/" ) ]
+
+body: //article[@id="sitecontent"]/section[@class="body"]
+author: //address[@class="author"]
+date: //div[@class="header"]//h1//span[@class="updated"]
+wrap_in(small): //div[@class="footer"]
+wrap_in(i): //figcaption/h3
+dissolve: //figcaption//h3
+dissolve: //figure/div[@class="body"]
+dissolve: //figure/a
+
+strip: //figure[ not( contains(@class, "zoomimage" ) ) ]
+strip: //div[@data-onlineonly="true"]
+strip: //address[@class="author"]
+
+test_url: http://sz.de/1.1556693
\ No newline at end of file
-title://h1[1]\r
-\r
-author: substring-after(//em, 'Von ')\r
-author:string('tagesschau.de')\r
-\r
-date:substring-after(//div[@class='standDatum'], 'Stand: ')\r
-\r
-body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')]\r
-\r
-strip://h1[1]\r
-strip: //div[contains(@class, 'directLinks')]\r
-strip: //div[contains(@class, 'zitatBox')]\r
-strip: //div[contains(@class, 'teaserBox metaBlock')]\r
-strip: //*[contains(@class, 'inv')]\r
-strip: //span[@class='imgSubline']\r
-strip: //*[contains(@class, 'topline')][1]\r
-strip: //div[@id='rightCol'][1]\r
-strip: //div[@id="footer"][1]\r
-strip: //div[@class="fPlayer"] \r
-strip: //div[@id='seitenanfang']\r
-strip: //div[@class='standDatum']\r
+title://h1[1]
+
+author: substring-after(//em, 'Von ')
+author:string('tagesschau.de')
+
+date:substring-after(//div[@class='standDatum'], 'Stand: ')
+
+body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')]
+
+strip://h1[1]
+strip: //div[contains(@class, 'directLinks')]
+strip: //div[contains(@class, 'zitatBox')]
+strip: //div[contains(@class, 'teaserBox metaBlock')]
+strip: //*[contains(@class, 'inv')]
+strip: //span[@class='imgSubline']
+strip: //*[contains(@class, 'topline')][1]
+strip: //div[@id='rightCol'][1]
+strip: //div[@id="footer"][1]
+strip: //div[@class="fPlayer"]
+strip: //div[@id='seitenanfang']
+strip: //div[@class='standDatum']
strip: //em
test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html
\ No newline at end of file
-title: //span[@class="entry-title"]\r
-author: //*[contains(@class, 'item')]/p/a/text()\r
-date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:')\r
+title: //span[@class="entry-title"]
+author: //*[contains(@class, 'item')]/p/a/text()
+date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:')
body: //div[@class="entry-content"]
test_url: http://www.tampabay.com/news/salvador-dali-leaders-want-st-petersburg-city-council-to-put-brakes-on/1236349
\ No newline at end of file
-title: //h3[@class="storytitle"]\r
-body: //div[@class="post"]\r
+title: //h3[@class="storytitle"]
+body: //div[@class="post"]
strip: //div[@class="blurbBox"]
test_url: http://taptaptap.com/blog/apples-precedents-vs-apples-guidelines/
\ No newline at end of file
-title: //span[@id='ctl00_ctl00_MainContent_MainContent_RecipeImage1_lblRecipeTitle']\r
-body: //div[@id='RDNEW']//*[@class='Recipe-imgCon' or @class='Recipe-Intro' or @class='recipeDetails']\r
-strip_id_or_class: rec-ExRightPanel\r
-strip_id_or_class: divCarousel\r
-strip_id_or_class: preptimeOuter\r
-strip_id_or_class: cooktimeOuter\r
-strip_id_or_class: durationOuter\r
-strip_id_or_class: divImageFooter\r
-strip_id_or_class: microFormatFnIngred\r
-strip: //span[@class='Recipe-Intro']//*[@class='link' or @class='rating']\r
-\r
-prune: no\r
-tidy: no\r
-
-test_url: http://www.tasteofhome.com/recipes/Grinch-Punch
\ No newline at end of file
+title: //div[@id='ctl00_MainContent_ctl00_Div1']//h2
+body: //div[@id='ctl00_MainContent_ctl00_Div1']
+
+single_page_link: //div[contains(@class, 'recipeHeader')]//a[contains(@href, '/print')]
+
+strip_image_src: tohPrintL.png
+
+prune: no
+
+test_url: http://www.tasteofhome.com/recipes/Grinch-Punch
+test_url: http://www.tasteofhome.com/recipes/lactose-free-chocolate-chip-cookies
\ No newline at end of file
-date: //div[@class='secthead']\r
-body: //div[@class='sectbody']\r
-title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1)\r
-author: //span[@class='author']\r
-strip: //p[@class='caption']\r
-strip_id_or_class: rack\r
+date: //div[@class='secthead']
+body: //div[@class='sectbody']
+title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1)
+author: //span[@class='author']
+strip: //p[@class='caption']
+strip_id_or_class: rack
test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/
\ No newline at end of file
-body: //div[@id='centercontent']\r
-strip: //div[@id='rightcontent']\r
-date: substring-before( //div[@id='cats'], '·')\r
+body: //div[@id='centercontent']
+strip: //div[@id='rightcontent']
+date: substring-before( //div[@id='cats'], '·')
title: //h1
test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money
\ No newline at end of file
--- /dev/null
+title: //h2
+body: //div[@class="post_content"]
+author: //span[@class="fn"]
+date: //time[@class="updated"]
+strip_comments: //yes
+footnotes: //yes
+test_url: http://tcmanila.tk/post/29189064358/my-2012-roadmap-is-almost-complete-look-at-the
\ No newline at end of file
-title: //div[@id='main-content']/h1\r
-body: //div[@id='main-content']\r
+title: //div[@id='main-content']/h1
+body: //div[@id='main-content']
strip: //div[@id='main-content']/h1
test_url: http://www.tcng.org/index.php/blog/view/teaching-basic-health-cutting-down-costs
\ No newline at end of file
-title: //h1[@class='storyheadline']\r
-body: //div[@class='storytext']\r
+title: //h1[@class='storyheadline']
+body: //div[@class='storytext']
strip: //strong
test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29
\ No newline at end of file
--- /dev/null
+title: //div[@class="title"]/h1
+title: //div[@class="caption"]/h1
+author: substring-after(//div[@class="metadata"]/div[@class="date"]/a[2], 'by ')
+date: //div[@class="metadata"]/div[@class="date"]/a
+test_url: http://tech.gilt.com/post/46359463184/26-3-13-todays-noon-outage-and-what-were-doing-to
\ No newline at end of file
-title://h1[contains(@id,'artibodyTitle')]\r
-\r
-date://span[contains(@id,'pub_date')]\r
-\r
-body://div[contains(@id,'artibody')]\r
-\r
-strip://div[contains(@class,'otherContent')]\r
-\r
-next_page_link://p[@class='page']/a[contains(.,'下一页')]\r
+title://h1[contains(@id,'artibodyTitle')]
+
+date://span[contains(@id,'pub_date')]
+
+body://div[contains(@id,'artibody')]
+
+strip://div[contains(@class,'otherContent')]
+
+next_page_link://p[@class='page']/a[contains(.,'下一页')]
test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml
\ No newline at end of file
-body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')]\r
-\r
-author: //a[@class="name"]\r
-\r
-date: //div[@class="post-time"]\r
-\r
-title: //h1[@class="headline"]\r
-strip_id_or_class: module-crunchbase\r
-\r
-# The following is for the mobile site\r
-body: //div[@id="singlentry"]\r
-author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ')\r
-date: substring-before(//div[@class="single-post-meta-top"],' @')\r
-title: //a[@class="sh2"]\r
-\r
-prune: no\r
-\r
+body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')]
+
+author: //a[@class="name"]
+
+date: //div[@class="post-time"]
+
+title: //h1[@class="headline"]
+strip_id_or_class: module-crunchbase
+
+# The following is for the mobile site
+body: //div[@id="singlentry"]
+author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ')
+date: substring-before(//div[@class="single-post-meta-top"],' @')
+title: //a[@class="sh2"]
+
+prune: no
+
test_url: http://techcrunch.com/2011/10/18/apples-insanely-great-q1-2012/
\ No newline at end of file
-body: //div[@class='story']\r
-title: //div[@class='story']/h1\r
-strip: //div[@class='story']/h1\r
-\r
-author: //div[@class='details']/p[contains(., 'by ')]/a\r
-date: //p[@class='storydate']\r
-\r
-strip: //p[a[contains(., 'Leave a Comment')]]\r
-strip_id_or_class: share\r
-strip_id_or_class: maincolumn_head\r
+body: //div[@class='story']
+title: //div[@class='story']/h1
+strip: //div[@class='story']/h1
+
+author: //div[@class='details']/p[contains(., 'by ')]/a
+date: //p[@class='storydate']
+
+strip: //p[a[contains(., 'Leave a Comment')]]
+strip_id_or_class: share
+strip_id_or_class: maincolumn_head
strip_id_or_class: maincolmod
test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml
\ No newline at end of file
--- /dev/null
+title: //div[@class='articleHead']//h1
+author: //div[@class="author-name"]/a[1]
+body: //div[@class="main"]
+
+# remove 'From the Lab' and 'Recent posts' text
+strip: //div[@class='blogLabel']
+
+# remove byline and meta info
+strip: //div[@class="article-meta"]
+strip: //div[@class="author-info"]
+
+#strip tags and categories
+strip: //div[@class="department"]
+
+#strip product cap links
+strip: //div[@class="cap-main"]
+strip: //div[@id="compare-lede"]
+test_url: http://www.techhive.com/article/2010549/up-close-with-blackberry-10.html
\ No newline at end of file
-single_page_link_in_feed: //b/a\r
-\r
+single_page_link_in_feed: //b/a
+
test_url_feed: http://www.techmeme.com/feed.xml
\ No newline at end of file
-title: //h2\r
-author: //meta[@name="author"]/@content\r
-date: //h3\r
-body: //div[@class="postBody"]\r
-strip: //h1\r
-strip: //h2\r
-strip: //h3\r
+title: //h2
+author: //meta[@name="author"]/@content
+date: //h3
+body: //div[@class="postBody"]
+strip: //h1
+strip: //h2
+strip: //h3
test_url: http://technicallyjordan.tumblr.com/post/22914659822/facebook-to-launch-app-store-knock-off
\ No newline at end of file
--- /dev/null
+next_page_link: //a[contains(., 'NEXT PAGE')]
+# following::node() selects text nodes too whereas following::* selects only elements.
+strip: //span[@class='pageo']/following::node()
+strip: //span[@class='pageo']
+test_url: http://technologizer.com/2010/03/08/the-secret-origin-of-windows/
\ No newline at end of file
-title: //header[@class='article-meta']/h1\r
-title: substring-before(//title, '|')\r
-\r
-body: //section[contains(@class, 'body')]\r
-\r
-# Author & Date for News and Featured Stories\r
-author: //ul[@class='byline']/li/a\r
-author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on')\r
-date: substring-after(//ul[@class='byline']/li, 'on ')\r
-\r
-# Author & Date for "Views"\r
-author: //div[@class='view-byline']/div[@class='meta']/h2[1]\r
-date: //div[@class='view-byline']/div[@class='meta']/h2[2]\r
-\r
-next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')]\r
+title: //header[@class='article-meta']/h1
+title: substring-before(//title, '|')
+
+body: //section[contains(@class, 'body')]
+
+# Author & Date for News and Featured Stories
+author: //ul[@class='byline']/li/a
+author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on')
+date: substring-after(//ul[@class='byline']/li, 'on ')
+
+# Author & Date for "Views"
+author: //div[@class='view-byline']/div[@class='meta']/h2[1]
+date: //div[@class='view-byline']/div[@class='meta']/h2[2]
+
+next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')]
test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/
\ No newline at end of file
-body: //div[@class="post"]\r
-\r
-strip: //div[@class="post-meta"]\r
-strip: //div[@id="socialicons"]\r
-strip: //div[@id="authorbox"]\r
+body: //div[@class="post"]
+
+strip: //div[@class="post-meta"]
+strip: //div[@id="socialicons"]
+strip: //div[@id="authorbox"]
test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572
\ No newline at end of file
-# Title without news/reviews etc. appended\r
-title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1\r
-\r
-# Remove home link\r
-strip: //div[@id='page_logo']/a\r
-\r
-# Remove utilities\r
-strip: //*[(@id = "utilities")]\r
-\r
-# Remove comments link\r
+# Title without news/reviews etc. appended
+title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1
+
+# Remove home link
+strip: //div[@id='page_logo']/a
+
+# Remove utilities
+strip: //*[(@id = "utilities")]
+
+# Remove comments link
strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny']
test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105
\ No newline at end of file
-body: //div[@id='artikelKolom']\r
-strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper']\r
-strip: //div[@id='artikeltoolbar']\r
-strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer']\r
-strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget']\r
-tidy: no\r
-prune: no\r
+body: //div[@id='artikelKolom']
+strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper']
+strip: //div[@id='artikeltoolbar']
+strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer']
+strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget']
+tidy: no
+prune: no
test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss
\ No newline at end of file
-body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea']\r
-strip: //p[@class='comments']\r
-strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")]\r
-strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links']\r
-strip: //p[@class='bbpTweet']/span[@class='timestamp']\r
-strip: //p[@class='bbpTweet']/span[@class='metadata']//img\r
-tidy: no\r
-prune: no\r
+body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea']
+strip: //p[@class='comments']
+strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")]
+strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links']
+strip: //p[@class='bbpTweet']/span[@class='timestamp']
+strip: //p[@class='bbpTweet']/span[@class='metadata']//img
+tidy: no
+prune: no
test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html
\ No newline at end of file
--- /dev/null
+body://div[@id="print-news"]
+strip://a
+strip://span[@class="date-line"]
+test_url: http://www.thanhnien.com.vn/pages/20121006/hon-90-trieu-usd-nang-cap-do-thi-can-tho.aspx
\ No newline at end of file
--- /dev/null
+tidy: no
+
+test_url: http://the-magazine.org/1/alone-together-again
\ No newline at end of file
--- /dev/null
+author: //h3[@class='authorName']
+date: //time
+body: //div[@class='articleBody']
+strip_id_or_class: adspot
+test_url: http://www.theage.com.au/victoria/top-cops-warns-outlaw-bikies-we-have-a-gang-too-20130331-2h1l8.html
\ No newline at end of file
--- /dev/null
+# Article Metadata
+title: //meta[@property="og:title"]/@content
+author: substring-after(//h3, 'By ')
+date: //h4/a[2]
+
+# Content Pruning
+strip: //h4
+strip: //a[@id="print_button"]
+strip: //p[@class="excerpt"]
+strip: //h3
+strip: //div[@class="caption"]
+strip: //center/a/img
+test_url: http://theamericanscholar.org/too-big-to-fail-and-too-risky-to-exist/
\ No newline at end of file
-# Remove home link\r
+# Remove home link
strip: //div[@id='blog-title']/a
test_url: http://theappleblog.com/2010/10/21/the-new-macbook-air-is-underwhelming/
\ No newline at end of file
-title: //div[@id='article']/h1\r
-title: //h1\r
-\r
-body: //div[@class='articleText']\r
-body: //div[@class='articleContent']\r
-body: //div[@id='article']\r
-date: //*[contains(@class, 'date')]\r
-author: //div[@id='profile']//*[@class='authors']//a[1]\r
-author: //*[@class='author']/span\r
-prune: no\r
-\r
-strip: //div[@class='moreOnBoxWithImages']\r
-\r
-single_page_link: //a[@class='print']\r
-\r
-test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/\r
-test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/\r
+title: //div[contains(@class, 'articleHead')]//h1
+
+body: //div[@class='articleText']
+body: //div[@class='articleContent']
+body: //div[@id='article']
+date: //*[contains(@class, 'date')]
+author: //div[@id='profile']//*[@class='authors']//a[1]
+author: //*[@class='author']/span
+prune: no
+
+strip: //div[@class='moreOnBoxWithImages']
+strip: //p[contains(., 'This article available online at:')]
+strip: //p[contains(., 'This article available online at:')]/following::*
+strip: //div[@class='earthbox']
+
+single_page_link: //article//a[contains(@class, 'print')]
+
+test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/
+test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/
test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/
\ No newline at end of file
--- /dev/null
+# To administrator:
+# Please replace the hostname with "*.theatlanticcities.com"
+
+# This filter is tested on:
+# http://m.theatlanticcities.com/arts-and-lifestyle/2012/04/invisible-borders-define-american-culture/1839/
+# http://www.theatlanticcities.com/housing/2012/11/chinas-holdouts/3981/
+# http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/
+
+title://h1
+author: //ul[@class='meta']/li/a
+date: //ul[@class='meta']/li/following-sibling::li
+body://article[@class='post']
+
+strip://h1
+strip://ul[@class='meta']
+strip://div[@class='newsletter-slug']
+test_url: http://www.theatlanticcities.com/arts-and-lifestyle/2012/12/christmas-time-here/4133/
\ No newline at end of file
-title: //meta[@name='og:title']/@content\r
-date: //meta[@name='created']/@content\r
-body: //div[@class="StoryBody" or @class="storyTeaser"]\r
-\r
-replace_string(<p></p>): <br /><br />\r
-\r
+title: //meta[@name='og:title']/@content
+date: //meta[@name='created']/@content
+body: //div[@class="StoryBody" or @class="storyTeaser"]
+
+replace_string(<p></p>): <br /><br />
+
test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html
\ No newline at end of file
-title: //h2[contains(@class, 'page-title')]\r
-body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content']\r
-\r
-prune: no\r
-\r
-strip: //div[contains(@class, 'node-book')]//a[@class='button']\r
-\r
-single_page_link: //a[@class='tool-print']\r
+title: //h2[contains(@class, 'page-title')]
+body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content']
+
+prune: no
+
+strip: //div[contains(@class, 'node-book')]//a[@class='button']
+
+single_page_link: //a[@class='tool-print']
test_url: http://thebrowser.com/interviews/yotam-ottolenghi-on-his-favourite-cookery-books
\ No newline at end of file
-title: substring-before(//title, ' – ') \r
-author:string('Shawn')\r
-date: //*/time/@pubdate\r
-\r
-\r
-strip: //header\r
-strip: //div[@id='prev_next']\r
-strip: //div[@id='masthead']\r
-\r
+title: substring-before(//title, ' – ')
+author:string('Shawn')
+date: //*/time/@pubdate
+
+
+strip: //header
+strip: //div[@id='prev_next']
+strip: //div[@id='masthead']
+
test_url: http://thecarton.net/2012/12/20/imdb
\ No newline at end of file
-#keep all body text\r
-prune: no\r
-\r
-#title, body, metadata\r
-title: //div[@class='story_header']/h1\r
-body: //div[@id='content']\r
-author: substring-after(//span[@class='byline'], "by ")\r
-author: substring-after(//span[@class='byline'], "By ")\r
-author: //span[@class='byline']\r
-date: //span[@class='date']\r
-\r
-#formatting\r
-convert_double_br_tags: yes\r
-dissolve: //div[@class='slides_full']/ul/li\r
-\r
-# cleanup\r
-strip: //a[@id='story_note']\r
-strip: //br\r
-strip: //div[@class='intro']\r
-strip: //div[@class='share-block']\r
-strip: //div[@class='sidebar-social']\r
-strip: //div[@class='top-stories']\r
-strip: //div[@class='prevnext']\r
+#keep all body text
+prune: no
+
+#title, body, metadata
+title: //div[@class='story_header']/h1
+body: //div[@id='content']
+author: substring-after(//span[@class='byline'], "by ")
+author: substring-after(//span[@class='byline'], "By ")
+author: //span[@class='byline']
+date: //span[@class='date']
+
+#formatting
+convert_double_br_tags: yes
+dissolve: //div[@class='slides_full']/ul/li
+
+# cleanup
+strip: //a[@id='story_note']
+strip: //br
+strip: //div[@class='intro']
+strip: //div[@class='share-block']
+strip: //div[@class='sidebar-social']
+strip: //div[@class='top-stories']
+strip: //div[@class='prevnext']
test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/
\ No newline at end of file
-title: //h1\r
-body: //article/div[contains(@class, 'article-body')]\r
-#strip: //header/hgroup/h1\r
-strip: //footer[@class='storyFooter']\r
-single_page_link: //li[@class='print']/a\r
-prune: no\r
+title: //h1
+body: //article/div[contains(@class, 'article-body')]
+#strip: //header/hgroup/h1
+strip: //footer[@class='storyFooter']
+single_page_link: //li[@class='print']/a
+prune: no
test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html
\ No newline at end of file
-# Remove duplicated title\r
-strip: //div[@id='content']/div[1][@class='full_intro']/h2\r
-\r
-# Remove links, ads etc.\r
-strip: //*[(@class= "aside")]\r
-\r
-# Remove the date and add it to the date published field in Instapaper\r
-strip: //div[@class="date"]\r
-date: //div[@class="date"]\r
-\r
-# There is no byline on The Daily Mash.\r
-\r
-convert_double_br_tags: yes\r
+# Remove duplicated title
+strip: //div[@id='content']/div[1][@class='full_intro']/h2
+
+# Remove links, ads etc.
+strip: //*[(@class= "aside")]
+
+# Remove the date and add it to the date published field in Instapaper
+strip: //div[@class="date"]
+date: //div[@class="date"]
+
+# There is no byline on The Daily Mash.
+
+convert_double_br_tags: yes
test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'entry-title')]
+author: //span[contains(@class, 'author vcard')]
+date: //span[@class = 'entry-date']
+body: //div[@class='entry-content']
+strip_id_or_class: bottomcontainerBox
+strip_id_or_class: lightsocial_container
+test_url: http://thedisneyblog.com/2012/11/17/videopolis-one-woman-disney-musical-beauty-and-the-beast/
\ No newline at end of file
--- /dev/null
+# Tested on:
+# http://theeuropean-magazine.com/352-dyson-george/353-evolution-and-innovation
+# http://theeuropean-magazine.com/522-casertano-stefano/919-morsi-and-the-future-of-egypt
+
+title://h2[@class='article-title']
+author:substring-before(substring-after(//p[@class='article-meta'], 'by'), '—')
+date:substring-after(//p[@class='article-meta'], '—')
+body://div[@class='article']
+
+wrap_in(strong)://p[@class='article-teaser']
+move_into(//div[@class='article-head'])://li/img
+
+strip://h2[@class='article-title']
+strip://p[@class='article-meta']
+strip://div[@class='copyright']
+strip://div[@class='opinions-of-readers']
+test_url: http://theeuropean-magazine.com/522-casertano-stefano/919-morsi-and-the-future-of-egypt
\ No newline at end of file
--- /dev/null
+## ERROR: Removes all images. Please fix, have no idea why (bad HTML?)
+
+title: //h1[@class='featuretitle']
+body: //div[@id='nobordercontentarea']
+
+# remove Twitter badge
+strip: //img[@alt='Follow tgdfweb on Twitter']
+
+# fix for headers not showing for some reason
+wrap_in(h2): //h2[@class='sectionheader']
+dissolve: //h2[@class='sectionheader']
+
+tidy: yes
+test_url: http://thegamedesignforum.com/features/acceleration_flow_1.html
\ No newline at end of file
-title: //h1[@id="headline"]\r
-author: //div[contains(@class, "editorial-byline-author")]/a\r
-date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ")\r
-\r
-# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed\r
-body: //div[@id="template"]\r
-strip_id_or_class: editorial-byline-pic\r
-strip_id_or_class: editorial-byline\r
-strip_id_or_class: headline\r
-\r
-# Include the leadin paragraph in the body text, but remove quotes because they're out of context\r
-dissolve: //div[contains(@id, "leadin")]\r
-strip_id_or_class: pullquote\r
-\r
-# Image captions removed because they're confusing in body text\r
-strip_id_or_class: image-caption-content\r
-\r
-# Remove header and footer\r
-strip_id_or_class: header\r
-strip_id_or_class: footer\r
-\r
-# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image\r
-strip: /html/body/span[contains(@style, "display: none")]\r
-\r
-# Remove search box\r
-strip_id_or_class: searchContainer\r
-strip: //div[contains(@class, "searchInstruction")]\r
-strip: //div[contains(@class, "searchResults")]/h4\r
-\r
-# Remove the 'Letters to the Editor' section\r
-strip_id_or_class: letter-text\r
-strip_id_or_class: letter-from\r
-strip_id_or_class: letter-date\r
-\r
-# Remove Like/Tweet links \r
-strip_id_or_class: social-tab\r
-\r
-# Remove 'divider' which causes an inexplicable slash to appear in the article body\r
-strip_id_or_class: divider\r
+title: //h1[@id="headline"]
+author: //div[contains(@class, "editorial-byline-author")]/a
+date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ")
+
+# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed
+body: //div[@id="template"]
+strip_id_or_class: editorial-byline-pic
+strip_id_or_class: editorial-byline
+strip_id_or_class: headline
+
+# Include the leadin paragraph in the body text, but remove quotes because they're out of context
+dissolve: //div[contains(@id, "leadin")]
+strip_id_or_class: pullquote
+
+# Image captions removed because they're confusing in body text
+strip_id_or_class: image-caption-content
+
+# Remove header and footer
+strip_id_or_class: header
+strip_id_or_class: footer
+
+# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image
+strip: /html/body/span[contains(@style, "display: none")]
+
+# Remove search box
+strip_id_or_class: searchContainer
+strip: //div[contains(@class, "searchInstruction")]
+strip: //div[contains(@class, "searchResults")]/h4
+
+# Remove the 'Letters to the Editor' section
+strip_id_or_class: letter-text
+strip_id_or_class: letter-from
+strip_id_or_class: letter-date
+
+# Remove Like/Tweet links
+strip_id_or_class: social-tab
+
+# Remove 'divider' which causes an inexplicable slash to appear in the article body
+strip_id_or_class: divider
test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/
\ No newline at end of file
-single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')]\r
-tidy: no\r
-prune: no\r
+single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')]
+tidy: no
+prune: no
test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/
\ No newline at end of file
--- /dev/null
+title: //h1[@id='headline']
+author: substring-after(//section[@class="credits"]/ul/li[1],"Interview by ")
+date: //time[@pubdate]
+body: //article[@class='interview']
+strip: //article[@class='interview']/footer
+test_url: http://thegreatdiscontent.com/jeffrey-zeldman
\ No newline at end of file
--- /dev/null
+title: //div[@id='main-article-info']//h1
+body: //div[@id='article-wrapper']
+date: //li[@class='publication']//time[@pubdate] | //li[@class='publication']//data[@pubdate]
+strip: //div[contains(@class, 'email-subscription')]
+strip: //div[contains(@class, 'kindleWidget')]
+#strip: //a[not(text())]
+strip_id_or_class: pocket-btn
+author: //li[@class='byline']
+prune: no
+tidy: no
+test_url: http://www.theguardian.com/world/2013/oct/04/nsa-gchq-attack-tor-network-encryption
+test_url: http://www.theguardian.com/world/2013/oct/03/edward-snowden-files-john-lanchester
+test_url: http://www.theguardian.com/commentisfree/2014/jun/15/britishness-search-identity-my-part-in-camerons-odyssey
\ No newline at end of file
-title: //h1[@class="Headline"]\r
-date: substring-after(//div[@class="posted"], 'EDT ')\r
-body: //div[@class="storyBody"]\r
-\r
-strip: //td[@class="AssocContentTD"]\r
-strip: //div[@id="pageTitle"]\r
-strip: //div[@class="posted"]\r
-strip: //div[@class="updated"]\r
-strip: //div[@class="js-kit-disclaimer"]\r
-strip: //table[@class="row3table"]\r
-strip: //div[@class="container2"]\r
+title: //h1[@class="Headline"]
+date: substring-after(//div[@class="posted"], 'EDT ')
+body: //div[@class="storyBody"]
+
+strip: //td[@class="AssocContentTD"]
+strip: //div[@id="pageTitle"]
+strip: //div[@class="posted"]
+strip: //div[@class="updated"]
+strip: //div[@class="js-kit-disclaimer"]
+strip: //table[@class="row3table"]
+strip: //div[@class="container2"]
strip: //div[@id="delta"]
test_url: http://www.theindychannel.com/news/31050840/detail.html
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'mainTitle')]
+author: //ul[@class='author']//a[@rel='author']
+body: //div[@id='article-box']
+prune: no
+tidy: no
+strip_id_or_class: head
+strip_id_or_class: social-nav
+strip_id_or_class: rate
+strip_id_or_class: video
+
+test_url: http://www.themarker.com/markerweek/1.2093167
\ No newline at end of file
-title: /html/body/div/div[2]/div/div/div/h3\r
-\r
-body: /html/body/div/div[2]/div/div/div/div[2]\r
-\r
-strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div\r
-\r
-tidy: no\r
-\r
+title: /html/body/div/div[2]/div/div/div/h3
+
+body: /html/body/div/div[2]/div/div/div/div[2]
+
+strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div
+
+tidy: no
+
# any way to get rid of this word character garbage?
test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html
\ No newline at end of file
-body: single-review\r
-strip_id_or_class: featured-review\r
-strip_id_or_class: resources\r
-strip_id_or_class: rate-the-book\r
-strip_id_or_class: write-review\r
+body: single-review
+strip_id_or_class: featured-review
+strip_id_or_class: resources
+strip_id_or_class: rate-the-book
+strip_id_or_class: write-review
test_url: http://themuseumofinnocence.com/review.php?id=1179
\ No newline at end of file
-title: //h1[@class='print-title']\r
-body: //div[@class='print-content']\r
-author: //a[contains(@href, '/authors')]\r
-author: substring-before(//div[@class='print-created'], '|')\r
-date: //span[@class='article-date']\r
-date: substring-after(//div[@class='print-created'], '|')\r
-prune: no\r
-\r
-single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')]\r
-\r
+title: //h2[@property='dc:title']
+#body: //div[@class='print-content']
+body: //div[@id='wysiwyg']
+author: //a[contains(@href, '/authors')]
+author: substring-before(//div[@class='print-created'], '|')
+date: //span[@class='article-date']
+date: substring-after(//div[@class='print-created'], '|')
+prune: no
+
+#single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')]
+single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '?page=full')]
+
test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher
\ No newline at end of file
-body: //div[@id="beta-inner"]\r
-title: //h3[@class="entry-header"]\r
+body: //div[@id="beta-inner"]
+title: //h3[@class="entry-header"]
test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='interior-page-title']
+author: //span[@class='author']/a
+date: //div[@class='byline']/time
+body: //div[@class='rich-text-body']
+
+strip: //div[@class='byline']
+strip: //div[@class='offscreen-menu']
+test_url: http://thenextgeneration.org/blog/post/rebrand-announce/
\ No newline at end of file
-body: //div[@class= 'article-body']\r
-author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')]\r
-\r
-strip: //div[@class = 'bargo']\r
-strip: //div[@class = 'tf']\r
-strip: //div[@class = 'article']/div[@class = 'blue-box']\r
-strip_id_or_class: respond\r
-\r
-tidy: no\r
-next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href\r
-\r
+body: //div[@class= 'article-body']
+author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')]
+
+strip: //div[@class = 'bargo']
+strip: //div[@class = 'tf']
+strip: //div[@class = 'article']/div[@class = 'blue-box']
+strip_id_or_class: respond
+
+tidy: no
+next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href
+
test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/
\ No newline at end of file
-body: //div[@id='fullstory']\r
+body: //div[@id='fullstory']
strip: //div[@id='page_leftbar']
test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt
\ No newline at end of file
-title: //h2[@class='title']\r
-date: substring-before(//p[@class='meta'], '|')\r
-body: //div[@class='story']\r
-#body: //div[@class='article_body']\r
-\r
-strip: //h2[@class='title']\r
-strip: //p[@class='meta']\r
-strip: //div[@class='ga_section']\r
-strip: //div[@id='recent_slider']\r
+title: //h2[@class='title']
+date: substring-before(//p[@class='meta'], '|')
+body: //div[@class='story']
+#body: //div[@class='article_body']
+
+strip: //h2[@class='title']
+strip: //p[@class='meta']
+strip: //div[@class='ga_section']
+strip: //div[@id='recent_slider']
test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/
\ No newline at end of file
-title: //h1[@class='post-title']\r
-body: //div[@class='post']\r
-author: //p[@class='posted-by']\r
-date: //div[@class='sprite post-date']\r
-\r
-# The body of the post doesn't have it's own div so we have to strip out the metadata\r
-strip: //div[@class='author_avatar']\r
-strip: //div[@class='sprite post-date']\r
-strip: //h1[@class='post-title']\r
+title: //h1[@class='post-title']
+body: //div[@class='post']
+author: //p[@class='posted-by']
+date: //div[@class='sprite post-date']
+
+# The body of the post doesn't have it's own div so we have to strip out the metadata
+strip: //div[@class='author_avatar']
+strip: //div[@class='sprite post-date']
+strip: //h1[@class='post-title']
strip: //p[@class='posted-by']
test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/
\ No newline at end of file
-title: //div[@id="article"]/h2\r
-author: //div[@id="article"]/p[@class="byline"]/a[1]\r
-date: //div[@id="article"]/p[@class="dateline"]/a[2]\r
-body: //div[@id="article"]/div[@id="body"]
-test_url: http://www.theregister.co.uk/2011/10/06/gas_bill_shocker/
\ No newline at end of file
+# Updated 25-Jan-2014
+single_page_link: //a[contains(@href, '/Print/')]
+
+title: //div[@id="article"]/h2
+author: //p[@class="byline"]/a
+date: //p[@class="dateline"]/a[last()]
+
+test_url: http://www.theregister.co.uk/2014/01/24/thirty_years_of_the_apple_macintosh_part_2/
-body: //div[@id='node-content']\r
+body: //div[@id='node-content']
strip_id_or_class: pager
test_url: http://www.theroot.com/views/why-i-am-male-feminist
\ No newline at end of file
-title: /html/body/div/div[2]/div/div/h1\r
-\r
+title: /html/body/div/div[2]/div/div/h1
+
body: /html/body/div/div[2]/div/div/div[2]
test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes
\ No newline at end of file
-#body: (//div[@class='ftr-yt-vid'])[1]\r
-body: (//blockquote[contains(@class, 'postcontent')])[1]\r
-body: (//div[starts-with(@id, 'post_message')])[1]\r
-\r
-prune: no\r
-tidy: no\r
-\r
-#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
-#replace_string(</iframe>): </iframe> </div>\r
-\r
+#body: (//div[@class='ftr-yt-vid'])[1]
+body: (//blockquote[contains(@class, 'postcontent')])[1]
+body: (//div[starts-with(@id, 'post_message')])[1]
+
+prune: no
+tidy: no
+
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"
+#replace_string(</iframe>): </iframe> </div>
+
test_url: http://www.thesiasat.com/showthread.php?19220-Dunya-News-HASB-E-HAAL-16-06-2012-Part-1-5
\ No newline at end of file
-title: //h3[@class='post-title']/a[@class='post-title-link']\r
-body: //div[@class='post-content']\r
+title: //h3[@class='post-title']/a[@class='post-title-link']
+body: //div[@class='post-content']
author: //div[@class='post-meta-under-title']/a
test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/
\ No newline at end of file
-strip: //*[(@id = "content")]/h2\r
+strip: //*[(@id = "content")]/h2
strip: //*[(@class = "wp-notable-line")]
test_url: http://www.thespoiler.co.uk/index.php/2010/10/21/wayne-rooney-tells-man-utd-its-not-me-its-you
\ No newline at end of file
-title: //h1[contains(@class, 'cTitle')]\r
-body: //div[contains(@class, 'KonaBody') or @id='articleimageright']\r
-author: //meta[@name='Author']/@content\r
-date: //meta[@name='OriginalPublicationDate']/@content\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //h1[contains(@class, 'cTitle')]
+body: //div[contains(@class, 'KonaBody') or @id='articleimageright']
+author: //meta[@name='Author']/@content
+date: //meta[@name='OriginalPublicationDate']/@content
+
+prune: no
+tidy: no
+
test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389
\ No newline at end of file
-# savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029\r
-\r
-#other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885\r
-\r
-title: //div[@id='savageColumn_head']/h1\r
-title: //h1[@class="headlineLarge"]\r
-\r
-strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner']\r
-\r
-body: //div[@id='savageColumn']\r
+# savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029
+
+#other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885
+
+title: //div[@id='savageColumn_head']/h1
+title: //h1[@class="headlineLarge"]
+
+strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner']
+
+body: //div[@id='savageColumn']
body: //div[@id='story_text']
test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029
\ No newline at end of file
-title: //div[@id='storyHdr']/h1\r
-title: //div[@id='print']//h2\r
-body: //div[@class="virtualpage"]\r
-body: //div[@id='print']//div[@id='bd']\r
-author: //meta[@name="AUTHOR"]/@content\r
-author: (//div[@id='print']//div[@id='bd']/h4)[1]\r
-date: //meta[@name="DATE"]/@content\r
-date: //div[@id='print']//div[@id='dte']\r
-\r
-strip_id_or_class: articleFooter\r
-strip_id_or_class: sidebar\r
-strip_id_or_class: ie6PrintSubhead\r
-strip_id_or_class: subHdr\r
-\r
-\r
-replace_string(<P/>): </p><p>\r
-\r
-prune: no\r
-\r
-#TODO: redirects back - perhaps needs referer to work\r
-single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')]\r
-\r
-test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html\r
-# multi page\r
+title: //div[@id='storyHdr']/h1
+title: //div[@id='print']//h2
+body: //div[@class="virtualpage"]
+body: //div[@id='print']//div[@id='bd']
+author: //meta[@name="AUTHOR"]/@content
+author: (//div[@id='print']//div[@id='bd']/h4)[1]
+date: //meta[@name="DATE"]/@content
+date: //div[@id='print']//div[@id='dte']
+
+strip_id_or_class: articleFooter
+strip_id_or_class: sidebar
+strip_id_or_class: ie6PrintSubhead
+strip_id_or_class: subHdr
+
+
+replace_string(<P/>): </p><p>
+
+prune: no
+
+#TODO: redirects back - perhaps needs referer to work
+single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')]
+
+test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html
+# multi page
test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html
\ No newline at end of file
-title: //h1[contains(@class, "headline")]\r
-\r
-author: //p[contains(@class, "byline")]/a[contains(@class, "author")]\r
-\r
-date: substring-after(normalize-space(//p[contains(@class, "byline")]/span[contains(@class, "publish-date")]), "on ")\r
-\r
-body: //article[contains(@class, 'feature-entry')]\r
-body: //article\r
-prune: no\r
-tidy: no\r
-\r
-strip: //article/header\r
-strip: //*[@id='sticky-menu']\r
-strip: //aside\r
-strip: //nav\r
-\r
-strip_id_or_class: gallery\r
-strip_id_or_class: article-meta\r
-strip_id_or_class: story-navigation\r
-strip_id_or_class: slegend\r
-strip_id_or_class: related-product-meta\r
-strip_id_or_class: comments\r
-strip_id_or_class: ui-jump-list\r
-strip_id_or_class: pullquote\r
-\r
-strip: //q\r
-\r
-strip: //a[contains(@class, 'entry-section-title')]\r
-\r
-test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review\r
-test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review
\ No newline at end of file
+author: //p[contains(@class, "byline")]/a[contains(@class, "author")]
+
+date: //span[contains(@class, "publish-date")]/time[@pubdate]/@datetime
+
+body: //div[contains(@class, 'entry-content')]
+# for vergecasts, e.g. http://www.theverge.com/2013/8/22/4648566/the-vergecast-090-august-22th-2013-video
+body: //article
+prune: no
+#tidy: no
+
+strip: //article/header
+strip: //*[@id='sticky-menu']
+strip: //aside
+strip: //nav
+strip: //img[contains(@class, 'vox-lazy-load')]
+# deal with bad parsing
+strip: //div[contains(@class, 'story-image')]//div[contains(., 'function(')]
+
+strip_id_or_class: gallery
+strip_id_or_class: article-meta
+strip_id_or_class: story-navigation
+strip_id_or_class: slegend
+strip_id_or_class: related-product-meta
+strip_id_or_class: comments
+strip_id_or_class: ui-jump-list
+strip_id_or_class: pullquote
+strip_id_or_class: m-ad
+strip_id_or_class: social-sharing
+strip_id_or_class: m-video-entry__excerpt
+strip_id_or_class: hidden
+
+replace_string(<noscript>): <div>
+replace_string(</noscript>): </div>
+
+find_string: <script
+replace_string: <div style="display:none"
+find_string: </script>
+replace_string: </div>
+
+strip: //q
+
+strip: //a[contains(@class, 'entry-section-title')]
+
+test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review
+test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review
+test_url: http://www.theverge.com/2013/2/24/4026114/barnes-noble-shifting-focus-away-from-nook-hardware
+test_url: http://www.theverge.com/2014/6/19/5824072/top-shelf-living-the-dream
+test_url: http://www.theverge.com/rss/frontpage
\ No newline at end of file
-body: //div[@class="briefingEntry"]\r
-prune: no\r
+body: //div[@class="briefingEntry"]
+prune: no
test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill
\ No newline at end of file
-author: //p[@class="byline"]/a\r
-body: //div[@class="post"]\r
+author: //p[@class="byline"]/a
+body: //div[@class="post"]
test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/
\ No newline at end of file
-body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body']\r
+body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body']
test_url: http://www.thisdaylive.com/articles/australia-pm-talks-human-rights-with-chinas-wen/90394/
\ No newline at end of file
-author: //div[@class='meta clearfix']/a\r
-body: //div[@class='post']\r
-\r
-strip: //div[@class='metaCat']\r
-strip: //div[@class='post']/h1\r
-strip: //div[@class='post']/div[@class='meta clearfix']\r
+author: //div[@class='meta clearfix']/a
+body: //div[@class='post']
+
+strip: //div[@class='metaCat']
+strip: //div[@class='post']/h1
+strip: //div[@class='post']/div[@class='meta clearfix']
strip: //div[@class='post']/div[@class='social-bar clearfix']
test_url: http://thisismynext.com/2011/10/18/galaxy-nexus-android-ice-cream-sandwich-pictures-video-hands-on/
\ No newline at end of file
-author: //span[@class='fn']\r
-date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|')\r
+author: //span[@class='fn']
+date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|')
test_url: http://tidbits.com/article/12651
\ No newline at end of file
-# 2011-10-25 - carlo@... - Initial setup.\r
-\r
-single_page_link: //li[@class='print']/a/@href\r
-\r
-title: //h1\r
-author: //meta[@name="byline"]/@content\r
-date: //meta[@name="date"]/@content\r
-\r
-strip: //span[@class="see"]\r
-strip: //div[@class="byline"]\r
-strip: //div[@id="date2"]\r
-strip: //h1\r
-\r
-test_url: http://www.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html
\ No newline at end of file
+title: //h1[contains(@class, 'article-title')]
+author: //article//span[contains(@class, 'byline')]
+date: //time[@pubdate]/@datetime
+body: //section[contains(@class, 'article-body')]
+prune: no
+tidy: no
+
+strip: //figcaption
+strip: //p[contains(., 'MORE:') and ./a]
+strip: //aside
+
+test_url: http://time.com/14478/emotions-may-not-be-so-universal-after-all/
\ No newline at end of file
-title: //h1\r
-body: //div[@class="storytext"]\r
-strip: //div[@id="thelogin"]\r
-strip: //*[@class="hide"]\r
+title: //h1
+body: //div[@class="storytext"]
+strip: //div[@id="thelogin"]
+strip: //*[@class="hide"]
strip: //div[@id="anchored"]
test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1
\ No newline at end of file
-body: //div[@id='content']\r
-\r
-strip_id_or_class: featured-box\r
-strip_id_or_class: postmeta\r
-strip_id_or_class: respond\r
-\r
-author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')]\r
-date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ')\r
+body: //div[@id='content']
+
+strip_id_or_class: featured-box
+strip_id_or_class: postmeta
+strip_id_or_class: respond
+
+author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')]
+date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ')
test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/
\ No newline at end of file
-title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1\r
-title: //div[contains(@class, 'article_detail')]//h1\r
-title: //h1\r
-\r
-body: //div[contains(@class, 'article_detail')]\r
-\r
-author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3\r
-author: div[@class='author']//h3\r
-strip: //div[contains(@class, 'field-field-book-cover')]\r
-\r
-date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '')\r
-\r
-prune: no\r
-\r
-single_page_link: //a[@class='print-page']\r
-\r
+title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1
+title: //div[contains(@class, 'article_detail')]//h1
+title: //h1
+
+body: //div[contains(@class, 'article_detail')]
+
+author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3
+author: div[@class='author']//h3
+strip: //div[contains(@class, 'field-field-book-cover')]
+
+date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '')
+
+prune: no
+
+single_page_link: //a[@class='print-page']
+
test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled
\ No newline at end of file
-title: //div[@id='maincontent']//div[@class='title']\r
-body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat']\r
-\r
-tidy: no\r
+title: //div[@id='maincontent']//div[@class='title']
+body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat']
+
+tidy: no
test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/
\ No newline at end of file
-tidy: no\r
-title: //title\r
-author: //a[@itemprop = 'author']\r
-date: //time[@itemprop = 'datePublished']\r
-body: //div[@id = 'intelliTXT']\r
-\r
+tidy: no
+title: //title
+author: //a[@itemprop = 'author']
+date: //time[@itemprop = 'datePublished']
+body: //div[@id = 'intelliTXT']
+
next_page_link: //li[@class="pagin next"]/a
test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html
\ No newline at end of file
-body://div[@id="news-content"]/div[@id="intelliTXT"][1]\r
-\r
-author://div[@id="header-news-infos"]/a[1]\r
-\r
-date: //div[@id="header-news-infos"]/span[1]\r
-\r
-title://h1[@id="header-news-title" and @class="hardwareTitle"][1]\r
-\r
-strip://div[@id="news-content"]/div[@id="intelliTXT"]/table \r
-\r
+body://div[@id="news-content"]/div[@id="intelliTXT"][1]
+
+author://div[@id="header-news-infos"]/a[1]
+
+date: //div[@id="header-news-infos"]/span[1]
+
+title://h1[@id="header-news-title" and @class="hardwareTitle"][1]
+
+strip://div[@id="news-content"]/div[@id="intelliTXT"]/table
+
footnotes: no
test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html
\ No newline at end of file
-body: //div[@class='post']\r
-\r
-strip: //div[@class='social']\r
-strip: //span[@class='next']\r
+body: //div[@class='post']
+
+strip: //div[@class='social']
+strip: //span[@class='next']
strip: //span[@class='previous']
test_url: http://toolsandtoys.net/noble-tonic-02/
\ No newline at end of file
--- /dev/null
+# Metadata
+title: substring-after(//title, 'Coyote Tracks - ')
+author: //meta[@name="author"]/@content
+date: //div[@class="post_header"]/a
+
+# Content Pruning
+strip: //div[@class="column left"]
+strip: //div[@class="pages"]
+strip: //a[@class="text_title"]
+strip: //ol[@class="notes"]
+
+dissolve: //div[@class='column right']/ul
+dissolve: //li[@class='post']
+test_url: http://tracks.ranea.org/post/31431060205/the-next-big-uh-slightly-taller-thing
\ No newline at end of file
--- /dev/null
+body: //div[@id='video' or @id='main']
+
+strip_id_or_class: socialshareprivacy2
+strip_id_or_class: wp_rp_first
+
+find_string: Genre</strong>
+replace_string: </strong></p><p><strong>Genre</strong>
+
+test_url: http://www.trailerzone.de/g-i-joe-2-die-abrechnung/
\ No newline at end of file
-title: //div[@class="Post-body"]//span[@class="PostHeader"]\r
-author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"]\r
-date: substring-before(//div[@class="PostHeaderIcons metadata"], '|')\r
-body: //div[@class="Post-body"]\r
-strip_id_or_class: print1\r
-strip_id_or_class: metadata\r
+title: //div[@class="Post-body"]//span[@class="PostHeader"]
+author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"]
+date: substring-before(//div[@class="PostHeaderIcons metadata"], '|')
+body: //div[@class="Post-body"]
+strip_id_or_class: print1
+strip_id_or_class: metadata
strip_id_or_class: authorbox
test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/
\ No newline at end of file
-title: //title\r
-author: //span/a\r
-date: substring-after(//small,'Published:')\r
-\r
-strip: //h1[@class='vert_class']\r
-strip: //h1[@class='headline']\r
-strip: //img[contains(@src,'logo_triblive.gif')]\r
-\r
-#strip: //h6\r
-#strip_img_src: logo_triblive.gif\r
-\r
-single_page_link: //a[@class='stprint']\r
+title: //title
+author: //span/a
+date: substring-after(//small,'Published:')
+
+strip: //h1[@class='vert_class']
+strip: //h1[@class='headline']
+strip: //img[contains(@src,'logo_triblive.gif')]
+
+#strip: //h6
+#strip_img_src: logo_triblive.gif
+
+single_page_link: //a[@class='stprint']
test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case
\ No newline at end of file
-title: //div[@class='printbody']/h1\r
-body: //div[@class='printbody']\r
-prune: no\r
-\r
-strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/']\r
-strip: //table[@class='footer']\r
-\r
-single_page_link: //div[@class='article_tools']//a[contains(@href, '/print/')]\r
-\r
-test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/
\ No newline at end of file
+title: //div[@class='printbody']/h1
+body: //div[@class='printbody']
+prune: no
+
+strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/']
+strip: //table[@class='footer']
+strip: //h6[contains(., 'http://')]
+
+single_page_link: //a[contains(@href, '/print/')]
+
+test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/
+test_url: http://www.truthdig.com/dig/item/the_death_of_truth_20130505/
\ No newline at end of file
-title: //h2\r
-author: //a[starts-with(@href, '/AuthorStories')]\r
+title: //h2
+author: //a[starts-with(@href, '/AuthorStories')]
body: //div[@id='storyinnerbody']
test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm
\ No newline at end of file
-title: //h1[@class='posttitle']\r
-author: //span[@class='author']/a\r
-date: //span[@class='timestamp']\r
-body: //div[@class='body']\r
+title: //h1[@class='posttitle']
+author: //span[@class='author']/a
+date: //span[@class='timestamp']
+body: //div[@class='body']
test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/
\ No newline at end of file
-title: //h1[@class='post-title']\r
-author: //div[@class='display-name']\r
-date: //div[@class='date']\r
-body: //div[@class='body']\r
-footnotes: no\r
+title: //h1[@class='post-title']
+author: //div[@class='display-name']
+date: //div[@class='date']
+body: //div[@class='body']
+footnotes: no
test_url: http://tuckreview.com/2012/8/14/migrating-to-v6
\ No newline at end of file
-# Google Custom Search\r
-strip_id_or_class: google_branding_style\r
-\r
-# Avoid double title\r
-strip_id_or_class: pagetitle\r
-\r
-# external links are labelled\r
-strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif\r
-\r
-title: //div[@class="pagetitle"]\r
-body: //div[@id="wikitext"]\r
-\r
-# don't get clever.\r
-strip_comments: no\r
-prune: no\r
-\r
-# navigation in footer lives inside the wikitext div, annoyingly.\r
-strip_id_or_class: pathholder\r
+# Google Custom Search
+strip_id_or_class: google_branding_style
+
+# Avoid double title
+strip_id_or_class: pagetitle
+
+# external links are labelled
+strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif
+
+title: //div[@class="pagetitle"]
+body: //div[@id="wikitext"]
+
+# don't get clever.
+strip_comments: no
+prune: no
+
+# navigation in footer lives inside the wikitext div, annoyingly.
+strip_id_or_class: pathholder
test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters
\ No newline at end of file
-title: //title\r
-body: (//p[contains(@class, 'js-tweet-text')])[1]\r
-author: (//strong[contains(@class, 'fullname')])[1]\r
-date: //span[contains(@class, 'js-short-timestamp')]/@data-time\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //title
+body: (//p[contains(@class, 'js-tweet-text')])[1]
+author: (//strong[contains(@class, 'fullname')])[1]
+date: //span[contains(@class, 'js-short-timestamp')]/@data-time
+
+prune: no
+tidy: no
+
test_url: https://twitter.com/medialens/status/216883678582804480
\ No newline at end of file
-body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText']\r
-strip: //div[contains(@class, 'mpindex')]\r
-prune: no\r
-tidy: no\r
-\r
+body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText']
+strip: //div[contains(@class, 'mpindex')]
+prune: no
+tidy: no
+
test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html
\ No newline at end of file
-# applies to uk.ds.ign.com, uk.wii.ign.com etc.\r
-# possibly to non-UK versions, but I can’t test that\r
-\r
-title: //h1[@class="headline"]\r
-author: //div[@class="hdr-sub byline"]/a\r
-date: //h2[@class="publish-date"]/span\r
-body: //div[@id="main-article-content"]\r
-\r
-strip: //ul[@class="lnks-readmore"]\r
-\r
-strip: //div[@class="inlineImageCaption"]\r
-# can’t make the images appear, so remove the captions\r
-\r
-strip: //div[@style="width:468px"]\r
-# video caption links\r
-\r
-convert_double_br_tags: yes\r
-\r
-strip_comments: no\r
-# otherwise the ‘Closing Comments’ are removed\r
-\r
+# applies to uk.ds.ign.com, uk.wii.ign.com etc.
+# possibly to non-UK versions, but I can’t test that
+
+title: //h1[@class="headline"]
+author: //div[@class="hdr-sub byline"]/a
+date: //h2[@class="publish-date"]/span
+body: //div[@id="main-article-content"]
+
+strip: //ul[@class="lnks-readmore"]
+
+strip: //div[@class="inlineImageCaption"]
+# can’t make the images appear, so remove the captions
+
+strip: //div[@style="width:468px"]
+# video caption links
+
+convert_double_br_tags: yes
+
+strip_comments: no
+# otherwise the ‘Closing Comments’ are removed
+
# Ratings box could do with some rearranging, but it’s tricky
test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html
\ No newline at end of file
-author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on')\r
-date: substring-after(//div[@class='post-byline'], ', on')\r
-\r
-# for some reason, the following is producing a "no text [48]" error\r
-#title: //div[@class='post-headline']\r
-\r
-# for some reason, the following doesn't appear to isolate just the body copy\r
-body: //div[@class='post-bodycopy']\r
-\r
-# we solve the above issue by stripping out everything else we don't want\r
-# these can probably all be removed if the body: command above worked\r
-strip_id_or_class: reply\r
-strip_id_or_class: left\r
-strip_id_or_class: post-headline\r
-strip_id_or_class: post-byline\r
+author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on')
+date: substring-after(//div[@class='post-byline'], ', on')
+
+# for some reason, the following is producing a "no text [48]" error
+#title: //div[@class='post-headline']
+
+# for some reason, the following doesn't appear to isolate just the body copy
+body: //div[@class='post-bodycopy']
+
+# we solve the above issue by stripping out everything else we don't want
+# these can probably all be removed if the body: command above worked
+strip_id_or_class: reply
+strip_id_or_class: left
+strip_id_or_class: post-headline
+strip_id_or_class: post-byline
strip_id_or_class: footer
test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='postTitle']
+author: //a[@rel='author']
+date: substring-before(//h4[@class='postAuthor'], '|')
+body: //div[@class='postContent']
+
+strip: //div[@class='simplePullQuote']
+
+wrap_in(figure): //img
+test_url: http://www.unwinnable.com/2013/04/23/gratifying-play/
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'layout__inner')]//div[contains(@class, 'file-image') or contains(@class, 'node__content')]
+author: //article//div[contains(@class, 'field-byline')]
+strip_id_or_class: rekommenderade
+strip_id_or_class: disqus
+strip_id_or_class: annonser
+
+test_url: http://www.uppsalafria.se/artikel/97167
\ No newline at end of file
-title: //title\r
-body: //td[@id='content']
-test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass
\ No newline at end of file
+title: //title
+body: //table[@id='entries']
+test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass
--- /dev/null
+date: //meta[@itemprop="datePublished"]/@content
+author: //div[@itemprop="author"]
+body: //div[@itemprop='articleBody']
+
+strip_id_or_class: share-tools
+
+test_url: http://www.usatoday.com/story/news/world/2014/03/18/malaysia-plane-search/6552429/
+test_url: http://rssfeeds.usatoday.com/usatoday-NewsTopStories
\ No newline at end of file
-body: //div[@id='CS_Element_maincontent']\r
-\r
-tidy: no\r
-prune: no\r
+body: //div[@id='CS_Element_maincontent']
+
+tidy: no
+prune: no
test_url: http://www.usccb.org/bible/readings/072412.cfm
\ No newline at end of file
-title: //h1\r
-\r
-date: substring-after(//p[@class='overline']/strong, ',')\r
-body: //div[@class="maintext"]\r
-strip: //p[@class='overline']\r
-strip: //h1\r
+title: //h1
+
+date: substring-after(//p[@class='overline']/strong, ',')
+body: //div[@class="maintext"]
+strip: //p[@class='overline']
+strip: //h1
tidy: no
test_url: http://www.useit.com/alertbox/mobile-startup-screen.html
\ No newline at end of file
--- /dev/null
+title: //meta[@property='dc:title']/@content
+date: //div[@class='content']//span[@property='dc:date']/@content
+body: //div[@property='content:encoded']
+prune: no
+
+test_url: http://www.usfirst.org/roboticsprograms/frc/Photo-From-Kickoff-Filming
\ No newline at end of file
--- /dev/null
+title: //h1
+author: //*[@class='byline']
+date: substring-after(//*[@class='pubdatetime'], 'Published: ')
+body: //*[@class='body-block']
+test_url: http://utdailybeacon.com/news/2012/oct/8/energy-forum-continues/
\ No newline at end of file
-author: ("Arturo Toledo")\r
-title: //div[@class="post"]/h2\r
-body: //div[@class="entry"]\r
-\r
-# Remove Twitter button\r
+author: ("Arturo Toledo")
+title: //div[@class="post"]/h2
+body: //div[@class="entry"]
+
+# Remove Twitter button
strip: //div[@class="entry"]/p[2]/a/img
test_url: http://ux.artu.tv/?p=192
\ No newline at end of file
-title: //meta[@property="og:title"]/@content\r
-author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')]\r
-date: //div[contains(@class, 'cn_date_time')]\r
-body: //div[contains(@class, 'pageContainers')]\r
-body: //article[@id='items-container']\r
-#body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container']\r
-\r
-strip_id_or_class: bc\r
-strip_id_or_class: utilities\r
-strip_id_or_class: list-supporting\r
-strip_id_or_class: yrail\r
-strip_id_or_class: urail\r
-\r
-prune: no\r
-#tidy: no\r
-\r
-strip_id_or_class: super-rubric-section\r
-strip_id_or_class: cn_date_time\r
-strip_id_or_class: cn_contributors\r
-strip_id_or_class: cn_pagination_controls\r
-strip_id_or_class: cn_features_container\r
-strip_id_or_class: global-footer\r
-strip_id_or_class: cn_ecom_placement\r
-strip: //li[@class='blogNavPrev']\r
-\r
-single_page_link: //a[@title='Print this page']\r
-\r
-test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105\r
-test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808\r
+title: //meta[@property="og:title"]/@content
+author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')]
+date: //div[contains(@class, 'cn_date_time')]
+body: //div[contains(@class, 'pageContainers')]
+body: //article[@id='items-container']
+#body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container']
+
+strip_id_or_class: bc
+strip_id_or_class: utilities
+strip_id_or_class: list-supporting
+strip_id_or_class: yrail
+strip_id_or_class: urail
+
+prune: no
+#tidy: no
+
+strip_id_or_class: super-rubric-section
+strip_id_or_class: cn_date_time
+strip_id_or_class: cn_contributors
+strip_id_or_class: cn_pagination_controls
+strip_id_or_class: cn_features_container
+strip_id_or_class: global-footer
+strip_id_or_class: cn_ecom_placement
+strip: //li[@class='blogNavPrev']
+
+single_page_link: //a[@title='Print this page']
+
+test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105
+test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808
test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201
\ No newline at end of file
-title: //div[@class='ArticleHeadlineDetailedView']\r
-date: //span[@class='ArticlePublicationDateTimeDetailedView']\r
-author://span[@class='ArticleBylineDetailedView']\r
+title: //div[@class='ArticleHeadlineDetailedView']
+date: //span[@class='ArticlePublicationDateTimeDetailedView']
+author://span[@class='ArticleBylineDetailedView']
body: //div[@class='ArticleTextDetailedView']
test_url: http://www.varingen.no/Nyheter/tabid/392/Default.aspx?ModuleId=56651&articleView=true
\ No newline at end of file
-# FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser\r
-\r
+# FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser
+
strip: //h2
test_url: http://www.varsity.co.uk/reviews/2662
\ No newline at end of file
--- /dev/null
+title://div[@class="detail-new-title"]
+body://div[@class="innerpad"]
+strip://div[@class="ArticleUtility"]
+strip://div[@class="commentPost"]
+strip://div[@class="comment-box"]
+strip://div[@id="TinLienQuan"]
+test_url: http://vea.gov.vn/vn/tintuc/tintuchangngay/Pages/T%C4%83ng-c%C6%B0%E1%BB%9Dng-b%E1%BA%A3o-t%E1%BB%93n-%C4%91%E1%BB%99ng-v%E1%BA%ADt-hoang-d%C3%A3-%E1%BB%9F-Vi%E1%BB%87t-Nam.aspx
\ No newline at end of file
-title: //td[@class='second_content']/h1\r
+title: //td[@class='second_content']/h1
body: //td[@class='second_content']/div[@class='article_text']
test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru
\ No newline at end of file
-author: //div[@class="blogginnleggForfatter"]\r
-date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd'])\r
-strip: //div[contains(@id,"bloggDelingslenker")]\r
+author: //div[@class="blogginnleggForfatter"]
+date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd'])
+strip: //div[contains(@id,"bloggDelingslenker")]
strip: //div[contains(@id,"bloggDelingslenker")]
test_url: http://veggbilder.no/blogginnlegg/fristelser
\ No newline at end of file
-title: //h2\r
-date: substring-before(//small," • Permalink")\r
-author:string('Martin Hering')\r
-\r
+title: //h2
+date: substring-before(//small," • Permalink")
+author:string('Martin Hering')
+
Strip: //p/small
test_url: http://vemedio.com/blog/posts/state-of-support-and-icloud
\ No newline at end of file
-title: //h1[@class="entry-title"]\r
-author: //div[@class="author-name"]\r
-date: //span[@class="the-time"]\r
-body: //div[@class="entry-content"]\r
+title: //h1[@class="entry-title"]
+author: //div[@class="author-name"]
+date: //span[@class="the-time"]
+body: //div[@class="entry-content"]
strip: //div[@class="vb-gallery"]
test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1
\ No newline at end of file
-title: //article/header/h1\r
-\r
-author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a\r
-date: //article/header/section[@class='byline']/span[@class='published']/span\r
-\r
-body: //article/section[@class='body']\r
-\r
-convert_double_br_tags: yes\r
-\r
-# This is required, because Tidy chokes on the HTML5 tags...\r
+title: //article/header/h1
+
+author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a
+date: //article/header/section[@class='byline']/span[@class='published']/span
+
+body: //article/section[@class='body']
+
+convert_double_br_tags: yes
+
+# This is required, because Tidy chokes on the HTML5 tags...
tidy: no
test_url: http://www.version2.dk/artikel/17069-amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere
\ No newline at end of file
-title: //title\r
-body: //div[contains(@class, 'printRecipe')]\r
-strip: //div[@class='recipeHeader']\r
-prune: no\r
-tidy: no\r
+title: //title
+body: //div[contains(@class, 'printRecipe')]
+strip: //div[@class='recipeHeader']
+prune: no
+tidy: no
single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')]
test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx
\ No newline at end of file
-body: //div[@id='artikkelspalte']\r
+body: //div[@id='artikkelspalte']
strip_id_or_class: 'breadcrumb'
test_url: http://www.vg.no/spill/artikkel.php?artid=10003628
\ No newline at end of file
-title: concat("Video: ", //div[@id='currentVideoTitleDivId'])\r
-body: //div[@id='currentVideoDescriptionId']\r
-author: //meta[@name='author']/@content\r
-\r
-replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease\r
-\r
-replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease\r
-\r
+title: concat("Video: ", //div[@id='currentVideoTitleDivId'])
+body: //div[@id='currentVideoDescriptionId']
+author: //meta[@name='author']/@content
+
+replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease
+
+replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease
+
test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works
\ No newline at end of file
-title: //h2[@class='posttitle']\r
-date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by')\r
-date: //span[@class='postdate']\r
-author: //span[@class='postdate']/a\r
+title: //h2[@class='posttitle']
+date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by')
+date: //span[@class='postdate']
+author: //span[@class='postdate']/a
body: //div[@class='entry line_top']
test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/
\ No newline at end of file
-title: //h2[@class='headline']\r
-\r
+title: //h2[@class='headline']
+
body: //div[@class='ContentPrint']
-\r
-prune: no\r
-\r
-single_page_link: //a[contains(@href, '/printVersion/')]\r
-\r
+
+prune: no
+
+single_page_link: //a[contains(@href, '/printVersion/')]
+
test_url: http://www.villagevoice.com/2010-03-16/news/new-york-s-ten-worst-landlords/
\ No newline at end of file
-title: //title\r
-body: //iframe\r
-\r
-find_string: <html><iframe \r
-replace_string: <iframe id="video" \r
-\r
-find_string: ></iframe></html>\r
-replace_string: ></iframe>\r
-\r
-replace_string("): "\r
-\r
-single_page_link: //link[@type='text/xml+oembed']\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //title
+body: //iframe
+
+find_string: <html><iframe
+replace_string: <iframe id="video"
+
+find_string: ></iframe></html>
+replace_string: ></iframe>
+
+replace_string("): "
+
+single_page_link: //link[@type='text/xml+oembed']
+
+prune: no
+tidy: no
+
test_url: http://vimeo.com/35941909
\ No newline at end of file
--- /dev/null
+title: //div[@id='singletext']//h1
+body: //div[contains(@class, 'mypictureborder')] | //div[@id='singletext']
+prune: no
+
+strip_id_or_class: singletostart
+strip_id_or_class: navigation
+strip_id_or_class: social
+strip_id_or_class: single_topwrapper
+strip: //a[contains(., 'Nächster Artikel')]
+
+test_url: http://www.viply.de/?p=87973
+test_url: http://www.viply.de/?feed=rss2
\ No newline at end of file
-# Author's name, when present, has 'skrifar:' ('writes:') appended to it.\r
-# In case of multiple authors, this would be 'skrifa:', hence only 7 characters\r
-# are stripped off.\r
-author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7)\r
-\r
-date: //span[@class='date']\r
-title: //h1\r
-body: //div[@class='paragraph']\r
-\r
-# Strip out author string when present\r
-strip: //div[@class='paragraph']/div[@class='meta']\r
-\r
+# Author's name, when present, has 'skrifar:' ('writes:') appended to it.
+# In case of multiple authors, this would be 'skrifa:', hence only 7 characters
+# are stripped off.
+author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7)
+
+date: //span[@class='date']
+title: //h1
+body: //div[@class='paragraph']
+
+# Strip out author string when present
+strip: //div[@class='paragraph']/div[@class='meta']
+
convert_double_br_tags: yes
test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997
\ No newline at end of file
-strip: //*[(@id = "ja-search")]\r
-body: //*[(@id = "ja-mainbody")]\r
-body: //*[(@id = "content-mass-bottom")]\r
-strip://h3[contains(span,'Related Posts')]\r
+strip: //*[(@id = "ja-search")]
+body: //*[(@id = "ja-mainbody")]
+body: //*[(@id = "content-mass-bottom")]
+strip://h3[contains(span,'Related Posts')]
strip://img
test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot
\ No newline at end of file
-body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table\r
-strip://div[@class="box-item"]\r
-strip://div[@id="ARTICLE_BANNER"]\r
-strip://a\r
-strip://div[@class="tag-parent"]\r
-strip://div[@class="email-print txtr"]\r
-\r
+body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table
+strip://div[@class="box-item"]
+strip://div[@id="ARTICLE_BANNER"]
+strip://a
+strip://div[@class="tag-parent"]
+strip://div[@class="email-print txtr"]
+
test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/
\ No newline at end of file
-title: //h1\r
+title: //h1
body: //div[@class='entrytext']
test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html
\ No newline at end of file
-body: //div[contains(@class, 'KonaBody')]\r
+body: //div[contains(@class, 'KonaBody')]
test_url: http://www.vworker.com/RentACoder/misc/BidRequests/ShowBidRequest.asp?lngBidRequestId=1634186
\ No newline at end of file
-title: //h2[@class="title"]\r
-body: //div[@class="post"]\r
+title: //h2[@class="title"]
+body: //div[@class="post"]
test_url: http://waffle.wootest.net/2011/06/22/on-reading-news/
\ No newline at end of file
-title: //div[@id='pr']/h3\r
-author: //div[@class='dateline']//a[contains(@href, '/author/')]\r
-\r
-# print page\r
-body: //div[@id='prbody']\r
-# standard page\r
-body: //div[@id='pgbody']\r
-\r
-# for multi-page articles\r
-single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')]\r
-\r
-prune: no\r
+title: //div[@id='pr']/h3
+author: //div[@class='dateline']//a[contains(@href, '/author/')]
+
+# print page
+body: //div[@id='prbody']
+# standard page
+body: //div[@id='pgbody']
+
+# for multi-page articles
+single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')]
+
+prune: no
test_url: http://www.walrusmagazine.com/articles/2011.12-memoir-kidnapped
\ No newline at end of file
-title: //h3\r
+title: //h3
body: //div[@class="content_wysiwyg"]
test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='main']//article
+
+prune: no
+
+test_url: http://www.washingtoninstitute.org/policy-analysis/view/striking-syria-lessons-from-the-israeli-experience?goback=.gde_3822158_member_273623672
+test_url: http://www.washingtoninstitute.org/rss/11/10
\ No newline at end of file
-title://a[@class = 'headline-article']\r
-\r
-author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ')\r
-date://div[@class = 'article']/span[@class = 'date']\r
-body://div[@class = 'article']\r
-single_page_link://a[@class = 'print']\r
-strip://p[@class = 'author']\r
-strip://a[@class = 'headline-article']\r
+title://a[@class = 'headline-article']
+
+author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ')
+date://div[@class = 'article']/span[@class = 'date']
+body://div[@class = 'article']
+single_page_link://a[@class = 'print']
+strip://p[@class = 'author']
+strip://a[@class = 'headline-article']
strip://span[@class = 'date']
test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php
\ No newline at end of file
-body: //div[@class="article_body"]\r
-author://meta[@name='DC.creator']/@content\r
-title://meta[@name='title']/@content\r
-date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title\r
-date://meta[@name="DC.date.issued"]/@content\r
-strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"]\r
-strip://div[@id="wp-column six end"]\r
-strip://div[contains(@class,'hidden')]\r
-strip://div[@id='article-side-rail']\r
-strip://div[@class="module component todays-paper-module curved"]\r
-strip://div[@class="module component live-qa curved img-border"]\r
-strip://div[@class="module component newsletter-signup curved"]\r
-strip://div[@class="module featured-stories component curved img-border"]\r
-\r
-strip_id_or_class: carousel\r
-strip_id_or_class: toolbar\r
-strip_id_or_class: module\r
-\r
-test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1\r
-test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html\r
+# Seems to be redirecting to articles.washingtonpost.com for many users
+
+body: //div[contains(@class, "article_body")]
+# print view
+body: //div[@id='print_facet']//div[@id='body']
+
+author://meta[@name='DC.creator']/@content
+title://meta[@name='title']/@content
+date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title
+date://meta[@name="DC.date.issued"]/@content
+strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"]
+strip://div[@id="wp-column six end"]
+strip://div[contains(@class,'hidden')]
+strip://div[@id='article-side-rail']
+strip://div[@class="module component todays-paper-module curved"]
+strip://div[@class="module component live-qa curved img-border"]
+strip://div[@class="module component newsletter-signup curved"]
+strip://div[@class="module featured-stories component curved img-border"]
+
+strip_id_or_class: carousel
+strip_id_or_class: toolbar
+strip_id_or_class: module
+
+# Change gJQAwdJG4U_story.html to gJQAwdJG4U_print.html
+single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_print.html")
+
+# [OLD] Change gJQAwdJG4U_story.html to gJQAwdJG4U_story_print.html
+#single_page_link: concat(substring-before(//link[@rel="canonical"]/@href, "_story.html"), "_story_print.html")
+
+test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1
+test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html
test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html
\ No newline at end of file
-body: //div[@id='template_article']\r
-\r
-strip_id_or_class: article_more\r
-strip: //hr\r
+body: //div[@id='template_article']
+
+strip_id_or_class: article_more
+strip: //hr
test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html
\ No newline at end of file
-title://div[@class="post"]/h2\r
-author://p[@class="postinfo"]/a\r
-date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ')\r
+title://div[@class="post"]/h2
+author://p[@class="postinfo"]/a
+date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ')
body://div[@class="contenttext"]
test_url: http://weblog.bignerdranch.com/?p=304
\ No newline at end of file
-title: //h2[@class="pageTitle"]\r
-strip: //div[@class="postfoot"]\r
-strip: //h2[@class="pageTitle"]\r
-strip: //h3[@class="pageTitle"]\r
-body: //div[@class="post"]\r
-author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed')\r
-date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by')\r
+title: //h2[@class="pageTitle"]
+strip: //div[@class="postfoot"]
+strip: //h2[@class="pageTitle"]
+strip: //h3[@class="pageTitle"]
+body: //div[@class="post"]
+author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed')
+date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by')
test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx
\ No newline at end of file
-tidy: no\r
-dissolve: //div[@id="content"]/div/article/header\r
-body: //div[@id="content"]/div/article \r
-title: //div[@id="content"]/div/article/h1\r
-date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"]\r
-strip: //div[@id="content"]/div/article/h1\r
+tidy: no
+dissolve: //div[@id="content"]/div/article/header
+body: //div[@id="content"]/div/article
+title: //div[@id="content"]/div/article/h1
+date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"]
+strip: //div[@id="content"]/div/article/h1
test_url: http://webpaper.nzz.ch/2012/06/23/front/JJKMS/aphrodite-und-die-kommunisten?guest_pass=24a3ca5b6d%3AJJKMS%3Ad30e1be8628c099669671d4da56cdce4187790ba
\ No newline at end of file
--- /dev/null
+strip: //*[@class="paginator"]
+body: //*[@id="articleText"]
+next_page_link: //a[@class="next"]
+
+# No author detection
+# No publishing date detection
+# No author and intro deduplication over multiple pages
+test_url: http://webwereld.nl/analyse/111452/de-code-van-dorifel-nader-bekeken.html
\ No newline at end of file
-# set body\r
-tidy: no\r
-body: //div[contains(@class, 'articleContent')]\r
-\r
-# remove clutter\r
-strip: //div[@class='advertising']\r
-strip: //div[@class='themenalarm']\r
-strip: //div[contains(@class, 'inTextTeaser')]\r
-\r
-# remove captions\r
-strip: //span[@class='copyRight']\r
-\r
-# remove photo galleries and extras\r
-strip: //div[contains(@class, 'textGallery')]\r
-strip: //div[contains(@class, 'videoGallery')]\r
-strip: //div[contains(@class, 'imageGallery')]\r
-strip: //div[contains(@class, 'openContent')]\r
-\r
-# remove comments\r
-strip: //div[@id = 'writeComment']\r
-\r
+# set body
+tidy: no
+body: //div[contains(@class, 'articleContent')]
+
+# remove clutter
+strip: //div[@class='advertising']
+strip: //div[@class='themenalarm']
+strip: //div[contains(@class, 'inTextTeaser')]
+
+# remove captions
+strip: //span[@class='copyRight']
+
+# remove photo galleries and extras
+strip: //div[contains(@class, 'textGallery')]
+strip: //div[contains(@class, 'videoGallery')]
+strip: //div[contains(@class, 'imageGallery')]
+strip: //div[contains(@class, 'openContent')]
+
+# remove comments
+strip: //div[@id = 'writeComment']
+
test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html
\ No newline at end of file
-title: substring-before(//title, '«')\r
-\r
-body: //div[@class='entry']\r
-strip: //div[@class='sharing_label']\r
+title: substring-before(//title, '«')
+
+body: //div[@class='entry']
+strip: //div[@class='sharing_label']
strip: //div[@class='snap_nopreview sharing robots-nocontent']
test_url: http://www.westhamtillidie.com/2012/03/11/twelve-things-we-learned-from-the-doncaster-game/
\ No newline at end of file
-strip: //div[@class="navigation"]\r
-strip: //div[@id="sidebar"]\r
-strip: //div[@id="post-extra-content"]\r
-strip: //div[@id="footer"]\r
-strip: //div[contains(@class, "sharing")]\r
+strip: //div[@class="navigation"]
+strip: //div[@id="sidebar"]
+strip: //div[@id="post-extra-content"]
+strip: //div[@id="footer"]
+strip: //div[contains(@class, "sharing")]
test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/
\ No newline at end of file
-body://div[contains(@class,'oAndtLyrics')]\r
-strip://div[contains(@class,'info')]\r
-strip://div[contains(@id,'romanization')]\r
-strip://div[contains(@id,'youtube')]\r
-strip://div[contains(@id,'romanizationSelector')]\r
-strip://div[contains(@id,'langSelectWrap')]\r
-strip://div[contains(@id,'requestTranslationWrap')]\r
-strip://div[contains(@id,'viewMore')]\r
-strip://div[contains(@class,'lyricsListInMainContent')]\r
+body://div[contains(@class,'oAndtLyrics')]
+strip://div[contains(@class,'info')]
+strip://div[contains(@id,'romanization')]
+strip://div[contains(@id,'youtube')]
+strip://div[contains(@id,'romanizationSelector')]
+strip://div[contains(@id,'langSelectWrap')]
+strip://div[contains(@id,'requestTranslationWrap')]
+strip://div[contains(@id,'viewMore')]
+strip://div[contains(@class,'lyricsListInMainContent')]
strip://div[contains(@class,'descIpNoti')]
test_url: http://wheelyric.com/lyrics/121#2
\ No newline at end of file
-title: //h1\r
-body: //div[@id='content']\r
-strip_id_or_class: editsection\r
-strip_id_or_class: toc\r
-strip: //div[@id='siteNotice']\r
-strip: //div[@id='content']//table[last()]\r
+title: //h1
+body: //div[@id='content']
+strip_id_or_class: editsection
+strip_id_or_class: toc
+strip: //div[@id='siteNotice']
+strip: //div[@id='content']//table[last()]
prune: no
test_url: http://wiki.guildwars.com/wiki/Monk
\ No newline at end of file
-title: //h1\r
-body: //div[@id='content']\r
-strip_id_or_class: editsection\r
-strip_id_or_class: toc\r
-strip: //div[@id='siteNotice']\r
-strip: //div[@id='content']//table[last()]\r
+title: //h1
+body: //div[@id='content']
+strip_id_or_class: editsection
+strip_id_or_class: toc
+strip: //div[@id='siteNotice']
+strip: //div[@id='content']//table[last()]
prune: no
test_url: http://wiki.guildwars2.com/wiki/Guardian
\ No newline at end of file
--- /dev/null
+# ...&printable=yes
+body: //div[@id='bodycontents']
+prune: no
+tidy: no
+strip_id_or_class: gatEditSection
+strip_id_or_class: relatedwikihows
+#strip: //div[contains(@class, 'step_num')]
+
+replace_string(<script ): <div style="display: none"
+replace_string(</script>): </div>
+
+single_page_link: //a[@id='gatPrintView']
+single_page_link: concat(//link[@rel='canonical']/@href, '?printable=yes')
+
+test_url: http://www.wikihow.com/Start-Your-Own-Country
\ No newline at end of file
-# copied from .wikipedia.org.txt\r
-title: //h1[@id='firstHeading' or @class='firstHeading']\r
-body: //div[@id = 'bodyContent']\r
-strip_id_or_class: editsection\r
-#strip_id_or_class: toc\r
-strip_id_or_class: vertical-navbox\r
-strip: //table[@id='toc'] | //div[@id='p-toc']\r
-strip: //div[@id='catlinks' or @id='contentSub']\r
-strip: //div[@id='jump-to-nav']\r
-strip: //div[@class='thumbcaption']//div[@class='magnify']\r
-strip: //table[@class='navbox']\r
-prune: no\r
+# copied from .wikipedia.org.txt
+title: //h1[@id='firstHeading' or @class='firstHeading']
+body: //div[@id = 'bodyContent']
+strip_id_or_class: editsection
+#strip_id_or_class: toc
+strip_id_or_class: vertical-navbox
+strip: //table[@id='toc'] | //div[@id='p-toc']
+strip: //div[@id='catlinks' or @id='contentSub']
+strip: //div[@id='jump-to-nav']
+strip: //div[@class='thumbcaption']//div[@class='magnify']
+strip: //table[@class='navbox']
+prune: no
tidy: no
test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes
\ No newline at end of file
-strip: //div[@class="widget-area"]\r
-title: //*[@class="entry-title"]\r
+strip: //div[@class="widget-area"]
+title: //*[@class="entry-title"]
date: //time[@class="entry-date"]
test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/
\ No newline at end of file
-title: substring-after(//span[@class='itemTitle'], ':') \r
+title: substring-after(//span[@class='itemTitle'], ':')
body: //div[@id='content']
test_url: http://www.williampfaff.com/modules/news/article.php?storyid=491
\ No newline at end of file
-title: //h1/span\r
-\r
-body: //div[@id="news_content"]\r
-\r
-author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text()\r
-\r
-date: //span[@class='date']\r
-\r
-# Rubrikenbild entfernen\r
-strip: //div[@id="news_content"]/a[1]\r
+title: //h1/span
+
+body: //div[@id="news_content"]
+
+author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text()
+
+date: //span[@class='date']
+
+# Rubrikenbild entfernen
+strip: //div[@id="news_content"]/a[1]
test_url: http://winfuture.de/news,69672.html
\ No newline at end of file
-title: //h1[@class='page-heading']\r
-author: //small/strong/a\r
-#their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time'\r
-date: substring-before(substring-after(//small,'on'),'with')\r
-body: //div[@class='entry']\r
+title: //h1[@class='page-heading']
+author: //small/strong/a
+#their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time'
+date: substring-before(substring-after(//small,'on'),'with')
+body: //div[@class='entry']
test_url: http://www.winrumors.com/chinese-windows-phone-launch-still-on-track-for-early-2012/
\ No newline at end of file
-date: //*[@class='kicker']\r
-body: //*[@class='KonaBody']\r
+date: //*[@class='kicker']
+body: //*[@class='KonaBody']
test_url: http://www.winsupersite.com/article/paul-thurrotts-wininfo/android-malware-surges-separate-studies-141364
\ No newline at end of file
-title: //meta[@property="og:title"]/@content \r
-title: //h1\r
-title: //*[@class='posttitle']\r
-author: //*[@class='entryAuthor']/a[1]\r
-author://*[@class='member-title']\r
-author://li[@class='author']/a[contains(@href, '/author/')]\r
-date: substring-after(//div[@class='entryAuthor'], '·')\r
-date: substring-before(//*[@class='entryDate'], '|')\r
-body: //div[@class='entry']\r
-strip: //span[contains(@class, 'nextprev')]\r
-#strip_id_or_class: ngg-galleryoverview \r
-# ngg-galleryoverview is the whole content sometimes, e.g. http://www.wired.com/underwire/2011/12/best-mixtapes-of-2011/?pid=5736&viewall=true\r
-\r
-strip: //p[span[contains(@class, 'contentjump')]]\r
-strip: //text()[contains(., 'nextpage')]\r
-\r
-prune: no\r
-\r
-single_page_link: //a[contains(@href, '/all/1') and contains(@class, 'contentjumpall')]\r
-\r
-test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/\r
-test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/1
\ No newline at end of file
+title: //meta[@name='Title']/@content
+author: //meta[@name='Author']/@content
+date: //meta[@name='DisplayDate']/@content
+body: //div[@class='entry']
+strip: //p[contains(., 'Pages:') and contains(., 'View All')]
+strip: //p[@class='caption']
+strip: //div[@class='desc' or @class='slide' or @id='slide-info']
+
+strip_id_or_class: pullquote
+strip_id_or_class: left_rail
+strip_id_or_class: related-container
+strip_id_or_class: radvert-caption-wrap
+
+# Remove gallery?
+strip_id_or_class: wpgallery
+
+#strip: //text()[contains(., 'nextpage')]
+
+prune: no
+
+single_page_link: //a[.='View All' and contains(@href, '/all/')]
+
+test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/
+test_url: http://www.wired.com/wiredenterprise/2013/09/docker/
+test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/
-title: //div[@class="bodyText"]/h1/text()\r
-body: //div[@class="bodyText"]\r
-\r
-# author and date are separated by only a newline\r
-# can't figure out how to tokenize that yet\r
-author: //div[@class="bodyText"]/span[@class="info"]/text()\r
-date: //div[@class="bodyText"]/span[@class="info"]/text()\r
-\r
-# strip metdata from body text\r
-strip: //div[@class="bodyText"]/h1/text()\r
-strip: //div[@class="bodyText"]/span[@class="info"]\r
+title: //div[@class="bodyText"]/h1/text()
+body: //div[@class="bodyText"]
+
+# author and date are separated by only a newline
+# can't figure out how to tokenize that yet
+author: //div[@class="bodyText"]/span[@class="info"]/text()
+date: //div[@class="bodyText"]/span[@class="info"]/text()
+
+# strip metdata from body text
+strip: //div[@class="bodyText"]/h1/text()
+strip: //div[@class="bodyText"]/span[@class="info"]
strip: //div[@class="bodyText"]/span[@class="info"]
test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas
\ No newline at end of file
-date://*[@class="entry-date"]\r
-author://*[@class="author vcard"]\r
+date://*[@class="entry-date"]
+author://*[@class="author vcard"]
strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"]
test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/
\ No newline at end of file
-title: //div[@class="content article"]/h1\r
-date: substring-after(//*[@class='date'], '//')\r
-body: //*[@class='article-content']\r
+title: //div[@class="content article"]/h1
+date: substring-after(//*[@class='date'], '//')
+body: //*[@class='article-content']
strip: //*[@id='nomodal']
test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html
\ No newline at end of file
-title: //p[@id='content']\r
-\r
+title: //p[@id='content']
+
body: //div[@class='contentblock']
test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm
\ No newline at end of file
-title: //h2[@class="posttitle"]\r
-body: //div[@class="post"]\r
-strip: //h2[@class="posttitle"]\r
-strip: //p[@class="filed-under"]\r
+title: //h2[@class="posttitle"]
+body: //div[@class="post"]
+strip: //h2[@class="posttitle"]
+strip: //p[@class="filed-under"]
convert_double_br_tags: yes
test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/
\ No newline at end of file
--- /dev/null
+body: //div[@id='nrelate_flyout_placeholder']
+
+strip_id_or_class: share
+
+prune: no
+
+test_url: http://www.wpmayor.com/themes/wordpress-portfolio-resume-themes/
+test_url: http://www.wpmayor.com/feed/
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'header-2')]
+body: //article//*[contains(@class, 'teaserText') or contains(@class, 'lastUpdated') or contains(@class, 'image') or contains(@class, 'body')]
+strip_id_or_class: articleIndex
+prune: no
+
+test_url: http://www.wtatennis.com/news/article/3190914
+test_url: http://www.wtatennis.com/news/article/3190244
\ No newline at end of file
-body://div[@id='articleNew']\r
-strip://div[@id='articleBy']\r
-strip://div[@id='articleDate']\r
-strip://td[@class='articleGraphicCredit']\r
-strip://h1\r
-strip://div[@id='articleEnd']\r
-strip://p[@class='tagline']\r
-strip://div[@class='openBox adslibraryArticle']\r
-strip_id_or_class:ad-180x150-1\r
-\r
-\r
-title: //div[@id="articleNew"]/h1\r
-author: //div[@id="articleBy"]/p/b\r
-date: substring-before(//div[@id="articleDate"], "-")\r
+body://div[@id='articleNew']
+strip://div[@id='articleBy']
+strip://div[@id='articleDate']
+strip://td[@class='articleGraphicCredit']
+strip://h1
+strip://div[@id='articleEnd']
+strip://p[@class='tagline']
+strip://div[@class='openBox adslibraryArticle']
+strip_id_or_class:ad-180x150-1
+
+
+title: //div[@id="articleNew"]/h1
+author: //div[@id="articleBy"]/p/b
+date: substring-before(//div[@id="articleDate"], "-")
test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml
\ No newline at end of file
-title:h1
-author: //*[@class = 'author']
-date: //*[@class = 'date']
-body: //*[@id = 'art']
-next_page_link: //*[@id='Str']/a[contains(text(), 'nastepne')]
-strip: //*[@class = 'rel_zdjTOP']
-strip: //*[@id = 'rel']
-strip: //*[@class = 'txt_upl']
-strip: //*[@id='Str']
-strip: //*[@id='source']
-test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x
\ No newline at end of file
+body: //div[@id='article']
+strip: //div[@class='head']
+
+strip_id_or_class: txt_upl
+
+single_page_link: //div[@id='gazeta_article_tools']//a[contains(@class, 'print')]
+
+test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x
+test_url: http://wyborcza.pl/1,75478,14880255,Biskup_Dydycz_o_pedofilii_i_tajemnicy_spowiedzi__Zamiast.html
\ No newline at end of file
-body: //div[@class='article-body']\r
+body: //div[@class='article-body']
title: //h1
test_url: http://wyctim.com/icloud-sync-regebbi-rendszereken/
\ No newline at end of file
-title://h1\r
-\r
-date://p[@class='articleDate']\r
+title://h1
+
+date://p[@class='articleDate']
body://div[@class='articleBody wzStandardArticle']
test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://www.xfgjls.com/magazine/html/?131.html
+# http://www.xfgjls.com/magazine/html/?170.html
+
+body://h3/following-sibling::div
+title: //h3
+date: substring-before(//h3/following-sibling::div/p, ' ')
+author: substring-before(substring-after(//h3/following-sibling::div/p, '作者:'), '来源')
+wrap_in(strong)://span[contains(@style, "FONT-WEIGHT: bold")]
+dissolve://span[@style="FONT-FAMILY: '宋体'; FONT-SIZE: 10.5pt; FONT-WEIGHT: bold; mso-spacerun: 'yes'"]
+test_url: http://www.xfgjls.com/magazine/html/?170.html
\ No newline at end of file
-title: //h1[@class="entry-title"]\r
-author: //span[@class="fn"]\r
+title: //h1[@class="entry-title"]
+author: //span[@class="fn"]
date: //p[@class="meta"]
test_url: http://xoeb.us/blog/2012/03/16/my-mistakes-with-our-first-release/
\ No newline at end of file
--- /dev/null
+body: //span[@id='article_content' or @class='text16g']
+
+# ads
+strip: //div[.//div[contains(@id, 'ads.')]]
+# related content heading
+strip: //p[contains(., 'עוד בערוץ החדשות של ynet:')]
+strip: //p[contains(., 'כותרות אחרונות מהעולם בחדשות ynet:')]
+strip: //div[contains(., 'אינציקלופדיית ynet:')]
+# related content links
+strip: //a[@class='bluelink']
+# strip image bullets
+strip_image_src: ynet_manual_bullet.png
+
+prune: no
+tidy: no
+
+# prevent JS issues
+find_string: <script type='text/javascript'>
+replace_string: <div style="display:none;">
+find_string: </script>
+replace_string: </div>
+
+test_url: http://www.ynet.co.il/articles/0,7340,L-4354266,00.html
+test_url: http://www.ynet.co.il/articles/0,7340,L-4354268,00.html
+#feed
+test_url: http://www.ynet.co.il/Integration/StoryRss2.xml
\ No newline at end of file
-title://div[@class='entry-title']\r
-body://div[@class='entry-content']\r
-strip_comments:yes\r
+title://div[@class='entry-title']
+body://div[@class='entry-content']
+strip_comments:yes
convert_double_br_tags:yes
test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/
\ No newline at end of file
--- /dev/null
+body: //div[@class="nxFullTextData"]
+test_url: http://yourerie.com/fulltext?nxd_id=306552
-title: //title\r
-body: //iframe\r
-\r
-find_string: <html><iframe \r
-replace_string: <iframe id="video" \r
-\r
-find_string: ></iframe></html>\r
-replace_string: ></iframe>\r
-\r
-single_page_link: //link[@type='text/xml+oembed']\r
-\r
-prune: no\r
-tidy: no\r
-\r
+title: //title
+body: //iframe
+
+find_string: <html><iframe
+replace_string: <iframe id="video"
+
+find_string: ></iframe></html>
+replace_string: ></iframe>
+
+single_page_link: //link[@type='text/xml+oembed']
+
+prune: no
+tidy: no
+
test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU
\ No newline at end of file
--- /dev/null
+title: //h1[@id='view_title']
+author: //div[contains(@class, 'content_authors')]//a
+body: //div[@id='view_body']
+
+prune: no
+
+test_url: http://www.zcommunications.org/orwellian-language-update-by-edward-s-herman.html
\ No newline at end of file
-title: //h1[@class="h s-1"]\r
-author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|')\r
-author: substring-after(//div[@class="bio"]//h3, 'About ')\r
-date: substring-after(//p[@class="meta s-10"], '|')\r
-date: substring-after(//p[@class="meta"], '|')\r
-body: //div[@class="content-1 entry space-1 clear"]\r
-body: //div[@class="storyBody"]\r
-\r
-test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920\r
+title: //h1[@class="h s-1"]
+author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|')
+author: substring-after(//div[@class="bio"]//h3, 'About ')
+date: substring-after(//p[@class="meta s-10"], '|')
+date: substring-after(//p[@class="meta"], '|')
+body: //div[@class="content-1 entry space-1 clear"]
+body: //div[@class="storyBody"]
+
+test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920
test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/
\ No newline at end of file
-# 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions\r
-# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)\r
-# 2011-12-09 [carlo@...] Removed "related articles" block\r
-# 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications.\r
-# 2011-08-20 [carlo@...] added author, fixed date\r
-\r
-\r
-single_page_link: //a[@title='Druckversion']\r
-tidy: no\r
-\r
-title: //title\r
-date: substring-before( //li[@class="date"], " " )\r
-author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text()\r
-author: substring-after(//li[@class='source first '], 'Quelle: ')\r
-\r
-strip_id_or_class: articleheader\r
-strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"]\r
-\r
-#Removes author and date from the start\r
-strip: //ul[@class="tools"]\r
-#Removes copyright statement - often disturb as first line of the news\r
-strip: //p[@class="copyright"]\r
-strip: //div[@class="copyright"]\r
-#Removes pagination links at the end\r
-strip: //div[@class="pagination"]\r
-\r
-# Fix picture captions\r
-wrap_in(small): //p[@class="caption"]/text()\r
-\r
-# Fix sub-headlines\r
-wrap_in(h2): //p/strong\r
-dissolve: //h2/strong\r
-\r
-#Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here.\r
-strip_id_or_class:"informatives"\r
-strip_id_or_class:"bottom"\r
-strip_id_or_class:"teasermosaic"\r
-strip_id_or_class:"comments"\r
-strip_id_or_class:"articlefooter af"\r
-strip_id_or_class:"relateds"\r
-strip_id_or_class:"pagination"\r
-\r
-footnotes: no\r
-test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag
\ No newline at end of file
+# 2013.10.30 [rezor92] fixed single_page_link
+# 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions
+# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)
+# 2011-12-09 [carlo@...] Removed "related articles" block
+# 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications.
+# 2011-08-20 [carlo@...] added author, fixed date
+
+
+single_page_link: //a[@title='Auf einer Seite']
+tidy: no
+
+title: //title
+date: substring-before( //li[@class="date"], " " )
+author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text()
+author: substring-after(//li[@class='source first '], 'Quelle: ')
+
+strip_id_or_class: articleheader
+strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"]
+
+#Removes author and date from the start
+strip: //ul[@class="tools"]
+#Removes copyright statement - often disturb as first line of the news
+strip: //p[@class="copyright"]
+strip: //div[@class="copyright"]
+#Removes pagination links at the end
+strip: //div[@class="pagination"]
+
+# Fix picture captions
+wrap_in(small): //p[@class="caption"]/text()
+
+# Fix sub-headlines
+wrap_in(h2): //p/strong
+dissolve: //h2/strong
+
+#Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here.
+strip_id_or_class:"informatives"
+strip_id_or_class:"bottom"
+strip_id_or_class:"teasermosaic"
+strip_id_or_class:"comments"
+strip_id_or_class:"articlefooter af"
+strip_id_or_class:"relateds"
+strip_id_or_class:"pagination"
+
+footnotes: no
+test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag
--- /dev/null
+author: //span[@class='submitted']/a
+strip: //div[@class='clear-block clr']
+strip: //div[@class='picture']
+strip: //span[@class='submitted']
+strip: //div[@class='breadcrumb']
+strip: //div[@class='fivestar-static-form-item']
+strip: //div[@class='js-links']
+strip: //div[@class='links clear-block clear']
+strip: //div[@class='block block-block']
+test_url: http://www.zerohedge.com/news/bernankes-columbus-voyage-end-monetary-policy-world
\ No newline at end of file
-title: //h1\r
+title: //h1
body: //div[@id="primarycontent"]
test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/
\ No newline at end of file
--- /dev/null
+# This filter is tested on:
+# http://www.zhihu.com/question/19587406
+# http://www.zhihu.com/question/20649035
+# http://www.zhihu.com/question/20637942
+
+author: //h3[@class='zm-item-answer-author-wrap']
+title://h2[@class='zm-item-title']
+date://a[@class='answer-date-link meta-item']
+convert_double_br_tags: yes
+
+wrap_in(blockquote)://div[@class='zm-editable-content']
+wrap_in(blockquote)://sup/text()
+dissolve://sup
+
+strip://div[@class='zh-answers-title']
+strip:///div[@class='zm-item-vote-info ']
+strip://div[@class='zm-item-answer-author-info']
+strip://div[@class='zu-blue-info-board zg-r3px']
+test_url: http://www.zhihu.com/question/20637942
\ No newline at end of file
-title: substring-after(id, 'post')/h2\r
+title: substring-after(id, 'post')/h2
body://div[@class = 'entry']
test_url: http://www.zingtrain.com/category/ontrack/january-2007/
\ No newline at end of file
class Database {
var $handle;
- private $order = array(
- 'ia' => 'ORDER BY entries.id',
- 'id' => 'ORDER BY entries.id DESC',
- 'ta' => 'ORDER BY lower(entries.title)',
- 'td' => 'ORDER BY lower(entries.title) DESC',
- 'default' => 'ORDER BY entries.id'
+ private $order = array (
+ 'ia' => 'ORDER BY entries.id',
+ 'id' => 'ORDER BY entries.id DESC',
+ 'ta' => 'ORDER BY lower(entries.title)',
+ 'td' => 'ORDER BY lower(entries.title) DESC',
+ 'default' => 'ORDER BY entries.id'
);
function __construct()
public function login($username, $password, $isauthenticated = FALSE)
{
if ($isauthenticated) {
- $sql = "SELECT * FROM users WHERE username=?";
- $query = $this->executeQuery($sql, array($username));
+ $sql = "SELECT * FROM users WHERE username=?";
+ $query = $this->executeQuery($sql, array($username));
} else {
- $sql = "SELECT * FROM users WHERE username=? AND password=?";
- $query = $this->executeQuery($sql, array($username, $password));
+ $sql = "SELECT * FROM users WHERE username=? AND password=?";
+ $query = $this->executeQuery($sql, array($username, $password));
}
$login = $query->fetchAll();