--- /dev/null
+title: //div[@class='post_header']//h2/a\r
+author: //span[@class='author']\r
+date: //span[@class='date']\r
+body: //div[@id='Content']\r
+
+test_url: http://37signals.com/svn/posts/2785-the-end-of-the-it-department
\ No newline at end of file
--- /dev/null
+body: //div[@class='content']\r
+date: //div[@class='content']/h2\r
+strip: //div[@class='content']/h2\r
+title: //div[@class='content']/h3\r
+\r
+strip: //div[@id='postmenu']\r
+strip: //div[@class='trackback']\r
+tidy: no\r
+test_url: http://www.3quarksdaily.com/3quarksdaily/2012/01/martin-luther-king-i-have-a-dream.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='main']
+title: //div[@class='intro']/h1
+author: //ul[@class='text-data']/li[@class='author']
+date: //ul[@class='text-data']/li[@class='date']
+convert_double_br_tags: yes
+tidy: no
+
+strip: //div[@class='share']
+strip: //*[@class='zoom']
+strip: //div[@id='disqus_thread']
+test_url: http://3voor12.vpro.nl/nieuws/2012/januari/Ook-website-GroenLinks-woensdag-op-zwart-i-v-m--SOPA.html
\ No newline at end of file
--- /dev/null
+body: //*[@class = 'content']\r
+author: //*[@class = 'submitted']/a\r
+date: substring-after(//*[@class = 'submitted']/text(), '|')
+test_url: http://www.43folders.com/2011/04/22/cranking
\ No newline at end of file
--- /dev/null
+# very loose setup for both 500px.com/photo/* and 500px.com/blog/*\r
+# photo page example: http://500px.com/photo/4181666\r
+# blog page example: http://500px.com/blog/110\r
+\r
+# avoid "no text" error\r
+tidy:no\r
+prune:no\r
+\r
+# reorganize photo page elements\r
+#body://div[contains(@class,'container')]\r
+move_into(body)://div[contains(@id,'thephoto')]\r
+move_into(body)://div[contains(@id,'description')]\r
+move_into(body)://div[contains(@id,'tags')]\r
+move_into(body)://div[contains(@id,'photo-info')]\r
+\r
+# clean photo page info\r
+strip://span[contains(@id,'copyright')]\r
+strip://*[contains(@id,'store')]\r
+strip://*[contains(@id,'user-info')]\r
+strip://*[contains(@id,'photo-stats')]\r
+strip://*[contains(@id,'voting_controls_container')]\r
+strip://*[contains(@id,'more-photos')]\r
+strip://*[contains(@id,'embed-photo')]\r
+\r
+# clean blog page side bar\r
+strip://*[contains(@class,'col d3 clearafter')]
+test_url: http://500px.com/photo/3641041?from=editors
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, '—')
+test_url: http://512pixels.net/more-on-linked-lists/
\ No newline at end of file
--- /dev/null
+body: //*[@id="episode"]\r
+prune: no\r
+tidy: no\r
+\r
+autodetect_next_page: no\r
+strip_id_or_class: player\r
+\r
+strip://*[@id="header"]
+test_url: http://5by5.tv/buildanalyze/60
\ No newline at end of file
--- /dev/null
+title: //h2[@class='border']\r
+body: //div[@class='padding']\r
+\r
+convert_double_br_tags: yes\r
+\r
+strip: //div[@id='social_sharing']\r
+strip: //div[@class='socialLinks']\r
+
+test_url: http://www.944.com/articles/mild-obsessions-frock-la-get-to-know-victoria-tik-s-haute-sustainable-fashion-line/
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]\r
+\r
+strip_id_or_class: socialshareprivacy1\r
+strip_id_or_class: zvaFacebookButton\r
+\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.aachener-nachrichten.de/lokales/aachen-detail-an/2517757
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+body: //*[@class='fliesstext_detail' or @class='detail_fliesstext'] | //img[@itemprop="image" and starts-with(@src, "/sixcms/media.php/")]\r
+\r
+strip_id_or_class: socialshareprivacy1\r
+strip_id_or_class: zvaFacebookButton\r
+\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.aachener-zeitung.de/sixcms/detail.php?template=az_detail&id=2552718
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+body: //div[@class='datosi' or @class='date' or @class='photo-alt1' or @class='text']\r
+strip_id_or_class: colB\r
+\r
+prune: no\r
+
+test_url: http://www.abc.es/20120209/tv-series/abci-house-ultima-temporada-201202090936.html
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //div[@class="byline"]/a\r
+date: //span[@class="timestamp"]\r
+\r
+strip: //p[@class="topics"]\r
+strip: //h1\r
+strip: //div[@class="byline"]\r
+strip: //p[@class="published"]\r
+strip: //div[contains(@class,"featured-scroller")]
+test_url: http://www.abc.net.au/news/2011-11-08/crabb-carbon-legislation-abbott-demolition/3652544
\ No newline at end of file
--- /dev/null
+title: //h1[@class='headline']\r
+body: //div[@id='storyText']\r
+# for video entries\r
+body: //img[@id='ff-img'] | //div[@id='meta']//div[contains(@class, 'overview')]\r
+author: //div[@class='byline']\r
+date: //div[@class='date']\r
+strip: //*[@id='date_partner']\r
+\r
+strip: //div[@class='breadcrumb']\r
+strip: //div[contains(@class,'show_tools')]\r
+strip: //div[@id='sponsoredByAd']\r
+strip: //div[contains(@class,'rel_container')]\r
+strip: //p[a[starts-with(@href, 'http://www.twitter.com')]]\r
+strip: //p[a[starts-with(@href, 'http://www.facebook.com')]]\r
+strip: //p[contains(., 'Click here to return to')]\r
+#strip_id_or_class: media\r
+strip_id_or_class: mediaplayer\r
+\r
+replace_string(<link rel="image_src" href="http): <img id="ff-img" src="http\r
+\r
+prune: no\r
+\r
+single_page_link: concat(//li[@class='pager']//a/@href, '&singlePage=true')\r
+\r
+test_url: http://abcnews.go.com/Politics/newt-gingrich-rocky-rollout-presidential-campaign-recover/story?id=13632744\r
+# multi-page\r
+test_url: http://abcnews.go.com/Blotter/family-freed-american-hostage-somalia-seals-obama/story?id=15439544
\ No newline at end of file
--- /dev/null
+title: //div[@id='H_docTitle']\r
+\r
+body: //div[@id='H_meta' or @id='H_content' or @id='F_footer']\r
+\r
+strip_id_or_class: F_toenail\r
+\r
+prune: no\r
+\r
+test_url: http://www.accesstoinsight.org/lib/authors/nyanaponika/wheel026.html
\ No newline at end of file
--- /dev/null
+body: //div[starts-with(@id, 'news-id-')]\r
+\r
+test_url: http://acidcow.com/fun/20933-acid-picdump-83-pics.html
\ No newline at end of file
--- /dev/null
+title://h1[@class="title"]\r
+author://div[@class="submitted"]/span/a\r
+date://div[@class="submitted"]/span\r
+body://div[@class="content-wrapper"]\r
+\r
+strip://div[@id="skip-link"]\r
+strip://div[@id="region-content-3-3"]\r
+strip://div[@id="section-footer"]
+test_url: https://www.acquia.com/blog/drupals-long-warmth-toward-third-party-code
\ No newline at end of file
--- /dev/null
+tidy:no\r
+date: //time[@class='updated']\r
+dissolve: //ul[@class='video-gallery']/li\r
+dissolve: //ul[@class='video-gallery']
+test_url: http://www.acroswing.fr/actualites/competition_rock/selectif_bellegarde_sur_valserine__2012-02-26.php
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+\r
+# clean up recipe pages\r
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
+\r
+#recipe pages\r
+strip_id_or_class: "recipe-feedback"\r
+strip_id_or_class: "comments"\r
+strip_id_or_class: "procedure-number"\r
+strip_id_or_class: "more-with-author"\r
+\r
+#slice\r
+strip_id_or_class: "inner"\r
+
+test_url: http://aht.seriouseats.com/archives/2009/12/the-burger-lab-salting-ground-beef.html
\ No newline at end of file
--- /dev/null
+body: //div[@class="entry"]
+test_url: http://alex.mullr.net/blog/2011/05/on-spotify/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='title']\r
+author: //h3[@class='byline']/a\r
+date: //div[@class='ishinfo']\r
+\r
+body: //*[@id='articletext']\r
+strip_id_or_class: 'ishinfo'\r
+strip_id_or_class: 'metastuff'\r
+strip_id_or_class: 'learnmore'\r
+strip_id_or_class: 'discuss'\r
+\r
+prune: no
+test_url: http://www.alistapart.com/articles/organizing-mobile/
\ No newline at end of file
--- /dev/null
+title: //span[@id='DetailedTitle']\r
+body: //td[@id='tdTextContent']\r
+strip_id_or_class: Skyscrapper_Body\r
+date: //span[@id='ctl00_cphBody_lblDate']\r
+author: //div[@id="dvAuthorInfo"]//a/text()\r
+strip: //table[ tbody/tr/td/object ]\r
+prune: no\r
+test_url: http://www.aljazeera.com/indepth/opinion/2012/01/2012114121925380575.html
\ No newline at end of file
--- /dev/null
+title: //h1[@id='itemTitle']\r
+body: //img[@id="ctl00_CenterColumnPlaceHolder_recipe_photoStuff_imgPhoto"] | //div[@id='ctl00_CenterColumnPlaceHolder_recipe_divSubmitter'] | //div[contains(@class, 'recipe-details-content')]\r
+strip: //div[@class='top-left' or @class='top-right' or @class='bot-left' or @class='bot-right']\r
+strip: //div[contains(@class, 'rightcoltoolsdiv')]\r
+strip: //div[contains(@class, 'servings-form')]\r
+strip: //p[@class='nutritional-information']\r
+strip: //a[contains(@class, 'nutritional-information') or contains(@class, 'nutritionanchor')]\r
+strip: //div[@id='nutri-info']/div[contains(@class, 'title')]\r
+strip: //img[@id='ctl00_CenterColumnPlaceHolder_recipe_imgSubmitter']\r
+strip_id_or_class: eshaAttribute\r
+strip_id_or_class: eshaParagraph\r
+prune: no\r
+
+test_url: http://allrecipes.com/Recipe/Taco-Pie/Detail.aspx?src=rotd
\ No newline at end of file
--- /dev/null
+title://div[@class="article-title"]/h1[@class="title"]\r
+date: //p[@class="article-date"]\r
+body://*[@class="article-body article-text"]\r
+# Trim out related posts at bottom of article\r
+strip://blockquote[@class="memo"]\r
+\r
+# Yup, no idea why author won't work...\r
+author://div[@class="page-header article-header clearfix"]/p[@class="title"]\r
+# [Marco:] Author won't work here because the page defines the "home" link under the author's name as rel="author", which always gets priority if the page has defined it.
+test_url: http://allthingsd.com/20120513/exclusive-yahoos-thompson-out-levinsohn-in-board-settlement-with-loeb-nears-completion/
\ No newline at end of file
--- /dev/null
+title: //div[@id='pageHdr']//h1\r
+body: //div[@id='pageHdr']/*[@class='dek'] | //div[@id='printArticle' or @id='slideShowPrint']\r
+strip: //div[contains(@class, 'infoBox') or @id='infoBox']\r
+single_page_link: //li[@id='print']/a\r
+\r
+prune: no
+\r
+test_url: http://www.allyou.com/budget-home/money-shopping/freebies-online-00400000066392/
\ No newline at end of file
--- /dev/null
+body: //div[@class = 'entry']\r
+date: substring-after(//p[@class="date"],'بتاريخ ')\r
+strip_id_or_class: date\r
+strip_id_or_class: follow-single\r
+strip_id_or_class: ratingblock\r
+strip_id_or_class: newRatingHolder\r
+strip_id_or_class: postmetadata\r
+strip_id_or_class: addthis_toolbox\r
+strip_id_or_class: addthis_default_style\r
+strip_id_or_class: size-full
+test_url: http://alphabeta.argaam.com/?p=35657
\ No newline at end of file
--- /dev/null
+body: //div[@id = "article-view"]\r
+body: //div[contains(@class, 'article')]//div[contains(@class, 'photo_bg')]\r
+author: //p[@class = "author"]\r
+strip: //h1\r
+strip: //h2\r
+strip_id_or_class: author\r
+prune: no\r
+test_url: http://www.alriyadh.com/2011/10/10/article674357.html\r
+test_url: http://www.alriyadh.com/net/article/780935
\ No newline at end of file
--- /dev/null
+title: //*[@id='normalfontyellow']
+test_url: http://www.alseraj.net/cgi-bin/pros/av/LeqaTextDisplay.cgi?display&2
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://alt1040.com/2011/09/banda-ancha-en-america-latina-insignificante
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://altfoto.com/2011/09/nikon-presenta-su-nuevo-sistema-nikon-1-y-dos-nuevas-camaras
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+author: substring-after(//div[@class="enableBullets"]/preceding-sibling::p[1], "By ")\r
+\r
+date: //div/a[contains (@href, "issue")]\r
+\r
+move_into(//div[@class="enableBullets"]/p): (//div[@id="content"]//img)[1]\r
+\r
+body: //div[@class="enableBullets"]
+test_url: http://alumni.stanford.edu/get/page/magazine/article/?article_id=54819
\ No newline at end of file
--- /dev/null
+title: //span[@id = 'btAsinTitle']\r
+body: (//*[@id='prodImageCell']//a)[1] | //div[@id = 'ps-content'] | //span[@id='actualPriceValue'] | //h2[.='Product Details']/following-sibling::div | //div[@class='h2' and .='Product Description']/following-sibling::div\r
+#strip_id_or_class: quantityDropdownDiv\r
+#strip_id_or_class: addToCartSpan\r
+#strip_id_or_class: oneClickDiv\r
+strip_id_or_class: nocontent\r
+strip_id_or_class: masDynamicConten\r
+strip_id_or_class: dynamic-content\r
+prune: no\r
+\r
+find_string: <span id="actualPriceValue">\r
+replace_string: <span id="actualPriceValue"><br />Price: \r
+\r
+strip_id_or_class: collapsePS\r
+strip_id_or_class: expandPS\r
+strip_id_or_class: psPlaceHolde\r
+strip: //li[contains(., 'update product info') or contains(., 'give feedback on images')]\r
+\r
+test_url: http://www.amazon.com/Common-Sense-Forestry-Living-Mother/dp/1931498210/
\ No newline at end of file
--- /dev/null
+title: //div[@class='head']/h2/a\r
+author: //div[@class='head']/a\r
+date: //div[@class='head']/p[@class='date']/a\r
+body: //div[@class='copy']\r
+strip: //p[@class='meta']
+test_url: http://americandrink.net/post/10567188712/free-the-hooch
\ No newline at end of file
--- /dev/null
+title: //div[@class="editorial-content"]/h3\r
+body: //div[@class="hero-image" or @class="editorial-content"]\r
+\r
+strip: //ul[@class="hero-caption"]\r
+strip_id_or_class: footer\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.americascup.com/en/Latest/News/2012/3/Coutts-and-Peyron-tell-transformative-tale-at-Global-Sports-Forum/
\ No newline at end of file
--- /dev/null
+title: //h1[@class="post-title"]\r
+author: //span[@class="author"]/a\r
+date: //span[@class="date"]\r
+body: //div[@class="post-content main"]
+test_url: http://www.americastestkitchenfeed.com/gadgets-and-gear/2012/07/chill-out-with-tovolos-king-cube-silicone-ice-cube-tray/
\ No newline at end of file
--- /dev/null
+author: //a[@class='b'][1]\r
+date: substring-after(substring-before(//div, 'Posted in'), ' on ')\r
+strip_image_src: /content/images/globals/\r
+strip: //h2[. = 'Page 1']/preceding::p\r
+strip: //h2\r
+\r
+prune: no\r
+\r
+single_page_link: concat('http://www.anandtech.com/print/', substring-after(//meta[@property='og:url']/@content, '/show/'))\r
+\r
+test_url: http://www.anandtech.com/show/5812/eurocom-monster-10-clevos-little-monster/
\ No newline at end of file
--- /dev/null
+title: //h2\r
+author: string('Andy Rutledge')\r
+date: //div[@class='articledate']\r
+body: //div[@class='copybody']\r
+\r
+strip: //*[@class='space']\r
+strip: //*[@class='articleFoot']\r
+\r
+test_url: http://www.andyrutledge.com/hungry-for-a-better-menu.php
\ No newline at end of file
--- /dev/null
+title: //h1[@class="title"]\r
+\r
+author: ("Anna Manasova")\r
+# is ignored, unfortunately\r
+\r
+date: //p[@class="date"]\r
+\r
+body: //div[@class="entry"]
+test_url: http://annatravelling.wordpress.com/2011/11/07/a-day-of-cooking-thai/
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'title')#\r
+body: //div[@id='mainContent']//div[contains(@class, 'section_content')] | //ul[@class='section_footer']\r
+date: //div[@class='date']\r
+\r
+strip_id_or_class: sharethis\r
+strip_id_or_class: stats\r
+strip_id_or_class: apply_form\r
+strip_id_or_class: job_map\r
+strip_id_or_class: respond\r
+strip: //h1//span[@class='type']\r
+strip: //li[@class='print' or @class='map']\r
+\r
+replace_string(<ul class="section_footer" style="display): <ul class="section_footer" style="display-bla\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://applature.com/mining-jobs/jobs/nickel-west-leinster-analytical-laboratory-technician/
\ No newline at end of file
--- /dev/null
+strip: //p[@class='sosumi']\r
+# Aren't they witty?\r
+\r
+# I can't work out what causes the before the title. \r
+title: //h1[@class='title']\r
+strip: //h1[@class='title']\r
+test_url: http://www.apple.com/pr/library/2011/02/15appstore.html
\ No newline at end of file
--- /dev/null
+title: //p[@class='title']\r
+\r
+author: //p[text() = 'By ']/a/text()\r
+strip: //p[text() = 'By ']\r
+\r
+body: //td[@class='bod']\r
+strip_id_or_class: title\r
+strip_id_or_class: minor\r
+\r
+strip_id_or_class: multipagefooter\r
+test_url: http://www.appleinsider.com/articles/12/02/29/inside_os_x_108_mountain_lion_safari_52_gets_a_simplified_user_interface_with_new_sharing_features.html
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://appleweblog.com/2011/09/encontrada-vulnerabilidad-grave-en-skype-para-ios
\ No newline at end of file
--- /dev/null
+date: //div[@class='post_date']\r
+\r
+body: //div[@class='post_content']\r
+
+test_url: http://www.archdaily.com/185325/p10-mixed-use-building-studio-up
\ No newline at end of file
--- /dev/null
+# Description: Fix XPaths to include ALL chapters on 'view_full_work' pages.\r
+# Include: work meta, summary, chapter information, and notes which Instapaper strips out on default.\r
+# Exclude: header, footer, navigation, comments.\r
+# Notes: User is a newbie with XPaths.\r
+\r
+title: //h2[@class='title']\r
+author: //h3[@class='byline']\r
+author: //a[@class='login author']\r
+\r
+strip_id_or_class:header\r
+strip_id_or_class:navigation\r
+strip_id_or_class:feedback\r
+strip_id_or_class:kudos\r
+strip_id_or_class:add_comment_placeholder\r
+strip_id_or_class:add_comment\r
+strip_id_or_class:globalize\r
+strip_id_or_class:footer
+test_url: http://archiveofourown.org/works/229402?view_full_work=true
\ No newline at end of file
--- /dev/null
+author: //p[@class='byline']/a\r
+body: //div[contains(@class,'article-content')]\r
+strip: //h2[@class='title']\r
+strip_id_or_class: byline\r
+prune: no\r
+\r
+date: //div[@class='byline']/span[@class='posted']//abbr/@original-title\r
+date: //div[@class='byline']/span[@class='posted']//abbr\r
+\r
+title: //div[@id='story']//h2[@class='title']\r
+\r
+strip: //div[@class='pager']\r
+next_page_link: //nav//a[span/@class='next']/@href\r
+\r
+test_url: http://arstechnica.com/tech-policy/news/2012/02/gigabit-internet-for-80-the-unlikely-success-of-californias-sonicnet.ars\r
+test_url: http://arstechnica.com/apple/2005/04/macosx-10-4/
\ No newline at end of file
--- /dev/null
+title: //div[@class="mod-bostonarticleheader mod-articleheader"]/h1\r
+author: substring-after(//div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[3],"By ")\r
+date: //div[@class="mod-bostonarticlebyline mod-articlebyline"]/span[@class="pubdate"]\r
+\r
+strip_id_or_class: mod-pagination
+test_url: http://articles.boston.com/2011-10-23/news/30313691_1_bigfoot-free-speech-monadnock-state-park
\ No newline at end of file
--- /dev/null
+title: //div[@class="mod-courantarticleheader mod-articleheader"]/h1\r
+date: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[@class="pubdate"]\r
+author: //div[@class="mod-courantarticlebyline mod-articlebyline"]/span[3]\r
+\r
+strip_id_or_class: mod-article-byline\r
+strip_id_or_class: mod-article-header\r
+strip_id_or_class: mod-article-subtitle\r
+#This leaves some crud after the article, but it's better than nothing.\r
+#It would be ideal if we could set the body to every element matching //div[contains(@class, "mod-articletext")]/p, but it seems like body only takes the first matching element.\r
+
+test_url: http://articles.courant.com/2011-10-22/news/hc-green-drugsearch--1022-20111022_1_drugs-in-student-lockers-police-dogs-lockdown
\ No newline at end of file
--- /dev/null
+body: //div[@id='HeadLine']\r
+strip: //div[@id='utility_right']
+test_url: http://www.asahi.com/culture/update/0520/TKY201105200321.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='article_title']\r
+author: //span[@class='author']\r
+date: //h2[@class='dateline']\r
+body: //div[@class='article_body']
+test_url: http://ascarter.net/2012/02/20/enough-is-enough.html
\ No newline at end of file
--- /dev/null
+title: //span[@class='titel']\r
+author: //span[@class='metadaten_C']/a//span[@class='metadaten_C']\r
+date: substring-after(//span[@class='metadaten_C'],'astronews.com')\r
+strip: //span[@class='bu']\r
+strip_image_src: '/_images/'\r
+
+test_url: http://www.astronews.com/news/artikel/2011/10/1110-021.shtml
\ No newline at end of file
--- /dev/null
+# Johannes Stühler\r
+\r
+title://h2\r
+author://span[@class='meta-content']\r
+date://abbr[@class='date published']/@title\r
+body://div[@class='entry-content']\r
+
+test_url: http://www.asymco.com/2011/01/14/is-android-more-efficient-than-ios-at-generating-search-revenue/
\ No newline at end of file
--- /dev/null
+prune: no\r
+body: //div[@class='post-body']\r
+author: //p[@class='byline']//a\r
+date: substring-after(//div[@class='about']/p[2], 'Posted')\r
+strip: //div[@class='body']/div[@class='meta']
+test_url: http://www.autoblog.com/2012/01/17/next-gen-bmw-x5-caught-again/
\ No newline at end of file
--- /dev/null
+author: //*[@id="article_wrapper"]/div[1]/a[1]\r
+body: //*[@id="article_wrapper"]/div[2]\r
+date: //*[@id="article_wrapper"]/div[1]/text()[2]
+test_url: http://www.avclub.com/articles/forgetmenot,70904
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@class='toppaginate']//a[@rel='nofollow']\r
+convert_double_br_tags: yes\r
+\r
+title: //div[@class="story"]/h1\r
+body: //div[@id="story-body-text"]\r
+author: //span[@class="byline"]\r
+date: //p[@class="date"]\r
+\r
+strip: //*[@class='all']\r
+strip: //*[@class='articlerail']\r
+
+test_url: http://www.baltimoresun.com/news/maryland/bs-md-omalley-budget-2-20120116,0,5340585.story
\ No newline at end of file
--- /dev/null
+title: //h2\r
+date: //span[@class='date']\r
+body: //div[@class='entry']\r
+\r
+strip: //div[@class='zusatz']\r
+
+test_url: http://www.basicthinking.de/blog/2011/12/13/sagt-social-networks-adieu-begrust-private-networks/
\ No newline at end of file
--- /dev/null
+author: substring(//h3[@class='headlines']/span[@class='dates'],0,string-length(//h3[@class='headlines']/span[@class='dates'])-20)\r
+\r
+\r
+date: substring((//h3[@class='headlines']/span[@class='dates']),string-length(//h3[@class='headlines']/span[@class='dates'])-18,12)\r
+\r
+\r
+body: //div[@class='first-article-big']\r
+strip: //table[@class='newsimagecontainer']\r
+strip: //h3[@class='headlines']\r
+strip: //iframe[@class='headlines']\r
+strip: //a[@class='newslink']\r
+convert_double_br_tags: yes
+test_url: http://bb.is/Pages/82?NewsID=174119
\ No newline at end of file
--- /dev/null
+body: //div[@class="story-body"]\r
+title: //h1[@class="story-header"]\r
+date: //span[@class="story-date"]/span[@class='date']\r
+\r
+# recipes, e.g. http://www.bbc.co.uk/food/recipes/mymincepies_71055\r
+body: //div[contains(@class, 'hrecipe')]//div[@id='subcolumn-1']\r
+\r
+#strip: //div[@class="story-feature narrow"]\r
+#strip: //div[@class="story-feature wide"]\r
+#strip: //div[@class="story-feature dslideshow-enclosure"]\r
+strip: //div[contains(@class, "story-feature")]\r
+strip: //span[@class="story-date"]\r
+#strip: //div[@class="caption body-narrow-width"]\r
+strip: //div[@class="warning"]//p\r
+strip: //div[@id='page-bookmark-links-head']\r
+strip: //object\r
+strip: //div[contains(@class, "bbccom_advert_placeholder")]\r
+strip: //div[contains(@class, "embedded-hyper")]\r
+strip: //div[contains(@class, 'market-data')]\r
+strip: //a[contains(@class, 'hidden')]\r
+strip: //div[contains(@class, 'hypertabs')]\r
+strip: //div[contains(@class, 'related')]\r
+strip: //form[@id='comment-form']\r
+strip: //div[contains(@class, 'comment-introduction')]\r
+\r
+replace_string(<noscript>): <div>\r
+replace_string(</noscript>): </div>\r
+\r
+prune: no\r
+\r
+dissolve: //h2\r
+test_url: http://www.bbc.co.uk/news/business-15060862
\ No newline at end of file
--- /dev/null
+body: //div[@class="entry-content"]\r
+\r
+# Remove text ‘Tweet’\r
+strip: //div[@class="entry-content"]/div[last()]\r
+\r
+title: h1[@class="entry-title"]\r
+\r
+# If the Instapaper text parser worked with HTML5 tags, we would use:\r
+date: //time[@class="entry-date"]\r
+\r
+# But since it does not, use this more complicated rule:\r
+date: //div[@class="entry-meta"]/a[@rel="bookmark"]\r
+\r
+# Unfortunately, the following rule is overridden by the automatically found author.\r
+author: ("Benoit Maison")
+test_url: http://www.benoitmaison.org/2011/12/06/why-siri-had-to-start-in-beta/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='headline']\r
+body: //div[contains(@class, 'article-wrapper')]
+test_url: http://www.berlingske.dk/danmark/festen-er-flyttet-nordpaa
\ No newline at end of file
--- /dev/null
+body: //div[@class="entry-content"]
+test_url: http://www.betabeat.com/2011/07/04/sheryl-sandberg-breaks-through-silicon-valleys-boys-club-sort-of/
\ No newline at end of file
--- /dev/null
+# some articles at this site like this one doesn't\r
+# seem to pick up the article body via normal \r
+# processing, other articles come through fine\r
+# http://www.betanews.com/joewilcox/article\r
+# /Google-is-a-marketing-sensation/1309708375\r
+body: //*[@id="article"]
+test_url: http://www.betanews.com/joewilcox/article/Google-is-a-marketing-sensation/1309708375
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'main-content')]//h1\r
+body: //div[@class='summary-column'] | //div[contains(@class, 'main-content')]\r
+\r
+prune: no\r
+\r
+single_page_link: //div[@id='biography-action-links']//a[contains(@href, '/print/')]\r
+
+test_url: http://www.biography.com/print/profile/martin-luther-9389283
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://bitelia.com/2011/09/klout-midiendo-influencia
\ No newline at end of file
--- /dev/null
+title: //h1[@class='articlehead']\r
+body: //div[@class='column']\r
+strip: //h1\r
+strip: //div[@class='help']\r
+\r
+#no author or date/time provided in current layout
+test_url: http://bjango.com/articles/actions/
\ No newline at end of file
--- /dev/null
+tidy: no\r
+prune: no\r
+date: //article/header/h6/time\r
+title: //article/header/h3\r
+author: //meta[@name='author']/@content\r
+body: //article//post\r
+
+test_url: http://blog.arsln.org/aska-ayip-oluyor/
\ No newline at end of file
--- /dev/null
+title: //title\r
+author: //span[@class='author vcard']/a\r
+date: //p[@class='headline_meta']/abbr[@class='published']\r
+body: //div[@class='format_text entry-content']\r
+\r
+strip: //div[@id='dd_ajax_float']
+test_url: http://blog.asmartbear.com/how-to-get-quality-freelance-graphics-design-work-on-a-budget.html
\ No newline at end of file
--- /dev/null
+# Instapaper gets this back to front and only gets the blog title instead of the article title.\r
+title: substring-before(//title, '-')\r
+\r
+author: //a[ contains(@href, '/people') ]\r
+\r
+body: //div[ @class='post' ]\r
+\r
+# Date is impossible to retrieve since they use those stupid "fuzzy" dates, inserted through javascript, at posterous.
+test_url: http://blog.cloudflare.com/understanding-analytics-when-is-a-page-view-n
\ No newline at end of file
--- /dev/null
+title: //h2\r
+date: //h3\r
+body: //ul\r
+
+test_url: http://blog.fefe.de/?ts=b063bf55
\ No newline at end of file
--- /dev/null
+# clean Instagram blog a little bit\r
+\r
+tidy:no\r
+prune:no\r
+\r
+body://div[contains(@id,'content')]\r
+\r
+strip_id_or_class:meta\r
+strip_id_or_class:notes\r
+strip_id_or_class:pagination
+test_url: http://blog.instagram.com/post/8757832007/fromwhereistand
\ No newline at end of file
--- /dev/null
+date: //span[contains(@class, 'date-links')]\r
+author: //span[contains(@class, 'author-links')]\r
+body: //div[contains(@class, 'entry-content')]
+test_url: http://blog.jaysalvat.com/article/celui-qui-avait-refait-son-site-web
\ No newline at end of file
--- /dev/null
+body: //*[contains(@class, 'post_content')]\r
+author: string('Kaelig Deloumeau-Prigent')\r
+title: //h1[@class='title']\r
+date: //span[@class='date']
+test_url: http://blog.kaelig.fr/post/24877648508/preprocesseurs-css-renoncer-par-choix-ou-par
\ No newline at end of file
--- /dev/null
+title: //span[@class='pcol1 itemSubjectBoldfont']\r
+body: //div[@id='postListBody']\r
+date: //p[@class='date fil5 pcol2']\r
+single_page_link: /html/frameset/frame[1]/attribute::src\r
+strip: //div[@class='post-btn']
+test_url: http://blog.naver.com/how2invest/110135068757
\ No newline at end of file
--- /dev/null
+# PCHOME blog, a popular Chinese blog host\r
+# Oct 15, 2011\r
+# \r
+\r
+title://*[contains(@class,'imp')]/h2\r
+\r
+date://*[contains(@class,'imp')]/span\r
+body://div[contains(@id,'blog_content')]\r
+\r
+\r
+
+test_url: http://blog.pchome.net/article/462502.html
\ No newline at end of file
--- /dev/null
+title: //a[@class="blog_title"]\r
+date: //p[@class="when"]/a\r
+body: //div[@class="blog_entry"]\r
+strip_id_or_class:blog_title\r
+strip_id_or_class:when
+test_url: http://blog.pinboard.in/2011/11/the_social_graph_is_neither/
\ No newline at end of file
--- /dev/null
+# Sina blog, the most popular blog host in China.\r
+# Its source code is horrible.\r
+# \r
+# Issue:\r
+# Only the first image in the article is displayed.\r
+# The rest images are replace by a 1x1 transparent gif by sina blog host.\r
+# \r
+\r
+title://*[contains(@class,'titName SG_txta')]\r
+author://*[contains(@id,'ownernick')]\r
+date://*[contains(@class,'time SG_txtc')]\r
+body://div[contains(@class,'articalContent')]\r
+\r
+# Remove redundant content which has span class start with "MASS"\r
+# Example <span class="MASSf21674ffeef7"></span>\r
+strip://span[contains(@class,'MASS')]\r
+\r
+# Remove comment\r
+strip://div[contains(@class,'allComm')]\r
+\r
+# Remove hiden text and link\r
+strip://ins\r
+\r
+tidy:no\r
+convert_double_br_tags:yes\r
+test_url: http://blog.sina.com.cn/s/blog_5054769e0102dtja.html
\ No newline at end of file
--- /dev/null
+body://div[@class='post']
+test_url: http://blog.spu.edu/lectio/from-the-frying-pan-into-the-fire/
\ No newline at end of file
--- /dev/null
+title: //h2/a[@class="no-link title"]\r
+author: //h2[@id="blog_owner"]\r
+date: //time\r
+strip: //h2/a[@class="no-link title"]\r
+test_url: http://blog.wells.ee/retina\r
+test_url: http://blog.wells.ee/skeuomorphism
\ No newline at end of file
--- /dev/null
+# 2011-08-23 [carlo@...] Initial version.\r
+\r
+author: //div[@id="blogauthordatebox-node"]//a[@title="View user profile."]/text()\r
+\r
+# why yes, I do feel a bit dirty\r
+date: substring-before( substring-after( substring-after( //div[@id="blogauthordatebox-node"]//td[3], "on " ), ", "), " " )\r
+
+test_url: http://blogs.aljazeera.net/asia/2011/08/22/peoples-hero
\ No newline at end of file
--- /dev/null
+body: //div[@class='entry']
+test_url: http://blogs.forbes.com/adamhartung/2011/04/08/apple-is-better-managed-than-microsoft/
\ No newline at end of file
--- /dev/null
+title: //div[@id='pageFeature']/h1\r
+body: //div[@id='articleBody']\r
+strip: //div[@class='module wide']\r
+test_url: http://blogs.hbr.org/bregman/2011/04/the-1-killer-of-meetings-and-w.html?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+harvardbusiness+%28HBR.org%29
\ No newline at end of file
--- /dev/null
+title: //h3[@class="post-name"]\r
+author: //span[@class="user-name"]\r
+date: //div[@class="post-date"]\r
+body: //div[@class="post-content user-defined-markup"]\r
+footnotes: no
+test_url: http://blogs.msdn.com/b/b8/archive/2011/10/04/designing-the-start-screen.aspx
\ No newline at end of file
--- /dev/null
+title: //div[@id='single']/h1\r
+body: //div[@id='postcontent']
+test_url: http://blogs.reuters.com/felix-salmon/2010/07/16/the-value-of-a-strong-brand-apple-edition/
\ No newline at end of file
--- /dev/null
+# meta data\r
+title://h1[@class = 'postTitle']\r
+author:substring-before(substring-after(//span[@class = 'byline'],'By '),'|')\r
+date://span[@class = 'datestamp']\r
+\r
+#body content\r
+body://div[@id = 'singleBlogPost']\r
+\r
+#reclaim author info\r
+move_into(//div[@id = 'singleBlogPost'])://div[@id = 'aboutAuthorDiv']\r
+strip://p[@class = 'moreLink mobileHide']\r
+\r
+#cleanup comments, there might be some open <div> sections\r
+strip://div[@id = 'comments2']\r
+strip://h3[a[@href = '#add-comment']]
+test_url: http://blogs.scientificamerican.com/a-blog-around-the-clock/2012/07/10/science-blogs-definition-and-a-history/
\ No newline at end of file
--- /dev/null
+# metadata\r
+author://div[@class = 'post']/div[@class='meta']/a[1]\r
+date://div[@id = 'rap']/h2[1]\r
+body://div[@class = 'post']\r
+\r
+# wrapping caption and image\r
+wrap_in(fieldset)://div[contains(@class, 'wp-caption')]\r
+\r
+\r
+# clean up\r
+strip://div[@class = 'post']/h3[@class = 'storytitle']\r
+strip://div[@class = 'post']/div[@class = 'social']\r
+strip://img[@style = 'display:none;']\r
+strip://img[@height='0' and @width='0']
+test_url: http://blogs.smithsonianmag.com/adventure/2011/10/tips-for-women-traveling-in-turkey/
\ No newline at end of file
--- /dev/null
+title: //h3[@class="post-name"]\r
+author: //span[@class="user-name"]\r
+date: //div[@class="post-date"]\r
+body: //div[@class="post-content user-defined-markup"]\r
+footnotes: no
+test_url: http://blogs.technet.com/b/dlemson/archive/2004/03/03/83304.aspx
\ No newline at end of file
--- /dev/null
+body://div[@class='entry']\r
+date://div[@class='meta']\r
+strip://a[@class='FlattrButton']
+test_url: http://bluetouff.com/2012/03/02/polemique-google-vie-privee/
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"][2]\r
+author: string("Paul Boag")\r
+date: substring(//span[@class="meta"], 11)\r
+body: //article\r
+strip: //h2\r
+strip: //h1\r
+strip: //div[@id="callsToAction"]
+test_url: http://boagworld.com/working-in-web-design/dealing-with-the-dickheads/
\ No newline at end of file
--- /dev/null
+# This is far from perfect, but so is BoingBoing's markup\r
+title: //h2[@class="headline"]\r
+single_page_link: //h2[@class="headline"]/a\r
+#date: //p[@class="byline"]\r
+body: //div[@class="post"]\r
+\r
+strip_id_or_class: shareMe\r
+strip_id_or_class: authorbox\r
+strip_id_or_class: byline\r
+
+test_url: http://boingboing.net/2011/10/23/understanding-the-hyperrich-through-the-lens-of-tomorrows-history.html
\ No newline at end of file
--- /dev/null
+title: //h2[@class='entry-title']\r
+body: //div[@class='entry-content']
+test_url: http://boldizsar.palotas.eu/blog/?p=1394
\ No newline at end of file
--- /dev/null
+body: //span[@property='v:description']\r
+date: //span[@property='v:dtreviewed']\r
+author: //span[@property='v:reviewer']\r
+prune: no\r
+
+test_url: http://book.douban.com/review/2422662/
\ No newline at end of file
--- /dev/null
+#metadata\r
+title://div[@class = 'Topper']/h1\r
+author://div[@class = 'Topper']/h3\r
+date://div[@class = 'Topper']/h6\r
+body://div[@class = 'Core']\r
+\r
+\r
+\r
+# clean up\r
+strip://div[@class = 'Topper']/h1\r
+strip://div[@class = 'Topper']/h3\r
+strip://div[@class = 'Topper']/h4\r
+strip://div[@class = 'Topper']/h5\r
+strip://div[@class = 'Topper']/h6\r
+strip://br[@clear = 'all']\r
+strip://div[@class = 'adCore']\r
+strip://div[@class = 'BookR']\r
+strip://div[@class = 'InfoBox']
+test_url: http://bookforum.com/inprint/018_04/8595
\ No newline at end of file
--- /dev/null
+title://h1\r
+author://div[@class="meta"]/span/a\r
+date://div[@class="date"]\r
+body://div[@class="content article"]\r
+strip://div[@class="content article"]/h1\r
+
+test_url: http://borderhouseblog.com/?p=7832
\ No newline at end of file
--- /dev/null
+# NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.\r
+\r
+title: //div[@class="header"]/h1\r
+author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")\r
+date: //div[@class="byline"]/p[last()]\r
+body: //div[@class="article-body"]\r
+\r
+strip_id_or_class: aside\r
+strip_id_or_class: promo\r
+strip_id_or_class: skip-nav\r
+strip_id_or_class: article-more\r
+strip_id_or_class: article-bar\r
+\r
+# This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.\r
+strip_id_or_class: figure
+test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html
\ No newline at end of file
--- /dev/null
+#basics\r
+title://h3[@class = 'article_title']\r
+date://span[@class = 'article_date']\r
+body://div[@id = 'center_column_article']\r
+#correct, but author not being picked up in preview\r
+author://span[@class = 'article_author']\r
+\r
+#strips basics from article\r
+strip_id_or_class:article_title\r
+strip_id_or_class:article_date\r
+strip_id_or_class:article_author\r
+\r
+#strips pull quotes\r
+strip_id_or_class:pull_quote
+test_url: http://www.bostonreview.net/BR36.4/megan_pugh_agnes_de_mille_dance.php
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, '|')\r
+body: //div[@class="entry"]\r
+# Remove the author's picture\r
+strip: //div[@class="entry"]/a[1]
+test_url: http://www.boundlessline.org/2011/06/the-nyts-on-gender-over-the-weekend.html
\ No newline at end of file
--- /dev/null
+title: //div[@class="standard"]/h1\r
+author: string("BrainFacts.org")\r
+date: //div[@class="meta"]/strong\r
+\r
+strip: //p[@class="skip"]\r
+strip: //div[@class="meta"]\r
+strip: //div[@class="standard"]/h1\r
+strip: //div[@class="modal"]\r
+strip: //div[@class="columnRight"]
+test_url: http://brainfacts.org/diseases-disorders/childhood-disorders/articles/2011/autism-the-pervasive-developmental-disorder/
\ No newline at end of file
--- /dev/null
+# set body\r
+body: //div[@id='theContent']\r
+\r
+# set title\r
+title: //div[@id='theContent']/h3\r
+strip: //div[@id='theContent']/h3
+test_url: http://www.brandeins.de/archiv/magazin/gegessen-wird-immer/artikel/hunger.html
\ No newline at end of file
--- /dev/null
+date://h2[@class="date-header"]\r
+body://div[@class="entry-content"]
+test_url: http://www.brandingstrategyinsider.com/2011/12/top-twelve-branding-keys-for-2012.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='post full']\r
+title: //h1\r
+author: substring-after(//title, '- ')\r
+date: //span[@class='date']
+test_url: http://brettterpstra.com/byword-for-ios/
\ No newline at end of file
--- /dev/null
+body: //div[@class='articleBody']
+test_url: http://www.brisbanetimes.com.au/opinion/blogs/blunt-instrument/losing-our-minds--for-24-hours-20120118-1q682.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='contentheader']/h1\r
+author: //p[@class='attribution']/span[@class='author']/*\r
+# Is there a way to pull multiple authors? My XPath here is just grabbing the first\r
+\r
+date: /html/head/meta[@name="date"]/@content\r
+body: //div[@class='main-content']\r
+\r
+strip: //p[@class='byline']\r
+strip: //div[@class='img-gallery']\r
+strip: //div[@class='callout']\r
+strip: //div[@class='add-your-view']\r
+convert_double_br_tags: yes
+test_url: http://www.brookings.edu/opinions/2011/1018_cyberattack_libya_goldsmith.aspx
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@class='article']\r
+body: //div[@class='post']\r
+date: //*[@id='single']/span\r
+prune: no\r
+test_url: http://brooksreview.net/2011/11/readability-agency/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //h2/a\r
+date: substring-after(//h2, '|')\r
+strip_id_or_class: 'attachment'\r
+strip: //h3\r
+\r
+body: //div[@class='entry']
+test_url: http://buquad.com/2012/04/09/paul-ryan/
\ No newline at end of file
--- /dev/null
+title://div[@class="sl-layout-post"]/h1\r
+body: //div[contains(@class, 'post-content') or contains(@class, 'KonaBody')]\r
+strip: //div[contains(@class, "post-sidebar")]\r
+strip: //div[@id='related-links']\r
+author://div[@class="byline"]/a\r
+date://div[@class="byline"]/span[@class="date"]\r
+prune: no\r
+\r
+strip://*[contains(@class,'sponsored-text')]\r
+strip: //div[@id='post_footer']\r
+\r
+test_url: http://www.businessinsider.com/microsoft-just-put-one-of-its-hardcore-technical-geniuses-on-xbox-2012-1
\ No newline at end of file
--- /dev/null
+body: //div[@id='article_detail']\r
+title: //meta[@property='og:title']/@content\r
+date: //div[@id='date_com_art']//a[@class='date']\r
+author: //div[@id='article_detail']//font[@class='auteur']\r
+\r
+strip_id_or_class: porte_titre_theme\r
+strip_id_or_class: cont_param\r
+strip_id_or_class: date_com_art\r
+\r
+prune: no\r
+\r
+test_url: http://www.businessnews.com.tn/details_article.php?a=31073&t=522&lang=fr&temp=1
\ No newline at end of file
--- /dev/null
+# story has several pages, should be detected\r
+body: //div[@id='storyBody']\r
+body: //div[@id='article_body']\r
+body: //div[@id='story_body']\r
+\r
+title://h1[@id='article_headline']\r
+\r
+# article author\r
+author: //p[@class='author']/a\r
+# story author(s)\r
+author: substring-after(//p[@class='byline'], 'By ')\r
+\r
+# article date\r
+date: //span[@class='published_date']\r
+# story date\r
+date: //span[@class='date']\r
+\r
+date: substring-after(//div[contains(@class,'attributor')],'on')\r
+strip_id_or_class: inset\r
+strip: //p/span[@class='photoCredit']\r
+strip: //h1\r
+\r
+strip_id_or_class: page_count\r
+strip_id_or_class: tools\r
+strip_id_or_class: pagination\r
+\r
+single_page_link: //li[@id='stPrint']/a\r
+\r
+test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html\r
+test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall
\ No newline at end of file
--- /dev/null
+# Creator: Greg Leuch <greg@...>\r
+\r
+# It can be messy.\r
+tidy:no\r
+\r
+# The basic template.\r
+title: //h1[@data-print='title']\r
+author: //a[@data-print='author']\r
+date: //time[@data-print='date']\r
+body: //div[@data-print='body']\r
+body: //section[@data-print='body']\r
+\r
+# For various things...\r
+strip: *[@data-print="ignore"]
+test_url: http://www.buzzfeed.com/hgrant/35-reasons-why-dogs-hate-the-holidays
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //a[contains(@href, '/author/')]\r
+date: //*[@class='post-date']\r
+strip: //*[@class='post-date']\r
+strip: //h1
+test_url: http://bygonebureau.com/2011/06/20/an-existential-psychoanalysis/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='producttabbed-title']\r
+body: //div[@class='postTabs_divs postTabs_curr_div']\r
+strip: //div[@class='ratingblock2']\r
+strip: //p[@id='breadcrumbs']\r
+strip: //div[@style='display: none']\r
+\r
+
+test_url: http://www.cardboardconnection.com/2012-topps-archives-baseball-cards
\ No newline at end of file
--- /dev/null
+title: //h2\r
+body: //div[@class='entry']\r
+\r
+prune: no\r
+# otherwise the footnotes are removed
+test_url: http://carpeaqua.com/2011/03/27/the-intersection-of-power-and-portability/
\ No newline at end of file
--- /dev/null
+body: //div[@class='article']\r
+strip: //div[@class='revhistory']\r
+strip: //div[@class='toc']\r
+tidy: no\r
+prune: no\r
+
+test_url: http://catb.org/~esr/faqs/smart-questions.html
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'headline')]/h1\r
+author: //h5[contains(@class, 'byline')]\r
+date: substring-after(//h4[contains(@class, 'posted')], 'Posted: ')\r
+body: //div[@id="storyboard"]
+test_url: http://www.cbc.ca/news/world/story/2012/01/16/cruise-ship-monday.html
\ No newline at end of file
--- /dev/null
+date: //meta[@name="published"]/@content\r
+date: //div[@class="timeLine"]\r
+title: //div[@id='contentBody']//h1\r
+author: //dl[@class="storyBlogByline"]/dd/a\r
+body: //div[@id='storyMediaBox'] | //div[contains(@class, 'storyText')]\r
+\r
+# Content Pruning\r
+strip: //div[@class="scrollingArrows"]\r
+strip: //div[@class="timeLine"]\r
+strip: //dl[@class="storyBlogByline"]\r
+\r
+prune: no\r
+\r
+test_url: http://www.cbsnews.com/8301-201_162-57366361/rescued-americans-dad-proud-of-the-u.s/
\ No newline at end of file
--- /dev/null
+title: //h1
+test_url: http://www.chareidi.org/archives5772/tetzaveh/TZV72adraft.htm
\ No newline at end of file
--- /dev/null
+title: //*[@id='Content']/span[1]\r
+author: substring-after(substring-before(//*[@id='Content']/span[2], ')'), '(')\r
+date: substring-before(substring-after(//*[@id='Content']/span[2], 'Updated: '), 'Counter')\r
+\r
+strip: //*[@id='Content']/span[1]\r
+strip: //*[@id='Content']/span[2]\r
+\r
+body: //*[@id='Content']\r
+
+test_url: http://www.chinamining.org/News/2011-07-22/1311319069d48087.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='title']\r
+author: //div[@class='author']\r
+prune: no\r
+\r
+test_url: http://www.chomsky.info/onchomsky/2002----.htm
\ No newline at end of file
--- /dev/null
+title://div[@class='title']\r
+author://div[@class='byline']/b\r
+date:substring-after(//div[@class='byline'], 'posted')\r
+body://div[@id='body']\r
+wrap_in(h2)://span[@class='subhead']\r
+wrap_in(i)://p[@class='bio']\r
+wrap_in(i)://p[@class='copyright']\r
+strip://div[@class='title']\r
+strip://div[@class='deck']\r
+strip://div[@class='byline']\r
+strip://div[@class='copyright']\r
+strip://br
+test_url: http://www.christianitytoday.com/ct/2012/aprilweb-only/my-god-forsaken-me.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]\r
+author: //*[@class="author vcard fn"]\r
+date: //*[@class="published"]\r
+body: //div[(@class = "dd_content_wrap")]
+test_url: http://christianpf.com/do-ibuys-lead-to-more-buying/
\ No newline at end of file
--- /dev/null
+tidy: no\r
+prune: no\r
+date: //article//time[@pubdate]\r
+title: //article/header/h2\r
+body: //article
+test_url: http://www.christies.com/LotFinder/custom/lot_details_MultiLanguage.aspx?from=salesummary&intObjectID=5556662&sid=e536ed1a-b763-41c4-afcf-c94815ec6eee&LID=3
\ No newline at end of file
--- /dev/null
+body: //pre[@id='cx-desc-text']\r
+body: //div[contains(@class, 'overview-tab-right-bar-info')]\r
+title: //h1[contains(@class, 'detail-dialog-title')]\r
+tidy: no\r
+prune: no\r
+replace_string(<noscript>): <div>\r
+replace_string(</noscript>): </div>\r
+
+test_url: https://chrome.google.com/webstore/detail/pnaiinchjaonopoejhknmgjingcnaloc
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, "entry-title")]\r
+author: //p[contains(@class, "byline")]\r
+\r
+# blog articles (chronicle.com/blogs/*)\r
+body: //div[contains(@class, "abstract")]\r
+date: //p[contains(@class, "time")]\r
+\r
+# all (?) other articles\r
+body: //div[@id="article-body"]\r
+date: //p[contains(@class, "dateline")]\r
+\r
+# remove sidebars containing images (I assume this is desired for Instapaper)\r
+strip: //div[@id="related"]\r
+strip: //div[contains(@class, "image")]\r
+\r
+# note that if you're not a Chronicle subscriber (personally or institutionally), you'll only see the first couple of paragraphs of the article, and Instapaper will display that with some crap above and below. thank goodness for that bookmarklet
+test_url: http://chronicle.com/article/In-a-Land-of-Second-Chances/128375/
\ No newline at end of file
--- /dev/null
+# fforst@...\r
+\r
+# Use link to print article for single page view\r
+single_page_link: //a[@class="print"]\r
+\r
+# set body\r
+tidy: no\r
+body: //div[@class='artikel-content']\r
+\r
+# strip title and subtitle since we got it already\r
+strip: //div[@class='issue']\r
+strip: //div[@class='artikel-content']/h2\r
+\r
+# some authors are known and have a link, others don't\r
+author: //a[contains(@href, 'autor?')]\r
+\r
+#date\r
+date: //span[@class='article-date']\r
+\r
+# Strip author since we got him\r
+strip_id_or_class: author\r
+\r
+#strip captions\r
+strip_id_or_class: field-name-field-image-credit\r
+strip_id_or_class: field-name-field-article-image-subtitle\r
+\r
+# remove community functions\r
+strip: //div[@class='meta']\r
+strip: //div[@id='comments']\r
+\r
+# remove "continue on the next page" text\r
+strip: //p[text()="[SEITE]"]
+test_url: http://www.cicero.de/weltbuehne/ihre-wut-ist-global-krise-jugend-revolten-aufstaende-zelte/43049
\ No newline at end of file
--- /dev/null
+body: //*[(@id = "articlebody")]\r
+strip_id_or_class: rotulo\r
+
+test_url: http://ciperchile.cl/2011/04/18/las-operaciones-secretas-que-ordenaba-karadima-para-aniquilar-a-su-competencia/
\ No newline at end of file
--- /dev/null
+body: //p[@class='subhead' or @class='attribution'] | //div[@class='article-body']\r
+prune: no\r
+\r
+single_page_link: //li[@class='print']/a\r
+\r
+test_url: http://www.cjr.org/behind_the_news/from_breaking_news_to_baseless.php
\ No newline at end of file
--- /dev/null
+title://div[@class="entrytitle"]/a\r
+author:substring-after(substring-before(//div[@class="entrytime"], "|"), "By ")\r
+date:substring-before(substring-after(//div[@class="entrytime"], "|"), "- Posted")\r
+body://div[@class="entrybody"]\r
+strip://div[@class="entrybody"]//p[@class="singleinfo"]
+test_url: http://clientk.com/2011/12/19/the-impact-of-more/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //a[@class='auteur']\r
+body: //div[@class='editorial']\r
+next_page_link: //a[contains(text(),'Page suivante')]\r
+strip: //a[contains(text(),'Page suivante')]\r
+strip: //a[contains(text(),'Page précédente')]\r
+strip_id_or_class: slideshow\r
+\r
+prune: no\r
+\r
+test_url: http://www.clubic.com/carte-graphique/carte-graphique-amd/radeon-hd-7770/article-478936-1-radeon-hd-7750-7770.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@id,'article-body')]\r
+strip://div[contains(@id,'disqus_count_block')]\r
+strip://div[contains(@id,'col-left')]\r
+strip://div[contains(@id,'col-right')]\r
+
+test_url: http://www.cmswire.com/cms/customer-experience/for-apps-and-appstores-the-singularity-is-approaching-014888.php
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //div[contains(@class, 'postBody')]\r
+date: //div[@id='nameAndTime']/time\r
+author: //div[@id='nameAndTime']/span[@class='author']\r
+\r
+strip_id_or_class: image-credit\r
+strip_id_or_class: noAutolink\r
+strip_id_or_class: related\r
+\r
+prune: no\r
+tidy: no\r
+\r
+# early end\r
+replace_string(Download today's podcast</a>): Download today's podcast</a></div></body></html>\r
+\r
+test_url: http://www.cnet.com/8301-13952_1-57367607-81/the-404-981-where-the-world-is-a-vampire-podcast/
\ No newline at end of file
--- /dev/null
+title: //div[@class="cnn_storyarea"]/h1\r
+author: //div[@class="cnnByline"]/strong\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Sun')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Mon')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Tue')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Wed')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Thu')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Fri')\r
+date: substring-after(//div[@class="cnn_strytmstmp"], 'Sat')\r
+strip: //div[@class="cnn_storyarea"]/h1\r
+strip_id_or_class: cnnByline\r
+strip_id_or_class: cnn_strytmstmp\r
+strip_id_or_class: cnn_strycaptiontxt\r
+strip_id_or_class: cnn_strybtntoolsbttm\r
+strip_id_or_class: cnn_strybtntools\r
+strip_id_or_class: cnn_strybtmcntnt\r
+strip_id_or_class: cnn_containerwht\r
+strip_id_or_class: cnn_stryathrtmp\r
+test_url: http://www.cnn.com/2012/05/13/us/new-york-police-policy/index.html?eref=rss_topstories
\ No newline at end of file
--- /dev/null
+# main sportsillustrated.com articles\r
+\r
+body: //div[@id="cnnStoryContent"]\r
+title: //div[@id="cnnStoryHeadline"]//h1\r
+author: //div[@id="cnnSubBanner"]//strong\r
+date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")\r
+date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")\r
+\r
+# kill ugly font buttons\r
+strip: //div[@id="cnnSCFontButtons"]\r
+\r
+# kill misc filler videos & etc\r
+strip: //div[@class="cnnDivideContent"]\r
+strip: //*[@class="cnnTMbox"]\r
+\r
+# si vault articles\r
+# -------------\r
+body: //div[@class="siv_artPara"]\r
+title: //div[@class="siv_artHeader"]//h1\r
+author: //div[@class="byline"]\r
+date: //div[@class="date"]\r
+\r
+next_page_link: //div[@id='cnnStoryContinue']/a\r
+strip_id_or_class: cnnstorypagination\r
+\r
+test_url: http://cnnsi.com/2012/writers/peter_king/01/08/wild.card.round/index.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+title: //div[@id='page_header']/h1\r
+\r
+strip_id_or_class: 'lineno'\r
+strip_id_or_class: 'block-toolbar-button'\r
+strip_id_or_class: 'recipe_score'\r
+strip: //div[@id='recipe_tools']\r
+strip: //div[@id='addcomment']\r
+\r
+test_url: http://code.activestate.com/recipes/500261-named-tuples/
\ No newline at end of file
--- /dev/null
+body: //div[@id="gc-pagecontent"]\r
+strip: //a[@class="backtotop"]\r
+prune: no\r
+\r
+test_url: http://code.google.com/apis/analytics/docs/tracking/gaTrackingEcommerce.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='blogbody']\r
+strip: //h3[@class='title']\r
+date: //h2[@class='date']\r
+#Should Atwood just be a literal?\r
+author: substring-before( substring-after(//div[@class='posted'], 'y'), 'V')\r
+\r
+# tim.kingman@... 2011-07-26\r
+# Prune:no to retain all-link ULs that are part of the body content like\r
+# http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html\r
+# Then explicitly strip the "Posted By" and prev/next links that Prune:yes would have removed.\r
+\r
+prune: no\r
+strip: //div[@class='posted']/following-sibling::*\r
+strip: //div[@class='posted']
+test_url: http://www.codinghorror.com/blog/2011/07/building-a-pc-part-vii-rebooting.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='title']\r
+author: //p[@class='byline']/a[1]\r
+date: //*[@class='date']\r
+\r
+body: //div[@class='article_body']\r
+strip: //p[@class='ca_intro']\r
+strip: //div[@id='action_bar']\r
+strip: //div[@class='below_content']\r
+strip: //div[@id='announcement']\r
+strip: //div[@id='leftovers']\r
+strip: //div[@class='form']\r
+strip: //div[@id='email_overlay']\r
+strip: //a[@class='close']
+test_url: http://www.collegehumor.com/article/6599562/how-it-happened-the-necktie
\ No newline at end of file
--- /dev/null
+body: //div[@class="entry-body"]
+test_url: http://communities-dominate.blogs.com/brands/2012/03/brutal-truth-about-lumia-cannot-sustain-even-1-to-1-replacement-of-symbian-windows-phone-strategy-do.html
\ No newline at end of file
--- /dev/null
+body: //div[@id="center"]//div[@class="node"]\r
+title: //div[@id="center"]//h2\r
+author: substring-after(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")\r
+date: substring-before(//div[@id="center"]//div[@class="node"]//span[@class="submitted"], "—")\r
+strip: //div[@id="center"]//h2[1]\r
+strip: //span[@class="submitted"][1]\r
+move_into(//div[@class="node"])://div[@class="breadcrumb"]
+test_url: http://community.service-now.com/blog/lawrenceeng/seasons-greetings-servicenow-team
\ No newline at end of file
--- /dev/null
+strip_id_or_class:column-3\r
+strip_id_or_class:portlet-boundary\r
+strip_id_or_class:banner\r
+
+test_url: http://www.computer.org/portal/web/buildyourcareer/careerwatch/jt19
\ No newline at end of file
--- /dev/null
+title://h1\r
+\r
+author://div[@id="news-meta"]/a\r
+\r
+body://*[@id="main"]/div[1]\r
+\r
+strip://*[@id="main"]/div[2]\r
+strip://*[@id="main"]/div[3]\r
+strip://*[@id="page"]//footer\r
+\r
+#date: didn't manage to parse it\r
+\r
+#Images have to be stripped because the page does it with overlay\r
+strip://img\r
+\r
+#figures are not displayed in instapaper...\r
+strip://figure | //figcaption\r
+test_url: http://www.computerbase.de/news/2012-06/verbraucherzentrale-mahnt-blizzard-fuer-diablo-3-ab/
\ No newline at end of file
--- /dev/null
+title: //meta[@name='headline']/@content\r
+date: //meta[@name='date']/@content\r
+author: //meta[@name='author']/@content\r
+body: //div[contains(@class, 'article')]\r
+body://div[@id="article_body"]\r
+\r
+strip_id_or_class: banner\r
+strip: //noscript\r
+strip: //div[@style='width:1px;height:130px;float:right;']\r
+strip: //div[@class='storyby']\r
+strip_image_src: twitter_icon\r
+strip_image_src: rss_bug\r
+\r
+tidy: no\r
+prune: no\r
+\r
+next_page_link://div[@id="next_page"]/a\r
+\r
+single_page_link: concat('http://www.computerworld.com/s/article/print/', substring-after(//link[@rel='canonical']/@href, '/s/article/'))\r
+\r
+test_url: http://www.computerworld.com/s/article/9224348/Apple_s_new_OS_X_tightens_screws_on_some_malware\r
+test_url: http://www.computerworld.com/s/article/9227679/Windows_8_Release_Preview_Updated_but_still_uneasy
\ No newline at end of file
--- /dev/null
+strip: //div[contains(@class, 'articleAdtechAd')]\r
+title: //div[@id='article']/h1\r
+title: //div[contains(@class, 'article')]/h1\r
+body: //div[@id='articleText']\r
+test_url: http://www.computerworld.dk/art/56748/test-din-viden-med-computerworlds-store-sommerquiz?a=fp_1&i=0
\ No newline at end of file
--- /dev/null
+# get author from string like "Posted by <author> on <date>"\r
+author: substring-before(substring-after(//div[@class='post']/p[@class='post-meta'], 'by'), 'on')\r
+\r
+# get date from string like "Posted by <author> on <date>"\r
+date: substring-after(//div[@class='post']/p[@class='post-meta'], 'on')\r
+\r
+# this keeps thumbnail images\r
+prune: no
+test_url: http://www.contemporist.com/2011/11/02/landing-200-lamp-by-kim-hyunjoo
\ No newline at end of file
--- /dev/null
+title: //div[@class='article_header']/h1\r
+body: //div[@class='article_header']/p | //div[@class='article_body']\r
+strip_id_or_class: share_this\r
+strip_id_or_class: sociable\r
+prune: no\r
+\r
+test_url: http://conversaciones.nokia.com/2011/10/07/cinco-atajos-en-el-nokia-n8/
\ No newline at end of file
--- /dev/null
+body: //div[@id="permalink"]/div[@class="post"]\r
+\r
+strip: //div[@id='backArrow']\r
+strip: //div[@id='fwdArrow']\r
+strip: //div[@class="post-title"]\r
+strip: //div[@class="sharing"]
+test_url: http://www.core77.com/blog/columns/why_design_education_must_change_17993.asp
\ No newline at end of file
--- /dev/null
+title: //div[@class='main']//h1[contains(@class, 'article-title')]\r
+author: //div[@class='mainauthorstyle']\r
+body: //div[@class='main']//div[@class='main-text']\r
+strip: //td[@width='140']\r
+\r
+test_url: http://www.counterpunch.org/johnstone05172011.html
\ No newline at end of file
--- /dev/null
+title://h2\r
+body://div[contains(@class, 'entrytext')]
+test_url: http://www.crazybutable.com/weblog/archives/2010/07/01/house-ideas-that-worked/
\ No newline at end of file
--- /dev/null
+autodetect_next_page: no
+test_url: http://www.crimemagazine.com/son-sam
\ No newline at end of file
--- /dev/null
+body: //div[@class="readingtext"]\r
+title: substring-after(substring-after(//title, ':'), ':')
+test_url: http://www.crimethinc.com/texts/recentfeatures/nightmares.php
\ No newline at end of file
--- /dev/null
+author: //p[contains(@class,'author')]/a\r
+date: //div[contains(@class,'date')]
+test_url: http://www.crn.de/netzwerke-tk/artikel-93103.html
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'head')]\r
+\r
+# standard page\r
+body: //div[@id='mainColumn']//div[contains(@class, 'list-article-full')]\r
+# print page\r
+body: //div[@id='mainColumn']\r
+\r
+author: //a[contains(@class, 'ui-author')]\r
+\r
+single_page_link: //div[@class='storyToolbar']//a[contains(@href, '/print/')]\r
+\r
+strip_id_or_class: storyToolbar\r
+strip_id_or_class: promotion-tag\r
+\r
+tidy: no\r
+prune: no\r
+
+test_url: www.csmonitor.com/World/Middle-East/2011/1108/Imminent-Iran-nuclear-threat-A-timeline-of-warnings-since-1979/Earliest-warnings-1979-84
\ No newline at end of file
--- /dev/null
+title: //div[@id='csn_blogST_headline']/h1\r
+\r
+body: //div[@id='csn_blogST_main']\r
+strip_id_or_class: ipfootnotes\r
+strip: //div[@id='csn_blogST_main']/p[1]/img\r
+strip: //div[@id='csn_blogST_sidebar']
+test_url: http://www.csnbayarea.com/blog/giants-talk/post/-?blog%2Fgiants-talk%2Fpost%2F-=&blockID=578902&feedID=5987
\ No newline at end of file
--- /dev/null
+# author's name is not isolated as a tag.... ugh\r
+convert_double_br_tags: yes\r
+body: //csn_blogST_main\r
+\r
+#junk above and around the article\r
+strip: /html/body/div[4]/div[3]/div/div/div/section/div/div/div/div/div/div\r
+strip: /html/body/div[4]/header\r
+strip_id_or_class: article-right-sidebar\r
+strip_id_or_class: rsn-gigya-sharebar-container\r
+strip_id_or_class: article-bottom\r
+strip_id_or_class: hider\r
+strip_id_or_class: footer\r
+strip_id_or_class: masthead\r
+strip_id_or_class: block-menu-menu-rsn-login-or-register\r
+strip_id_or_class: block-menu-menu-header-links\r
+strip_id_or_class: block-rsn-follow-bar-follow-bar\r
+strip_id_or_class: block-rsn-weather-rsn-weather-scoreboard\r
+strip_id_or_class: logo\r
+strip_id_or_class: element-invisible\r
+strip_id_or_class: site-name\r
+strip: //div[contains(@style, 'none')]\r
+test_url: http://www.csnphilly.com/eagles/can-stoutland-save-danny-watkins-career
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://cucharasonica.com/2011/09/queen-busca-candidatos-para-su-propia-banda-tributo
\ No newline at end of file
--- /dev/null
+single_page_link: //a\r
+tidy: no\r
+prune: no\r
+
+test_url: da.feedsportal.com/c/585/f/413794/s/17037b5a/l/0L0Stelegraaf0Bnl0Cbinnenland0C10A2757860C0I0IKlacht0Itegen0Idr0B0IFrank0Iniet0I0Eontvankelijk0I0I0Bhtml0Dcid0Frss/ia1.htm
\ No newline at end of file
--- /dev/null
+tidy: no\r
+body: //article\r
+
+test_url: http://www.dailydot.com/entertainment/tumblr-christopher-price-topherchris/
\ No newline at end of file
--- /dev/null
+body: //div[@id='article-1']//div[contains(@class, 'article-body')]\r
+title: //div[@class='meta']//a[@id='titleHref']\r
+date: //div[@class='meta']//p[@class='date']\r
+\r
+strip_id_or_class: invisible\r
+strip_id_or_class: divider-doodle\r
+\r
+prune: no\r
+\r
+test_url: http://www.dailykos.com/story/2012/01/26/1058790/-Newt-Gingrichs-campaign-admits-he-lied-during-debate-about-ABC-News-interview-with-his ex-wife
\ No newline at end of file
--- /dev/null
+body: //div[@id='js-article-text']\r
+strip: //div[@class='explore-links']\r
+strip: //div[@id='js-article-text']/br[position()=1]\r
+strip_id_or_class: print-or-mail-links\r
+strip_id_or_class: shareArticles\r
+strip_id_or_class: googleAds\r
+strip_id_or_class: digg-button\r
+strip_id_or_class: article-icon-links-container\r
+strip_id_or_class: clickToEnlarge\r
+tidy: no\r
+\r
+test_url: http://www.dailymail.co.uk/news/article-1375423/Royal-wedding-Texan-billionaire-Joe-Albritton-invited-Prince-Charles.html
\ No newline at end of file
--- /dev/null
+autodetect_next_page: no\r
+tidy: no\r
+prune: no\r
+body: //div[@class='NoOverflow']
+test_url: http://www.dansdata.com/gz129.htm
\ No newline at end of file
--- /dev/null
+title: //div[@class="article"]/h1\r
+author: //div[@id="Sidebar"]/p/strong\r
+date: //h6[@class="dateline"]\r
+body: //div[@class="article"]\r
+strip: //h6[@class="dateline"]\r
+strip: //div[@class="article"]/h1\r
+test_url: http://daringfireball.net/2011/10/apps_are_the_new_channels
\ No newline at end of file
--- /dev/null
+body: //div[@id="article"]\r
+date: //p[@class="date"]\r
+author: //p[@class="byline"]
+test_url: http://www.datanami.com/datanami/2011-12-07/new_path_for_sap:_in_memory_computing,_predictive_analysis_converge.html?featured=top
\ No newline at end of file
--- /dev/null
+title: (//article//h2)[1]\r
+body: //article[contains(@class, 'post')]\r
+date: //time[@id='top_time']/@datetime\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://dcurt.is/predictions-txt
\ No newline at end of file
--- /dev/null
+strip_id_or_class: banner\r
+strip_id_or_class: gamma\r
+strip_id_or_class: module-list
+test_url: http://delong.typepad.com/sdj/2011/02/in-which-suresh-naidu-visits-the-new-jerusalem.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='artikelHeader']/h1\r
+author: //span[@class='author']\r
+date: //span[@class='date']\r
+body: //div[@class='copytext']\r
+strip: //ul[@class='lookupLinksArtikel']\r
+\r
+strip: //div[@id='pageTop']\r
+strip: //div[@id='toolbar']\r
+strip: //div[@id='articleTools']\r
+strip: //div[@id='weiterlesen']\r
+strip: //div[@id='communityCanvas']\r
+
+test_url: http://derstandard.at/1318726018343/Breitband-LTE-Was-bringt-die-neue-Mobilfunk-Generation
\ No newline at end of file
--- /dev/null
+tidy: no\r
+body: //div[@class='main']\r
+\r
+author: substring-before(substring-after(//div[@class='meta-single'], 'erstellt von '), ' am')\r
+date: substring-before(substring-after(//div[@class='meta-single'], ' am '), ' | ')\r
+\r
+strip_id_or_class: pagelink\r
+strip_id_or_class: wp-polls \r
+\r
+next_page_link: //div[@class='post-page-next']/a
+test_url: http://www.designtagebuch.de/die-gefuehlte-lesbarkeit/
\ No newline at end of file
--- /dev/null
+body: (//blockquote[contains(@class, 'postcontent')])[1]\r
+body: (//div[starts-with(@id, 'post_message')])[1]\r
+\r
+prune: no\r
+tidy: no
\ No newline at end of file
--- /dev/null
+title: //h1[@class="content-headline"]\r
+body: //div[@class="headers-container"] | //div[@class="content-container"]\r
+prune: no\r
+tidy: no\r
+\r
+single_page_link: //li[@class='utility-print']/a\r
+\r
+test_url: http://www.details.com/culture-trends/critical-eye/201108/best-new-designers-innovations
\ No newline at end of file
--- /dev/null
+title: //div[@class="bodyText"]/h1\r
+author: //div[@class="picture"]/a/img/@alt
+test_url: https://developers.facebook.com/blog/post/2012/03/22/developer-spotlight--foodspotting/
\ No newline at end of file
--- /dev/null
+date: //h2[@class='date-header']\r
+body: //div[@class='post hentry']\r
+title: //h3\r
+strip: //div[@class='post-footer']\r
+
+test_url: http://devlinsangle.blogspot.co.at/2012/03/difference-between-teaching-and_01.html
\ No newline at end of file
--- /dev/null
+title: //h1[@id='query_h1']\r
+body: //div[contains(@class, 'lunatext results_content')]\r
+strip_id_or_class: spl_unshd\r
+#replace_string(<div class="dicTl">): <div class="dicTl">------------------<br />\r
+\r
+prune: no\r
+
+test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/
\ No newline at end of file
--- /dev/null
+title: //div[@class='article']/h1\r
+date: substring-before(//p[@class='articletime'],'|')\r
+body: //div[@id='articletext']\r
+strip: //div[@class='inlineDiashow']\r
+
+test_url: http://diepresse.com/home/politik/aussenpolitik/701905/TibeterProteste_Nonne-verbrennt-sich-selbst?_vl_backlink=/home/politik/index.do
\ No newline at end of file
--- /dev/null
+# default parser works great\r
+# only add "author" and "next page link" reference\r
+# 2012-04-13\r
+\r
+next_page_link: //div[@class = 'pagination']/a[@class = 'next_page']\r
+\r
+author: //*[@class = 'author metadata']/a
+test_url: http://digiphoto.techbang.com/posts/2433--commercial-photography-communication-is-the-key-to-a-good-work
\ No newline at end of file
--- /dev/null
+title: //div[@class='post-title']/h1\r
+author: //a[@href='#author']\r
+body: //div[@class='post-content']\r
+strip: //div[@class='post-meta']\r
+\r
+test_url: http://www.digital-photography-school.com/10-ways-to-develop-yourself-photographically
\ No newline at end of file
--- /dev/null
+title: //div[@class="article_header"]/h1\r
+date: //div[@class="article_pub"]/span[@class="time"]\r
+author: //div[@class="article_pub"]/span[@class="editors"]/a/text()\r
+body: //div[@class="article_body clear_left"]
+test_url: http://www.digitalspy.co.uk/movies/at-the-movies/a364066/top-5-super-bowl-movie-trailers-the-avengers-battleship-more.html
\ No newline at end of file
--- /dev/null
+convert_double_br_tags: yes\r
+\r
+title: substring(substring-after(//title, ':'), 1, string-length(substring-after(//title, ':')) - 10)\r
+body: //*[contains(@class, 'SB_Content')]\r
+author: string('Scott Adams')\r
+date: //*[contains(@class, 'SB_Detail')]/text()[1]\r
+
+test_url: http://dilbert.com/blog/entry/death_by_hypnosis_or_not/
\ No newline at end of file
--- /dev/null
+title: //div[@class='newsdetbd']\r
+body: //div[@id='innerleft'] \r
+#//p[@class = 'plnht']\r
+strip_image_src: /albums/\r
+strip: //div[@class='mrrt']\r
+prune: yes\r
+strip_id_or_class: 'fdpd'\r
+strip_id_or_class: 'epapt' \r
+strip_id_or_class: 'newsrtwd'\r
+strip_id_or_class: 'padtp'\r
+strip_id_or_class: 'newdt'\r
+strip_id_or_class: 'newdlt'\r
+strip: //div[@id='selNotes']\r
+strip_id_or_class: 'clsNotes'\r
+strip_id_or_class: 'clear'\r
+strip_id_or_class: 'cmtwrap'\r
+strip_id_or_class: 'sess'\r
+strip_id_or_class: 'parents'
+test_url: http://www.dinamalar.com/News_Detail.asp?Id=295725
\ No newline at end of file
--- /dev/null
+# Since this element has class="clear", the Instapaper stylesheets (at least this text parser preview), will render it unreadable, with a 1px font size and line height.\r
+\r
+body: //div[@id="article-content"]\r
+\r
+\r
+# Ads\r
+strip_id_or_class: advert-space\r
+\r
+# Read more, recommend, comments etc\r
+strip_id_or_class: fbc-recommend\r
+strip_id_or_class: recommend\r
+strip_id_or_class: article-readers\r
+strip_id_or_class: article-addons\r
+strip_id_or_class: hook\r
+strip_id_or_class: right\r
+strip_id_or_class: footer\r
+\r
+# Other news\r
+strip: //div[@id="mirrors"]\r
+\r
+# Author\r
+author: //div[@id="byline"]/div/p/strong\r
+\r
+# Date\r
+date: substring(substring-after(//p[@class="published"], 'Publicerad '), 0, 11)\r
+test_url: http://www.dn.se/nyheter/varlden/landade-flygplan-mitt-i-villaomrade
\ No newline at end of file
--- /dev/null
+strip: //*[(@id = "featured")]\r
+\r
+author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')\r
+\r
+date: concat(//div[@class='month'],' ',//div[@class='day'])\r
+\r
+#doctac doesn't provide a year, but month/day is better than nothing
+test_url: http://www.doctac.com/mac/iphone/instapaper-update-app/
\ No newline at end of file
--- /dev/null
+# TODO: clean up the extra junk at the end of articles\r
+\r
+# general text formatting\r
+prune: no\r
+convert_double_br_tags:yes\r
+\r
+# where to find the basic metadata\r
+author://a[@class='articleauthor']\r
+date://a[starts-with(@href,'/en/search/published/')]\r
+title:substring-before(//h2[@class='title'],'—')\r
+body://div[@id='maincontainer']\r
+\r
+dissolve://div[starts-with(@id,'commentableblock')]\r
+\r
+# clean up the crap\r
+strip://div[contains(@class,'domusnetwork')]\r
+strip://div[contains(@class,'relative_wrapper')]\r
+\r
+strip://div[contains(@class,'captionsubimage')]/img[contains(@class,'arrow')]\r
+wrap_in(em): //div[contains(@class,'captionsubimage')]/span
+test_url: http://www.domusweb.it/en/design/in-praise-of-lost-time/
\ No newline at end of file
--- /dev/null
+title: //h1[@itemprop="name"]\r
+\r
+author: //div[contains(@class, 'author')]//div[contains(@class, 'name')]/a\r
+\r
+date: //div[contains(@class, 'b-info')]//span[contains(@class, 'date')]\r
+\r
+body: //div[contains(@class, 'b-typo')]
+test_url: http://dou.ua/lenta/interviews/andrej-havryuchenko/?from=sb_mostcomm
\ No newline at end of file
--- /dev/null
+# This filter is tested on:\r
+# http://www.douban.com/note/215003067/\r
+# http://www.douban.com/note/213540049/\r
+# http://www.douban.com/group/topic/31140104/\r
+\r
+title: //div[@class='note-header']/h1\r
+title: //div[@id='content']/h1\r
+\r
+author: //div[@class='info']/ul/li/a\r
+author: //h3/span/a\r
+\r
+date://div[@class='note-header']/div/span\r
+date://h3/span[contains(@class, 'color-green')]\r
+\r
+body://div[contains(@class, 'note')]\r
+body://div[contains(@class, 'topic-content')]\r
+\r
+strip://h3\r
+\r
+convert_double_br_tags: yes\r
+test_url: http://www.douban.com/group/topic/31140104/
\ No newline at end of file
--- /dev/null
+# next_page_link for product review\r
+# example: http://www.dpreview.com/reviews/lytro/\r
+next_page_link: //img[@alt = 'Next page']/../@href\r
+\r
+# next_page_link for other articles\r
+# example: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1\r
+next_page_link: //*[@class = 'pages']/*/td[@class = 'next enabled']/a\r
+single_page_link: //a[contains(.,'Print view')]
+test_url: http://www.dpreview.com/articles/6126592906/first-impressions-using-the-fujifilm-x-pro1
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+author: //div[@class='articleFunctions']//a\r
+date: //meta[@name='pubdate']/@content\r
+\r
+# Can you strip elements from the body only? It is required here (`//div[@class='articleContent']/p` breaks for some reason)\r
+body: //div[@class='articleContent']\r
+\r
+tidy: no
+test_url: http://www.dr.dk/Nyheder/Udland/2011/10/24/150115.htm
\ No newline at end of file
--- /dev/null
+body: //div[@class='postext']\r
+\r
+strip_id_or_class: ratingblock\r
+strip_id_or_class: hreview-aggregate\r
+strip: //div[contains(@style, 'display: none;')]\r
+\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.dramasonline.com/jago-pakistan-jago-7th-december-2012-ali-gul-pir/
\ No newline at end of file
--- /dev/null
+single_page_link: //a[contains(@href, '/article/print')]
+test_url: http://www.drdobbs.com/architecture-and-design/240001128
\ No newline at end of file
--- /dev/null
+body: //div[@class = "description"]\r
+body: //div[@id = "post"]\r
+\r
+strip_id_or_class: vcard\r
+strip_id_or_class: journallist\r
+strip_id_or_class: infobox\r
+strip_id_or_class: terms\r
+strip_id_or_class: replieslist\r
+strip_id_or_class: communityside\r
+\r
+
+test_url: http://www.drive2.ru/cars/audi/a6/a6_c5/elysey/journal/288230376151836654/
\ No newline at end of file
--- /dev/null
+title://h1\r
+author://div[@class="submitted"]/a\r
+date:substring-after(//div[@class="meta"],'modified: ')\r
+date:substring-after(//div[@class="submitted"],'on ')\r
+body://div[@class="node-content"]\r
+strip://div[@class="meta"]\r
+strip_id_or_class:book-navigation
+test_url: http://drupal.org/node/1327354
\ No newline at end of file
--- /dev/null
+title: //h2/a\r
+author: substring-before(substring-after(//span[@class='byline'], 'by'), ',')\r
+date: substring-before(substring-after(//span[@class='byline'], ','), '|')\r
+body: //div[@class='entry']\r
+\r
+\r
+# strip out auction stuff at the end of posts\r
+# tidy kills the center tag, so disable it\r
+tidy: no\r
+strip: //center//table
+test_url: http://www.dukebasketballreport.com/articles/?p=42660
\ No newline at end of file
--- /dev/null
+strip://*[@id = 'blog_top_stories']\r
+strip://*[@id = 'takeover_off']\r
+strip://*[@id = 'right_gray_box']\r
+strip://*[@class = 'blog_topics']\r
+strip://*[@class = 'section_titles']\r
+\r
+author://div[@class = 'post_author_info']/a\r
+date://div[@class = 'post_date_info']
+test_url: http://dvice.com/archives/2012/05/is-nfc-and-smar.php
\ No newline at end of file
--- /dev/null
+title: //div [@class="post contain"]/h1\r
+strip: //div [@class="post contain"]/h1\r
+body: //div [@class="post contain"]\r
+author: substring-before(//title, ':')\r
+author: substring-before(//title, ' ')\r
+\r
+
+test_url: http://eamesinerudition.com/2012/03/hospital-numbers-are-bad-for-you
\ No newline at end of file
--- /dev/null
+title: //h1\r
+date: //div[@class="et_dateUnderTitle"]\r
+author: substring-after(//div[@class="et_authorUnderTitle"], 'By ')\r
+body: //div[@id="et_leftCol640split"]\r
+\r
+strip: //div[@id="et_leftCol640splitRight"]\r
+strip: //div[@class="et_light_greybgboxlower"]
+test_url: http://eandt.theiet.org/magazine/2011/12/this-festive-waste.cfm
\ No newline at end of file
--- /dev/null
+title: //div[@class='title_text']\r
+\r
+author: //div[@class='author_text']\r
+\r
+body: //div[@class='story_text']/..\r
+\r
+strip: //b\r
+\r
+strip_id_or_class: back_to_top\r
+strip_id_or_class: author_text\r
+strip_id_or_class: title_text\r
+\r
+wrap_in(center): //a\r
+\r
+dissolve: //a\r
+ \r
+footnotes: no
+test_url: http://www.eastoftheweb.com/short-stories/UBooks/Horl.shtml
\ No newline at end of file
--- /dev/null
+body: //h1[@class='it-ttl'] | //div[@id='mainImgHldr'] | //span[@id='prcIsum']\r
+\r
+strip_image_src: imgLoading_30x30.gif\r
+\r
+test_url: http://www.ebay.com/itm/BRAND-NEW-FM-Transmitter-Ca-r-Charger-iPhone-4S-4-4G-3GS-3G-2G-iPod-Touch-/190657497204
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://ecetia.com/2011/09/vida-de-jugon-vii-las-tres-es
\ No newline at end of file
--- /dev/null
+title: //h1[@class="title"]\r
+author: //div[@class="hosted"]/a\r
+date: substring-after(//div[@class="dateline"]/text(), '|')\r
+\r
+strip: //a[@class="top" and @href="#"]
+test_url: http://econlog.econlib.org/archives/2012/04/blinder_on_heal.html
\ No newline at end of file
--- /dev/null
+date: //div[@class="bb-md-noticia-fecha"]\r
+body: //div[@class="corpo"]\r
+dissolve: //div[@class="bb-md-noticia-extras"]\r
+strip: //strong\r
+strip_id_or_class: bb-md-noticia-foto-autor\r
+strip_id_or_class: bb-md-noticia-foto-bajada
+test_url: http://economia.estadao.com.br/noticias/economia,cmn-aprova-r-67-bi-em-credito-para-20-setores-da-economia,118501,0.htm
\ No newline at end of file
--- /dev/null
+title: //div[@class='ec-blog-headline']\r
+body: //div[@class='ec-blog-body']\r
+body: //div[@class='ec-article-content clear']\r
+strip: //div[@class='related-items']\r
+date: substring-before(//p[@class='ec-article-info'], '|')\r
+prune: no\r
+\r
+autodetect_next_page: no\r
+\r
+test_url: http://www.economist.com/node/21528429
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //h2[@class='strapline'] | //article[contains(@class, 'node-article')]\r
+date: //time[@pubdate]/@datetime\r
+author: //span[@class='author-name']\r
+prune: no\r
+tidy: no\r
+strip: //footer\r
+\r
+replace_string(<p>[ pagebreak ]</p>): <!-- pagebreak -->\r
+\r
+single_page_link: //a[contains(@href, '?page=show')]\r
+\r
+test_url: http://www.edge-online.com/features/telling-modern-warfares-story
\ No newline at end of file
--- /dev/null
+title: //div[@class='HomeLeftPannel IMGCTRL']/h2\r
+body: //div[@class='HomeLeftPannel IMGCTRL']//div[@class='Brownalink' or @id='shortdesc']\r
+tidy: no\r
+\r
+test_url: http://edge.org/print/conversation.php?cid=the-argumentative-theory
\ No newline at end of file
--- /dev/null
+body: //div[@id='cnnContentContainer']//div[contains(@class, 'cnn_strycntntlft')]\r
+strip: //div[@id='cnnCVP2']\r
+strip_id_or_class: cnn_strylftcexpbx\r
+strip_id_or_class: cnn_strylctcqrelt\r
+strip_id_or_class: cnn_strybtntoolsbttm\r
+strip_id_or_class: cnn_stryftsbttm\r
+strip_id_or_class: cnn_strybtmcntnt\r
+prune: no
+test_url: http://edition.cnn.com/2011/US/04/29/severe.weather/index.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='style6 nevek']\r
+\r
+body: //div[@class='bal3']\r
+\r
+\r
+prune: yes\r
+\r
+tidy: yes\r
+convert_double_br_tags: yes\r
+
+test_url: http://ekultura.hu/olvasnivalo/egyeb/cikk/2010-12-15/interju-galvolgyi-judit-2010-december
\ No newline at end of file
--- /dev/null
+body: //div[@id='jobDesc-bd']/p\r
+
+test_url: http://www.elance.com/j/xml-technical-intergration/23687172/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+date: //div[@class='datum']\r
+single_page_link: //a[contains(@href, '?type=99')]\r
+\r
+# this hack preserves the intro text, because it would be striped otherwise if the title is set to //h1\r
+dissolve: //div[@class='artikelMeldung']\r
+\r
+\r
+strip_id_or_class: anzeige\r
+strip_id_or_class: top_page_navigation\r
+strip_id_or_class: cr_image_container\r
+strip_id_or_class: cr_image_reference\r
+strip_id_or_class: cr_image_icon\r
+strip_id_or_class: _close_txt\r
+strip_id_or_class: _close_ico\r
+strip_id_or_class: clearer\r
+\r
+strip://h1\r
+strip://h6\r
+strip://div[contains(@id, 'plista')]\r
+strip://img[contains(@id,'tiny')]\r
+strip://img[@class='cr_image']\r
+\r
+# strip url at the top\r
+strip: //p[@style='font-size: 10px;']\r
+
+test_url: http://www.elektroniknet.de/automotive/technik-know-how/sicherheitselektronik/article/87717/0/Besser_als_die_Wirklichkeit/
\ No newline at end of file
--- /dev/null
+single_page_link: //a[contains(@href, 'print_contenido')]\r
+title: //h2\r
+author: //div[@class="autor"]
+test_url: http://www.elmalpensante.com/index.php?doc=display_contenido&id=668
\ No newline at end of file
--- /dev/null
+title: //meta[@name='DC.title']/@content\r
+title: //div[contains(@class, 'cabecera_noticia')]//h1\r
+date: //meta[@name='DC.date']/@content\r
+date: //meta[@name='date']/@content\r
+body: //div[@class='columna_texto']\r
+body: //div[@id='cuerpo_noticia']\r
+body: //div[@class='estructura_2col_1zq']//div[@class='margen_n']\r
+\r
+prune: no\r
+\r
+strip_id_or_class: disposicion_vertical\r
+strip_id_or_class: ampliar_foto\r
+strip_id_or_class: utilidades\r
+strip_id_or_class: info_relacionada\r
+strip_id_or_class: m-kiosko\r
+strip_id_or_class: info_complementa\r
+\r
+strip: //div[starts-with(@id, 'sumario') and contains(., 'más información')]\r
+strip: //div[@id='coment' or @id='foros_not']\r
+
+test_url: http://elpais.com/elpais/2012/02/06/gente/1328526783_491687.html\r
+test_url: http://www.elpais.com/articulo/cultura/mano/retrato/materia/elpepicul/20120207elpepicul_2/Tes
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+strip: //div[@class='rl'] \r
+strip: //p[@class='authdesc']\r
+strip: //p[@class='strybtm']\r
+strip: //div[@id='stryFtrLft']\r
+strip: //div[@id='f1Conversation']\r
+strip: //div[@id='cmtSpncrRuler']\r
+strip: //div[@id='stryComments']\r
+strip: //div[@id='athrData']
+test_url: http://en.espnf1.com/monaco/motorsport/story/50529.html
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //div[@class='post_body']\r
+date: //*[@class='post_time']\r
+\r
+prune: no\r
+\r
+test_url: http://www.engadget.com/2011/05/20/screen-grabs-the-mentalist-takes-the-ipad-to-new-heights/
\ No newline at end of file
--- /dev/null
+title: //h2\r
+body: //div[@class="post_content"]\r
+author: //p[@class="author"]/a\r
+date: //p[@class="date"]\r
+strip: //h2\r
+strip: //header
+test_url: http://engineering.tumblr.com/post/21276808338/tumblr-firehose
\ No newline at end of file
--- /dev/null
+title: //span[@id='DetailedTitle']\r
+body: //div[@id='ctl00_cphBody_dvArticleInfoBlock'] | //td[@class='DetailedSummary']\r
+strip_id_or_class: sidebar\r
+strip_id_or_class: Skyscrapper_Body\r
+strip: //td[@class='DetailedSummary']/table[position() != 1]\r
+prune: no\r
+test_url: http://english.aljazeera.net//news/middleeast/2011/04/20114681444376835.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='article']//div[contains(@class, 'inside')]\r
+\r
+strip_id_or_class: tags\r
+strip_id_or_class: actions\r
+strip_id_or_class: google-ads\r
+\r
+prune: no\r
+\r
+test_url: http://www.enikos.gr/politics/98606,To_oxi_toy_Agorastoy_stoys_Germanoys.html
\ No newline at end of file
--- /dev/null
+author://div[@class = 'article-author']/span[@class = 'byline']\r
+title://h1[@class = 'heading']\r
+body://div[@id = 'related-article-links']\r
+strip://div[@id = 'comment-sort-order']\r
+strip://div[@id = 'my-profile']\r
+strip://div[@class = 'article-author']\r
+strip://div[@class = 'bg-f8f1d8 width-385 text-left']\r
+strip://div[@id = 'login-status']\r
+strip://div[@class = 'puff-padding']
+test_url: http://entertainment.timesonline.co.uk/tol/arts_and_entertainment/the_tls/article7177738.ece
\ No newline at end of file
--- /dev/null
+title: concat(//div[@class='doc_author'], ' - ', upper-case(//div[@class='doc_title']))\r
+\r
+body: //div[@class='doc']\r
+\r
+prune: yes\r
+\r
+tidy: yes\r
+convert_double_br_tags: yes\r
+\r
+strip: //a[contains(@href, 'www.facebook.com/pages/Elet-es-Irodalom/')]
+test_url: http://www.es.hu/2010-12-08_vissza-a-partpenzt
\ No newline at end of file
--- /dev/null
+strip_comments: no
+test_url: http://www.escapistmagazine.com/articles/view/columns/extraconsideration/8717-Extra-Consideration-The-Story
\ No newline at end of file
--- /dev/null
+title: //div[@class='headline'] | //div[@class='mod-header']/h3\r
+body: //div[contains(@class, 'article')]\r
+strip: //div[contains(@class, 'mod-inline')]\r
+strip: //*/span[@class='page-actions']\r
+strip: //div[@class='page-actions']/*\r
+strip: //div[@class='headline'] | //div[@class='mod-header']/h3\r
+strip: //div[@class='mod-blog-navigation']\r
+strip: //div[@class='monthday']\r
+strip: //div[@class='time']\r
+strip: //div[@class='timeofday']\r
+strip: //div[contains(@class, 'mod-conversations')]
+test_url: http://espn.go.com/boston/mlb/story/_/id/7092528/terry-francona-victim-latest-red-sox-smear-campaign
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //div[@id='byline']\r
+\r
+body: //div[@id='printBody']\r
+\r
+single_page_link: concat('http://www.esquire.com/print-this/', substring-after(//link[@rel='canonical']/@href, 'esquire.com/'))\r
+\r
+prune: no\r
+\r
+test_url: http://www.esquire.com/features/impossible/price-is-right-perfect-bid-0810
\ No newline at end of file
--- /dev/null
+title: //*[@itemprop='headline']\r
+author: //*[@itemprop='author']\r
+date: //*[@itemprop='datePublished']\r
+body: //*[@itemprop='articleBody']\r
+strip: //*[contains(@class, 'instapaper_ignore')]
+test_url: http://www.essentialpublicradio.org/story/2011-11-14/volunteers-sought-federal-tax-assistance-program-pennsylvania-9421
\ No newline at end of file
--- /dev/null
+strip_id_or_class: 'left'\r
+strip_id_or_class: 'right'\r
+strip_id_or_class: 'block-belowcontent'\r
+author: //span[@class = 'name']/a\r
+date: //div[@class= 'datum']\r
+test_url: http://www.etc.se/intervju/lonsamt-att-radda-jorden
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://eternabuenosaires.com/2011/09/calle-adolfo-bioy-casares
\ No newline at end of file
--- /dev/null
+body: //div[ @class='content' ] | //div[ @class='blog-entry' ]\r
+\r
+strip: //h2/abbr | //div[ @class='lowleader' ] | //*[ @class='discussion' ] | //img[ @class='play-button' ] | //div[ @class='boxout' ] | //h2/a | //h2 | //h2/div | //p[ @class='timestamp' ] | //a[ @class='eurogamer-author' ] | //p[ @class='aPager' ] | //h1 | //div[ @id='lowleader' ] | //a[ @class='next' ] | //div[contains(concat(' ', normalize-space(@class), ' '), ' pullquote ')]\r
+\r
+date://p[ @class='timestamp' ]\r
+\r
+author://a[ @class='eurogamer-author' ]\r
+test_url: http://www.eurogamer.net/articles/digitalfoundry-vs-unreal-engine-4
\ No newline at end of file
--- /dev/null
+author: substring-after(//div[@class='articleauthor'],'By ')\r
+\r
+# Blog posts\r
+date: //div[@class='articledate']\r
+# News\r
+date: //div[@class='articledate_b']\r
+\r
+body: //div[@class='articletext']\r
+\r
+convert_double_br_tags: yes
+test_url: http://www.evo.co.uk/carreviews/evolongtermtests/280072/bmw_330d_sport_touring.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='article']/div[contains(@class, 'content')]/h1\r
+body: //div[@id='article']/div[contains(@class, 'content')]\r
+date: //div[contains(@class, 'article-slot')]/descendant::div[contains(@id, 'articledates')]\r
+\r
+strip: //img[contains(@src, 'img/px.gif')]\r
+prune: no\r
+# remove Facebook banner and obtrusive ad\r
+strip: //div[@id='article']/div[contains(@class, 'content')]/div[contains(@class, 'art-right')]\r
+test_url: http://www.expressen.se/kultur/1.2683904/medan-natet-dras-at
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://extracine.com/2011/09/straw-dogs-la-original
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://f1actual.com/2011/09/previo-gran-premio-de-singapur
\ No newline at end of file
--- /dev/null
+bosdy: //div[@class='content']\r
+
+test_url: http://facta.co.jp/blog/archives/20111026001026.html
\ No newline at end of file
--- /dev/null
+title: //h2[@class='related relatedTitle']\r
+author: //a[contains(@href, 'liste.php?author_id')]\r
+\r
+# can't think of a better way unfortunately, really bad markup on this site\r
+date: substring-after(//td[@style='width:85%;'], 'vom')\r
+\r
+# not sure why, but instapaper seems to suck up the teaser paragraph\r
+# not solved!\r
+body: //div[contains(@class, 'teaser')]\r
+body: //div[@id='content']\r
+\r
+# cleanup\r
+strip: //img[@src='http://www.falter.at/web/_pics/falterlogo_dblau.gif']\r
+strip: //div[@class='servicebox']\r
+strip: //h1\r
+strip: //br\r
+strip: //td[@id='adcol']
+test_url: http://www.falter.at/web/print/detail.php?id=1634
\ No newline at end of file
--- /dev/null
+body: //*[@id = 'story text']\r
+author: //a[starts-with(@href, '/u/')]\r
+next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")\r
+autodetect_next_page:yes\r
+strip_id_or_class: 'a2a_kit'
+test_url: http://www.fanfiction.net/s/6497403/1/Spartan_Love
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //h5[@class='byline']//a\r
+date: //h5[@class='date']\r
+body: //figure[@class='node-poster'] | //div[contains(@class, "node-content")]\r
+strip_id_or_class: article-top-wrapper\r
+strip_id_or_class: footer-message\r
+strip_id_or_class: print-logo\r
+strip: //cite\r
+strip://*[@class='timestamp']\r
+strip://div[@id='page_right']\r
+strip://section[@id='header_region']\r
+strip://h1[@class='node-title']\r
+strip://div[@class='node-submitted']\r
+strip_id_or_class: skipnav\r
+test_url: http://www.fastcompany.com/3000226/link-between-quietness-and-productivity\r
+test_url: http://www.fastcompany.com/3003586/6-simple-rituals-reach-your-potential-every-day
\ No newline at end of file
--- /dev/null
+# Title\r
+title: //p[@class='Content HeadlineShort']\r
+\r
+# Authors\r
+# some are known and have a link, others don't\r
+author: substring-after(//span[@class='Autor'], 'Von')\r
+\r
+# Date\r
+date: //span[@class='Datum']\r
+\r
+# Body\r
+body: //div[@class='Artikel']\r
+\r
+# Removements before body text\r
+strip: //div[@class='Breadcrumbs']\r
+strip: //div[@class='QuickSearchBox']\r
+strip: //div[@class='FAZArtikelEinleitung']\r
+strip: //div[@class='FAZArtikelReiter']\r
+strip: //div[@class='clear']\r
+\r
+# General removements\r
+strip: //span[@class='Bildnachweis']\r
+\r
+# Removements after body text\r
+strip: //div[@class='ArtikelAbbinder']\r
+strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content']\r
+strip: //div[@class='FAZArtikelKommentare FAZArtikelContent']\r
+strip: //div[@class='FAZArtikelFunktionen']\r
+strip: //div[@id='FAZContentRight']\r
+test_url: http://www.faz.net/aktuell/gesellschaft/ehe-haltbarkeitsformel-verliebe-dich-oft-verlobe-dich-selten-heirate-vielleicht-11685306.html
\ No newline at end of file
--- /dev/null
+body: id('storytext')\r
+author: //a[starts-with(@href, '/u/')]\r
+#next_page_link: substring-after(//input[contains(@value, 'Next')]/@onclick, "self.location='")\r
+strip_id_or_class: 'a2a_kit'
+test_url: http://www.fictionpress.com/s/2897964/1/All_We_Knew
\ No newline at end of file
--- /dev/null
+title: //h4\r
+author: //span[@class="author"]\r
+body: //div[@id="story"]\r
+strip_id_or_class: summary\r
+strip_id_or_class: meta\r
+strip_id_or_class: storyfoot\r
+convert_double_br_tags: yes\r
+prune: no\r
+\r
+# Note: this site still has trouble because single <br> tags are stripped, but I don't see a way to fix that with this interface.\r
+
+test_url: http://www.ficwad.com/story/158977
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+body: //div[@id='y-article-bd']\r
+body: //div[contains(@class, 'yom-art-content')]\r
+strip: //div[contains(@class, 'related-companies')]\r
+strip: //div[@id='y-article-related']\r
+strip: //div[@id='ypf-article-related']\r
+prune: no\r
+\r
+single_page_link: //div[@class='ft']//a[contains(@href, 'page=all')]\r
+\r
+test_url: http://sg.finance.yahoo.com/news/Motorola-takes-wraps-249-rsg-3508842732.html?x=0&.v=1\r
+test_url: http://finance.yahoo.com/news/super-young-retirement-savers.html
\ No newline at end of file
--- /dev/null
+date: //div[@class='notes']/a\r
+body: //div[@id='content']\r
+\r
+strip_id_or_class: tags\r
+strip_id_or_class: permalink\r
+strip_id_or_class: notes\r
+strip_id_or_class: post_nav\r
+strip: //div[@id='content']//h2\r
+strip_id_or_class: right_column
+test_url: http://findtheswagger.tumblr.com/post/11589145141/moe-resners-end-of-an-era-1957-giants-final
\ No newline at end of file
--- /dev/null
+title: //div[@class='articleTitle']\r
+author: //div[@class='articleAuthor']\r
+body: //div[@class='articleContent']\r
+prune: no\r
+convert_double_br_tags: yes\r
+\r
+test_url: http://www.firstthings.com/article/2011/05/the-trouble-with-ayn-rand
\ No newline at end of file
--- /dev/null
+body: //div[@class='entry']
+test_url: http://www.fivechapters.com/2010/paris-part-one/
\ No newline at end of file
--- /dev/null
+prune: no
\ No newline at end of file
--- /dev/null
+title: substring-after(//title, 'Right:')\r
+body: //div[@class = 'post-body']\r
+author: substring-after(//*[@class='post-author'], 'by')\r
+date: concat(//*[@class='date-header'], ' ', //*[@class='post-timestamp']/a)\r
+convert_double_br_tags: yes\r
+
+test_url: http://www.fivethirtyeight.com/2010/07/does-rnc-have-structural-problems.html
\ No newline at end of file
--- /dev/null
+author: //div[@class='authorDescription']/h2\r
+body: //div[@id='story']\r
+date: substring-before(substring-after(//p[@class='date'],'Erstellt am:'), '-')\r
+title: //h1[@class='detail']\r
+strip: //div[@class='fact']\r
+
+test_url: http://fm4.orf.at/stories/1689156/
\ No newline at end of file
--- /dev/null
+title: normalize(//h1)\r
+\r
+author: //td/p[position()=last()]/em\r
+\r
+# I swear, this is really the best way to do this\r
+date: normalize(//td[contains(@style, "color: #ffffff")])\r
+\r
+# my god, it's full of tables\r
+body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td\r
+strip: //h1\r
+\r
+# the following two lines strip the byline at the end of the article (the byline is a <p> that consists of an em dash and then some text in an <em>). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output.\r
+strip: //p[position()=last()]/em\r
+strip: //p[position()=last()]/child::text()
+test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+author: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']\r
+\r
+date: //div[@class='articleHead']/span[@class='created']\r
+\r
+body: //div[@id='article']\r
+\r
+strip: //span[@class='markerText']\r
+strip: //div[@class='articleContent small']/div[@class='textBlock']//span[@class='created']\r
+strip: //div[@class='sidebar']\r
+strip: //div[@class='starbar']\r
+strip: //div[@class='actions clearfix']\r
+strip: //div[@id='commentForm']\r
+strip: //div[@id='commentSent']\r
+strip: //div[@id='comments']\r
+strip: //div[@class='similarityBlock']\r
+
+test_url: http://www.focus.de/politik/ausland/ein-jahr-nach-bombenanschlag-u-bahn-attentaeter-von-minsk-hingerichtet_aid_724958.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='entry-content']\r
+date: //meta[@name="date"]/@content\r
+author: //meta[@name="author"]/@content\r
+\r
+strip_id_or_class: ecapShell\r
+strip_id_or_class: noindent\r
+strip_id_or_class: targetedPromotion\r
+\r
+prune: no\r
+\r
+test_url: http://www.fool.com/investing/general/2012/01/27/dfc-global-beats-up-on-analysts-yet-again.aspx
\ No newline at end of file
--- /dev/null
+title: //hgroup//h1\r
+title: //span[@class='mainarttitle']\r
+\r
+body: //div[@id='leftRail']//div[contains(@class, 'body')]\r
+\r
+author: //meta[@name="author"]/@content\r
+author: //span[@class='mainartauthor']\r
+\r
+date: substring-before(//hgroup//h6, '@')\r
+date: //span[@class='mainartdate']\r
+\r
+prune: no\r
+\r
+single_page_link: //a[contains(@href, '/print/')]\r
+\r
+test_url: http://www.forbes.com/forbes/2011/0509/technology-frog-design-jan-chipchase-ethnographer-birth-cool_print.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='art-mast']//h1\r
+author: substring-after(//span[@id='by-line'], 'BY ')\r
+date: //span[@id='pub-date']\r
+body: //div[@id='art-mast']//h2 | //div[@id='art-mast']/h3 | //div[@id='art-body']//div[@class='translateBody']\r
+strip: //div[@id='share-box']\r
+prune: no\r
+\r
+single_page_link: //span[@id='controls']/a[contains(@href, 'print=yes')]\r
+\r
+test_url: http://www.foreignpolicy.com/articles/2011/08/01/a_murderers_manifesto_and_me\r
+test_url: test_url: http://www.foreignpolicy.com/articles/2012/02/29/five_years_in_damascus
\ No newline at end of file
--- /dev/null
+title: //div[@class="articleHeader"]/h1\r
+author: //p[@class="byline"]\r
+date: //p[contains(@class,"publishedDate")]/span\r
+# remove the right menu\r
+strip: //div[contains(@class,"aside")]\r
+# remove some SharePoint webpart label junk\r
+strip: //div[@id="ctl00_PlaceHolderMain_ArticleLeadField_label"]\r
+strip: //div[@id="ctl00_PlaceHolderMain_PublishingPageContentField_label"]
+test_url: http://forsvaret.no/aktuelt/publisert/nyheter/Sider/F5-fly-til-Skedsmo.aspx
\ No newline at end of file
--- /dev/null
+prune: no\r
+\r
+author: //meta[@name="dc.publisher"]/@content\r
+date: //meta[@name="dc.date"]/@content\r
+strip: //p[contains(@class, 'contributor vcard')]\r
+replace_string(<ul><li><div class="photo">): <div class="photo">\r
+strip: //p[a[contains(., 'Click here to read more on this story ')]]\r
+\r
+test_url: http://www.foxnews.com/entertainment/2011/05/04/dwayne-johnson-guys-grow-pair-driving-hybrid/
\ No newline at end of file
--- /dev/null
+body: //div[@id="projectDetailsContent"]//td\r
+
+test_url: http://www.freelancer.com/projects/PHP-Website-Design/debug-Forum-website-code.html
\ No newline at end of file
--- /dev/null
+body: //div[@class = 'instapaperbody']\r
+convert_double_br_tags: no\r
+date: //div[@class='instadate']\r
+title: //h2[@class = 'instatitle']
+test_url: http://freytag-film.com/blog/artikel/shooting_a_feature_film_in_10_days
\ No newline at end of file
--- /dev/null
+#body: (//div[@class='ftr-yt-vid'])[1]\r
+body: (//blockquote[contains(@class, 'postcontent')])[1]\r
+body: (//div[starts-with(@id, 'post_message')])[1]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
+#replace_string(</iframe>): </iframe> </div>\r
+\r
+test_url: http://www.friendskorner.com/forum/f137/debate-personal-lives-leaders-west-vs-pakistan-must-read-297989/
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'ft-story-body')]\r
+\r
+author: substring-after(//div[contains(@class, 'ft-story-header')]/p[1], 'By ')\r
+date: substring-before(substring-after(//div[contains(@class, 'ft-story-header')]/p[2], 'Published:'), '|')
+test_url: http://www.ft.com/cms/s/2/e1be4b5a-620c-11e0-8ee4-00144feab49a.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='boxIntroHead']/span[@class='h3'] | //div[@class='section']/div[@class='paragraph' or @class='embObjLeft']\r
+single_page_link: //a[@class='icon print']\r
+
+test_url: http://www.ftd.de/it-medien/it-telekommunikation/:mobilfunk-vivendi-und-vodafone-trennen-sich-in-frankreich/60034691.html\r
+test_url: http://www.ftd.de/it-medien/medien-internet/:verkauf-von-warner-music-musikbranche-auf-dem-sprung/60048185.html
\ No newline at end of file
--- /dev/null
+body: //div[@class = 'entry']\r
+
+test_url: http://www.fubiz.net/2011/05/31/world-press-photo-2011/
\ No newline at end of file
--- /dev/null
+date: //span[@class='date']\r
+strip: //div[@class='postsidebar']\r
+body: //div[@class='singlepost']\r
+title: //div[@class='singlepost']/h1\r
+move_into(//div[@class='singlepost']): //div[@class='info']\r
+strip: //div[@class='gallery']\r
+strip: //div[@class='biggallery']\r
+strip: //ul[@class='social']\r
+strip: //ul[@class='social_mail']\r
+
+test_url: http://futurezone.at/future/5502-erste-galileo-satelliten-starten-ins-all.php
\ No newline at end of file
--- /dev/null
+# default view title\r
+title: //span[@class='newsTitle']\r
+# print view title\r
+title: //h3[@class='title']\r
+\r
+# default view author\r
+author: //span[@class='newsAuth']/a\r
+author: substring-after(//span[@class='newsAuth'], 'by ')\r
+\r
+# default view date\r
+date: //td[@class='newsDate']\r
+\r
+# default view body\r
+body: //td[@class='featureText']\r
+body: //td[@class='newsText']\r
+\r
+strip: //h3[@class='title']\r
+\r
+single_page_link: //a[contains(@href, '?print=1')]
+test_url: http://www.gamasutra.com/view/feature/132559/staying_power_rethinking_feedback_.php
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //div[@id='GBTVPlayer'] | //div[contains(@class, 'col490')]\r
+\r
+prune: no\r
+\r
+strip_id_or_class: noprint\r
+strip: //div[@id='gbNewsTextContent']/following-sibling::*\r
+\r
+test_url: http://www.gameblog.fr/news/26330-les-sims-3-showtime-s-annonce-en-video\r
+test_url: http://www.gameblog.fr/news/26306-mise-a-jour-du-dashboard-de-la-xbox-360-disponible
\ No newline at end of file
--- /dev/null
+tidy: no\r
+\r
+test_url: http://www.garythink.com/eft/testing.html
\ No newline at end of file
--- /dev/null
+# These should work, but don't. They were given by Firefox XPather extension\r
+title: //article//header//a//h1\r
+body: //article//section
+test_url: http://gasteroprod.com/blog/faut-il-continuer-a-supporter-internet-explorer-6.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='panel']\r
+strip: //div[@style='float:right']\r
+strip: //span[@class='titulosHomePublicidad']\r
+strip: //div[@id='TitTop5Der']\r
+strip: //img[@src='/ImagesGatoPardo/LogoGatopardo.png']\r
+\r
+prune: yes
+test_url: http://www.gatopardo.com/ReportajesGP.php?R=95
\ No newline at end of file
--- /dev/null
+body: //div[@class="post-body"]\r
+\r
+# Remove 'content is restricted'\r
+strip: //div[@id='agegate_IDHERE']\r
+\r
+test_url: http://gawker.com/#!5782070/russian-bomb-squad-successfully-defuses-sex-toy
\ No newline at end of file
--- /dev/null
+author: substring-after(//span[@class='storyauthor'],'Posted by')\r
+date: //span[@class='storydate']
+test_url: http://www.geeksofdoom.com/2012/03/14/robert-rodriguez-says-machete-kills-and-sin-city-2-will-film-this-year/
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'article']\r
+strip: //div[@id = 'klasbox']
+test_url: http://www.geenstijl.nl/mt/archieven/2010/10/vrouw_lange_frans_wou_baas_b_d.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='post']\r
+strip: //ul[@id='bookmark_single']
+test_url: http://getnews.jp/archives/117312
\ No newline at end of file
--- /dev/null
+# 2011-11-19 - carlo@... - Initial setup.\r
+\r
+strip_id_or_class: user-review-detail\r
+strip: //h1\r
+\r
+body: //div[@class="wiki-content"] | //div[@class="section-bd"] | //div[@class="news-story"]\r
+\r
+author: //span[@class="reviewer"] | //p[@class="byline"]/a/text()\r
+date: //span[@class="dtreviewed"]\r
+
+test_url: http://www.giantbomb.com/the-elder-scrolls-v-skyrim/61-33394/
\ No newline at end of file
--- /dev/null
+tidy:no\r
+title://h2[@class="title"]\r
+# author:"Ben Miller"\r
+date://div[@id="stats"]/span\r
+strip_id_or_class:stats\r
+strip_id_or_class:breadcrumbs\r
+strip_id_or_class:gn-why-content\r
+strip_id_or_class:single-social\r
+strip_id_or_class:sidebar-ads\r
+strip_id_or_class:sidebar-top\r
+strip_id_or_class:footer\r
+strip_id_or_class:post_meta\r
+# strip_id_or_class:\r
+# strip_id_or_class:\r
+# strip_id_or_class:\r
+# strip_id_or_class:\r
+# strip_id_or_class:\r
+# strip_id_or_class:\r
+
+test_url: http://www.giga.de/benm/2011/10/17/probleme-mit-ios-5-wenn-die-daten-weg-sind/#more-58033
\ No newline at end of file
--- /dev/null
+date: //meta[@name='DC.date.issued']/@content\r
+date: //span[@class='post-meta the-date']\r
+\r
+title: //meta[@property='og:title']/@content\r
+\r
+author: //meta[@name='DC.creator']/@content\r
+\r
+body: //div[contains(@class, 'post-sub-head') or starts-with(@id, 'post-content-')]\r
+\r
+find_string: id="content"\r
+replace_string: id="content-ignore"\r
+\r
+strip_id_or_class: sharedaddy\r
+\r
+prune: no\r
+\r
+test_url: http://gigaom.com/2011/10/24/groupon-google-lawsuit/
\ No newline at end of file
--- /dev/null
+single_page_link: //p[@id='skip']//a[contains(@href, 'skip')]\r
+
+test_url: http://gihyo.jp/dev/serial/01/machine-learning/0010
\ No newline at end of file
--- /dev/null
+body: //div[@class="highlight"]/pre\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: https://gist.github.com/1258908
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@id="content"]//h2/a\r
+
+test_url: http://givemesomethingtoread.com/post/6285838917/the-baddest-lawyer-in-the-history-of-jersey
\ No newline at end of file
--- /dev/null
+body: //div[@id="leadimage" or @class="postcontent"]\r
+author: //div[@class="contentauthor"]\r
+date: //div[@class="timestamp"]\r
+\r
+prune: no\r
+\r
+test_url: http://www.gizmodo.co.uk/2013/02/bbc-forcing-poor-old-sir-david-attenborough-to-go-on-twitter/
\ No newline at end of file
--- /dev/null
+body: //div[@class="post-body" or contains(@class, 'illustration top')]\r
+author: (//cite//span[@class="plus-icon"])[1]\r
+date: //span[@class="date"]\r
+\r
+prune: no\r
+\r
+test_url: http://gizmodo.com/5880147/kuhn-rikon-improves-their-spice-grinder-with-grade-school-science
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://gizmologia.com/2011/09/amd-trinity-el-sucesor-de-llano-en-una-demostracion-muy-interesante
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://gizmovil.com/2011/09/hipertextual-labs-receptor-bluetooth-nokia-bh-214
\ No newline at end of file
--- /dev/null
+# Look for Open Graph data - http://ogp.me\r
+title: //meta[@property="og:title"]/@content\r
+date: //meta[@property="article:published_time"]/@content\r
+# article:author is someties URL, e.g. on guardian.co.uk
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+\r
+strip: //p[@class='top']\r
+strip: //h2[.='Where next?']\r
+strip_id_or_class: where-next\r
+strip_id_or_class: social-bookmarks\r
+strip_id_or_class: link-to-here\r
+strip_id_or_class: options-heading\r
+strip_id_or_class: page-options-content\r
+strip_id_or_class: page-info-bottom\r
+\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.globalissues.org/article/39/a-primer-on-neoliberalism
\ No newline at end of file
--- /dev/null
+title: //div[@id='article_headline']//h1\r
+date: //div[contains(@class, 'articleDate')]//h4\r
+body: //div[@id='article_headline']/h2 | //div[@id='large_article_image' or @id='article_content']\r
+\r
+strip_id_or_class: relatedLinksBox\r
+strip_id_or_class: betting-widget\r
+strip_image_src: install_flash.gif\r
+\r
+strip: //table[contains(@style, 'float: right; width: 285px;')]\r
+strip: //div[@class='caption']\r
+\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139032/video-profile-back-to-his-very-best-for-bayern-frances-flair-and-\r
+test_url: http://www.goal.com/en-gb/news/3284/euro-2012/2012/05/31/3139869/lampard-injury-a-bitter-blow-for-england-and-sorry-way-to#
\ No newline at end of file
--- /dev/null
+# Jens Kohl, jens.kohl@...\r
+# - Added publication date\r
+# - Striped pagination block\r
+# - Added single page link\r
+# - Added xpath-querys for the printer friendly version\r
+\r
+title: //h1\r
+body: //div[@class='formatted']\r
+prune: no\r
+\r
+date: substring-after(//li[2][@class="text1"], 'Datum:')\r
+strip: //ol[@class="list-chapters"]\r
+strip_comments: yes\r
+\r
+# next: commands for printer friendly pages\r
+single_page_link: //a[contains(@href, 'print.php?a=')]/@href\r
+title: //body/h3\r
+strip_image_src: staticrl/images/logo.jpg\r
+strip_image_src: http://cpx.golem.de/cpx.php?class=7\r
+strip: //body/h3\r
+strip: //body/b[1]\r
+strip: //body/b[2]\r
+strip: //body/b[3]\r
+strip: //div[1]\r
+test_url: http://www.golem.de/1112/88696.html
\ No newline at end of file
--- /dev/null
+title: //div[@class="title"]/div/h1\r
+body: //div[@class="body"]\r
+date: //li[@class="date-time"]\r
+test_url: http://www.good.is/post/why-amazon-is-the-next-top-tech-company/
\ No newline at end of file
--- /dev/null
+date: //meta[@name='og:article:published_time']/@value\r
+\r
+body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']\r
+\r
+strip_id_or_class: itemImageGallery\r
+\r
+# remove extras at end of post content\r
+find_string: <div style="margin:5px 0 10px;">\r
+replace_string: </div></body></html><!--\r
+\r
+prune: no\r
+\r
+test_url: http://www.gossip-tv.gr/story/158902/aggelike-daliane-semera-duskoleuontai-oloi-sta-epaggelmatika-tous\r
+test_url: http://www.gossip-tv.gr/lifestyle/Taste/story/230266/lahtaristo-kai-ygieino-tost-sokolatas
\ No newline at end of file
--- /dev/null
+title: //div[@class='entry-header']\r
+author: //span[@class='vcard author']\r
+date: //abbr[@class='published']\r
+#move_into(//div[@class='entry-body']): //img[@id='photo_1']\r
+body: //div[@class='entry-body']\r
+strip: //div[@class='galleryEaseThumbs']
+test_url: http://gothamist.com/2012/03/15/fancy_cocktail_lounge_the_randolph.php
\ No newline at end of file
--- /dev/null
+title: //span[@id="showTitle"]\r
+author: //span[@id="showAuthor"]\r
+date: //span[@id="showRefDate"]\r
+\r
+strip: //span[@class="black_bold"]\r
+strip: //div[@id="sectionName"]\r
+strip: //div[@id="storyHeader"]\r
+\r
+body: //div[@id="newsBodyText"]\r
+\r
+strip_image_src: "http://www.gotomanager.com/img/mgrm/space.gif"\r
+strip_image_src: "http://www.gotomanager.com/images/separator.gif"\r
+strip_image_src: "http://www.gotomanager.com/images/spaces.gif"\r
+\r
+convert_double_br_tags: yes\r
+tidy: yes\r
+\r
+strip: //div[@id="smallLeadImage"]\r
+strip: //div[@id="truehitsSurvey"]\r
+strip: //table[@id="relatedInfoTable"]
+test_url: http://www.gotomanager.com/news/details.aspx?id=86759
\ No newline at end of file
--- /dev/null
+next_page_link: //div[@class='pagination']//span[@class='paginationNext']/a\r
+strip_id_or_class: utility\r
+strip_id_or_class: keywords\r
+strip_id_or_class: pagination\r
+strip_id_or_class: position2_content\r
+body: //div[@class='article']\r
+title: //h1[@class='content-headline']\r
+author: //span[@class='contributor']//a
+test_url: http://www.gq.com/news-politics/newsmakers/201203/terry-thompson-ohio-zoo-massacre-chris-heath-gq-february-2012
\ No newline at end of file
--- /dev/null
+# this is fragile with footnotes -- leave it for now\r
+\r
+#tidy: no\r
+#prune: no\r
+#move_into(//article): //aside[@id='footnotes']\r
+author: //cite/a\r
+date: //time\r
+\r
+strip: //a[text()='Grantland']\r
+strip_id_or_class: ad-wrapper\r
+strip_id_or_class: fb-connect-link\r
+strip_id_or_class: fb-status\r
+strip: //li[@class='print']\r
+strip: //cite\r
+strip: //a[contains(text(), '[+]')]\r
+strip: //a[@id='jump-nav-link']\r
+strip: //h1[text()='Share This']\r
+strip: //h1[text()='Top Stories']\r
+strip: //div[@id="update-text-size"]\r
+test_url: http://www.grantland.com/story/_/id/8421241/examining-new-albums-rock-veterans-no-doubt-green-day
\ No newline at end of file
--- /dev/null
+title: //div[@class="blogpost"]/h2\r
+author: //div[@class="blogpost"]/p[@class="byline"]/a\r
+date: //div[@class="blogpost"]/p[@class="byline"]/span[@class="time_posted"]\r
+body: //div[@class="blogpost"]\r
+strip_id_or_class: flag\r
+strip_id_or_class: byline\r
+strip_id_or_class: post_footer\r
+strip_id_or_class: related_posts\r
+strip_id_or_class: post_author_bios\r
+strip: //h2
+test_url: http://greatergreaterwashington.org/post/12457/ask-ggw-what-will-happen-to-the-1000-series-railcars/
\ No newline at end of file
--- /dev/null
+title://h1\r
+author://span[@class="submitted"]/a\r
+date:substring-after(//span[@class="submitted"],'on ')\r
+body://div[@class="content"]
+test_url: http://groups.drupal.org/node/36816
\ No newline at end of file
--- /dev/null
+title: //div[@id='main-article-info']//h1\r
+body: //div[@id='article-wrapper']\r
+date: //li[@class='publication']//time[@pubdate] | //li[@class='publication']//data[@pubdate]\r
+author: //li[@class='byline']\r
+prune: no\r
+tidy: no\r
+test_url: http://www.guardian.co.uk/business/2011/oct/06/quantitative-easing-75bn-bank-of-england
\ No newline at end of file
--- /dev/null
+body: //div[@class='wrapper_half']//ul[@class='details'] | //div[@class='wrapper_half']//p[@class='synopsis'] | //div[@class='wrapper_half']//div[@class='image'] | //div[@class='wrapper_half']//div[@class='article']\r
+strip: //div[@class='wrapper_half']//ul[@class='details']/li[position()>1]\r
+prune: no\r
+tidy: no\r
+test_url: http://gulfnews.com/news/gulf/uae/government/abu-dhabi-centre-offers-useful-information-1.811084
\ No newline at end of file
--- /dev/null
+# To administrator:\r
+# Please change the hostname to "www.guokr.com/article/*"\r
+# Not working for "www.guokr.com/post/" pages configured by carlosliu913@gmail.com\r
+\r
+# This filter is tested on:\r
+# http://www.guokr.com/article/274325/\r
+# http://www.guokr.com/article/275013/\r
+\r
+title://h1\r
+author://div[contains(@class, 'content-th-info')]/a\r
+date://div[contains(@class, 'content-th-info')]/span\r
+body://div[contains(@class, 'Content')]\r
+\r
+strip://div[contains(@class, 'bottom-i')]\r
+strip://div[contains(@class, 'copyright')]\r
+strip://div[contains(@class, 'fr')]\r
+strip://div[contains(@class, 'content-th-info')]\r
+strip://h1[contains(@id, 'articleTitle')]\r
+strip://div[contains(@class, 'side')]\r
+strip://div[contains(@class, 'top-wp')]\r
+test_url: http://www.guokr.com/article/275013/\r
+test_url: http://www.guokr.com/article/338387/
\ No newline at end of file
--- /dev/null
+title: //div[@id="habermetni"]/h1[@id="haber_baslik"]\r
+body: //div[@id="habermetni"]/p\r
+strip: //img[@class='newsDetailLeft']\r
+strip_image_src: /haber-resimleri/
+test_url: http://www.haberler.com/emniyete-atacakti-elinde-patladi-3198733-haberi/
\ No newline at end of file
--- /dev/null
+title:substring-before(id("maincontent")/table, 'Posted')\r
+body:id("maincontent")/p\r
+# eventually convert linebreaks better\r
+
+test_url: http://halo.bungie.org/fanfic/?story=Delahunt0312112316071.html
\ No newline at end of file
--- /dev/null
+# Remove right column\r
+strip: //*[(@class = 'right_col')]\r
+\r
+# Remove comments etc.\r
+strip: //*[(@class = 'category')]\r
+strip: /html/body/div[1][@class='absolute_content_high']/div[1][@class='wrapper']/div[1][@class='main_col']/div[@class='main_content']/h3
+test_url: http://hammers.theoffside.com/carling-cup/a-funny-thing-happened-on-the-way-to-4-nil.html
\ No newline at end of file
--- /dev/null
+date: //span[@class="item-date"]\r
+body: //div[@class="item-content"]\r
+strip_comments: no
+test_url: http://www.hanselman.com/blog/BrainBytesBackBunsTheProgrammersPriorities.aspx
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //a[@class='a_aut']\r
+body: //div[@class='content_dossier']\r
+strip: //div[@id='pagination']\r
+next_page_link: //div[@class='sommaire_colonne']//span[@class='page_actuelle']/following::span[@class='autres_page']//a/@href
+test_url: http://www.hardware.fr/articles/850-1/pci-express-3-0-impact-performances.html
\ No newline at end of file
--- /dev/null
+title: //div[@id='article-title']\r
+author: //div[@id='articleAuthors']\r
+body: //div[@id='article']\r
+strip: //div[@class='module wide']\r
+next_page_link: //a[@title='Next Page']
+test_url: http://hbr.org/2012/04/the-real-leadership-lessons-of-steve-jobs/ar/
\ No newline at end of file
--- /dev/null
+date: //span[@class = 'date']\r
+body: //div[@class = 'entry-content']\r
+strip://div[@class='more-ways']\r
+strip://div[@id = 'stayConnected']\r
+strip://p[child::a[@rel = 'bookmark']]\r
+strip://p[starts-with(string(.),'(MORE:')]\r
+strip://p[starts-with(string(.),'(PHOTOS:')]\r
+move_into(//p[../@class = 'entry-content'][position() = last()])://div[@id = 'featbox']\r
+
+test_url: http://healthland.time.com/2011/07/24/amy-winehouse-and-the-pain-of-addiction/?preview=true&preview_id=39210&preview_nonce=0777d4e408
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']/div\r
+date: //p[@class='author_date']/span[@class='date']
+test_url: http://heise-online.mobi/newsticker/meldung/Amazons-Appstore-in-der-Kritik-Ein-Desaster-fuer-Kunden-und-Entwickler-1273936.html
\ No newline at end of file
--- /dev/null
+single_page_link: //p[@class='news_option']/a\r
+\r
+date: //p[@class='news_datum']\r
+title: //h1\r
+body: //div[@class='meldung_wrapper']\r
+\r
+test_url: http://www.heise.de/newsticker/meldung/Europa-soll-Grundrechteschutz-im-Netz-staerken-1392664.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='article_holder']//div[@class='image'] | //div[@id='article_body']\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://hespress.com/videos/73684.html\r
+test_url: http://hespress.com/permalink/73678.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='journal-entry-text']\r
+
+test_url: http://highscalability.com/blog/2011/3/14/6-lessons-from-dropbox-one-million-files-saved-every-15-minu.html
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://hiperpop.com/2011/09/marc-anthony-celebra-su-cumpleanos-con-jennifer-lopez
\ No newline at end of file
--- /dev/null
+body: //div[@class = 'pd']\r
+strip: //div[@id = 'overzicht-albumrecensies']\r
+strip: //div[@id = 'jc']
+test_url: http://hiphopleeft.nl/index.php?option=com_content&view=article&id=2767:mark-ronson-record-collection&catid=66:m&Itemid=142
\ No newline at end of file
--- /dev/null
+body://div[@id = 'content']\r
+author://span[@class = 'authors']\r
+author://span[@class = 'ht-vtag'][1]\r
+date:substring-before(//meta[@name = 'dc.date']/@content,'T')\r
+strip://div[contains(@class, 'region-ubercontent')]\r
+strip://h1\r
+strip://div[@id = 'ht-author']\r
+strip://ul[@class = 'links inline'] \r
+strip://div[@id = 'ht-tools']\r
+test_url: http://www.historytoday.com/carol-dyhouse/skin-deep-fall-fur
\ No newline at end of file
--- /dev/null
+title: //*[@class='ptitle']\r
+date: //span[@class='date']\r
+body: //div[@class='body']\r
+prune: no
+test_url: http://hmercer.com/2011/07/why-i-switched-to-jekyll/
\ No newline at end of file
--- /dev/null
+body: //div[@id='entry-body']\r
+strip_id_or_class: paginate\r
+strip: //p[contains(., 'Additional Resources')]
+test_url: http://hometheaterreview.com/dreamvision-starlight-3-three-chip-d-ila-projector-reviewed/
\ No newline at end of file
--- /dev/null
+body: //table[@class='ap-smallphoto-table'] | //div[@class='body']//*[@class='entry-content']\r
+tidy: no\r
+strip_image_src: analytics.apnewsregistry\r
+\r
+test_url: http://hosted.ap.org/dynamic/stories/U/US_SPENDING_SHOWDOWN?SITE=FLPET&SECTION=HOME&TEMPLATE=DEFAULT&CTIME=2011-04-06-07-46-50
\ No newline at end of file
--- /dev/null
+prune: yes\r
+tidy: yes
+test_url: http://www.hs.fi/kotimaa/Teollisuushallin%20palo%20levitt%C3%A4%C3%A4%20vaarallista%20savua%20Tuusulassa/a1305571582405
\ No newline at end of file
--- /dev/null
+single_page_link: //iframe[@id='hootFrame']/@src\r
+\r
+test_url: http://ht.ly/bOiZV
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //div[img[starts-with(@id, 'img_caption')]] | //div[@class="big_photo"] | //div[contains(@class, 'entry_body_text')]\r
+date: //meta[@name="publish_date"]/@content\r
+author: //a[@rel="author"]\r
+author: //meta[@name="author"]/@content\r
+prune: no\r
+tidy: no\r
+strip: //footer\r
+strip_id_or_class: ps-slideshow\r
+strip_id_or_class: fs-slideshow\r
+strip: //p[contains(., 'Related on HuffPost:')]\r
+# end early\r
+replace_string(<div class="sbm-main): </body></html><div class="not-interested \r
+\r
+test_url: http://www.huffingtonpost.com/mitch-moxley/tracking-beijings-boom-th_b_1209828.html\r
+test_url: http://www.huffingtonpost.com/2012/09/11/president-obama-iphone-throwdown_n_1873826.html
\ No newline at end of file
--- /dev/null
+title: //h3[@class="entry-header"]\r
+date: //h2[@class="date-header"]\r
+body: //div[contains(@class, 'entry')]\r
+
+test_url: http://www.humantransit.org/2012/06/can-network-primers-reduce-grief-about-network-design.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='HaberDetayTitleHold Title']/h1\r
+body: //div[@id='YazarDetayText']\r
+author: //div[@class='HaberDetayTitleHold Title']/h1\r
+prune: no\r
+\r
+test_url: http://www.hurriyet.com.tr/ekonomi/19490260.asp\r
+test_url: http://www.hurriyet.com.tr/yazarlar/22078439.asp
\ No newline at end of file
--- /dev/null
+title: //div[@id='pg-content']//h1\r
+body: //div[@id='articleBody0']\r
+replace_string(</table>): </table><br /><br />\r
+\r
+single_page_link: //div[@class="up-header"]/a\r
+\r
+prune: no\r
+
+test_url: http://hvg.hu/w/20111125_sparta
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']//div[contains(@class, 'wp-image-') or contains(@class, 'entry')][1]\r
+author: //span[@class='author']/a\r
+\r
+strip_id_or_class: disqus\r
+strip_id_or_class: paginator\r
+strip_id_or_class: photo-number\r
+\r
+prune: no\r
+\r
+test_url: http://hypebeast.com/2012/11/stussy-2012-fall-winter-november-releases/
\ No newline at end of file
--- /dev/null
+title: //a[@class='post_title']\r
+body: //div[@class='entrybox']\r
+strip_id_or_class: post_title\r
+date: //div[@class='entrybox']/b[1]\r
+strip: //div[@class='entrybox']/b[1]\r
+author: string('Maciej Cegłowski')
+test_url: http://idlewords.com/2011/08/why_arabic_is_terrific.htm
\ No newline at end of file
--- /dev/null
+author: substring-after(substring-after(//span[@class='submitted'],'- '),'- ')\r
+date: substring-before(//span[@class='submitted'], concat('- ',substring-after(substring-after(//span[@class='submitted'],'- '),'- ')))\r
+body: //div[@class='content clear-block zoneApple']\r
+
+test_url: http://www.igeneration.fr/iphone/l-iphone-et-l-ipad-chouchous-des-tpe-et-pme-55112
\ No newline at end of file
--- /dev/null
+title://h1[@class='page-title']\r
+body://*[@id='content']//div[contains(@class,'node-content')]\r
+\r
+author://*[@id='content']//div[contains(@class,'node-submitted')]/a\r
+\r
+date:substring-after(//div[contains(@class,'node-submitted')],' on ')
+test_url: http://ignoredbydinosaurs.com/2011/09/great-lie-lorem-ipsum
\ No newline at end of file
--- /dev/null
+# Get proper Title, Author and Date info\r
+title: substring-before(//title, '|')\r
+author: substring-after(//h4/a[@href='http://www.ilounge.com/index.php/ilounge/aboutus/'], 'By')\r
+date: //span[@class='instapaper_date']\r
+\r
+# For Reviews & First Looks, get the intro paragraph and put it in front of the main body.\r
+move_into(//div[@id='instapaper_para1']): //div[@id='instapaper_body']\r
+body: //div[@id='instapaper_para1']\r
+strip: //div[@class='reviewinfo']\r
+\r
+# We don't use footnotes, so why bother checking for them? \r
+footnotes: no
+test_url: http://www.ilounge.com/index.php/reviews/entry/luxa2-alum-x-for-iphone-4-4s/?utm_source=twitterfeed&utm_medium=twitter
\ No newline at end of file
--- /dev/null
+title: //div[@class='published visible e2-smart-title']//span\r
+author: //span[@id='e2-blog-title']\r
+date: //p[@class='super-h']\r
+body: //div[@class='text published visible']
+test_url: http://ilyabirman.ru/meanwhile/2011/11/15/2/
\ No newline at end of file
--- /dev/null
+author: substring-after(substring-before(//div[@id='byline'],'|'),'By')\r
+author: //div[@class='byline']/a\r
+date: //span[@class='pubdate']\r
+# print friendly page\r
+body: //div[@id='text']\r
+# regular page\r
+body: //div[@id= 'articlecontent']\r
+\r
+strip: //div[@id= 'articlecontent']/h1\r
+strip: //div[@id='articlecontent']/p[@class='deck']\r
+strip: //div[@id='articlecontent']/div[@class='byline']\r
+strip: //div[@id='articlespacer']\r
+strip: //div[@id='incsharebox']\r
+strip: //div[@id='articlesidebar']\r
+\r
+prune: no\r
+\r
+single_page_link: //a[contains(@href, 'Printer_Friendly.html')]\r
+strip: //a[contains(., 'Dig Deeper')]\r
+test_url: http://www.inc.com/guides/2010/11/seven-tips-for-lobbying-politicians.html\r
+test_url: http://www.inc.com/eric-schurenberg/startups-are-we-geting-irrationally-exuberant.html
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+body: //div[contains(@class, 'articleContent')]\r
+date: //meta[@property='article:published_time']/@content\r
+author: //div[@id='main']//div[@class='byline']//span[@class='authorName']\r
+\r
+strip_id_or_class: RelatedArtTag\r
+\r
+tidy: no
+test_url: http://www.independent.co.uk/news/world/middle-east/syria-could-face-human-rights-probe-2274326.html
\ No newline at end of file
--- /dev/null
+body: //figure[@class='mainVideo']\r
+strip: //figcaption\r
+\r
+prune: no\r
+\r
+test_url: http://www.indiatimes.com/bollywood/kareena-insecure-about-saif-working-with-bipasha-23386.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='weblogPost']/h3[1]\r
+author: ("Brent Simmons")\r
+date: //span[@class="weblogPostDisplayDate"]\r
+body: //div[@class='weblogPostBody']
+test_url: http://inessential.com/2011/10/25/why_just_store_the_app_data_on_dropbo
\ No newline at end of file
--- /dev/null
+title://h1\r
+body://div[@id='texto_link']\r
+
+test_url: http://info.abril.com.br/noticias/internet/filme-do-youtube-vai-estrear-nos-cinemas-22042011-6.shl
\ No newline at end of file
--- /dev/null
+body: //div[@id="intTranscript"]\r
+body: //div[@class="box-content"]\r
+title: //div[@class="box-content"]//h1[1]\r
+author: //p[@class="info"]/strong \r
+date: substring-before(substring-after(//p[@class="info"], "on"), "Length")\r
+strip: //div[@class="box-content"]//h1[1]\r
+strip: //div[@class="box-content"]//p[@class="info"]\r
+strip_id_or_class: vendor-content-box\r
+strip_id_or_class: tags2\r
+strip_id_or_class: instructions\r
+strip_id_or_class: comments\r
+strip_id_or_class: forum-list-tree\r
+strip: //div[@class="addthis_toolbox addthis_default_style"]
+test_url: http://www.infoq.com/interviews/oleg-zhurakousky-javaone2011-interview
\ No newline at end of file
--- /dev/null
+title: //div[@class='tituloInt']\r
+body: //div[@class='notaPortada']\r
+strip: //img[@id='imgHorizontalInt imgDetalleImg imagenNota']\r
+date: //span[@class='publi']\r
+author: //span[@class='autor']\r
+tidy: no\r
+prune: no\r
+
+test_url: http://www.informador.com.mx/tecnologia/2011/337606/6/iran-desarrolla-antivirus-tras-afectaciones-por-duqu.htm
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+author: //*[@property='dc:creator']\r
+date: //*[@property='dc:date']/@content\r
+body: //div[@id='page-content']//div[contains(@class, 'article-body')]\r
+\r
+tidy: no
+test_url: http://www.information.dk/282307
\ No newline at end of file
--- /dev/null
+title://h1[@class="post_title"]\r
+body://article[@class="post"]\r
+date://h1[@class="section_separator"]\r
+author://span[@class="post_author"]\r
+strip://nav[@class="arrow_nav"]\r
+strip://section[@id="contact"]\r
+strip_id_or_class:post_title\r
+strip_id_or_class:post_author\r
+strip_id_or_class:section_separator
+test_url: http://informationarchitects.net/blog/nzz-relaunch-a-quick-review/
\ No newline at end of file
--- /dev/null
+title: //head/title\r
+body: //table[@id='table3']//div[@class='postContent']\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.informationclearinghouse.info/article28238.htm
\ No newline at end of file
--- /dev/null
+title: //div[@id='content']/h1\r
+body: //div[@id="content"]\r
+strip: //img[contains(@src, 'informit_printer.png')]\r
+single_page_link: //div[contains(@class, 'articleTools')]//a[contains(@href, '/printerfriendly.')]\r
+prune: no\r
+\r
+test_url: http://www.informit.com/articles/article.aspx?p=1729268
\ No newline at end of file
--- /dev/null
+body: //div[@id='main_text']\r
+title: //div[@id='main_text']/h1\r
+strip: //div[@id='main_text']/h1\r
+strip: //div[@id='main_text']/h2\r
+strip_id_or_class: tools\r
+strip_id_or_class: articleTools\r
+strip_id_or_class: pagination\r
+strip_id_or_class: byline\r
+strip_id_or_class: tweet\r
+date: //div[@class='date']\r
+strip: //div[@class='date']
+test_url: http://www.infoworld.com/d/the-industry-standard/it-jobs-the-rise-both-offshore-and-in-us-187689
\ No newline at end of file
--- /dev/null
+# This filter is tested on:\r
+# http://www.infzm.com/content/71068\r
+# http://www.infzm.com/content/41577\r
+\r
+author://em[contains(@class, 'toAuthor')]\r
+date:substring(//em[contains(@class, 'pubTime')],1)\r
+body://section[contains(@id, 'articleContent')]\r
+title://h1[contains(@class ,'articleHeadline clearfix')]
+test_url: http://www.infzm.com/content/41577
\ No newline at end of file
--- /dev/null
+# set body\r
+body: //div[@class='post-listing']\r
+\r
+# remove clutter\r
+strip: //a/big\r
+strip: //a/em\r
+strip: //p/em
+test_url: http://inhabitat.com/2010/11/18/sliding-walls-transform-this-tokyo-house-into-an-office/
\ No newline at end of file
--- /dev/null
+title: //div[@class='caption']\r
+author: //p[@class='username']\r
+\r
+strip: //div[@class='contents']/h3\r
+strip: //div[@class='location']
+test_url: http://instagr.am/p/G-s_aciyDJ/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']
+test_url: http://www.interest.co.nz/opinion/opinion-when-our-fear-corporate-way-and-our-love-small-business-man-dangerous-thing
\ No newline at end of file
--- /dev/null
+body: //center/table
+test_url: http://www.iolanguage.com/scm/io/docs/IoGuide.html
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'post']\r
+strip: //div[@class = 'postinfo']\r
+strip: //div[@id = 'postmetanew']\r
+strip: //div[@class = 'paginator']\r
+strip: //div[@class = 'col-2']\r
+strip: //div[@id = 'adfactor-label']
+test_url: http://www.ipadclub.nl/15808/text-writer-ipad-tekstverwerker-met-functieknoppen/
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'post']\r
+strip: //div[@class = 'postinfo']\r
+strip: //div[@id = 'postmetanew']\r
+strip: //div[@class = 'paginator']\r
+strip: //div[@class = 'col-2']\r
+strip: //div[@id = 'adfactor-label']
+test_url: http://www.ipadplanet.nl/11723/steve-jobs-bevestigt-verdwijnen-fysieke-rotatieschakelaar-in-ios-4-2/
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'post']\r
+strip: //div[@class = 'postinfo']\r
+strip: //div[@id = 'postmetanew']\r
+strip: //div[@class = 'paginator']\r
+strip: //div[@class = 'col-2']\r
+strip: //div[@id = 'adfactor-label']\r
+test_url: http://www.iphoneclub.nl/105808/t-mobile-mobiel-internet-wordt-duurder-maar-blijft-onbeperkt/
\ No newline at end of file
--- /dev/null
+title: //meta[@name='og:title']/@content\r
+body: //small[@class='postmetadata'] | //div[contains(@class, 'entry-content')]\r
+\r
+strip: //span[@vanilla-identifier]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.iphonehacks.com/2012/07/app-review-process-behind-the-scenes.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']//div[@class='entry-banner' or @class='entry-content']
+test_url: http://www.iplaysoft.com/webbrowserpassview.html
\ No newline at end of file
--- /dev/null
+# Remove social buttons\r
+strip: //div[@id='temp_Content_Right']\r
+\r
+# Remove duplicate article title\r
+strip: //*[(@class='storytitle')]
+test_url: http://isource.com/2010/10/24/swearch-a-cool-iphone-web-app/
\ No newline at end of file
--- /dev/null
+author: //p[@class = 'writer']\r
+\r
+date: //p[@class = 'published-time']\r
+\r
+body: //div[@class = 'text main']
+test_url: http://www.itavisen.no/899786/old-republic-blir-gratis
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]\r
+body: //div[@class='format_text entry-content']\r
+author: //span[@class="author vcard"]/a\r
+date: //abbr[@class="published"]\r
+\r
+strip_id_or_class: related-posts\r
+strip_id_or_class: membershipbox\r
+strip_id_or_class: share_this_compact_bt\r
+\r
+\r
+footnotes: no
+test_url: http://www.itstactical.com/warcom/knives/exclusive-triple-aught-design-production-dauntless-knife-video-walkthrough/
\ No newline at end of file
--- /dev/null
+title: //*[@id="article-title"]\r
+author: //*[@id="article-info"]/strong\r
+date: //*[@class="article-dateline"]/strong\r
+body: //*[@id="article-content"]
+test_url: http://www.itworld.com/open-source/140916/android-sued-microsoft-not-linux
\ No newline at end of file
--- /dev/null
+body: //div[starts-with(@id, 'news-id-')]\r
+prune: no\r
+\r
+test_url: http://izismile.com/2011/06/13/uncanny_factoid_fashion_or_creepy_2_pics.html
\ No newline at end of file
--- /dev/null
+author: //span[@class='plus-icon']
+test_url: http://jalopnik.com/5892124/1955-porsche-550-spyder-sells-for-record-3685-million/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']//div[@class = 'post f']\r
+strip_id_or_class: comment-big\r
+strip_id_or_class: avatar\r
+strip: //div[@class='time_s']\r
+
+test_url: http://jandan.net/2011/04/03/iphone-5-sony.html
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //p[contains(@class, 'author')]/a\r
+date: //p[contains(@class, 'time')]\r
+body: //div[@class='content']/div[contains(@class, 'text')]\r
+\r
+# prevent "no text" errors on multi-page articles\r
+tidy: no\r
+\r
+# we use a custom next-link detector instead of the print view because\r
+# it's pretty hard to strip out the unwanted parts in the print view\r
+autodetect_next_page: no\r
+next_page_link: //div[contains(@class, 'text')]/div/div[contains(@class, 'paging')]/a[@class='more ']\r
+\r
+strip: //h1\r
+\r
+strip_id_or_class: meta\r
+strip_id_or_class: author\r
+strip_id_or_class: paging\r
+\r
+# prevent "Report an Error" from being recognized as footnote\r
+footnotes: no
+test_url: http://jetzt.sueddeutsche.de/texte/anzeigen/544308/Alles-flicken
\ No newline at end of file
--- /dev/null
+body: //div[@class='entry']\r
+prune: no\r
+
+test_url: http://www.jjahnke.net/rundbr87.html#2514
\ No newline at end of file
--- /dev/null
+body: //div[@id='formatCont_en']\r
+\r
+prune: no\r
+\r
+test_url: http://www.jobbank.gc.ca/detail-eng.aspx?Source=JobPosting&OrderNum=6397922
\ No newline at end of file
--- /dev/null
+# Works with old posts too, such as http://www.joelonsoftware.com/articles/fog0000000332.html\r
+\r
+author: substring-after(//div[@class="author"], 'by ')\r
+date: //div[@class="date"]\r
+\r
+## Clean stuff at top ##\r
+\r
+strip: //h1[1]\r
+strip: //h2[1]\r
+strip: //div[@class="date"]\r
+strip: //div[@class="author"]\r
+\r
+## Clean stuff at bottom ##\r
+\r
+strip: //blockquote[@class="textmessage"]\r
+strip: //div[@style="width:500px"]/p[last()]\r
+strip: //div[@style="width:500px"]/p[last()-1]\r
+strip: //div[@style="width:500px"]/h4[last()]\r
+strip: //div[@style="width:500px"]/h4[last()-1]\r
+strip: //div[@style="width:500px"]/div[last()]
+test_url: http://www.joelonsoftware.com/items/2011/09/15.html
\ No newline at end of file
--- /dev/null
+author: //h1\r
+date: //p[contains(@class,'date')]
+test_url: http://jouire.com/2011/01/exquisite-whispers/
\ No newline at end of file
--- /dev/null
+author: //a[@class="byline-author"]\r
+title: //h1[@class="headline"]\r
+strip: //div[@id="info-card"]\r
+strip: //div[@id="breaking-news"]\r
+strip: //div[@class="rmod list-post-mod"]\r
+strip: //div[@id="footer"]\r
+strip: //div[@id="GH_strip"]
+test_url: http://www.joystiq.com/2012/06/20/magic-the-gathering-duels-of-the-planeswalkers-2013-review/
\ No newline at end of file
--- /dev/null
+body: //div[@id='article_container']\r
+author: //h4//a[@class='author']\r
+title: //h1\r
+\r
+replace_string(lang="en"): lang="de"\r
+replace_string(/>1</a>):/></a>\r
+\r
+strip_id_or_class: share_toolbox\r
+strip_id_or_class: article_header\r
+strip_id_or_class: phototext\r
+\r
+strip_image_src: icon_author.gif\r
+\r
+strip: //img[@src='']\r
+strip: //h4[@id='author']\r
+\r
+prune: no\r
+\r
+test_url: http://www.juedische-allgemeine.de/article/view/id/13366
\ No newline at end of file
--- /dev/null
+convert_double_br_tags: yes\r
+\r
+title: //div[@id="storycredits"]/p/span[@class="title"]\r
+author: //div[@id="storycredits"]/p/br[1]/following-sibling::text()\r
+\r
+strip: //div[@id="storycredits"]\r
+
+test_url: http://www.juppy.org/santa/stories.php?ForAuthorID=35&Year=2005
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'inner_content')]\r
+
+test_url: http://kachestvo.ru/promtovar/odezhda/denim.html
\ No newline at end of file
--- /dev/null
+# Ads\r
+strip: //table[@align="right"][@width="120"]\r
+\r
+# Affiliate link paragraphs\r
+strip: //a[.="Adorama"]/parent::p[contains(., "goodies")]\r
+strip: //a[.="Adorama"]/parent::p[contains(., "This free website's biggest source of")]
+test_url: http://www.kenrockwell.com/tech/composition.htm
\ No newline at end of file
--- /dev/null
+# set body\r
+body: //div[@id='ovArtikel']\r
+\r
+# set title\r
+title: //div[@id='ovArtikel']/h1\r
+# strip main title and leave sub title\r
+strip: //div[@id='ovArtikel']/h1\r
+\r
+date: //div[@class='publicdate']\r
+\r
+#remove captions\r
+strip: //*/div[@class='bu']\r
+strip: //*/div[@class='credit']\r
+\r
+#remove adds\r
+strip: //*/div[@class='ad-head']\r
+strip: //*/div[@class='linksebay']\r
+\r
+# remove video content\r
+strip: //*/div[@class='ovVideo']
+test_url: http://www.kicker.de/news/fussball/frauen/wmfr/frauen-weltmeisterschaft/2011/3/1123662/spielbericht_frankreich-frauen_deutschland-frauen.html
\ No newline at end of file
--- /dev/null
+title: //h1[@id='name']\r
+body: //*[@id='leftcol']\r
+\r
+strip_id_or_class: 'share-box'\r
+strip_id_or_class: 'project-faqs'\r
+strip_id_or_class: 'report-issue-wrap'
+test_url: http://www.kickstarter.com/projects/hop/elevation-dock-the-best-dock-for-iphone
\ No newline at end of file
--- /dev/null
+title: //div[@class='post']/h2\r
+body: //div[@class='entry']\r
+strip: //p[contains(.,'Tags:')]
+test_url: http://www.kingarthurflour.com/blog/2011/01/28/a-big-sandwich-for-the-big-game/
\ No newline at end of file
--- /dev/null
+author: //span[@class="plus-icon"]
+test_url: http://kotaku.com/5920211/save-the-furries-on-your-wii-in-this-weeks-nintendo-download
\ No newline at end of file
--- /dev/null
+title: //h2\r
+author: //*[@id='main']/div/a[1]\r
+date: substring-before(substring-after(//div[@class='meta'],'•'),'•')\r
+body: //div[@id='main']\r
+strip: //div[@class='meta']\r
+test_url: http://kottke.org/08/02/king-of-kong-a-fistful-of-quarters
\ No newline at end of file
--- /dev/null
+body: //div[@class = "entry-full"]\r
+
+test_url: http://www.kumailplus.com/2011/12/02/24308
\ No newline at end of file
--- /dev/null
+title: //div[@id='centrediv']/h1\r
+\r
+author: substring-after(//div[@id='centrediv']/h3,'By: ')\r
+\r
+date: substring-after(substring-before(//div[@id='centrediv']/h3,'By: '),'Filed: ')\r
+\r
+body: //div[@class='KonaBody']\r
+\r
+convert_double_br_tags: yes
+test_url: http://www.kumb.com/story.php?id=126084
\ No newline at end of file
--- /dev/null
+date: //span[@class='datum']\r
+title: //div[@class='artikel']/h2\r
+body: //div[@class='entry']\r
+strip: //p[@class='tags']\r
+author: substring-after(//div[@class='authorinfo']/em,'Dies ist ein Artikel von ')\r
+strip: //div[@class='authorinfo']\r
+strip: //div[@class='authorpic']\r
+
+test_url: http://kwerfeldein.de/index.php/2011/10/17/doppelbelichtungen-mit-konzept/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='headline']\r
+body: //div[@class='article']\r
+strip: //div[@class='article']//h3[contains(@class, 'section')]\r
+strip: //div[@class='article']//ul[contains(@class, 'article-actions')]\r
+strip: //div[@id='syndication-upper']\r
+strip: //a[@id='syndication']\r
+strip: //dl[@id='article-tags']\r
+strip: //div[@id='article-like']\r
+prune: no\r
+\r
+single_page_link: //li[@class='single-page']/a\r
+\r
+test_url: http://www.laphamsquarterly.org/essays/balanced-diets.php
\ No newline at end of file
--- /dev/null
+tidy: no\r
+
+test_url: http://www.laprensagrafica.com/opinion/editorial/229252-reflexiones-sobre-la-educacion-que-necesitamos.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='content-content']//div[@class='content']\r
+title: //h1[@class='title']\r
+date: substring-after(//*[@class='submitted'],'Submitted on')\r
+tidy: no\r
+strip: //div[@class='terms terms-inline']\r
+strip: //div[@class='more']\r
+strip: //div[@class='share-links']\r
+strip: //table[@id='attachments']\r
+\r
+test_url: http://www.laquadrature.net/en/finalization-of-eu-parliaments-weak-net-neutrality-resolution
\ No newline at end of file
--- /dev/null
+#meta data\r
+title:substring-after(title,'|')\r
+\r
+author:substring-before( substring-after(//meta[@name = 'description']/@content, normalize-space(substring-after(//title,'|'))),' respond ')\r
+date://h5[@class = 'postDate']\r
+\r
+#text\r
+body://div[@class = 'articleBody']\r
+\r
+#clean up\r
+strip://center
+test_url: http://lareviewofbooks.org/post/14066007115/literary-transactions-and-their-vicissitudes
\ No newline at end of file
--- /dev/null
+strip: //div[@id="tugs_story_display"]\r
+strip: //div[@id="search_overlay"]\r
+strip: //div[@id="adv_search"]\r
+body: //div[@class='story']\r
+tidy: no\r
+convert_double_br_tags: yes\r
+single_page_link: //a[contains(@href, ',print.')]\r
+strip: //p[starts-with(., 'latimes.com')]\r
+strip: //h1[starts-with(., 'latimes.com')]\r
+strip_id_or_class: cubead
+test_url: http://www.latimes.com/news/opinion/commentary/la-oe-gartonash-wilders-20110512,0,2876761.story
\ No newline at end of file
--- /dev/null
+title: //h1[@class='entry-title']\r
+body: //div[@class='entry-content']
+test_url: http://laughingsquid.com/mysterious-tiny-doors-appearing-around-san-francisco/
\ No newline at end of file
--- /dev/null
+title: //div[@id="content"]/h1[1]\r
+date: substring-before(//p[@class="postdate"], ' at ')\r
+author: ("Dr. Drang")\r
+\r
+strip: //div[@id="content"]/h1[1]\r
+strip: //p[@class="postdate"]\r
+strip: //h2[@id="respond"]\r
+strip: //blockquote[@class="bbpTweet"]/p/span/a/img
+test_url: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
\ No newline at end of file
--- /dev/null
+title: //meta[@name='title']/@content\r
+author: //span[@class='sign']//a[@class='journaliste']\r
+author: //meta[@name='author']/@content\r
+body: //*[@id='article']/div[@class='photo'] | //*[@id='article']/h2 | //*[@id='article']/div[@class='texte']\r
+date: //time[@pubdate]/@datetime\r
+prune: no\r
+test_url: http://www.lefigaro.fr/environnement/2011/11/10/01029-20111110ARTFIG00801-la-chine-confrontee-a-un-immense-defi-ecologique.php\r
+test_url: http://www.lefigaro.fr/conjoncture/2012/11/20/20002-20121120ARTFIG00609-l-usager-devrait-payer-plus-pour-financer-les-transports.php
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+# they have a single component containing both author and date\r
+#author: //p[@class='source']\r
+#date: //p[@class='source']\r
+\r
+body: //div[@class='contenu_article']\r
+#Shoot the insane "conjugaison.lemonde.fr" links :\r
+strip: //a[contains(@class, 'listLink')]\r
+\r
+prune: no\r
+\r
+test_url: http://www.lemonde.fr/economie/article/2011/07/05/moody-s-abaisse-la-note-du-portugal-de-quatre-crans_1545237_3234.html
\ No newline at end of file
--- /dev/null
+title: //h1/following::span[@class='fn']\r
+# Author: should stop parsing until <br> reached, but I don't know how to do this.\r
+author: //following::div[@class='PDate2']\r
+date: //following::div[@class='PDate2']/strong\r
+\r
+body: //div[@class='ArTexte']\r
+body: //div[@id='prod_txt_b']\r
+body: //div[@class='ArPhotoP']\r
+test_url: http://www.lesnumeriques.com/disque-dur-multimedia/popcorn-hour-300-p12231/test.html
\ No newline at end of file
--- /dev/null
+title: //h2\r
+strip_image_src: logo.gif
+test_url: http://www.letemps.ch/Facet/print/Uuid/7c9f912c-07c9-11e0-9b50-4d96c9eca37f
\ No newline at end of file
--- /dev/null
+title: //h2[@class="entry-title"]\r
+body: //div[@class="entry-content"]
+test_url: http://www.lifeandculture.fr/digital/facebook-and-the-epiphanator-an-end-to-endings/
\ No newline at end of file
--- /dev/null
+# Adds author text: Gawker sites commonly show as "Author: View Profile"\r
+author://a[@class="plus-icon modfont"]\r
+\r
+# Add date and time\r
+date: //span[@class="date"]\r
+\r
+# Remove date and time from article text\r
+strip: //span[@class="date"]\r
+\r
+# Remove login/comment text\r
+strip: //*[(@class="presence_control_external smalltype")]\r
+\r
+strip: //div[@class="nodebyline modfont"]\r
+\r
+# Remove right sidebar\r
+strip: //div[@id="rightwrapper"]\r
+\r
+# Remove print header\r
+strip: //div[@id='printhead']/h1\r
+\r
+# Remove 'content is restricted'\r
+strip: //div[@id='agegate_IDHERE']\r
+\r
+# Remove follow text\r
+strip: //*[(@class="permalink_ads")]\r
+\r
+# Remove view/comment count\r
+strip: //div[@id='wrapper']/div[2][@class='postmeta_permalink_wrapper']/div[1][@class='postmeta_permalink']/div[2][@class='pm_line']\r
+\r
+# Remove contact text\r
+strip: //div[@id='wrapper']/div[1][@class='content permalink']/p[6][@class='contactinfo']\r
+\r
+# Remove medium duplicates of the article image\r
+strip_image_src: medium.jpg\r
+\r
+# Remove "arrow" class at bottom of page\r
+strip: //p[@class="arrow"]\r
+\r
+# Remove "track" image from article body\r
+strip: //img[@alt="track"]\r
+test_url: http://lifehacker.com/5925801/how-can-i-turn-vague-goals-into-actionable-to+dos\r
+test_url: http://lifehacker.com/5941600/hack-an-old-computer-mouse-into-a-wireless-bluetooth-mouse
\ No newline at end of file
--- /dev/null
+single_page_link: //ul[@class='util-nav']//a[@class='close']
+test_url: http://www.linkedin.com/news?actionBar=&articleID=894735221&ids=0Rdj4Qe3wQejwIczAOc3sRdzwUb3wScPoPdzkVe2MNcz8RcPsQejwIcPASdjwTcjwU&aag=true&freq=weekly
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@class="post"]/div[@class="title"]/a\r
+
+test_url: http://longform.org/2011/05/06/disconcerting-new-answers-in-models-suicide/
\ No newline at end of file
--- /dev/null
+body: //div[@class='container_16']//div[@class='grid_11']\r
+strip: //h2[@class='mast']\r
+strip: //div[@class='container_16']//div[@class='grid_11']/h1\r
+strip: //div[@class='container_16']//div[@class='grid_11']/p[1]\r
+strip: //div[@class='container_16']//div[@class='grid_11']/div\r
+author: //a[starts-with(@title, 'Posts by')]\r
+date: substring-before(substring-after(//time, 'Posted on '), ' at')\r
+test_url: http://www.loopinsight.com/2012/09/13/forget-iphone-5-naysayers-this-thing-is-big/\r
+test_url: http://www.loopinsight.com/2011/05/20/playbook-returns-high-misses-sales-targets-by-90/
\ No newline at end of file
--- /dev/null
+prune: no\r
+convert_double_br_tags: yes
+test_url: http://www.lostgarden.com/2012/04/loops-and-arcs.html
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, ' · LRB')\r
+\r
+body: //div[@class="article-body indent"]\r
+\r
+date: substring-after(//p[@class="meta-info"]/a, '· ')\r
+\r
+prune: no
+test_url: http://www.lrb.co.uk/v33/n18/james-meek/its-already-happened
\ No newline at end of file
--- /dev/null
+title: //h2\r
+\r
+body: // div[@id='content']\r
+\r
+strip: //div[@class='sidebar_wrapper']
+test_url: http://www.luminous-landscape.com/tutorials/optimizing_exposure.shtml
\ No newline at end of file
--- /dev/null
+title: //div[@class="story-body"]/div[@class="story-inner"]/h1\r
+body: //div[@class="story-body"]\r
+date: //p[@class='date']/strong\r
+author: substring-after(//div[@class="story-inner"]/div[@class="byline"]//span[@class='name'], 'By')\r
+\r
+strip: //div[@class="story-inner"]/div[@class="byline"]\r
+
+test_url: http://m.bbc.co.uk/news/science-environment-19144464
\ No newline at end of file
--- /dev/null
+title: //p[@class="txhead"]\r
+author: //div[@class='txb']\r
+wrap_in(p): //div[@class='para']\r
+date: //div[@class='txb']/following-sibling::p/text()[substring(., 14)]\r
+strip: //table[@class="tlogo"]\r
+strip: //div[@class="cookieText"]\r
+strip: //*[@class="sltb"]\r
+strip: //*[@class="ijobs-x-link"]\r
+strip: //*[@class="sponscolour"]\r
+strip: //*[@class="sponsouter"]\r
+strip: //div[@id="bottom-nav-block"]/following::*\r
+test_url: http://m.guardian.co.uk/ms/p/gnm/op/s3OOwgO3yIhGuj41C1_S3Xg/view.m?id=15&gid=world/2012/jul/26/arctic-climate-change&cat=top-stories
\ No newline at end of file
--- /dev/null
+author: substring-after(//div[@class='author'],'Par ')\r
+date: //div[@class='date']\r
+body: //div[@class='content']\r
+
+test_url: http://www.mac4ever.com/news/64182/icloud_les_prix_en_euros_et_en_chf/
\ No newline at end of file
--- /dev/null
+title: substring-before(//title,' « Macdrifter')
+test_url: http://www.macdrifter.com/2012/03/instacast-on-my-mac/
\ No newline at end of file
--- /dev/null
+# Remove news feed\r
+strip: //div[@id='news_feed_front']\r
+\r
+# Remove pull quote\r
+strip: //div[@class='field field-type-text field-field-pull-quote']\r
+\r
+# Remove login\r
+strip: //div[@class='right_bar_login']
+test_url: http://macformat.techradar.com/blog/solid-state-storage-bringing-parity-back-mac-29-10-10&article=89189666
\ No newline at end of file
--- /dev/null
+author: substring-before(substring-after(//div[@class='dateNews'],'par '),' le')\r
+date: substring-after(//div[@class='dateNews'],' le ')\r
+body: //div[@class='singleNews zoneApple']\r
+
+test_url: http://www.macgeneration.com/news/voir/211162/dropbox-encore-un-mac-et-deux-comptes-dropbox
\ No newline at end of file
--- /dev/null
+# Remove sliders\r
+strip: //*[(@class="slides_container")]\r
+strip: //div[(@id="slides_two")]\r
+\r
+# Remove tag cloud\r
+strip: //span[(@class="secao")]\r
+\r
+# Fix date article\r
+# TODO\r
+\r
+# Remove other stuff\r
+strip: //div[(@id="idc-container")]\r
+strip: //div[(@id="idc-noscript")]\r
+strip: //div[(@class="linkwithin_div")]\r
+strip: //div[(@class="navPosts")]\r
+strip: //div[(@id="lateral")]\r
+strip: //div[(@id="autor")]\r
+strip: //div[(@id="rodape")]\r
+strip: //div[(@id="post")]/h1\r
+strip: //div[(@id="post")]/div[(@id="boxInformacoes")]
+test_url: http://macmagazine.com.br/2011/08/01/skype-para-ipad-esta-finalmente-chegando-a-app-store/
\ No newline at end of file
--- /dev/null
+author: substring-after(//div[@class='byline'], " by ")\r
+date: substring-before(//div[@class='byline'], " by ")\r
+\r
+# set body\r
+body: //div[@class='content']\r
+\r
+# set title\r
+title: //h3\r
+#strip: //div[@class='content']/h3
+test_url: http://www.macrumors.com/2010/11/10/apple-debuts-new-apple-tv-and-itunes-movie-content-in-japan/
\ No newline at end of file
--- /dev/null
+strip: //*[(@id = "featured")]\r
+\r
+author:substring-after( //div[@class='posttitle']/h2[@class='author'],'by ')\r
+\r
+date: concat(//div[@class='month'],' ',//div[@class='day'])\r
+\r
+#macstories doesn't provide a year, but month/day is better than nothing\r
+test_url: http://www.macstories.net/news/instapaper-4-0-available-completely-redesigned-ipad-ui-new-features-search-subscription/
\ No newline at end of file
--- /dev/null
+author://div[@class="article_username_container_full"]\r
+date://div[@class="article_username_container"]\r
+body://div[@class="article cms_clear restore postcontainer"]
+test_url: http://www.mactalk.com.au/content/chat-basil-shkara-developer-taptax-2452/
\ No newline at end of file
--- /dev/null
+title: substring-after(substring-after(//title, '>'), '>')\r
+body: //div[@class='NewsArticleContent']
+test_url: http://www.mactechnews.de/news/index/Apple-Pressekonferenz-zum-iPhone-4-147316.html
\ No newline at end of file
--- /dev/null
+title: //article//h1\r
+date: //meta[@name="date"]/@content\r
+author: //div[@class="author-name" or @class="article-byline"]/a[1]\r
+\r
+body: //section[@class="page"]\r
+\r
+# remove 'From the Lab' and 'Recent posts' text\r
+strip: //div[@class='blogLabel']\r
+\r
+# remove byline and meta info\r
+strip: //div[@class="article-meta"]\r
+strip: //div[@class="author-info"]\r
+\r
+#strip tags and categories\r
+strip: //div[@class="department"]\r
+\r
+#strip product cap links\r
+strip: //div[@class="cap-main"]\r
+strip: //div[@id="compare-lede"]\r
+\r
+prune: no\r
+\r
+# copes less well with Review pages, seems fine for News\r
+test_url: http://www.macworld.com/article/163184/2011/10/the_ipod_as_an_iconic_cultural_force.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='NewsArticle']\r
+
+test_url: http://mainichi.jp/select/weathernews/20110311/news/20110520k0000e040062000c.html
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, '|')\r
+body: //*[@id='content-left']\r
+\r
+# Why is this not working here?\r
+# body: //*[@id='content-left']/div[@class='content-container'][2]/div[@class='content-body']/div[@class='inner-container']/div[@class='detail']\r
+\r
+\r
+#Header\r
+strip_id_or_class: 'subHead'\r
+strip_id_or_class: 'fl_right'\r
+strip_id_or_class: 'infolink'\r
+strip_id_or_class: 'content-head'\r
+strip_id_or_class: 'tab'\r
+strip_id_or_class: 'tab-active'\r
+strip: //*[contains(@class,'trenner')]\r
+\r
+# Headline\r
+strip: //h1/*\r
+strip_id_or_class: 'font16'\r
+\r
+#Images\r
+strip_id_or_class: 'leftimage'\r
+strip_id_or_class: 'rightimage'\r
+\r
+#Comments\r
+strip: //table\r
+strip: //p/following-sibling::*[0]
+test_url: http://www.mainpost.de/ueberregional/meinung/Dioxin-Skandal-bringt-Agrarministerin-in-Bedraengnis;art9517,5920211
\ No newline at end of file
--- /dev/null
+tidy: no
+
+test_url: http://www.makeuseof.com/dir/kindle-it-web-pages-kindle-friendly/
\ No newline at end of file
--- /dev/null
+tidy: no\r
+prune: no\r
+date: //article//time[@pubdate]\r
+title: //article/header/h2\r
+body: //article\r
+strip: //header\r
+test_url: http://www.marco.org/2012/09/08/businessweek-gruber\r
+test_url: http://www.marco.org/2012/04/24/might-upgrade-someday
\ No newline at end of file
--- /dev/null
+strip_id_or_class: wwsgd
+test_url: http://www.marksdailyapple.com/are-detoxes-and-cleanses-safe-and-effective/
\ No newline at end of file
--- /dev/null
+date: //div[@id="main"]/p[@class="date"]\r
+author: string("Martin Fowler")\r
+body: //div[@id="main"]\r
+strip_id_or_class: date\r
+strip_id_or_class: tags\r
+strip_id_or_class: tagLabel\r
+strip: //div[@id="main"]/h1[1]
+test_url: http://martinfowler.com/bliki/DatabaseThaw.html
\ No newline at end of file
--- /dev/null
+title: //header[@class='entry-title']/h1
+body: //div[@class='description']
+strip: //div[@class='ytm-gallery-box']
+test_url: http://mashable.com/2011/12/05/india-wants-google-and-facebook-to-censor-user-content/
\ No newline at end of file
--- /dev/null
+date: //*[@class = 'published']
+test_url: http://www.mattcutts.com/blog/internet-censorship-sopa/
\ No newline at end of file
--- /dev/null
+body: //div[class="frett-main"]
+test_url: http://mbl.is/frettir/innlent/2012/02/21/litill_munur_a_fargjaldaverdi/
\ No newline at end of file
--- /dev/null
+strip: //div[contains(@class, 'article-tools')]\r
+test_url: http://www.medialens.org/index.php/alerts/alert-archive/2012/713-the-illusion-of-democracy.html
\ No newline at end of file
--- /dev/null
+# need to find a way to eliminate <span> content for "related content" without eliminating important content\r
+\r
+convert_double_br_tags: [yes]\r
+#body: //div[@id='leftside']\r
+title: //h1\r
+title: //h2\r
+Author: substring-after(//h4, 'By ')\r
+Author: substring-after(//h4, 'By: ')\r
+#Strip: //span\r
+strip_id_or_class: morefromcat\r
+strip_id_or_class: mostpopular\r
+strip_id_or_class: articlepagination\r
+strip_id_or_class: toolbar\r
+body: //div[@id='zmodcontent']\r
+single_page_link: //li[@class='onepage'] //a[contains (@href, 'printer.php')]\r
+test_url: http://www.menshealth.com/mhlists/pursuit_of_happiness/index.php
\ No newline at end of file
--- /dev/null
+title: //div[@class="blogtitle"]\r
+strip: //div[@class="blogtitle"]\r
+\r
+author: substring-after(//span[@class="blogheader"], 'Author: ')
+test_url: http://www.mikeash.com/pyblog/friday-qa-2012-01-13-the-mac-toolbox.html
\ No newline at end of file
--- /dev/null
+title: //div[@class='post_content']/h2\r
+date: //div[@class='dateline']\r
+body: //div[@class='entry']\r
+\r
+strip: //div[@class='closer']\r
+strip: //div[@class='navigation']\r
+strip: //div[@class='aux_pane']\r
+strip: //div[@class='aux_aux_pane']
+test_url: http://www.mikeindustries.com/blog/archive/2011/10/never-be-another
\ No newline at end of file
--- /dev/null
+title: //*[@class="article"]/h1\r
+date: //*[@class="article"]/div[@class="date"]\r
+\r
+# strip the title and date from the article text\r
+strip: //*[@class="article"]/h1\r
+strip: //*[@class="article"]/div[@class="date"]\r
+\r
+# strip annoying <br> between metadata and article\r
+strip: //*[@class="article"]/div[@class="date"]/following-sibling::br
+test_url: http://minnesota.publicradio.org/display/web/2012/06/19/health/senators-want-health-care-ruling-on-tv/
\ No newline at end of file
--- /dev/null
+title: //*[@id="content-header"]/h1\r
+author: //*[contains(@class, 'byline')]/a/text()\r
+date: substring-after(//*[contains(@class, 'byline')]/text()[2], '|')\r
+body: //*[contains(@class, 'node-body')]
+test_url: http://www.minnpost.com/eric-black-ink/2012/06/overturning-obamacare-would-be-game-changer-supreme-court
\ No newline at end of file
--- /dev/null
+# Remove extra links\r
+strip: //*[@class='appended_html']
+test_url: http://www.mirrorfootball.co.uk/news/West-Ham-crisis-Carlton-Cole-slams-diabolical-performance-and-rips-into-Avram-Grant-lack-of-tactical-nous-following-Liverpool-mauling-article636151.html
\ No newline at end of file
--- /dev/null
+strip_id_or_class: 'book-ad'\r
+strip_id_or_class: 'bigger pullquote'\r
+strip_id_or_class: 'subscribe'\r
+strip_id_or_class: 'blog-link'
+test_url: http://mises.org/daily/4804
\ No newline at end of file
--- /dev/null
+title: //h1[@class='article-headline']\r
+date: //span[@class='timeStamp']\r
+author: substring-before(//p[@class='article-byline'], '/')\r
+body: //div[@id='article']\r
+#strip: //div[@class='inner']\r
+strip: //div[@id='article_head']\r
+strip: //p[@class='tagLine']\r
+strip: //div[@id='article_related_links']\r
+strip: //div[@id='article_related_mlb']\r
+strip: //span[@class='more']\r
+strip: //div[@class='article_component']\r
+strip: //span[@class='screen_reader']\r
+strip: //ul[@class='columnists_blurb']\r
+test_url: http://mlb.mlb.com/news/article.jsp?ymd=20120403&content_id=27880830
\ No newline at end of file
--- /dev/null
+title: //h1[@id = 'stream_title']\r
+author: //p[@class = 'byline']/a\r
+date: //span[@class = 'datetime']\r
+\r
+body: //div[@id = 'stream_container']\r
+strip: //p[@class = 'byline']\r
+strip_id_or_class: stream_summary\r
+strip_id_or_class: social-spoken\r
+strip_id_or_class: datetime\r
+strip_id_or_class: author-mini-profile\r
+strip_id_or_class: social-tools\r
+strip_id_or_class: entry-tags\r
+strip_id_or_class: fb-like-box
+test_url: http://mlb.sbnation.com/2011/10/17/2495845/2011-world-series-st-louis-cardinals-texas-rangers-home-field-advantage
\ No newline at end of file
--- /dev/null
+title: //*[@class="header_title"]/h1\r
+date: //*[@class="field-date"]\r
+author: //*[@class="field-author"]\r
+body: //div[contains(@class, 'content')]\r
+
+test_url: http://www.mlssoccer.com/news/article/2012/06/19/lack-depth-front-forces-arena-alter-las-formation
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id = 'article_content']/div[contains(@class,'article')]\r
+author: //sub[@class = 'article_promoted_text']/a[starts-with(@href, 'member')]\r
+date: //div[@class = 'article_username_container']
+test_url: http://www.mmo-champion.com/content/2688-Other-Press-Tour-Interviews-A-Night-in-Mists-of-Pandaria-Blue-Posts-MoP-Screenshot
\ No newline at end of file
--- /dev/null
+tidy: no\r
+author: //div[@id="above-content"]//img/@alt | //div[@class="comment-auth"]/span[1]/a/text()\r
+date: //div[@class="comment-auth"]/div | //div[@class="comment-auth"]/span[2]\r
+body: //div[@class="node"]\r
+\r
+strip_id_or_class: vertical-social-bar\r
+strip_id_or_class: blogs_paginator\r
+strip_id_or_class: horizontal-social-links\r
+strip_id_or_class: servicelinksdiv\r
+
+test_url: http://www.mnn.com/green-tech/research-innovations/blogs/5-breakthroughs-that-will-make-solar-power-cheaper-than-coal
\ No newline at end of file
--- /dev/null
+title: //title\r
+\r
+author: //div[@class="author"]\r
+\r
+strip_id_or_class: 'header'\r
+strip_id_or_class: 'cikk_ajanlo'\r
+strip_id_or_class: 'buttons'\r
+strip_id_or_class: 'related'\r
+strip_id_or_class: 'adbox ad_cikk_kozepre'\r
+strip_id_or_class: 'cikk-cimkek'\r
+strip_id_or_class: 'cikk_ertekeles'\r
+\r
+strip_comments: yes
+test_url: http://mno.hu/grund/a-gumibottal-hadonaszo-rendort-joval-konnyebb-utalni-1055351
\ No newline at end of file
--- /dev/null
+title: //h2[@class="article_title"]\r
+strip: //a[@class="houseAdLink"]\r
+strip: //h1\r
+strip: //div[@class="more_articles"]
+test_url: http://mobile.slate.com/rss.jsp?rssid=411&item=http%3a%2f%2fwww.slate.com%2fdefault.aspx%3fdisplaymode%3d201%26id%3d2293749%26device%3drss
\ No newline at end of file
--- /dev/null
+body: //div[@class='post uncustomized-post-template']\r
+\r
+# remove duplicate of post title, which is a link\r
+strip: //h3[@class='post-title']\r
+\r
+# remove permalink and timestamp, which isn't useful as it's a time with no date\r
+strip: //span[@class='post-timestamp']\r
+\r
+# remove labels (tags)\r
+strip: //span[@class='post-labels']
+test_url: http://mobileopportunity.blogspot.com/2010/12/rims-q3-financials-tale-of-two.html
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+author: //meta[@name="author"]/@content\r
+date: //span[@class='date1']\r
+body: //div[@id='newsimage'] | //div[@id='bodytext']\r
+tidy: no\r
+prune: no\r
+\r
+test_url: http://www.modernghana.com/news/323765/1/039ghost039-teachers-removed-salaries-allowances-p.html
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+title: //h1[@class='storyheadline']\r
+author: //meta[@name="AUTHOR"]/@content\r
+date: //span[@class='cnnDateStamp']\r
+date: //meta[@name="DATE"]/@content\r
+body: //div[@id='storytext' or @class='storytext']\r
+\r
+strip_id_or_class: ie_column\r
+strip_id_or_class: sharewidgets\r
+strip_image_src: bug.gif\r
+\r
+strip: //div[@class="hed_side"]\r
+strip: //span[@class="byline"]\r
+strip: //a[@class="soc-twtname"]\r
+strip: //span[@class="cnnDateStamp"]\r
+strip: //div[@class="storytimestamp"]\r
+strip: //div[@class="cnnCol_side"]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://money.cnn.com/2011/03/15/news/companies/steve_jobs_thought_process.fortune/index.htm?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29\r
+test_url: http://money.cnn.com/2012/01/27/markets/markets_newyork/index.htm\r
+test_url: http://money.cnn.com/2012/05/13/technology/yahoo-ceo-out-rumor/index.htm
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://monkeyzen.com/2011/09/siluetas-de-clasicos-a-modo-de-vinilos
\ No newline at end of file
--- /dev/null
+strip_image_src: menu\r
+strip_image_src: templates\r
+strip: //div/a\r
+strip: //div/b\r
+strip: //div/strong\r
+strip: //td[@width='30%']\r
+strip: //br[1]\r
+strip: //br[2]\r
+strip: //br[3]\r
+strip: //br[4]\r
+strip: //a[@href='http://www.moonsault.de/newzboard/index.php?act=home']\r
+strip_id_or_class: cse-branding-right
+test_url: http://www.moonsault.de/newzboard/index.php?news=22321&act=previous
\ No newline at end of file
--- /dev/null
+title: //h1[@class='print-title']\r
+body: //div[@class='print-submitted' or @class='print-created' or @class='print-content']\r
+prune: no\r
+\r
+single_page_link: //li[@class='print']/a\r
+\r
+test_url: http://moreintelligentlife.com/content/places/paul-markillie/they-trash-cars-dont-they
\ No newline at end of file
--- /dev/null
+author: //span[@class="author"]/a\r
+date: //span[@class="date"]\r
+body: //div[@class="story-content"]\r
+strip: //aside\r
+test_url: http://motherboard.vice.com/blog/you-can-carry-a-copy-of-the-pirate-bay-in-your-pocket
\ No newline at end of file
--- /dev/null
+title: //h2[contains(@class,'post_headline')]\r
+body: //div[@class='entry']\r
+convert_double_br_tags: yes\r
+strip_image_src: _selected.gif\r
+strip_id_or_class: addthis_\r
+strip: //a[contains(@href,'feedburner.com')]
+test_url: http://mothering.com/all-things-mothering/inspiration/motherhood-brings-me-down
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id = 'content-area']\r
+next_page_link: //div[@class='node-pager']/a[contains(@class, 'next')]\r
+tidy: no\r
+author: //p[contains(@class, 'byline')]/a\r
+\r
+strip_id_or_class: node-header\r
+strip_id_or_class: hdr-tools\r
+strip_id_or_class: node-body-break\r
+strip_id_or_class: pullquote\r
+strip_id_or_class: node-pager\r
+strip_id_or_class: author-bio\r
+strip_id_or_class: node-footer\r
+
+test_url: http://motherjones.com/politics/2012/02/mac-mcclelland-free-online-shipping-warehouses-labor
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://motorfull.com/2011/09/aparca-valeo-park4u-remote
\ No newline at end of file
--- /dev/null
+body: //div[class="mainBody"]\r
+footnotes: no
+test_url: http://msdn.microsoft.com/en-us/library/hh542796(VS.103).aspx
\ No newline at end of file
--- /dev/null
+title: //title\r
+author: //div[@id='byline']\r
+\r
+date: //div[contains(@class,'timestamp')]/abbr/text()\r
+\r
+body: //div[@id='intellitTXT']\r
+\r
+strip: //div[@id='byline']\r
+strip: //div[contains(@class,'timestamp')]\r
+strip: //div[contains(@class, 'ad-label')]\r
+strip: //div[contains(@class, 'ad-break')]\r
+strip: //span[contains(@class, 'x-video')]\r
+strip: //span[contains(@class, 'inline')]\r
+strip: //div[contains(@class, 'video')]\r
+strip: //div[contains(@class, 'discuss')]\r
+strip: //div[@id='most-popular']\r
+strip: //div[contains(@class,'drawer')]\r
+strip: //*[contains(@class, 'hide')]\r
+\r
+footnotes: no
+test_url: http://www.msnbc.msn.com/id/44748412/ns/business-world_business/#.TolUv-vfDbE
\ No newline at end of file
--- /dev/null
+body: //div[@class="col1"]//div[@class="photo"] | //div[@class="detail"]/p[@class="fontStyle21"] | //div[@class="story last"]\r
+tidy: no\r
+\r
+test_url: http://www.myfoxboston.com/dpp/news/local/transit-police-say-woman-spat-on-mbta-bus-driver-2010611
\ No newline at end of file
--- /dev/null
+title: //h2[contains(@class, 'name')]\r
+body: //div[@class='printFullPageContentContainer']//div[contains(@class, 'recipe')]\r
+\r
+strip_id_or_class: photoBy\r
+strip_id_or_class: link\r
+\r
+single_page_link: //li[@class='print']/a[contains(@href, '/print/')]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.myrecipes.com/recipe/hummingbird-cake-10000000387218/
\ No newline at end of file
--- /dev/null
+body: //div[@class='node']
+test_url: http://www.narenji.ir/2806
\ No newline at end of file
--- /dev/null
+title: //div[@class='address']/span\r
+author: substring-before(//span[@class='credits'],',')\r
+date: //div[@class='promodatepress']/span\r
+body: //div[@class='default_style_wrap']\r
+strip: //div[@class='text_adjust']\r
+strip: //div[@class='skiplink']\r
+strip: //h2
+test_url: http://www.nasa.gov/mission_pages/kepler/news/kepler-21b.html
\ No newline at end of file
--- /dev/null
+date://span[contains(@class,'date')]\r
+\r
+body://div[contains(@class,'contWarp')]\r
+\r
+strip://div[contains(@class,'keyWord')]\r
+strip://div[contains(@class,'submitComt')]\r
+strip://div[contains(@class,'cmts')]\r
+strip://div[contains(@class,'notice')]\r
+strip://div[contains(@class,'part pt-second')]
+test_url: http://www.nbweekly.com/news/china/201203/29316.aspx
\ No newline at end of file
--- /dev/null
+#host configuration should be http://www.neh.gov/news/humanities/\r
+\r
+\r
+#meta data \r
+title:substring-after(substring-after(//title,':'),':')\r
+author:substring-after(//h2[@class = 'subHead'],'By')\r
+date:substring-before(substring-after(//title,':'),':')\r
+\r
+#img and caption handling\r
+wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text()\r
+wrap_in(fieldset)://div[@id = 'mainContent']/table\r
+\r
+# clean up\r
+strip: //table[@class = 'marginpaddingTop']\r
+strip: //h2[@class = 'subHead']\r
+
+test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html
\ No newline at end of file
--- /dev/null
+title: //*[@class="header_title"]/h1\r
+body: //div[contains(@class, 'content')]
+test_url: http://neomoney.co/personal/expatriate-and-migrant-loans/expatriate-loans/
\ No newline at end of file
--- /dev/null
+title: //div[@class='content-title']\r
+#date: substring-after(//div[@class='dernek-text-under'],'Posted on')\r
+body: //div[@class='content-item']\r
+next_page_link: //li[@class='next']/a\r
+convert_double_br_tags: yes\r
+
+test_url: http://www.net-security.org/article.php?id=1732
\ No newline at end of file
--- /dev/null
+title: //h1\r
+author: //div[@class="submitted"]/span\r
+\r
+# seems like this should work, but nothing is returned. Issue with xpath parser?\r
+date: //div[@class="submitted"]/time\r
+\r
+body: //div[@id="main-content"]\r
+\r
+strip_comments: no\r
+\r
+strip: //h1\r
+strip: //div[@class="submitted"]\r
+strip: //dd[@class="profile-avatar"]\r
+strip: //div[@class="author-profile"]/dl/dt[1]\r
+strip: //div[@id="right-col"]
+test_url: http://www.netmagazine.com/opinions/nielsen-wrong-mobile
\ No newline at end of file
--- /dev/null
+title: //h1[@class='entry-title']\r
+author: //a[@ref='author']\r
+date: //span[@class='entry-date']\r
+body: //div[@class='entry-content']\r
+
+test_url: http://netzpolitik.org/2011/buch-generation-facebook/
\ No newline at end of file
--- /dev/null
+title: //div[@id="maincontent"]/h1\r
+body: //div[@id="maincontent"]\r
+date: //div[@id="maincontent"]/p[2]\r
+author: //ul[@id="contributors"]/li/p/b\r
+\r
+strip: //p[@*]\r
+strip: //h1\r
+strip: //div[@id="maincontent"]/div
+test_url: http://newmatilda.com/2011/07/22/turnbull-makes-sense-climate
\ No newline at end of file
--- /dev/null
+title: //div[@id="main-content"]//h2\r
+\r
+author: //div[@id="main-content"]//span[@class="authors"]\r
+\r
+date: //div[@id="main-content"]//span[@class="timestamp"]\r
+\r
+body: //div[@id="main-content"]//div[@class="content"]
+test_url: http://www.news-gazette.com/news/business/economy/2011-08-08/ibm-drops-out-blue-waters-project.html
\ No newline at end of file
--- /dev/null
+#This should apply to *.cnet.com. Not just news.cnet.com.\r
+title: //h1\r
+author: //img[@class="mugshot"]/@alt\r
+strip: //h1\r
+strip_id_or_class: breadcrumb\r
+strip: //p[@id="introP"]\r
+strip: //div[@class="postByline"]\r
+strip: //div[@class="editorBio"]\r
+strip: //div[@class="inline-slideshow"]\r
+strip: //div[@class="related"]\r
+body: //div[@class="postBody txtWrap"]
+test_url: http://news.cnet.com/8301-27076_3-57405303-248/apple-ipad-charging-fine-keep-it-plugged-in/?tag=mncol;posts
\ No newline at end of file
--- /dev/null
+title://div[@class="content_detail"]/h1\r
+\r
+author://div[@class="author"]/strong\r
+\r
+date:substring-before(substring-after(//div[@class="content_detail"]/span[@class="date"], ','), ' WIB')\r
+\r
+body://div[@class="text_detail"]
+test_url: http://news.detik.com/read/2012/05/22/225531/1922307/10/menkeu-cek-soal-lolosnya-315-kg-sabu-dari-bea-cukai
\ No newline at end of file
--- /dev/null
+body: //div[@id='main']\r
+strip: //div[@id='sbs']\r
+strip: //div[@id='fsizeSwitch']\r
+strip: //div[@id='googleAd']\r
+strip: //div[@id='detailFoot']\r
+strip_image_src: counter?key\r
+convert_double_br_tags: yes\r
+
+test_url: http://news.kanaloco.jp/localnews/article/1105200018/
\ No newline at end of file
--- /dev/null
+title: //h2[@class="lyt-hdg-02-04"]\r
+\r
+author: //div[@class="lyt-namearea"]/a\r
+\r
+date: //div[@class="lyt-namearea"]/text()\r
+\r
+body: //div[@class="articleContent"]\r
+\r
+strip: //div[@id="tab-aside"]\r
+
+test_url: http://news.mynavi.jp/articles/2011/12/07/nico/index.html
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@id='content']//p[@class='readMore']/a\r
+\r
+title: //div[@class='hidden offscreen']/h2\r
+body: //div[@id="storyText"]\r
+move_into(//div[@id='storyText']): //div[@class='fact']\r
+strip: //small[@class='credit']\r
+strip: //small[@class='caption']\r
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
+strip: //p[@class='toplink']\r
+
+test_url: http://news.orf.at/stories/2084731/
\ No newline at end of file
--- /dev/null
+body: //article\r
+title: //h1\r
+author: //span[@class='b-article-source-dropdown']\r
+strip: //span[@class='b-article-photo-incut__source']\r
+strip: //a[@class='b-read-more b-read-more_bottom']\r
+\r
+\r
+tidy:no
+test_url: http://news.rambler.ru/12972208/
\ No newline at end of file
--- /dev/null
+body: //div[@class='main']/div[@class='item']\r
+strip: //div[@class='right']\r
+\r
+test_url: http://news.techmeme.com/110516/fh-rip
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+title: //h1[@class='headline']\r
+author: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//span[@class='fn']\r
+date: //cite[contains(@class, 'byline') and contains(@class, 'vcard')]//abbr/@title\r
+body: //div[@id='mediaarticlelead']//a[@class='media'] | //div[contains(@class,'yom-art-content')]\r
+#strip: //cite/abbr\r
+strip_id_or_class: action\r
+strip_id_or_class: prefetch\r
+tidy: no\r
+prune: no\r
+
+test_url: http://news.yahoo.com/cold-la-nina-winter-forecast-west-coast-183535067.html
\ No newline at end of file
--- /dev/null
+strip_comments: no\r
+strip: //a[. = 'reply']
+test_url: http://news.ycombinator.com/item?id=1516461
\ No newline at end of file
--- /dev/null
+date: //meta[@name='og:article:published_time']/@value\r
+\r
+body: //div[@class='itemIntroText' or @class='itemImageBlock' or @class='itemFullText']\r
+\r
+strip_id_or_class: itemImageGallery\r
+\r
+prune: no\r
+\r
+test_url: http://www.newsbomb.gr/gossip/story/257234/i-proin-moy-protimoyse-na-serfarei-apo-to-na-kanoyme-sex
\ No newline at end of file
--- /dev/null
+single_page_link: //iframe/@src
+test_url: http://newsle.com/article/0/15831103/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: (//div[@class='articleImg']//img)[1] | //p[contains(@class, 'commentTextArticle') or contains(@class, 'articlePublished')] | //div[@id='articleLeftContent']\r
+author: //div[@class='byline']//a[contains(@href, '/user/')]\r
+\r
+strip_id_or_class: facts\r
+strip_id_or_class: articleBlogsHolder\r
+strip_id_or_class: byline\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.newsmill.se/artikel/2012/05/06/medielogiken-v-ger-tyngre-n-reportrarnas-sikter
\ No newline at end of file
--- /dev/null
+body: //div[@class='right']//div[@class='articles']\r
+author: //div[@id='artinfo']//a[contains(@href, '/author/')]\r
+strip: //div[@id='artinfo']\r
+strip: //table[//a[contains(@href, 'twitter.com')]]\r
+strip_id_or_class: twitter\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.newsunspun.org/eotn/bbc-headline-change-iran-goes-from-not-building-to-undecided-on-nuclear-bomb
\ No newline at end of file
--- /dev/null
+title: //h1[@id='articlehed'] | //h2[@id="articleintro"]\r
+body: //div[@id='articletext']\r
+\r
+strip: //ul[@id="bc"] | //div[@id="yrail"] | //div[@class="entry-keywords"] | //div[@class="entry-categories"] | //div[@class="socialUtils"] | //div[@id="footer"]\r
+\r
+date: //h4[@id='articleauthor']/span[@class='dd dds']\r
+date: //div[@id="pagebody"]/div[@class='hentry entry']/div[@class='published']\r
+\r
+single_page_link: //div[@class='paginationViewSinglePage']/a\r
+test_url: http://www.newyorker.com/online/blogs/culture/2012/06/mug-shot-web-sites.html
\ No newline at end of file
--- /dev/null
+# 2011-08-22 [carlo@...] initial version\r
+# 2011-08-22 [carlo@...] removed comments & social links\r
+\r
+tidy: no\r
+\r
+single_page_link: //a[@class="single active"]\r
+\r
+body: //div[@id="main"]//div[@class="content-region"]/article\r
+author: //span[@class="author-name"]\r
+date: //time/text()\r
+\r
+strip_id_or_class: //aside[@id="related"]\r
+strip: //footer\r
+\r
+title: //h1
+test_url: http://www.next-gen.biz/reviews/deus-ex-human-revolution-review
\ No newline at end of file
--- /dev/null
+# doesn't look like selecting an attribute value works?\r
+# author: //meta[@id="authorName"]@value\r
+\r
+author: substring-after(//li[@id="article-hdr-meta-author"]/text(), "By ")\r
+date: //abbr[@id="article-time"]\r
+title: //div[@id="article-hdr"]/h1\r
+body: //div[@class="articleText"]\r
+\r
+# strip miscellaneous teasers & etc\r
+strip: //div[@class="removeformobile"]
+test_url: http://www.nfl.com/news/story/09000d5d82388707/article/close-shave-chiefs-haley-perseveres-through-rough-start?module=HP11_content_stream
\ No newline at end of file
--- /dev/null
+next_page_link: //div[@class='nextpage_continue']/a\r
+strip: //div[@class='nextpage_continue']\r
+strip_id_or_class: nextpage\r
+title: //div[@class='article_title']//h1\r
+body: //div[@class='article_title']/..\r
+body: //div[@class='content']
+test_url: http://ngm.nationalgeographic.com/2012/02/tsunami/folger-text
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'news_right']
+test_url: http://www.nhk.or.jp/news/html/20110309/t10014559982000.html
\ No newline at end of file
--- /dev/null
+body: //div[@id="main"]\r
+title: //div[@id="main"]/h3\r
+\r
+# Remove ‘Review’ and ‘Wii’.\r
+strip: //div[@class="badge"]\r
+\r
+# Remove duplicate title and country flag.\r
+strip: //h3\r
+\r
+# Commented out below are attempts to extract the author and date, which did not work.\r
+# author: //p[@class="extra "]/a\r
+# date: //p[@class="extra "]/span[@class="when"]
+test_url: http://www.nintendoworldreport.com/review/28400
\ No newline at end of file
--- /dev/null
+author: //span[@class='meta']/span[@class='username']\r
+body: //div[@class='article-content']\r
+\r
+strip_id_or_class: 'article-actions'
+test_url: http://nojesguiden.se/blogg/maja-bredberg/maja-laser-tidningen-en-helt-vanlig-lordag-i
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='pn-maincontent']\r
+strip_id_or_class: z-menu\r
+strip_id_or_class: news_category\r
+strip_id_or_class: news_title\r
+strip_id_or_class: news_modify\r
+strip_id_or_class: news_morearticlesincat\r
+strip_id_or_class: ezc_comments\r
+strip_comments: yes\r
+\r
+test_url: http://www.northumberlandview.ca/index.php?module=news&func=display&sid=5972
\ No newline at end of file
--- /dev/null
+title: /html/body/div[3]/div/div/h1\r
+\r
+body: //*[@id="article-body"]\r
+\r
+
+test_url: http://nplusonemag.com/the-outskirts-of-progress
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'storytitle')]//h1\r
+author: //p[@class="byline"]/span\r
+body: //div[@id='storyspan02']//*[@class='duration' or @class='download' or contains(@class, 'photo')] | //div[@id='storytext'] | //div[@class='transcript']\r
+date: //meta[@name="date"]/@content\r
+\r
+strip: //div[@class='enlarge_measure']\r
+strip: //div[@class='enlarge_html']\r
+strip: //a[@class='enlargeicon']\r
+strip: //div[contains(@class, 'bookedition')]\r
+strip: //div[@class='textsize']\r
+strip: //ul[@class='genres']\r
+strip: //span[@class='bull']\r
+strip_id_or_class: secondary\r
+strip_id_or_class: con1col\r
+strip: //h3[@class='conheader']\r
+\r
+replace_string(<a name="more"> </a>): <!-- no more -->\r
+replace_string(<div class="transcript">): <div class="transcript"><h2>Transcript</h2>\r
+\r
+prune: no\r
+strip://div[@class="ecommercepop"]\r
+strip://span[@class="bull"]\r
+strip://span[@class="purchaseLink"]\r
+strip://div[@class="enlarge_html"]\r
+strip://div[@class="enlarge_measure"]\r
+strip://div[@class="container con1col small"]\r
+strip://a[contains(@class, "enlargebtn")]\r
+strip://div[contains(@class, "bucketwrap internallink")]\r
+\r
+test_url: http://www.npr.org/blogs/thetwo-way/2011/07/12/137799301/sports-loses-its-escapist-gleam-in-a-summer-of-court-dates\r
+test_url: http://www.npr.org/2012/07/04/156190948/feeling-under-siege-catholic-leadership-shifts-right\r
+test_url: http://www.npr.org/2012/12/13/166480907/the-years-best-sci-fi-crosses-galaxies-and-genres
\ No newline at end of file
--- /dev/null
+strip_id_or_class: sIFR-alternate\r
+title: //div[@id='page-title-wrapper']/div[@id='page-title']/h2\r
+single_page_link: //a[contains(@href, 'pagination=false') and not(contains(@href, 'printpage=true'))]\r
+\r
+body: //div[@id = 'article-body']\r
+strip_id_or_class:article-tools\r
+strip_id_or_class:js_target\r
+strip_id_or_class:marker\r
+author://div[@id = 'page-title']/h3\r
+date://div[@id = 'page-title']/h5/a[starts-with(@href,'/issues/')]\r
+\r
+\r
+test_url: http://www.nybooks.com/articles/archives/2012/feb/23/were-more-unequal-you-think/
\ No newline at end of file
--- /dev/null
+title: //h2[contains(@class, 'primary')]\r
+body: //div[@id='story']\r
+author: //*[@class='by']/a\r
+date: substring-after(//*[@class='date'], 'Published')\r
+\r
+next_page_link: //div[@class='page-navigation']//li[@class='next']/a\r
+\r
+test_url: http://nymag.com/news/features/wall-street-2012-2/
\ No newline at end of file
--- /dev/null
+title: //div[@class="article default-article"]/h1\r
+author: //p[@class="author"]/a[2]\r
+\r
+# Article introduction:\r
+#move_into(//div[@class="article-bread"]): //p[@class="lead"]\r
+\r
+body: //div[@class="article-bread"]
+test_url: http://www.nyteknik.se/nyheter/energi_miljo/energi/article3391426.ece
\ No newline at end of file
--- /dev/null
+title://h1[@class="articleHeadline"]\r
+body://div[@id="article"]\r
+strip_id_or_class:articleTools\r
+strip_id_or_class:readerscomment\r
+#strip://div[contains(@class, "articleInline runaroundLeft")]\r
+strip: //div[contains(@class, "doubleRule")]\r
+# strip image credit - appears as a bold heading\r
+strip: //div[contains(@class, "articleInline")]//h6\r
+strip_id_or_class:enlargeThis\r
+strip_id_or_class:pageLinks\r
+strip_id_or_class:memberTools\r
+strip_id_or_class:articleExtras\r
+strip_id_or_class:singleAd\r
+strip_id_or_class:byline\r
+strip_id_or_class:dateline\r
+strip_id_or_class:articleheadline\r
+strip_id_or_class:articleBottomExtra\r
+strip://a[contains(@href, 'nytimes.com/adx/')]\r
+strip: //nyt_byline\r
+strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')]\r
+strip: //p[@class='caption']//a[contains(., 'More Photos')]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+date: substring-after(//*[contains(@class, 'dateline')], 'Published:')\r
+\r
+single_page_link: //link[contains(@href, 'pagewanted=all')]\r
+#single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))]\r
+\r
+strip://ul[@id = 'toolsList']\r
+strip://h6[@class = 'kicker']\r
+author:substring-after(//h6[@class='byline'],'By ')\r
+\r
+test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html\r
+test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html
\ No newline at end of file
--- /dev/null
+body: //*[@class='article-full']\r
+title: //h3\r
+strip: //header[@class='group']\r
+#body: //p[@class='lead']\r
+#move_into(//p[@class='lead']): //*[@class='article-full']/figure\r
+#move_into(//p[@class='lead']): //div[@id='articleBodyText']\r
+strip: //div[@id='social-media-floater']\r
+strip: //div[@class='advertisement']\r
+strip: //div[@class='infobox']\r
+strip: //div[@id='articleComments']\r
+\r
+test_url: http://www.nzz.ch/wissen/wissenschaft/sonnenschutz-fuer-die-erde-1.17282213
\ No newline at end of file
--- /dev/null
+body: //article[contains(@class, 'instapaper_body')]\r
+\r
+prune: no\r
+\r
+single_page_link: //a[@id='print-button']\r
+\r
+test_url: http://www.observer.com/2008/would-you-take-tumblr-man
\ No newline at end of file
--- /dev/null
+body: //div[(@id = "content")]\r
+strip: //div[(@class = "links-bar")]\r
+strip: //div[(@class = "povrzani")]\r
+strip: //div[(@class = "povrzani-dolu")]\r
+strip: //div[(@class = "tags")]\r
+strip: //h1[(@id = "page-title")]
+test_url: http://off.net.mk/zhivot-i-zabava/gadzheti/dzhabe-raboti-dzhabe-ne-dishi
\ No newline at end of file
--- /dev/null
+body: //div[@class='story']
+test_url: http://www.omaha.com/article/20111031/BIGRED/111039984#pelini-tremendous-challenge-ahead-for-huskers
\ No newline at end of file
--- /dev/null
+title: //div[@id='squeeze']/h1\r
+strip: //div[@id='squeeze']/h1\r
+author: //div[@class='submitted']/a\r
+strip: //div[@class='submitted']/a\r
+convert_double_br_tags: yes\r
+\r
+\r
+
+test_url: http://omiliya.org/content/predchuvstvie.html
\ No newline at end of file
--- /dev/null
+body: //div[(@class = "statija")]\r
+strip: //div[(@class = "relatedBlock")]\r
+strip: //div[(@class = "swftools")]\r
+strip: //table[(@class = "links")]
+test_url: http://on.net.mk/video/na-trkala/lamborghini-aventador-avionot-shto-ne-leta
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+body: //div[@id='article_story_body']\r
+\r
+author: //h3[@class='byline']/a\r
+# for slid show content\r
+body: //ul[@id='imageSlide']//li[@class='firstSlide']//img | (//div[@class='txt_body']//p)[1]\r
+date: //li[@class='dateStamp']/small\r
+\r
+strip_id_or_class: insetFullBracket\r
+strip_id_or_class: insettipBox\r
+#strip_id_or_class: legacyInset\r
+strip_id_or_class: recipeACShopAndBuyText\r
+\r
+strip: //div[contains(@class, 'insetContent')]//cite\r
+strip: //*[contains(@style, 'visibility: hidden;')]\r
+strip: //div[contains(@class, 'insetContent') and not(contains(@class, 'image'))]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://online.wsj.com/article/SB10001424052970203363504577185322849515102.html\r
+# slide show\r
+test_url: http://online.wsj.com/article/SB10001424052970204791104577110550376458164.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='news_detail']//div[@class='contents clearfix']
+test_url: http://www.onlinewelten.com/games/aliens-colonial-marines/news/offizielle-spiel-ankuendigung-nintendos-wii-u-103690/
\ No newline at end of file
--- /dev/null
+strip: //div[@id="dnn_LeftPane"] | //div[@id="dnn_ContentPane"]//h1 | //div[@id="dnn_ContentPane"]//p[@class="Normal"] | //div[@class="Submissions"] | //div[@id="listing"]//h3 | //div[@id="listing"][2] | //div[@id="emart-fail"] | //div[@id="emart-success"] | //div[@id="emart-form"]
+test_url: http://onstartups.com/tabid/3339/bid/37737/Secrets-Of-Freemium-Pricing-Make-The-Cheapskates-Pay.aspx
\ No newline at end of file
--- /dev/null
+body: //div[@class='content clear-block']
+test_url: http://opensource.org/node/537
\ No newline at end of file
--- /dev/null
+body: //div[@id = 'content-inner']\r
+strip: //div[@id = 'content-bottom']\r
+strip_id_or_class: print_sharebutton
+test_url: http://openthemagazine.com/article/nation/sania-vs-saina
\ No newline at end of file
--- /dev/null
+body: //div[@class="chapter"]\r
+prune: no\r
+tidy: no\r
+test_url: http://openwebx.org/docs/springext.html
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@id='content']//p[@class='readMore']/a\r
+\r
+title: //div[@class='hidden offscreen']/h2\r
+body: //div[@id="storyText"]\r
+move_into(//div[@id='storyText']): //div[@class='fact']\r
+strip: //small[@class='credit']\r
+strip: //small[@class='caption']\r
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
+strip: //p[@class='toplink']\r
+
+test_url: http://orf.at/stories/2084731/
\ No newline at end of file
--- /dev/null
+title: /html/body/div[5]/div[2]/h1\r
+body: /html/body/div[5]/div[2]/div[6]/div/div\r
+body: //*[@id="cikk"]\r
+strip: /html/body/div[5]/div[2]/h1\r
+strip: /html/body/div[5]/div[2]/div[4]\r
+strip: //*[@id="multidoboz"]\r
+strip: /html/body/div[5]/div[2]/div[6]/div[2]\r
+strip: //*[@id="comments"]\r
+strip: //*[@id="rating-doboz"]\r
+strip: /html/body/div[5]/div[2]/div[10]\r
+strip: /html/body/div[5]/div[2]/a\r
+strip: /html/body/div[5]/div[2]/span\r
+strip: /html/body/div[5]/div[2]/span[2]\r
+strip: /html/body/div[5]/div[2]/span[3]\r
+strip: /html/body/div[5]/div[2]/span[4]\r
+strip: /html/body/div[5]/div[2]/span[5]\r
+strip: //*[@id="kommentszam"]
+test_url: http://www.origo.hu/itthon/20110119-lemondott-a-kulturaert-felelos-helyettes-allamtitkar.html
\ No newline at end of file
--- /dev/null
+#body: (//div[@class='ftr-yt-vid'])[1]\r
+body: (//blockquote[contains(@class, 'postcontent')])[1]\r
+body: (//div[starts-with(@id, 'post_message')])[1]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
+#replace_string(</iframe>): </iframe> </div>\r
+\r
+test_url: http://pakistantvdekho.com/showthread.php?647741-Sitam-Gar-by-HUM-TV-Episode-07&p=659080#post659080
\ No newline at end of file
--- /dev/null
+title://h2\r
+author://div[@class="posted"]/a\r
+date://div[@class="date"]\r
+body://div[@class="entry"]
+test_url: http://pandagon.net/index.php/site/its-okay-to-admit-that-mass-hysteria-is-real
\ No newline at end of file
--- /dev/null
+tidy: no\r
+body: //article\r
+date: //time/@datetime\r
+strip_id_or_class: sharedaddy
+test_url: http://pandodaily.com/2012/01/19/ibooks-author-is-not-going-to-hurt-publishers-it-might-even-help-them/
\ No newline at end of file
--- /dev/null
+body: //div[@class='entry']\r
+date: //h3[@class='postDate']
+test_url: http://www.panic.com/blog/2011/07/panic-is-ready-for-lion/
\ No newline at end of file
--- /dev/null
+title: //h2[@class="post-title"]\r
+author: substring-after(//div[@class="description"],'Words by ')\r
+date: //li[@class="date"]\r
+strip: //h2[@class="post-title"]\r
+body: //div[@class="copy"]
+test_url: http://parislemon.com/post/13462682469/the-15-inch-air
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='news-article']
+test_url: http://www.parliament.uk/business/committees/committees-a-z/commons-select/backbench-business-committee/news/guidance-for-e-petitioners/
\ No newline at end of file
--- /dev/null
+title://div[@class="paste_box_line1"]/h1\r
+author://div[@class="paste_box_line2"]/a\r
+body://div[@class="text"]\r
+date:substring-before(substring-after(//div[@class="paste_box_line2"],'|'),'|')\r
+dissolve://li
+test_url: http://pastebin.com/LAykd1es
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='ff-pastepad-content']\r
+prune: no\r
+# todo: add test file
+test_url: http://pastepad.fivefilters.org/test.html
\ No newline at end of file
--- /dev/null
+title://*[contains(@class,'post-title')]\r
+body://div[contains(@class,'post-body')]\r
+body://div[contains(@class,'entry-content')]\r
+strip_comments:no\r
+prune:no\r
+convert_double_br_tags:yes\r
+tidy:yes
+test_url: http://www.pathawks.com/2011/06/crazyawesomecoloradotrip.html
\ No newline at end of file
--- /dev/null
+prune: no
+test_url: http://pcast.me/shownotes/get/16t
\ No newline at end of file
--- /dev/null
+prune:yes\r
+\r
+date://*[contains(@class,'date')]\r
+\r
+body://div[contains(@id,'content')]\r
+\r
+next_page_link://a[contains(.,'Next >')]\r
+\r
+strip_id_or_class:sponsors
+test_url: http://www.pcmag.com/article2/0,2817,2401676,00.asp
\ No newline at end of file
--- /dev/null
+title: //div[@class='articleHead']//h1\r
+author: //div[@class="author-name"]/a[1]\r
+body: //div[@class="main"]\r
+\r
+# remove 'From the Lab' and 'Recent posts' text\r
+strip: //div[@class='blogLabel']\r
+\r
+# remove byline and meta info\r
+strip: //h1\r
+strip: //div[@class="article-meta"]\r
+strip: //div[@class="author-info"]\r
+\r
+#strip tags and categories\r
+strip: //div[@class="department"]\r
+\r
+#strip product cap links\r
+strip: //div[@class="cap-main"]\r
+strip: //div[@id="compare-lede"]\r
+test_url: http://www.pcworld.com/article/262034/are-printer-companies-gouging-us-on-laser-toner-pricing.html
\ No newline at end of file
--- /dev/null
+# 2012-01-14 carlo@... - fixed title, body; added author, date\r
+\r
+title: //div[@class="title"]/h2/a\r
+# body: //div[@class="post"]\r
+# author: //p[@class="iconEmail"]/a\r
+# date: //p[@class="iconDate"]\r
+\r
+# 1/24/2013 yosoyju - fixed author, date, and body, added support for PA Report\r
+\r
+# Penny Arcade\r
+\r
+author: //li[@class="iconEmail"]/a\r
+date: //li[@class="iconDate"]\r
+body: //div[@class="body"]\r
+\r
+# PA Report\r
+\r
+author: //div[@class="meta"]/p/a\r
+date: substring-after(//div[@class="meta"]/p, '/ ')\r
+title: substring-after(//title, '- ')\r
+\r
+test_url: http://penny-arcade.com/2012/01/13/i-put-some-news-in-your-news\r
+test_url: http://penny-arcade.com/report/editorial-article/the-dystopian-future-of-casual-games-personalized-targeted-pricing-and-mech
\ No newline at end of file
--- /dev/null
+next_page_link: //a[contains(., 'Next:')]
+test_url: http://www.pentaxforums.com/reviews/long-exposure-handhelds/introduction.html
\ No newline at end of file
--- /dev/null
+prune: no\r
+tidy: no\r
+body: //div[@class='article-content']\r
+dissolve: //nobr/a\r
+dissolve: //nobr
+test_url: http://www.philadelphiaeagles.com/news/article-1/Jacksons-Light-Shined-On-Sunday-Night/51a862de-42b4-40f1-a5a8-ba0fb8a435b7
\ No newline at end of file
--- /dev/null
+title: //h1[@class='entry-title']\r
+author: //p[@class='byline']/span\r
+body: //@id='body-content'\r
+date: //div[@class='article_timestamp']/span\r
+\r
+strip: //@class=b-group\r
+strip: //*[contains(@style, 'none')]\r
+strip: //a[contains(@href, 'comments')]\r
+strip: //*[contains(@class, 'comment')]
+test_url: http://www.philly.com/philly/sports/eagles/20120127_Ohio_State_s_Posey_didn_t_waste_time_lost_to_suspension.html
\ No newline at end of file
--- /dev/null
+author: substring-before(//div[@class='post_meta'],' on')\r
+date: substring-after(substring-before(//div[@class='post_meta'],'with'),' on')\r
+title: //h1[class='post_title']\r
+body: //div[@class='article']\r
+
+test_url: http://photo.tutsplus.com/articles/news/a-brilliant-beginners-guide-to-architectural-photography/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+strip_id_or_class: manualnavbar\r
+\r
+prune: no\r
+
+test_url: http://www.php.net/manual/en/migration5.incompatible.php
\ No newline at end of file
--- /dev/null
+title: //div[@class='abstitle']//h1\r
+author: //div[@class='authorList']\r
+body: //div[@id='fulltext_body']\r
+\r
+prune: no\r
+
+test_url: http://www.physicstoday.org/resource/1/phtoad/v64/i10/p48_s1?bypassSSO=1
\ No newline at end of file
--- /dev/null
+title:concat(//h1,' - ',//h2,' - ',//h3)\r
+author://address\r
+date://span[@class='pub-date']\r
+body://div[@id='main']\r
+single_page_link://link[@rel='canonical']\r
+strip://div[@class='info']\r
+strip_id_or_class:'object-grid related-content'\r
+strip_id_or_class:'object-prevnext'\r
+strip_id_or_class:'object-header'\r
+strip_id_or_class:'source'\r
+strip_id_or_class:'label'\r
+strip_id_or_class:'title'\r
+dissolve://ul\r
+strip://li[@class='next']\r
+strip://li[@class='prev']
+test_url: http://pitchfork.com/features/why-we-fight/8796-on-the-far-slope-of-the-uncanny-valley/
\ No newline at end of file
--- /dev/null
+title: //h2[@class='post-title']\r
+author: substring-before(substring-after(//h3[@class='post-byline'],'By:'),'/')\r
+date: substring-before(substring-after(//p[@class='post-details'],'Posted on '),'in')\r
+strip: //h2[@class='post-title']\r
+strip: //p[@class='post-details']\r
+strip: //h3[@class='post-byline']\r
+body: //div[@id='content']
+test_url: http://pittnews.com/newsstory/mens-basketball-pitt-recruit-robinson-to-bring-leadership/
\ No newline at end of file
--- /dev/null
+title: substring-before(//title,'pirates.com')\r
+date: //span[@class='timeStamp']\r
+author: substring-before(substring-after(//div[@class='byLine'],'By'),'/')\r
+body: //div[@id='article']\r
+#strip: //div[@class='inner']\r
+strip: //div[@id='article_head']\r
+strip: //p[@class='tagLine']\r
+strip: //div[@id='article_related_links']\r
+strip: //div[@id='article_related_mlb']\r
+strip: //div[@id='article_related_club']\r
+strip: //span[@class='more']\r
+strip: //div[@class='article_component']\r
+strip: //span[@class='screen_reader']\r
+strip: //ul[@class='columnists_blurb']
+test_url: http://pittsburgh.pirates.mlb.com/news/article.jsp?ymd=20120330&content_id=27759040&vkey=news_pit&c_id=pit
\ No newline at end of file
--- /dev/null
+title: substring-before(//title,'- Pittsburgh Tribune')\r
+author: substring-before(substring-after(//div[@class='byline'],'By '),',')\r
+date: substring-after(substring-after(//div[@class='byline'],','),',')\r
+body: //div[@id='storyBody']\r
+strip: //div[@class='morestories']\r
+dissolve: //p[@class='subheader']
+test_url: http://www.pittsburghlive.com/x/pittsburghtrib/sports/columnists/s_785654.html
\ No newline at end of file
--- /dev/null
+title: //title\r
+author: substring-after(//div[@class='by-line'],'BY')\r
+\r
+body: //div[@id='article-body']\r
+\r
+strip: //div[@class='by-line']\r
+strip: //div[@id='article-body']/h1
+test_url: http://www.pittsburghmagazine.com/Pittsburgh-Magazine/May-2012/Verde-Lights-the-Night/
\ No newline at end of file
--- /dev/null
+title: //span[@class='StoryHeadline']\r
+strip: //div[@class='fivevert']\r
+body: //div[@id='Content']
+test_url: http://www.pittsburghpanthers.com/sports/m-baskbl/recaps/031412aaa.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='articletitle']\r
+author: substring-after(//span[@class='author'],'by')\r
+date: //span[@class='created']\r
+body: //div[@class='article']\r
+strip: //div[@class='headline']\r
+strip: //p[@class='articleinfo']\r
+#dissolve: //p[@class='subheader']
+test_url: http://www.pittscriptblog.com/2012-articles/march/2012-football-opponents-set-and-the-attendance-dilemma.html
\ No newline at end of file
--- /dev/null
+author: //article//*[@class="author"]\r
+date: //article//*[@class="publication-date"]\r
+body: //article\r
+strip: //article/header\r
+strip: //article/section
+test_url: http://www.playboy.com/playground/view/playboy-interview-jon-hamm
\ No newline at end of file
--- /dev/null
+body: //div[@id='contentPane']//div[@class='vg']\r
+body: //div[@id='contentPane']\r
+\r
+# Grab the author by finding the first profile pic, then backing up a node and getting the title of <a> tag which will be the author hopefully. Sorry can't test this due to parser errors, thanks google :(\r
+\r
+author: //div[@id='contentPane']//img[contains(@alt, 'profile photo')][1]/../@title\r
+\r
+\r
+strip: //*[@title="People who +1'd this"]/../..\r
+strip: //*[contains(@class, 'a-b-f-i-Hg-Uf')]\r
+strip: //*[@role='menu']\r
+strip: //img[contains(@alt, 'profile photo')]\r
+strip: //*[@class='a-f-i-Ad']\r
+\r
+tidy: no\r
+\r
+test_url: http://plus.google.com/u/0/117840649766034848455/posts/FddaP6jeCqp
\ No newline at end of file
--- /dev/null
+title: //h2[@class='jcw-pagetitle'\r
+date: //p[@class='postinfo']\r
+body: //div[@class='contenttext']
+test_url: http://plzkthxbai.com/blog/2011/06/28/1password-and-internet-security/
\ No newline at end of file
--- /dev/null
+body: //div[@id="content"]/div[1]\r
+\r
+title: //h1[@class="entry-title"]
+test_url: http://pogue.blogs.nytimes.com/2011/05/12/the-future-of-skype/
\ No newline at end of file
--- /dev/null
+title://div[contains(@class, "article")]/h1\r
+body://div[contains(@class,"story-text")]\r
+\r
+# Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"]\r
+\r
+next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a\r
+date://meta[@name="publish_date"]/@content\r
+\r
+strip://div[contains(@class, "breadcrumbs")]\r
+strip://a[contains(@class, "hidden")]\r
+strip://div[contains(@class, "story-embed")]\r
+strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/..
+test_url: http://www.politico.com/news/stories/0712/78105.html
\ No newline at end of file
--- /dev/null
+body: //div[@id="content"]\r
+\r
+strip: //div[@class="pfcontentmid"]/div[position()>4]|//div[@class="pfad"]
+test_url: http://www.politifact.com/truth-o-meter/statements/2011/may/30/barbara-boxer/barbara-boxer-says-medicare-overhead-far-lower-pri/
\ No newline at end of file
--- /dev/null
+# 21/10-2011:\r
+# Added Author+Date\r
+# Remove fakta-boks if found\r
+# Deleted 'Læs også...' filter \r
+# - Change in markup caused it to strip too much.\r
+\r
+author://span[@class='autor-name']\r
+date:substring-after(//div[@class='art-created'], ' ')\r
+title: //h1[contains(@class, 'stor-type')]\r
+body: //div[@id='art-body']\r
+strip: //div[@class='art-fakta article-box']\r
+
+test_url: http://politiken.dk/kultur/boger/skonlitteratur_boger/ECE1426386/makabre-tegneserie-zombier-aeder-alt-levende/
\ No newline at end of file
--- /dev/null
+next_page_link: //div[@id='longPagination']/a[@class='next']\r
+\r
+title: //div[@id='contentHeader']//h1\r
+\r
+body: //div[@id='articleBody']\r
+# this is so sad\r
+body: //div[@id='intelliTXT']
+test_url: http://www.popularmechanics.com/technology/aviation/crashes/what-really-happened-aboard-air-france-447-6611877
\ No newline at end of file
--- /dev/null
+title: //div[@id="newsDetailTitle"]\r
+author: //span[@id="showAuthor"]\r
+date: //span[@id="showRefDate"]\r
+\r
+strip: //div[@id="breadcrumbs"]\r
+strip: //span[@id="PageTitle"]\r
+strip: //div[@id="newsDetailAuthorPublish"]\r
+\r
+strip: //div[@class="leadPix"]\r
+\r
+strip: //span[@id="ctl00_PageTitle"]\r
+strip: //div[@id="newsDetailTitle"]\r
+convert_double_br_tags:yes\r
+\r
+strip: //div[@id="newsDetailCredential"]\r
+strip: //div[@id="sidebar2"]\r
+strip: //div[@id="footer"]\r
+
+test_url: http://www.positioningmag.com/magazine/details.aspx?id=41083
\ No newline at end of file
--- /dev/null
+title: //div[@class='story_headline']\r
+author: substring-before(substring-after(//div[@class='story_byline'],'By'),'/')\r
+date: //div[@class='story_lastupdate'] \r
+body: //div[@id='story']\r
+strip: //div[@class='story_byline']\r
+strip: //div[@class='story_lastupdate']\r
+strip: //div[@class='story_headline']\r
+strip: //div[@id='abuse']\r
+strip: //h2\r
+strip: //div[@class='pagenumbers_wrap']\r
+strip: //ul[@class='pagenumbers']\r
+strip: //div[starts-with(., 'To report inappropriate comments')]\r
+\r
+strip_id_or_class: story_share\r
+strip_id_or_class: OUTBRAIN\r
+strip_id_or_class: story_box_right\r
+strip: //div[a[@href='http://www.post-gazette.com/pg/12062/1213990-42.stm']]\r
+strip: //ul[@id='pikame']/li[position()>1]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+single_page_link: //a[contains(@href, '?p=0')]\r
+\r
+test_url: http://www.post-gazette.com/stories/sports/penguins/pens-crosby-expects-to-return-thursday-226648/\r
+test_url: http://www.post-gazette.com/stories/sports/pirates/pirates-fork-over-changes-for-fans-at-pnc-park-629789
\ No newline at end of file
--- /dev/null
+title: //div[@id='divAdnetKeyword']/h1\r
+body: //div[@id='_middle_content_bottom']\r
+\r
+wrap_in(fieldset)://div[@id='_middle_content_bottom_child2']/img\r
+\r
+strip: //div[@id='_middle_content_bottom_child1']\r
+strip: //div[@id='_middle_content_bottom_child4']\r
+strip: //div[@class='cls']\r
+strip: //div[@class='iphoneBox']\r
+strip: //ul[@class='ilgiliHaber']\r
+strip: //div[@class='yorumlar']\r
+strip: //div[@class='kategoriler']\r
+strip: //div[@class='textSize']\r
+strip: //span[@class='tarih']
+test_url: http://www.posta.com.tr/yasam/teknoloji/HaberDetay/Fedailer_Istanbul_da.htm?ArticleID=101044
\ No newline at end of file
--- /dev/null
+title: //h1\r
+date: /html/head/meta[@name="date"]/@content\r
+body: //div[@id="featuredlinksbox"]\r
+strip: //div[@class="relatedbox"]\r
+strip: //h1\r
+strip: //br\r
+strip_image_src: "/images"
+test_url: http://www.prb.org/Journalists/Webcasts/2011/military-families.aspx
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='left']\r
+strip: //h1\r
+convert_double_br_tags: yes\r
+strip_id_or_class: entry-footer\r
+strip: //h1[. = 'Previously']/following::*\r
+author: string('James Hague')\r
+date: //div[@class = 'entry-footer']/text()
+test_url: http://prog21.dadgum.com/105.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='body']\r
+title: //h2[@class='title']\r
+date: //span[@class='posted-on']
+test_url: http://prolost.com/blog/2011/10/13/real-men-comp-with-film.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class="article-title"]\r
+author: //meta[@name="author"]/@content\r
+body: //div[@class="article-full"]\r
+strip_id_or_class: sidebar_inject\r
+strip_id_or_class: callout\r
+strip_id_or_class: content-inset\r
+strip_id_or_class: byline-block\r
+strip_id_or_class: photo-caption\r
+strip_id_or_class: foot-tools\r
+
+test_url: http://www.propublica.org/article/pardon-applicants-benefit-from-friends-in-high-places
\ No newline at end of file
--- /dev/null
+author: //p[@class='name']\r
+date: substring-before(//p[@class='date'], ' | ')\r
+body: //div[@class='news_single_item']
+test_url: http://www.prosa.dk/aktuelt/nyhed/artikel/internetaktivisten-uden-maske/
\ No newline at end of file
--- /dev/null
+#basics\r
+author: (//div[contains(@class,'author')])[1]\r
+date: substring-before(//a[@class='issue'], '—')\r
+#body://div[@class = 'entry']\r
+# use this until move_into support is ready\r
+body: //div[@class = 'entry' or @class='standfirst' or @class='lead_image']\r
+\r
+#moves header image and tagline into body\r
+move_into(//div[@class='entry']/div)://div[@class = 'lead_image']\r
+move_into(//div[@class='entry']/div)://div[@class = 'standfirst']\r
+\r
+\r
+# moves author info to end of text\r
+move_into(//p[strong[string(.) = 'Follow Prospect on Twitter']])://div[@id='sidebar_content']/p/em\r
+\r
+prune: no\r
+\r
+# strips social links\r
+strip_id_or_class:login-status\r
+strip_id_or_class:shareinpost\r
+strip_id_or_class:content_subscribe\r
+strip_id_or_class:postinfo\r
+strip_id_or_class:postutils\r
+strip_id_or_class:comments\r
+strip://strong[string(.) = 'Follow Prospect on Twitter']\r
+test_url: http://www.prospectmagazine.co.uk/2011/07/postmodernism-is-dead-va-exhibition-age-of-authenticism/
\ No newline at end of file
--- /dev/null
+title: //div[@class="page-title"]/h1\r
+author: //a[@title="View Bio"]\r
+date: substring-before(substring-after(//span[@class="submitted"], 'Published on '), ' by')\r
+strip://div[@class="page-title"]/h1\r
+strip://div[@class="article-abstract"]\r
+strip://div[@class="article-meta"]\r
+strip://div[@id="rightColumn"]\r
+strip://div[@id="inline-content-bottom-left"]
+test_url: http://www.psychologytoday.com/blog/how-happiness/201205/my-quibble-facebook
\ No newline at end of file
--- /dev/null
+author: //meta[@name="Author"]\r
+date: //meta[@name="Date"]\r
+strip: //h5
+test_url: http://www.publications.parliament.uk/pa/ld201011/ldhansrd/text/111109-0003.htm
\ No newline at end of file
--- /dev/null
+title: //div[@class='title']\r
+body: //div[@class='body']\r
+next_page_link: //div[@class='source']/text()[contains(., 'page')]/following-sibling::a
+test_url: http://purpleplanetmedia.com/eye/inte/ngaiman.php
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, "hentry")]/h3\r
+\r
+author: //div[contains(@class, "hentry")]/h2[contains(@class, "author_bio")]\r
+\r
+date: substring-before(substring-after(normalize-space(//p[contains(@class, "postmetadata")]/small), "was posted on "), " and is filed under")\r
+\r
+body: //div[contains(@class, "entry")]\r
+\r
+strip_id_or_class: addtoany_share_save_container\r
+strip_id_or_class: postmetadata\r
+strip_id_or_class: author_bio\r
+strip_id_or_class: author_bio_2\r
+strip: //div[contains(@class, "hentry")]/h3
+test_url: http://www.quantumdiaries.org/2011/10/25/piling-up/
\ No newline at end of file
--- /dev/null
+body: //div[@class='copy']\r
+title: //h1[@class='hed']
+test_url: http://www.queerty.com/rawhide-radicals-meet-five-heroes-from-the-leather-community-20120302/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+body: //div[@class="cuerpoArticulo"]\r
+\r
+
+test_url: http://www.quepasa.cl/magazine/articulo/print.html?id=5299
\ No newline at end of file
--- /dev/null
+tidy: no\r
+prune: no\r
+body: //div[contains(@class, 'main_col')]\r
+title: //h1\r
+\r
+strip_id_or_class: hidden\r
+strip_id_or_class: item_action_bar\r
+strip_id_or_class: answer_voters\r
+strip_id_or_class: question_topics\r
+strip_id_or_class: answer_header_text\r
+strip_id_or_class: editor_link\r
+strip_id_or_class: view_tag\r
+strip_id_or_class: include_details\r
+strip_id_or_class: sig_edit\r
+strip_id_or_class: profile_photo_img\r
+
+test_url: http://www.quora.com/What-everyday-habit-do-you-wish-you-had-developed-earlier-in-life
\ No newline at end of file
--- /dev/null
+date://span[@class='date']\r
+body://div[@class='entry-body']
+test_url: http://radar.oreilly.com/2012/01/genome-cloud-digital-humanities-hadoop-world-strata.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='body']\r
+title: //div[@class='newsstory']/h2
+test_url: http://www.radionz.co.nz/news/stories/2010/07/18/12481029a86d
\ No newline at end of file
--- /dev/null
+title: //div[@id='center-col']/h4\r
+author: substring-before(//title,'In')\r
+date: substring-after(//div[@class='commenttext']/span,'#')\r
+body: //div[@id='center-col']\r
+strip: //div[@id='center-col']/h4\r
+strip: //div[@class='graytext']\r
+\r
+# Anthony Perez-Sanz 2012.3.14\r
+# Removed long gif from the end\r
+strip: //img[@src='http://www.randsinrepose.com/spreader.gif']\r
+test_url: http://www.randsinrepose.com/archives/2012/03/13/hacking_is_important.html
\ No newline at end of file
--- /dev/null
+single_page_link: //link[@rel='canonical']/@href\r
+
+test_url: http://www.readability.com/read?url=http://feeds.gawker.com/~r/lifehacker/full/~3/jaxAjSay_Rw/add-a-rain-gutter-to-a-picnic-table-for-a-built+in-drink-cooler
\ No newline at end of file
--- /dev/null
+title: //h1[@class="titlelink"]\r
+date: //span[@class="timestamp"]/@data-published\r
+body: //div[@class="asset-content"]\r
+strip_id_or_class: related-entries\r
+strip_id_or_class: like-and-retweet\r
+\r
+author: //div[@id="submeta"]/a[1]\r
+test_url: http://www.readwriteweb.com/archives/why_facebook_terrifies_google.php
\ No newline at end of file
--- /dev/null
+body: //div[@id='_ctl12__ctl0_Article']\r
+prune: no\r
+autodetect_on_failure: no
\ No newline at end of file
--- /dev/null
+body: //div[@class='recipedetailsleft' or @id='recipePrepAndServe' or @id='recipeingredients']\r
+\r
+strip_id_or_class: location\r
+strip_id_or_class: savings\r
+strip_id_or_class: recipeDetailDescButton\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.recipe.com/avocado-basil-pasta/
\ No newline at end of file
--- /dev/null
+body: //div[@class='short-text' or starts-with(@id, 'news-id-')]\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://red-hot-girls.com/2011/06/10/the_red_hot_natalia_maria_53_pics.html
\ No newline at end of file
--- /dev/null
+# This setup grabs the text from a Reddit self post. It ignores all comments etc.\r
+\r
+title: //p[@class="title"]/a/text()\r
+\r
+author: //p[@class="tagline"]/a\r
+\r
+# this doesn't work for some reason...?\r
+date: //p[@class="tagline"]//@datetime\r
+\r
+body: //div[@class="expando"]//div[@class="usertext-body"]\r
+\r
+strip_id_or_class: tagline\r
+strip_id_or_class: unvotable-message\r
+strip_id_or_class: buttons\r
+\r
+test_url: http://www.reddit.com/r/truegaming/comments/wfe7r/i_wrote_about_the_problems_i_honestly_feel_that/
\ No newline at end of file
--- /dev/null
+title: //div[@class='posthead']//h2\r
+body: //div[contains(@class, 'postcontent') or @class='posthead']\r
+author: //div[@class='posthead']//a[@rel='author']\r
+\r
+strip: //div[@class='posthead']//h2\r
+replace_string(>Advertisements</div>): ></div>\r
+replace_string(<p>You can follow us on): <p style="display:none;">\r
+strip_id_or_class: likeThisPost\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.redmondpie.com/how-to-play-music-directly-from-home-screen-folders-on-iphone/
\ No newline at end of file
--- /dev/null
+# Think there might be something up with your parser that it strips out 'print' from the title :)\r
+\r
+title: //meta[@name='title']/@content\r
+author: //meta[@name='author']/@content\r
+date: //meta[@name='date']/@content\r
+\r
+body: //div[@class='articleText']\r
+\r
+strip: //div[contains(@class, 'day')]\r
+strip: //div[contains(@class, 'month')]\r
+strip: //div[contains(@class, 'year')]\r
+strip: //div[contains(@class, 'time')]\r
+strip: //h1[@class='gl_headline']\r
+strip: //div[@class='byline']\r
+strip: //div[@id='left_ear']\r
+strip: //div[@id='right_ear']\r
+strip: //div[contains(@class, 'PopularPosts')]\r
+strip ://div[@class='discuss_page_break']\r
+strip ://div[contains(@class, 'p-content_TagList')]
+test_url: http://redtape.msnbc.msn.com/_news/2011/09/28/8020661-sprint-raises-fee-but-wont-free-users-from-two-year-contracts?preview=true
\ No newline at end of file
--- /dev/null
+body://div[@class='storycontent']\r
+date://div[@class='date']\r
+strip://li[@class='sharing_label']\r
+strip://a[@class='FlattrButton']
+test_url: http://reflets.info/orange-nokia-siemens-deep-packet-inspection/
\ No newline at end of file
--- /dev/null
+title: //*[@class='entry-title']\r
+body: //div[@class='entry-content']
+test_url: http://www.renenekuda.cz/recept-na-produktivitu/
\ No newline at end of file
--- /dev/null
+single_page_link://a[contains(@href, 'print')]\r
+\r
+# Grab metadata from the "printer-friendly" page, after specifying single_page_link\r
+title://h2\r
+date://cite
+test_url: http://www.retrieverweekly.com/?cmd=displaystory&story_id=7548&format=html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='headline3']\r
+author: substring-after(//p[@class="byline"], 'By ')\r
+date: //meta[@name="REVISION_DATE"]/@content\r
+body: //div[@id='articleImage' or @id='frame_fd1fade'] | //span[@id='articleText'] | //div[@class='pageNavigation']\r
+strip: //li[@class='next']\r
+strip: //span[@class='articleLocation']\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.reuters.com/article/2011/04/08/us-ivorycoast-killings-idUSTRE73732A20110408
\ No newline at end of file
--- /dev/null
+title: //div[@class="article_header"]/h3\r
+author: //div[@class="autor"]/p/*\r
+date: substring-after(substring-after(//div[@class="flt-left"],"> "), "> ")\r
+\r
+move_into(//div[@class="new_article"]): //div[@class="img_article"]/img\r
+\r
+body: //div[@class="article_content"]\r
+convert_double_br_tags: yes\r
+
+test_url: http://revistapiaui.estadao.com.br/edicao-68/questoes-latino-americanas/filhos-da-guerra-suja
\ No newline at end of file
--- /dev/null
+body: //div[@id="post"]\r
+strip: //div[@id="author-description"]\r
+date: //span[@class="entry-date"]\r
+author: //span[@class="author vcard"]
+test_url: http://richardmuscat.wordpress.com/2011/06/20/the-price-of-free/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+TheBrooksReview+%28The+Brooks+Review%29
\ No newline at end of file
--- /dev/null
+body: //div[@class='post-body entry-content']\r
+strip: //div[@id='lws_0']\r
+prune: no\r
+\r
+test_url: http://ritemail.blogspot.com/2011/06/hayden-panettiere-candids-in-los.html
\ No newline at end of file
--- /dev/null
+title: //h2\r
+\r
+strip: //div[ contains(@class, 'respond') ] | //h2 | //h1\r
+\r
+date: substring-after(//p[@class='info'], ' on ')\r
+\r
+author: //p[@class='info']//a
+test_url: http://www.rockpapershotgun.com/2010/07/29/rps-half-verdict-starcraft-2/
\ No newline at end of file
--- /dev/null
+author: //article/header/span[@class='author']\r
+title://article/header/h1\r
+body: //article\r
+strip: //article/header\r
+strip: //article/p[@class='metadata']\r
+footnotes: yes
+test_url: http://rodrigo.sharpcube.com/2010/06/20/using-and-sharing-a-vpn-connection-on-your-mac/
\ No newline at end of file
--- /dev/null
+title: substring-before(//title,':')\r
+author: substring-after(substring-before(//div[@class='text']/b,'/'),'BY')\r
+\r
+body: //div[@class='text']\r
+\r
+strip: //a[contains(@href,'printart')]\r
+strip_id_or_class: enlarge_photo
+test_url: http://rogerebert.com/apps/pbcs.dll/article?AID=/20120411/REVIEWS/120419998/1005/GLOSSARY
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'inhoud')]\r
+date: //span[@class ='published']\r
+author: //span[@class ='author']\r
+strip: //div[@class = 'grid_2']\r
+strip: //div[@class = 'block-citation-text']\r
+test_url: http://www.rolfinjapan.nl/2011/06/duizend-kraanvogels/
\ No newline at end of file
--- /dev/null
+title: //h1
+author: //h3[@class="byline"]/strong
+
+body: //div[@id='main']/h2 | //div[@id='main']//div[@class='body']
+
+prune: no
+
+single_page_link: //a[@class='print-page']
+
+test_url: http://www.rollingstone.com/politics/news/the-plastic-bag-wars-20110725
\ No newline at end of file
--- /dev/null
+body: //div[@class='movie_content_area']\r
+strip_id_or_class: tomatometer_bar_help\r
+strip_id_or_class: critic-links\r
+strip_id_or_class: top-critics-numbers\r
+strip_id_or_class: fan_side\r
+strip_id_or_class: fblike\r
+strip_id_or_class: rating_widget\r
+strip_id_or_class: friend_reviews\r
+prune: no\r
+
+test_url: http://www.rottentomatoes.com/m/thor/
\ No newline at end of file
--- /dev/null
+body: //div[@class='content']\r
+strip: //p[@class='postmeta']/following::*\r
+strip: //p[@class='postmeta']\r
+strip: //p[@align='left']
+test_url: http://www.roughtype.com/archives/2012/01/power_to_the_da.php
\ No newline at end of file
--- /dev/null
+strip_comments: no
+test_url: http://roy.gbiv.com/untangled/2008/rest-apis-must-be-hypertext-driven
\ No newline at end of file
--- /dev/null
+body: //div[@id='news-text']\r
+prune: no\r
+test_url: http://www.rpgsite.net/news/1964-tetsuya-nomura-says-hell-soon-show-the-future-of-final-fantasy\r
+test_url: http://www.rpgsite.net/news/1965-new-atelier-totori-plus-screens-and-artwork
\ No newline at end of file
--- /dev/null
+author: //div[contains(@class, 'author_text')]/h4/text()\r
+date: //li[@class='date']\r
+\r
+# stripping excessive tags\r
+strip: //div[contains(@class, 'entry_meta')]\r
+strip: //div[contains(@class, 'single_meta')]\r
+strip: //br[contains(@class, 'clear')]\r
+strip: //h3[contains(., 'Komentarz')]
+test_url: http://rubysfera.pl/2011/09/10-porad-o-rvm/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='entry-title']\r
+author: ///span[@class='author vcard']\r
+date: //abbr[@class='published']\r
+body: //div[@class='entry-content']\r
+
+test_url: http://ruhlman.com/2009/05/cookbooks-that-teach/
\ No newline at end of file
--- /dev/null
+author: //a[@class='author']\r
+tidy: no
+test_url: http://ruttloff.org/2012/06/13/intervention
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+author: (//span[@class="byline"]/a)[1]\r
+date: //span[contains(@class, "toLocalTime")]\r
+body: (//div[contains(@class, "articleInner")]//img[contains(@src, 'media.salon.com') and contains(@src, '460x')])[1] | //div[contains(@class, "articleContent") or contains(@class, "writerMeta")]\r
+\r
+prune: no\r
+\r
+# deal with singleton links\r
+single_page_link: (//h1/a[contains(@href, '/singleton')])[1]\r
+\r
+test_url: http://www.salon.com/2011/10/25/occupying_the_rust_belt/singleton/
\ No newline at end of file
--- /dev/null
+body: //p[@class='teaser1 darkgrey myriad']\r
+move_into(//p[@class='teaser1 darkgrey myriad']): //div[@class='artikel clear']\r
+strip: //div[@class='hidden']\r
+strip: //div[@id='article_related_source']\r
+\r
+test_url: http://www.salzburg.com/nachrichten/oesterreich/politik/sn/artikel/deutliche-nachbesserungen-bei-lehrerdienstrecht-19469/
\ No newline at end of file
--- /dev/null
+title://h1\r
+\r
+# my section divs seem to interfere with the Instapaper parser, so I ditch 'em\r
+dissolve://div[contains(@class, 'section')]\r
+\r
+#these don't seem to be necessary, but just in case\r
+strip_id_or_class:'masthead'\r
+strip_id_or_class:'footer'\r
+\r
+#again, Instapaper seems to understand where my content is, but just in case\r
+body://div[@id='content']\r
+\r
+# in general, I want the Instapaper view to look like my print CSS, so I remove things specified for the screen or non-printing\r
+strip_id_or_class:'screen-only'\r
+strip_id_or_class:'no-print'\r
+\r
+#other misc removals and simplifications\r
+strip_id_or_class:'popup'\r
+strip_id_or_class:'ZoomSpin'\r
+\r
+#I have a lot of content in sidebars and "meta" asides that can work inline just fine, but has to be distinguished somehow with some minimal formatting, so I put them in blockquotes\r
+wrap_in(blockquote)://div[contains(@class, 'sidebar')]\r
+wrap_in(blockquote)://div[contains(@class, 'meta')]\r
+wrap_in(blockquote)://p[contains(@class, 'meta')]
+test_url: http://saveyourself.ca/tutorials/low-back-pain.php
\ No newline at end of file
--- /dev/null
+title: //h1[@id='stream_title']\r
+\r
+# Author and date don't work\r
+author: //div[@class='byline']\r
+date: //div[@class='date-stamp']\r
+\r
+body: //div[@class='node-article']\r
+\r
+strip_id_or_class: fb-like-box\r
+strip_id_or_class: stream-fb-like\r
+strip_id_or_class: social-meta\r
+strip_id_or_class: social-spoken\r
+strip_id_or_class: twitter-share-button\r
+strip_id_or_class: twitter-follow-button\r
+strip_id_or_class: spinner_node_list\r
+strip_id_or_class: node-sort-link\r
+strip_id_or_class: stream_title\r
+strip_id_or_class: stream_summary\r
+strip_id_or_class: update-count-container\r
+strip_id_or_class: major-updates\r
+strip_id_or_class: newsletter-slide\r
+strip_id_or_class: author-mini-profile\r
+strip_id_or_class: byline\r
+strip_id_or_class: header\r
+strip_id_or_class: footer\r
+\r
+# Works, but "no text" errors on: http://www.sbnation.com/nba/2012/3/9/2856780/nba-scores-dwight-howard-bulls-magic-mavs-suns
+test_url: http://www.sbnation.com/nba/2012/3/13/2867226/dwight-howard-trade-rumors-2012-faq-orlando-magic
\ No newline at end of file
--- /dev/null
+author: //p[@class='mastname']\r
+\r
+body: //div[@class='indivbody']\r
+date: //div[@class='indivbody']/h2[1]\r
+\r
+# Remove blog title. Specify first occurrence in case h1 is used in article\r
+strip: //div[@class='indivbody']/h1[1]\r
+\r
+# Remove blog description (the first p element)\r
+strip: //div[@class='indivbody']/p[1]\r
+\r
+# Remove navigation (second p element)\r
+strip: //div[@class='indivbody']/p[2]\r
+\r
+# Remove duplicate of article title. Specify first occurrence in case h3 is used in article\r
+strip: //div[@class='indivbody']/h3[1]\r
+\r
+# Remove publishing date, it's extracted by rule above\r
+strip: //div[@class='indivbody']/h2[1]\r
+\r
+# Remove duplicate of date at end, and newsletter signup\r
+strip: //p[@class='posted']\r
+\r
+# Leave date at top\r
+test_url: http://www.schneier.com/blog/archives/2010/12/security_in_202.html
\ No newline at end of file
--- /dev/null
+body: //div[@class="storybox"]\r
+title: //div[@class="storybox"]//h1\r
+strip: //p[@class='metaline']\r
+date: substring-after(//*[@class='time'],'Erstellt am')\r
+strip: //div[@class='fact']\r
+strip: //p[@class='backlink']\r
+strip: //div[@class='mailto']\r
+strip: //div[@id='forumDisclaimer']\r
+strip: //div[@class='forum']\r
+
+test_url: http://science.orf.at/stories/1700900/
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@class='c2c1']/div[@class='toptheme further line']//ul//li/a\r
+\r
+author: //div[@class='details clear']//a[@class='hi']\r
+body: //div[@class='title']\r
+strip: //p[@class='entrypagination']\r
+strip: //p[@class='details_top']\r
+date: //p[@class='details_top']\r
+title: //div[@class='title']/h1\r
+strip: //p[@class='details']\r
+strip: //p[@class='details_bottom']\r
+
+test_url: http://www.scienceblogs.de/astrodicticum-simplex/2011/10/weltuntergang-reloaded-das-jungste-gericht-findet-am-21-oktober-statt.php
\ No newline at end of file
--- /dev/null
+body: //div[@class='post']\r
+title: //h1[@id='singlePageTitle']\r
+date: substring-before(//small,'• Rubrik')\r
+\r
+strip: //div[@class='post-ratings']\r
+strip: //div[@class='post-ratings-loading']\r
+strip: //a[@title='Empfehlen Sie den Text weiter!']\r
+strip: //a[@title='Drucken']\r
+strip: //div[@class='share']\r
+
+test_url: http://www.scienceticker.info/2011/11/24/forscher-finden-gedachtnismolekul/
\ No newline at end of file
--- /dev/null
+#\r
+# After site revisions at SciAm, this configuration does\r
+# not work, especially for multi-page articles. For\r
+# every article there is now a "Print" link which\r
+# is far more reliable. So this configuration should be\r
+# removed or disabled.\r
+# 2/3/13\r
+#\r
+\r
+# meta data\r
+title://h1[@class = 'articleTitle']\r
+author:substring-after(//span[@class = 'byline'],'By ')\r
+date:substring-before(//span[@class = 'datestamp'],'|')\r
+\r
+#body content\r
+body://div[@id = 'articleContent']\r
+#next_page_link://li[@id = 'flairPagination']/a[last()]\r
+\r
+single_page_link: //a[contains(@href, 'print=true')]\r
+\r
+#cleanup\r
+strip://div[@class = 'fsgBooks']\r
+\r
+test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state\r
+test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet
\ No newline at end of file
--- /dev/null
+title: //title\r
+author: //p[@id='author-name-role']/a\r
+date: substring-after(//p[@class='time'],'Posted')\r
+body: //div[@id='main']\r
+strip: //div[@id='author-info']\r
+strip: //div[@id='author-links']\r
+strip: //h1
+test_url: http://www.scotusblog.com/2012/04/shaken-baby-case-an-update/
\ No newline at end of file
--- /dev/null
+title: //h2\r
+body: //div[@class='body']
+test_url: http://scraplab.net/2010/10/26/please-keep-your-belongings-with-you-at-all-times/
\ No newline at end of file
--- /dev/null
+strip: //a[starts-with(@href, '#')]\r
+strip: //*[@class='storyByline']\r
+body: //*[@class='storyPageText']/..\r
+author: string('Dave Winer')\r
+date: substring-before(substring-after(//*[@class='storyByline'], 'on'), 'at')\r
+title: //h1\r
+footnotes: no
+test_url: http://scripting.com/stories/2011/07/08/yeahImStillYawning.html
\ No newline at end of file
--- /dev/null
+body: //*[@class="entry-content"]\r
+title: //h1[@class="entry-title"]\r
+date: //*[@class="entry-date"]\r
+author: //*[@class="author vcard"]
+test_url: http://sct.temple.edu/blogs/news-events/2011/05/congratulations-sct-class-of-2011/
\ No newline at end of file
--- /dev/null
+body: //div[@class="storyBox"]\r
+title: //div[@class="storyBox"]/h1\r
+author: //a[@rel="author"]\r
+date: substring-before(//span[@class="dateline"], 'by')\r
+\r
+#Removes related content but cleans up article text\r
+strip: //h1\r
+strip: //p[@class="homeStory tdmSideInfo"]\r
+strip: //div[@id="bylineShare"]\r
+strip: //script\r
+strip: //hr\r
+\r
+strip_id_or_class: homeStory\r
+strip_id_or_class: authorpic\r
+strip_id_or_class: insideComments\r
+strip_id_or_class: authorbio\r
+strip_id_or_class: gpt-ad-sel-cube\r
+strip_id_or_class: smxTextAd\r
+
+test_url: http://searchengineland.com/googles-jaw-dropping-sponsored-post-campaign-for-chrome-106348
\ No newline at end of file
--- /dev/null
+title: //h3[@class="storytitle"]
+date: //div[@class='meta']
+body: //div[@class='storycontent']
+
+test_url: http://seattletransitblog.com/2012/06/19/times-st-louis-interested-in-buying-waterfront-streetcars/
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, '«')\r
+body: //div[@class = 'entry']\r
+strip_id_or_class: 'postmetabox'
+test_url: http://sebbo.net/2010/12/akkus/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+\r
+# clean up recipe pages\r
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
+\r
+#recipe pages\r
+strip_id_or_class: "recipe-feedback"\r
+strip_id_or_class: "comments"\r
+strip_id_or_class: "procedure-number"\r
+strip_id_or_class: "more-with-author"\r
+\r
+#slice\r
+strip_id_or_class: "inner"\r
+
+test_url: http://www.seriouseats.com/recipes/2010/09/peking-duck-mandarin-pancakes-plum-sauce-recipe.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='post-title']\r
+author: //div[@class='post-byline']/a\r
+date: substring-before(//div[@class='post-byline'], ', by')\r
+\r
+body: //div[@class='post-body']\r
+dissolve: //noscript
+test_url: http://sf.curbed.com/archives/2011/10/17/lower_haight_loft_would_really_really_really_like_a_buyer.php
\ No newline at end of file
--- /dev/null
+title: //h1[@class="post-title"]\r
+author: //div[@class="post-byline"]/a\r
+date: substring-before(//div[@class='post-byline'], ', by')\r
+\r
+body: //div[@class='post-body']\r
+strip_id_or_class: post-kicker
+test_url: http://sf.eater.com/archives/2012/05/22/nate_pollack_talks_about_the_american_grilled_cheese_kitchen_moving_into_the_mission.php
\ No newline at end of file
--- /dev/null
+title: /html/head/title\r
+\r
+body: //div[@id = 'articlecontent']/div[contains(@class, 'bodytext')]\r
+author: //div[@class = 'articleheadings']/p[contains(@class,'author')]/span[@class = 'fn']\r
+date: //div[@class = 'articleheadings']/span[@class = 'updated']\r
+strip: //div[div[contains(@class, 'imgbox')]]\r
+\r
+body: //div[@class = 'blogitem']\r
+author: //p[@class="credit"]/span[@class="author"]/a[position() = 1]\r
+date: //span[@class = 'pubdate']\r
+
+test_url: http://www.sfgate.com/columnists/garchik/
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'content_body')]\r
+strip_id_or_class: det_rel
+test_url: http://www.sfweekly.com/2012-03-14/news/cia-lsd-wayne-ritchie-george-h-white-mk-ultra/
\ No newline at end of file
--- /dev/null
+date: //span[@class='date']\r
+body: //div[@class='post_content']
+test_url: http://www.shabayek.com/blog/2011/10/16/%D8%AF%D8%B1%D9%88%D8%B3-%D9%85%D9%86-%D9%82%D8%B5%D8%A9-%D8%AA%D8%A3%D8%B3%D9%8A%D8%B3-%D8%AA%D9%88%D9%8A%D8%AA%D8%B1-%E2%80%93%D8%AC3/
\ No newline at end of file
--- /dev/null
+title://*[@class='primary']/h1\r
+date: //*[@class='articledate']\r
+author: substring-before(substring-after(//*[@class='block first']/p,'2012 '),'.')\r
+body: //div[@class='primary']\r
+footnotes: yes\r
+strip: //*[@class='primary']/h1\r
+strip: //*[@class='articledate']\r
+strip: //*[@class='detailsarticle']\r
+strip: //*[@class='endnav']\r
+strip: //*[@class='endmeta']\r
+test_url: http://shawnblanc.net/2011/11/kindle-touch-review/
\ No newline at end of file
--- /dev/null
+body: //div[ @class='entry-content' ]\r
+\r
+strip: //div[ contains(@class, 'sharing') ]\r
+\r
+date: //div[ @class='entry-meta' ]/a
+test_url: http://shifteleven.com/articles/2008/05/10/issue-tracking-git-ticgit
\ No newline at end of file
--- /dev/null
+#body: (//div[@class='ftr-yt-vid'])[1]\r
+body: (//blockquote[contains(@class, 'postcontent')])[1]\r
+body: (//div[starts-with(@id, 'post_message')])[1]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
+#replace_string(</iframe>): </iframe> </div>\r
+\r
+test_url: http://www.siasat.pk/forum/showthread.php?107668-Policy-Matters-17th-March-2012-Dr-Shahid-Masood-Gen-Hameed-gul-amp-Fawad-Chudhary-Pak-US-Relationship&p=787733
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, "entry")]\r
+\r
+date: //div[contains(@class, "entryFooter")]/a\r
+
+test_url: http://simonwillison.net/2009/Oct/22/redis/
\ No newline at end of file
--- /dev/null
+body: //div[@class='post-body']\r
+strip: //div[@id='lws_0']\r
+prune: no\r
+
+test_url: http://singaporeanstocksinvestor.blogspot.com/2011/04/aims-amp-capital-industrial-reit.html
\ No newline at end of file
--- /dev/null
+body://div[contains(@class,"entry-content")]
+test_url: http://singularityhub.com/2011/05/21/google-invades-your-home-android-phones-control-your-appliances-and-accessories-video/
\ No newline at end of file
--- /dev/null
+title: //div[@class='headline']//h2\r
+body: //div[contains(@class, 'storycontent')]\r
+\r
+prune: no\r
+\r
+test_url: http://sintagoulis.gr/sokolatenia/sokolatenia-mpompa-me-amaretti-
\ No newline at end of file
--- /dev/null
+title: substring-before(//title,'| /Film')\r
+date: substring-before(substring-after(//p[@class='post-info'],'Posted on '),'by')\r
+strip: //div[@class='pm-left']\r
+strip: //div[@class='pm-right']\r
+strip: //h2/span\r
+next_page_link: //h2/strong/a\r
+strip: //h2/strong/a\r
+strip: //p[contains(text(),'we have to split this post over')]\r
+strip: //p[@class='post-info']\r
+strip: //h1/a\r
+strip: //img[contains(@src,'siteimages/authors')]\r
+strip: //div[@id='header']\r
+strip: //div[@class='topad-right']\r
+strip: //strong[contains(text(),'Cool Posts From Around the Web:')]\r
+test_url: http://www.slashfilm.com/superhero-bits-206/
\ No newline at end of file
--- /dev/null
+title: //h1[@class="sl-art-head-dek"]\r
+body: //article//div[@class='sl-art-body']/div[contains(@class, 'body')]\r
+strip: //div[@class="department_kicker"]\r
+strip: //div[@id="insider_ad_wrapper" or @id="insider_ad_inner"]\r
+strip: //div[@id="bottom_sponsored_links"]\r
+strip: //div[@class="sl-art-ad-midflex"]\r
+#strip: //dl\r
+#strip: //p[em/a[contains(@href, 'facebook.com')]]\r
+prune: no\r
+\r
+author: //div[@id='author_bio']//a[contains(@href, '/author/')]\r
+author: //a[contains(@href, '/authors.')]\r
+\r
+date: substring-before(substring-after(//span[@class='sl-art-byline'], 'Posted '), ', at ')\r
+\r
+single_page_link: //a[@class='sl-art-sinpage']\r
+\r
+test_url: http://www.slate.com/id/2274583/pagenum/all/\r
+test_url: http://www.slate.com/id/2293116/
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+\r
+# clean up recipe pages\r
+strip: //h2[@class='fn'] | //h2[@class='double-lined'] | //h3 | //div[@id='threeColumn2'] | //div[@id='threeColumn3']\r
+\r
+#recipe pages\r
+strip_id_or_class: "recipe-feedback"\r
+strip_id_or_class: "comments"\r
+strip_id_or_class: "procedure-number"\r
+strip_id_or_class: "more-with-author"\r
+\r
+#slice\r
+strip_id_or_class: "inner"\r
+
+test_url: http://slice.seriouseats.com/archives/2010/10/the-pizza-lab-how-to-make-great-new-york-style-pizza.html
\ No newline at end of file
--- /dev/null
+strip_id_or_class: postCategory\r
+title: //h3[@class='postTitle']\r
+body: //div[@class='postBody']
+test_url: http://slog.thestranger.com/slog/archives/2010/10/12/sl-letter-of-the-day-leave-it-alone
\ No newline at end of file
--- /dev/null
+title: //td[@class='hweissblau2']\r
+body: //p[@class='copy'] | //div[@class='Section1']\r
+prune: no\r
+
+test_url: http://www.smartinvestor.de/news/smartinvestor/detail.hbs?itemid=item949496655&recnr=14593
\ No newline at end of file
--- /dev/null
+title: //meta[@property='og:title']/@content\r
+date: //p[@class='autor_line']/b/text()
+test_url: http://www.sme.sk/c/6268206/lipsic-vidi-malcharkove-uplatky.html
\ No newline at end of file
--- /dev/null
+# meta data\r
+title://h1[@id = 'articleTitle']\r
+author:substring-after(//ul[@id = 'byLine']/li[1],'By ')\r
+date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',')\r
+body://div[@id = 'article-body']\r
+\r
+# full content\r
+single_page_link://td/li[@class = 'article-singlepage']/a\r
+\r
+# caption clean up\r
+wrap_in(i)://span[@class='articleImageCaptionwide']\r
+move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p\r
+\r
+\r
+# clean up\r
+strip://p[@id = 'articlePaginationWrapper']\r
+strip://ul[contains(@class, 'cat-breadcrumb')]\r
+strip://div [@class= 'viewMorePhotos']\r
+
+test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html
\ No newline at end of file
--- /dev/null
+title: //h2[@class='custom-entry-title']\r
+author: substring-after(//span[@class='author vcard'],'by ')\r
+date: substring-after(//span[@class='publ'],'Published on ')\r
+body: //div[@class='postentry-content']\r
+test_url: http://smokingapples.com/software/popclip-for-mac/
\ No newline at end of file
--- /dev/null
+#grab the actual content div\r
+body: //div[@class='rt-article']\r
+\r
+test_url: http://www.sourcebooks.com/next/sourcebooks-next-our-blog/1601-another-piece-of-the-e-puzzle-or-when-good-ebook-promotions-go-bad.html
\ No newline at end of file
--- /dev/null
+author: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']/child::text()\r
+\r
+body: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']\r
+\r
+# Not very helpfull, the title and author are container by the same element that contains the body\r
+strip: /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/h2 | /html/body/div[@id='wrapper']/div[@id='main-content']/div[@class='article_body']/a[@class='author-link']
+test_url: http://www.spectator.co.uk/arts-and-culture/night-and-day/7449683/spotify-sunday-my-personal-soundtrack.thtml
\ No newline at end of file
--- /dev/null
+body://div[@class="articleBody"]\r
+author://p[@class="articleBodyTtl"]
+test_url: http://spectrum.ieee.org/semiconductors/processors/behind-intels-new-randomnumber-generator/
\ No newline at end of file
--- /dev/null
+body://div[@class="body"]
+test_url: http://speirs.org/blog/2011/5/5/ipad-trials-at-oklahoma-state.html
\ No newline at end of file
--- /dev/null
+# A. Niepel, narya.de@...\r
+# - added single_page_link\r
+# - added author for default and single page view\r
+# - added date for single page view\r
+# fforst@...\r
+# - Fixed it\r
+# bode2104@...\r
+# - Fixed single_page_link\r
+# - Included intro text in single page view\r
+# - Added body in default view\r
+\r
+# set body\r
+tidy: no\r
+# body in single page view\r
+body: //div[@id="spArticleContent"]\r
+# body in default view\r
+body: //div[@id="spArticleSection"]\r
+# body in "Fotostrecke"\r
+body: //div[@id="spBigaContent"]\r
+\r
+# set date in single page view\r
+date: //div[@id="spArticleContent"]/h3\r
+# strip date\r
+strip: //div[@id="spArticleContent"]/h3\r
+# set date in "Fotostrecke"\r
+date: //div[@id="spBigaDatum"]\r
+\r
+#set title in single page view\r
+title: //div[@id='spArticleContent']/h2\r
+# strip title\r
+strip: //div[@id='spArticleContent']/h1\r
+strip: //div[@id='spArticleContent']/h2\r
+#set title in "Fotostrecke"\r
+title: //div[@class='spBigaHeadline']\r
+\r
+# set author\r
+author: //p[@class="spAuthor"]/a\r
+author: substring-after(//p[@class="spAuthor"], 'Von ')\r
+# strip author\r
+strip: //p[@class='spAuthor']\r
+\r
+# remove captions\r
+strip: //*/span[@class='spPicLayerText']\r
+strip: //*/div[@class='spPanoPlayerPaneControl']\r
+strip: //*/div[@class='spCredit']\r
+strip: //*/div[@class='spCredit']/following-sibling::p\r
+\r
+# remove ads\r
+strip: //div[@class='spMInline']\r
+\r
+# remove photogalleries and extras\r
+strip: //div[@class='spPhotoGallery']\r
+strip: //div[@class='spPhotoGallery']/following-sibling::br\r
+strip: //div[@class='spAssetAlignleft']\r
+strip: //div[contains(@class,'spAsset')]\r
+strip: //br[@clear='all']\r
+\r
+# remove community functions\r
+strip: //div[@id='spSocialBookmark']\r
+strip: //div[contains(@class, 'spCommunityBox')]\r
+strip: //div[contains(@class, 'spArticleNewsfeedBox')]\r
+strip: //div[@class='spArticleCredit']\r
+\r
+# remove clutter in "Fotostrecke"\r
+strip: //div[@id='spBreadcrumb']\r
+strip: //div[@id='spBigaLatestEntries']\r
+strip: //div[contains(@class, 'spBigaNavi')]\r
+strip: //div[@class='spDottedLine']\r
+\r
+# Use link to print article for single page view\r
+single_page_link: //a[contains(@href, '-druck')]\r
+\r
+# use next link in "Fotostrecke"\r
+next_page_link: //a[@class='spBigaControlForw']\r
+test_url: http://www.spiegel.de/politik/deutschland/0,1518,787602,00.html
\ No newline at end of file
--- /dev/null
+tidy: no\r
+body: //section[contains(@class, 'main')]\r
+strip: //footer\r
+strip: //a[@class='paginated']
+test_url: http://www.spin.com/articles/bathlands-deep-heart-americas-new-drug-nightmare
\ No newline at end of file
--- /dev/null
+author:string('Dan Frommer/SplatF')\r
+date://div[@class='postdate']\r
+body://div[@class='entry']\r
+title://div[@class='post']/h1
+test_url: http://www.splatf.com/2012/02/month-six/
\ No newline at end of file
--- /dev/null
+author: //div[@class='byline']/a\r
+date: //div[@id='date']\r
+body: //div[@class='entry']
+test_url: http://splitsider.com/2011/10/saturday-nights-children-rob-riggle-2004-2005/
\ No newline at end of file
--- /dev/null
+title://div[@class="content_detail"]/h1\r
+\r
+author://div[@class="author"]/strong\r
+\r
+date:substring-before(substring-after(//div[@class="content_detail"]/*[@class="date"], ','), ' WIB')\r
+\r
+body://div[@class='text_detail']
+test_url: http://sport.detik.com/sepakbola/read/2012/05/23/065011/1922350/71/agen-silva-ingin-bertahan-di-milan?b99220270
\ No newline at end of file
--- /dev/null
+single_page_link: //div[@id='content']//p[@class='readMore']/a\r
+\r
+title: //div[@class='hidden offscreen']/h2\r
+body: //div[@id="storyText"]\r
+move_into(//div[@id='storyText']): //div[@class='fact']\r
+strip: //small[@class='credit']\r
+strip: //small[@class='caption']\r
+date: substring-after(//div[@id='storyMeta']//p[@class='date'],'Publiziert am')\r
+strip: //p[@class='toplink']\r
+
+test_url: http://sport.orf.at/stories/2084851/
\ No newline at end of file
--- /dev/null
+title: //div[@class='headline'] | //div[@class='mod-header']/h3\r
+body: //div[contains(@class, 'article')]\r
+strip: //div[contains(@class, 'mod-inline')]\r
+strip: //*/span[@class='page-actions']/a\r
+strip: //*/span[@class='page-actions']/a\r
+strip: //div[@class='page-actions']/*\r
+strip: //div[@class='headline'] | //div[@class='mod-header']/h3\r
+strip: //div[@class='mod-blog-navigation']\r
+strip: //div[@class='monthday']\r
+strip: //div[@class='time']\r
+strip: //div[@class='timeofday']\r
+test_url: http://sports.espn.go.com/espn/page2/story?page=simmonsnfl2010/lebron_james_return_clevelend&sportCat=nba
\ No newline at end of file
--- /dev/null
+title: //div[@id='article']/div[@class='hd']/h1\r
+body: //p[@id='byline'] | //div[@id='article']//div[@class='body_copy 0']\r
+strip: //div[@class='foot']\r
+strip: //div[@id='sidebar']//div[@class='ft']\r
+strip: //p[@id='byline']//em\r
+tidy: no\r
+prune: no\r
+
+test_url: http://sports.yahoo.com/nba/news?slug=ap-nbafinals
\ No newline at end of file
--- /dev/null
+title://div[@id='ardContent']/h1\r
+\r
+author://p[@id='ardAutor']\r
+author://span[@id='ardQuelle']\r
+author:string('sportschau.de')\r
+\r
+date:substring-after(//span[@id='ardStand'], 'Stand: ')\r
+\r
+body://div[@id='ardContent']\r
+\r
+strip://div[@id='ardContent']/h1\r
+strip://p[@id='ardAutor']\r
+strip: //div[@class='embeddedPlayer_clipinfo']\r
+strip: //div[@class='ardMehrZumThemaRechts']\r
+strip: //*[contains(@class, 'inv')]\r
+\r
+strip: //p[@id='ardAbbinder']\r
+strip: //div[@class='socialBookmarks']\r
+strip: //div[@id='ardContentEnd']\r
+strip: //div[@id='ardDisclaimer']\r
+strip: //div[@id='ardRechteSpalte']
+test_url: http://www.sportschau.de/sp/fussball/news201203/17/analyse_leverkusen_gladbach.jsp
\ No newline at end of file
--- /dev/null
+# main sportsillustrated.com articles\r
+#\r
+body: //div[@id="cnnStoryContent"]\r
+title: //div[@id="cnnStoryHeadline"]//h1\r
+author: //div[@id="cnnSubBanner"]//strong\r
+date: substring-after(//div[@id="cnnTimeStamp"], "Updated: ")\r
+date: substring-after(//div[@id="cnnTimeStamp"], "Posted: ")\r
+\r
+# kill ugly font buttons\r
+strip: //div[@id="cnnSCFontButtons"]\r
+\r
+# kill misc filler videos & etc\r
+strip: //div[@class="cnnDivideContent"]\r
+strip: //*[@class="cnnTMbox"]\r
+\r
+# si vault articles\r
+# -------------\r
+body: //div[@class="siv_artPara"]\r
+title: //div[@class="siv_artHeader"]//h1\r
+author: //div[@class="byline"]\r
+date: //div[@class="date"]\r
+\r
+next_page_link: //div[@id='cnnStoryContinue']/a\r
+strip_id_or_class: cnnstorypagination\r
+\r
+test_url: http://sportsillustrated.cnn.com/2012/writers/peter_king/02/27/combine/index.html
\ No newline at end of file
--- /dev/null
+title: //h2\r
+author: string('Michael Spreng')\r
+date: //div[@class='date']\r
+body: //div[@class='entry']
+test_url: http://www.sprengsatz.de/?p=3691
\ No newline at end of file
--- /dev/null
+body: //div[@id='ff-body']\r
+\r
+replace_string(<h1 align=center>): <div id="ff-body"><h1 align=center>\r
+\r
+prune: no\r
+\r
+test_url: http://www.sqlite.org/fileformat2.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='content']\r
+date: substring-before( //div[@class='unit dateAndNotes'], 'with')\r
+title: //h3
+test_url: http://squashed.tumblr.com/post/17613522228/lets-stop-blaming-the-victims-of-predatory-lending
\ No newline at end of file
--- /dev/null
+body: //div[@class='post-text' or @class='user-action-time' or @class='user-details' or @class='vote'] | //div[@id='answers-header']//h2\r
+\r
+replace_string(<div class="user-details"><br></div>): <!-- nothing -->\r
+replace_string(<div class="vote">): <div class="vote"><h3>Vote count: \r
+\r
+strip_id_or_class: vote-up\r
+strip_id_or_class: vote-down\r
+strip_id_or_class: star-off\r
+strip_id_or_class: favoritecount\r
+strip_id_or_class: -share\r
+strip_id_or_class: badgecount\r
+\r
+
+test_url: http://stackoverflow.com/questions/4484289/id-like-to-understand-the-jquery-plugin-syntax
\ No newline at end of file
--- /dev/null
+title: //div[@class='articleLeft']/h3\r
+\r
+author: substring-after(//span[@class='articleAuthor']/a,'By ')\r
+\r
+date: substring-before(//span[@class='articleDateTime'],'in ')\r
+\r
+body: //div[@class='articleLeft']\r
+strip: //div[@class='articleMoreNews']\r
+strip: //div[@class='articleLeft']/h3\r
+strip: //div[@class='articleLeft']/p[@class='articleInfo clearfix']\r
+\r
+# Remove duplicate title from text\r
+strip: //div[@id='site']/div[5][@class='holder']/div[1][@class='hBlock']/div[1][@class='sglCol article']/h3\r
+test_url: http://www.stalbansreview.co.uk/news/9581446.New_roundabout_in_King_Harry_Lane/r/?ref=rss
\ No newline at end of file
--- /dev/null
+autodetect_next_page: no\r
+footnotes: no\r
+dissolve: //div[@class="column-2"]//div[@class="widget"]\r
+dissolve: //div[@class="column-2"]//div\r
+\r
+author: //div[@class="innerbyline"]/a\r
+strip: //div[@class="innerbyline"]/a\r
+\r
+strip: //p[@class="dateline"]\r
+date: //p[@class="dateline"]\r
+\r
+title: //h1[@class="title"]\r
+author: //div[@class="innerbyline"]/a\r
+date: //p[@class="dateline"]\r
+body: //div[@class="column-2"]
+test_url: http://www.standard.co.uk/lifestyle/esmagazine/grace-and-flavour-pizarro-7938350.html
\ No newline at end of file
--- /dev/null
+title: //h1[@id='storyTitle']\r
+author: substring-after(//span[@class='hsa_postCredit'], 'By ') \r
+date://span[@class='hsa_dateStamp']\r
+body: //div[@class='storytext']\r
+strip_id_or_class: insideStoryAd \r
+strip_id_or_class: printDesc\r
+strip_id_or_class: sb_2010_story_tools\r
+strip_id_or_class: FBConnectButton_Text\r
+strip_id_or_class: breadcrumbs\r
+prune: no\r
+test_url: http://www.staradvertiser.com/news/20111112_World_leaders_step_onto_isle_stage.html
\ No newline at end of file
--- /dev/null
+title: /html/head/meta[@name='title']/@content\r
+author: //span[contains(concat(' ',normalize-space(@class),' '),' article_author ')]/a\r
+date: //span[contains(concat(' ',normalize-space(@class),' '),' article_date ')]\r
+\r
+body: //div[@class='entry-content']\r
+\r
+single_page_link: //p[@class='pagination']/a
+test_url: http://www.stephenfry.com/2011/10/06/steve-jobs/
\ No newline at end of file
--- /dev/null
+title: article/h1\r
+author: //p[@class='byline']\r
+date: //p[@class='date']\r
+body: //div[@class='body']
+test_url: https://www.stlbeacon.org/#!/content/23404/mogop_caucus_031712
\ No newline at end of file
--- /dev/null
+strip_id_or_class: 'left'\r
+strip_id_or_class: 'right'\r
+strip_id_or_class: 'block-belowcontent'\r
+
+test_url: http://stockholm.etc.se/reportage/bakom-stangda-dorrar-pa-fas-3-massa
\ No newline at end of file
--- /dev/null
+title: //h2[@class="post-title"]\r
+date: //span[@class="post-date"]\r
+body: //div[@class="post-entry"]\r
+\r
+#This is also good for *.streetsblog.org, for example:\r
+#http://dc.streetsblog.org/2011/10/21/friday-job-market/
+test_url: http://streetsblog.net/2011/10/20/look-out-below-one-in-nine-bridges-structurally-deficient-reports-t4a/
\ No newline at end of file
--- /dev/null
+title://div[@id='left_col']/h1\r
+author:substring-after(//span[contains(@class,'storycredit')],'BY ')\r
+author://span[contains(@class,'storycredit')]\r
+date:substring-after(//div[contains(@class,'toolbox_date')],'Last updated ')\r
+date://div[contains(@class,'toolbox_date')]\r
+body://div[@id='left_col']\r
+\r
+strip_id_or_class: toolbox\r
+strip_id_or_class: story_features\r
+strip_id_or_class: sharebox_new\r
+strip_id_or_class: related_box\r
+strip_id_or_class: sponsored_links\r
+strip_id_or_class: hidden_ad\r
+strip_id_or_class: story_content_top\r
+strip_id_or_class: total_number\r
+strip_id_or_class: sort_order\r
+strip_id_or_class: subscribe_order\r
+\r
+strip://div[contains(@class,'ad_story')]\r
+\r
+test_url: http://www.stuff.co.nz/national/politics/3930344/PM-issues-challenge\r
+test_url: http://www.stuff.co.nz/entertainment/7045944/International-praise-for-Ladyhawke
\ No newline at end of file
--- /dev/null
+single_page_link: //iframe[@id='stumbleFrame']/@src\r
+\r
+test_url: www.stumbleupon.com/su/35V0wB/zouchmagazine.com/poetry-violet/
\ No newline at end of file
--- /dev/null
+title: //*[@id='posts']/div[1]/h2\r
+author: //*[@id='posts']/div[1]/div[2]/span[2]/a\r
+date: //*[@class='date']\r
+body: //div[@class='body-lead']\r
+\r
+# take out the bit saying 'body'\r
+strip: //div[@class='body-lead']/div[@class='info-label']\r
+\r
+\r
+\r
+\r
+\r
+\r
+\r
+\r
+\r
+test_url: http://www.subtraction.com/2011/02/01/unnecessary-explanations
\ No newline at end of file
--- /dev/null
+# 2012-12-04: complete rewrite after Süddeutsche.de relaunch - carlo@...\r
+\r
+single_page_link: //a[ contains( @href, "/2.220/" ) ]\r
+\r
+body: //article[@id="sitecontent"]/section[@class="body"]\r
+author: //address[@class="author"]\r
+date: //div[@class="header"]//h1//span[@class="updated"]\r
+wrap_in(small): //div[@class="footer"]\r
+wrap_in(i): //figcaption/h3\r
+dissolve: //figcaption//h3\r
+dissolve: //figure/div[@class="body"]\r
+dissolve: //figure/a\r
+\r
+strip: //figure[ not( contains(@class, "zoomimage" ) ) ]\r
+strip: //div[@data-onlineonly="true"]\r
+strip: //address[@class="author"]\r
+\r
+test_url: http://www.sueddeutsche.de/muenchen/mietshaus-am-gaertnerplatz-alles-muss-raus-1.1556693
\ No newline at end of file
--- /dev/null
+strip_id_or_class: toolbar
+test_url: http://summify.com/story/Tmt1YQ0JBgKTAHwK/www.nybooks.com/articles/archives/2003/jan/16/fixed-opinions-or-the-hinge-of-history/?pagination=false
\ No newline at end of file
--- /dev/null
+title: //div[@class='story-details']/h1\r
+date: //span[@class='date-time']\r
+Author: substring-after(//p[@class='by-line'], 'By ')\r
+\r
+strip: //div[@class='videoThumbnails']\r
+strip: //div[@class='ad-square2-container']\r
+strip: //div[@class='homeDeliveryContainer5']\r
+\r
+strip: //div[@class='image-description']\r
+strip: //div[@id='internal-side-bar']\r
+\r
+strip: //span[@class='hide']\r
+strip: //div[@class='date']
+test_url: http://www.suntimes.com/technology/ihnatko/8816567-452/review-kindle-fire-is-no-ipad-killer-but-it-is-a-killer-device.html
\ No newline at end of file
--- /dev/null
+# Ads\r
+strip_id_or_class: articlead\r
+
+test_url: http://www.svd.se/nyheter/inrikes/manga-huggormsbitna-golfare_5004031.svd
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+author: //a[contains(@href, '/sok/?')]/text()\r
+\r
+date: substring-after(//span[@class='date'], 'Publicerad ')\r
+\r
+body: //div[@class='two_column_left']\r
+strip_id_or_class: story\r
+strip: //div[@class='leadText saplo:lead']/h5\r
+
+test_url: http://www.sydsvenskan.se/kultur-och-nojen/-jag-vill-garna--stanna--
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, "post")]/h2\r
+\r
+author: //div[contains(@class, "post")]/p[position()=last()]/text()[1]\r
+\r
+date: //div[contains(@class, "post")]/p[1]\r
+\r
+body: //div[contains(@class, "post")]\r
+\r
+strip: //div[contains(@class, "post")]/h2[1]\r
+strip: //div[contains(@class, "post")]/p[1]\r
+strip: //div[contains(@class, "post")]/p[position()=last()]
+test_url: http://www.symmetrymagazine.org/breaking/?p=12784
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body://div[@class='drucken']\r
+author: substring-after(//span[@class='autor'], 'Von ')\r
+author: //span[@class='autor']\r
+\r
+single_page_link://a[contains(@href, '/drucken/')]\r
+convert_double_br_tags:yes\r
+\r
+dissolve://div[@class='vorspann']\r
+\r
+strip://h1\r
+strip_id_or_class: klassifizierung\r
+strip_id_or_class: source\r
+strip_id_or_class: autor
+test_url: http://sz-magazin.sueddeutsche.de/texte/anzeigen/37567
\ No newline at end of file
--- /dev/null
+title://h1[1]\r
+\r
+author: substring-after(//em, 'Von ')\r
+author:string('tagesschau.de')\r
+\r
+date:substring-after(//div[@class='standDatum'], 'Stand: ')\r
+\r
+body://div[contains(@class, 'article')] | //div[contains(@class, 'centerCol')]\r
+\r
+strip://h1[1]\r
+strip: //div[contains(@class, 'directLinks')]\r
+strip: //div[contains(@class, 'zitatBox')]\r
+strip: //div[contains(@class, 'teaserBox metaBlock')]\r
+strip: //*[contains(@class, 'inv')]\r
+strip: //span[@class='imgSubline']\r
+strip: //*[contains(@class, 'topline')][1]\r
+strip: //div[@id='rightCol'][1]\r
+strip: //div[@id="footer"][1]\r
+strip: //div[@class="fPlayer"] \r
+strip: //div[@id='seitenanfang']\r
+strip: //div[@class='standDatum']\r
+strip: //em
+test_url: http://www.tagesschau.de/ausland/wahlkampffrankreich102.html
\ No newline at end of file
--- /dev/null
+title: //span[@class="entry-title"]\r
+author: //*[contains(@class, 'item')]/p/a/text()\r
+date: substring-after(//*[contains(@class, 'item')]/p/text()[3], 'Posted:')\r
+body: //div[@class="entry-content"]
+test_url: http://www.tampabay.com/news/salvador-dali-leaders-want-st-petersburg-city-council-to-put-brakes-on/1236349
\ No newline at end of file
--- /dev/null
+title: //h3[@class="storytitle"]\r
+body: //div[@class="post"]\r
+strip: //div[@class="blurbBox"]
+test_url: http://taptaptap.com/blog/apples-precedents-vs-apples-guidelines/
\ No newline at end of file
--- /dev/null
+title: //span[@id='ctl00_ctl00_MainContent_MainContent_RecipeImage1_lblRecipeTitle']\r
+body: //div[@id='RDNEW']//*[@class='Recipe-imgCon' or @class='Recipe-Intro' or @class='recipeDetails']\r
+strip_id_or_class: rec-ExRightPanel\r
+strip_id_or_class: divCarousel\r
+strip_id_or_class: preptimeOuter\r
+strip_id_or_class: cooktimeOuter\r
+strip_id_or_class: durationOuter\r
+strip_id_or_class: divImageFooter\r
+strip_id_or_class: microFormatFnIngred\r
+strip: //span[@class='Recipe-Intro']//*[@class='link' or @class='rating']\r
+\r
+prune: no\r
+tidy: no\r
+
+test_url: http://www.tasteofhome.com/recipes/Grinch-Punch
\ No newline at end of file
--- /dev/null
+date: //div[@class='secthead']\r
+body: //div[@class='sectbody']\r
+title: concat(//div[@class='sectbody']/h4,': ',//div[@class='sectbody']/h1)\r
+author: //span[@class='author']\r
+strip: //p[@class='caption']\r
+strip_id_or_class: rack\r
+
+test_url: http://www.taz.de/Protestbewegung-Occupy/!80188/
\ No newline at end of file
--- /dev/null
+body: //div[@id='centercontent']\r
+strip: //div[@id='rightcontent']\r
+date: substring-before( //div[@id='cats'], '·')\r
+title: //h1
+test_url: http://www.tbray.org/ongoing/When/201x/2012/03/04/Mobile-Money
\ No newline at end of file
--- /dev/null
+title: //div[@id='main-content']/h1\r
+body: //div[@id='main-content']\r
+strip: //div[@id='main-content']/h1
+test_url: http://www.tcng.org/index.php/blog/view/teaching-basic-health-cutting-down-costs
\ No newline at end of file
--- /dev/null
+title: //h1[@class='storyheadline']\r
+body: //div[@class='storytext']\r
+strip: //strong
+test_url: http://tech.fortune.cnn.com/2011/03/17/why-startups-dont-go-public-anymore/?section=money_topstories&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fmoney_topstories+%28Top+Stories%29
\ No newline at end of file
--- /dev/null
+title://h1[contains(@id,'artibodyTitle')]\r
+\r
+date://span[contains(@id,'pub_date')]\r
+\r
+body://div[contains(@id,'artibody')]\r
+\r
+strip://div[contains(@class,'otherContent')]\r
+\r
+next_page_link://p[@class='page']/a[contains(.,'下一页')]\r
+
+test_url: http://tech.sina.com.cn/mobile/n/2012-03-22/07476863046.shtml
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'media-container') or contains(@class, 'body-copy')]\r
+\r
+author: //a[@class="name"]\r
+\r
+date: //div[@class="post-time"]\r
+\r
+title: //h1[@class="headline"]\r
+strip_id_or_class: module-crunchbase\r
+\r
+# The following is for the mobile site\r
+body: //div[@id="singlentry"]\r
+author: substring-after(//span[@class="single-post-meta-top"],'rsaquo; ')\r
+date: substring-before(//div[@class="single-post-meta-top"],' @')\r
+title: //a[@class="sh2"]\r
+\r
+prune: no\r
+\r
+test_url: http://techcrunch.com/2011/10/18/apples-insanely-great-q1-2012/
\ No newline at end of file
--- /dev/null
+body: //div[@class='story']\r
+title: //div[@class='story']/h1\r
+strip: //div[@class='story']/h1\r
+\r
+author: //div[@class='details']/p[contains(., 'by ')]/a\r
+date: //p[@class='storydate']\r
+\r
+strip: //p[a[contains(., 'Leave a Comment')]]\r
+strip_id_or_class: share\r
+strip_id_or_class: maincolumn_head\r
+strip_id_or_class: maincolmod
+test_url: http://www.techdirt.com/articles/20120112/17455117394/sega-gets-it-right-about-sopa-its-time-hard-reset-copyright-law-congress.shtml
\ No newline at end of file
--- /dev/null
+single_page_link_in_feed: //b/a\r
+\r
+test_url_feed: http://www.techmeme.com/feed.xml
\ No newline at end of file
--- /dev/null
+title: //h2\r
+author: //meta[@name="author"]/@content\r
+date: //h3\r
+body: //div[@class="postBody"]\r
+strip: //h1\r
+strip: //h2\r
+strip: //h3\r
+test_url: http://technicallyjordan.tumblr.com/post/22914659822/facebook-to-launch-app-store-knock-off
\ No newline at end of file
--- /dev/null
+title: //header[@class='article-meta']/h1\r
+title: substring-before(//title, '|')\r
+\r
+body: //section[contains(@class, 'body')]\r
+\r
+# Author & Date for News and Featured Stories\r
+author: //ul[@class='byline']/li/a\r
+author: substring-before(substring-after(//ul[@class='byline']/li, 'By '), ' on')\r
+date: substring-after(//ul[@class='byline']/li, 'on ')\r
+\r
+# Author & Date for "Views"\r
+author: //div[@class='view-byline']/div[@class='meta']/h2[1]\r
+date: //div[@class='view-byline']/div[@class='meta']/h2[2]\r
+\r
+next_page_link: //section[@class='pagination']/a[contains(@class, 'continue')]\r
+test_url: http://www.technologyreview.com/news/427567/facebooks-telescope-on-human-behavior/
\ No newline at end of file
--- /dev/null
+body: //div[@class="post"]\r
+\r
+strip: //div[@class="post-meta"]\r
+strip: //div[@id="socialicons"]\r
+strip: //div[@id="authorbox"]\r
+
+test_url: http://techpinions.com/why-google-and-microsoft-hate-siri/3572
\ No newline at end of file
--- /dev/null
+# Title without news/reviews etc. appended\r
+title: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/h1\r
+\r
+# Remove home link\r
+strip: //div[@id='page_logo']/a\r
+\r
+# Remove utilities\r
+strip: //*[(@id = "utilities")]\r
+\r
+# Remove comments link\r
+strip: //div[@id='subColumn1Pad']/div[1][@class='article']/div[1][@class='articleHead']/p[@class='tiny']
+test_url: http://www.techradar.com/news/television/sky-to-rebrand-living-as-sky-living-903105
\ No newline at end of file
--- /dev/null
+body: //div[@id='artikelKolom']\r
+strip: //div[@class='broodMediaBox']/div[@class='docbox' or @class='artBannerWrapper']\r
+strip: //div[@id='artikeltoolbar']\r
+strip: //div[@class='reactiebalk artspacer' or @class='bannercenter clearfix artspacer']\r
+strip: //div[@id='artikelKolomRechts' or @id='TMGTweetWidget']\r
+tidy: no\r
+prune: no\r
+
+test_url: http://www.telegraaf.nl/binnenland/10275097/__Identiteit_man_in_sloot_onbekend__.html?cid=rss
\ No newline at end of file
--- /dev/null
+body: //div[@class='byline' or @id='storyEmbSlide' or @id='mainBodyArea']\r
+strip: //p[@class='comments']\r
+strip: //div[@id='storyEmbSlide']//div[contains(@class, "hide")]\r
+strip: //div[@id='tmg-related-links' or @id='outbrain-related-links' or @id='onespot-related-links']\r
+strip: //p[@class='bbpTweet']/span[@class='timestamp']\r
+strip: //p[@class='bbpTweet']/span[@class='metadata']//img\r
+tidy: no\r
+prune: no\r
+
+test_url: http://www.telegraph.co.uk/news/worldnews/europe/ireland/8663451/Is-Ireland-divorcing-from-the-Catholic-Church.html
\ No newline at end of file
--- /dev/null
+# Remove home link\r
+strip: //div[@id='blog-title']/a
+test_url: http://theappleblog.com/2010/10/21/the-new-macbook-air-is-underwhelming/
\ No newline at end of file
--- /dev/null
+title: //div[@id='article']/h1\r
+title: //h1\r
+\r
+body: //div[@class='articleText']\r
+body: //div[@class='articleContent']\r
+body: //div[@id='article']\r
+date: //*[contains(@class, 'date')]\r
+author: //div[@id='profile']//*[@class='authors']//a[1]\r
+author: //*[@class='author']/span\r
+prune: no\r
+\r
+strip: //div[@class='moreOnBoxWithImages']\r
+\r
+single_page_link: //a[@class='print']\r
+\r
+test_url: http://www.theatlantic.com/technology/archive/2011/04/want-to-see-how-crazy-a-bot-run-market-can-be/237773/\r
+test_url: http://www.theatlantic.com/magazine/archive/2007/11/the-autumn-of-the-multitaskers/6342/\r
+test_url: http://www.theatlantic.com/entertainment/archive/2012/04/30-rock-live-a-funny-reminder-of-why-sitcoms-arent-shot-live-anymore/256447/
\ No newline at end of file
--- /dev/null
+title: //meta[@name='og:title']/@content\r
+date: //meta[@name='created']/@content\r
+body: //div[@class="StoryBody" or @class="storyTeaser"]\r
+\r
+replace_string(<p></p>): <br /><br />\r
+\r
+test_url: http://www.thebostonchannel.com/slideshow/news/28210648/detail.html
\ No newline at end of file
--- /dev/null
+title: //h2[contains(@class, 'page-title')]\r
+body: //div[@id='content']/div[contains(@id, 'node-')]/div[@class='content']\r
+\r
+prune: no\r
+\r
+strip: //div[contains(@class, 'node-book')]//a[@class='button']\r
+\r
+single_page_link: //a[@class='tool-print']\r
+
+test_url: http://thebrowser.com/interviews/yotam-ottolenghi-on-his-favourite-cookery-books
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, ' – ') \r
+author:string('Shawn')\r
+date: //*/time/@pubdate\r
+\r
+\r
+strip: //header\r
+strip: //div[@id='prev_next']\r
+strip: //div[@id='masthead']\r
+\r
+test_url: http://thecarton.net/2012/12/20/imdb
\ No newline at end of file
--- /dev/null
+#keep all body text\r
+prune: no\r
+\r
+#title, body, metadata\r
+title: //div[@class='story_header']/h1\r
+body: //div[@id='content']\r
+author: substring-after(//span[@class='byline'], "by ")\r
+author: substring-after(//span[@class='byline'], "By ")\r
+author: //span[@class='byline']\r
+date: //span[@class='date']\r
+\r
+#formatting\r
+convert_double_br_tags: yes\r
+dissolve: //div[@class='slides_full']/ul/li\r
+\r
+# cleanup\r
+strip: //a[@id='story_note']\r
+strip: //br\r
+strip: //div[@class='intro']\r
+strip: //div[@class='share-block']\r
+strip: //div[@class='sidebar-social']\r
+strip: //div[@class='top-stories']\r
+strip: //div[@class='prevnext']\r
+test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //article/div[contains(@class, 'article-body')]\r
+#strip: //header/hgroup/h1\r
+strip: //footer[@class='storyFooter']\r
+single_page_link: //li[@class='print']/a\r
+prune: no\r
+test_url: http://www.thedailybeast.com/articles/2010/04/06/how-mastercard-predicts-divorce.html
\ No newline at end of file
--- /dev/null
+# Remove duplicated title\r
+strip: //div[@id='content']/div[1][@class='full_intro']/h2\r
+\r
+# Remove links, ads etc.\r
+strip: //*[(@class= "aside")]\r
+\r
+# Remove the date and add it to the date published field in Instapaper\r
+strip: //div[@class="date"]\r
+date: //div[@class="date"]\r
+\r
+# There is no byline on The Daily Mash.\r
+\r
+convert_double_br_tags: yes\r
+test_url: http://www.thedailymash.co.uk/index.php?option=com_content&task=view&id=4994&Itemid=81&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thedailymash+%28The+Daily+Mash.+It%27s+news+to+us.%29
\ No newline at end of file
--- /dev/null
+body: //div[@class='body']
+test_url: http://thefilmexperience.net/blog/2011/12/30/distant-relatives-2001-a-space-odyssey-and-the-tree-of-life.html
\ No newline at end of file
--- /dev/null
+title: //h1[@id="headline"]\r
+author: //div[contains(@class, "editorial-byline-author")]/a\r
+date: substring-after(//div[contains(@class, "editorial-byline-meta")], " | ")\r
+\r
+# The article body contains a mix or article and non-article elements, so lot of manual tweaks are needed\r
+body: //div[@id="template"]\r
+strip_id_or_class: editorial-byline-pic\r
+strip_id_or_class: editorial-byline\r
+strip_id_or_class: headline\r
+\r
+# Include the leadin paragraph in the body text, but remove quotes because they're out of context\r
+dissolve: //div[contains(@id, "leadin")]\r
+strip_id_or_class: pullquote\r
+\r
+# Image captions removed because they're confusing in body text\r
+strip_id_or_class: image-caption-content\r
+\r
+# Remove header and footer\r
+strip_id_or_class: header\r
+strip_id_or_class: footer\r
+\r
+# Remove the hidden logo that seems to be used to cause Facebook to show the logo instead of a random article image\r
+strip: /html/body/span[contains(@style, "display: none")]\r
+\r
+# Remove search box\r
+strip_id_or_class: searchContainer\r
+strip: //div[contains(@class, "searchInstruction")]\r
+strip: //div[contains(@class, "searchResults")]/h4\r
+\r
+# Remove the 'Letters to the Editor' section\r
+strip_id_or_class: letter-text\r
+strip_id_or_class: letter-from\r
+strip_id_or_class: letter-date\r
+\r
+# Remove Like/Tweet links \r
+strip_id_or_class: social-tab\r
+\r
+# Remove 'divider' which causes an inexplicable slash to appear in the article body\r
+strip_id_or_class: divider\r
+
+test_url: http://www.theglobalmail.org/feature/tiramisu-time-in-pyongyang/88/
\ No newline at end of file
--- /dev/null
+single_page_link: //div[contains(@class, 'pagination')]//a[contains(@title, 'ingle page')]\r
+tidy: no\r
+prune: no\r
+
+test_url: http://www.theglobeandmail.com/report-on-business/rob-magazine/how-a-novice-miner-survived-a-summer-in-the-klondike/article2345350/
\ No newline at end of file
--- /dev/null
+title: //h1[@class="Headline"]\r
+date: substring-after(//div[@class="posted"], 'EDT ')\r
+body: //div[@class="storyBody"]\r
+\r
+strip: //td[@class="AssocContentTD"]\r
+strip: //div[@id="pageTitle"]\r
+strip: //div[@class="posted"]\r
+strip: //div[@class="updated"]\r
+strip: //div[@class="js-kit-disclaimer"]\r
+strip: //table[@class="row3table"]\r
+strip: //div[@class="container2"]\r
+strip: //div[@id="delta"]
+test_url: http://www.theindychannel.com/news/31050840/detail.html
\ No newline at end of file
--- /dev/null
+title: /html/body/div/div[2]/div/div/div/h3\r
+\r
+body: /html/body/div/div[2]/div/div/div/div[2]\r
+\r
+strip: /html/body/div/div[2]/div/div/div/div[6]/div[3]/div/div/div\r
+\r
+tidy: no\r
+\r
+# any way to get rid of this word character garbage?
+test_url: http://www.themillions.com/2010/07/at-the-movies-with-david-mitchell-the-thousand-autumns-of-jacob-de-zoet.html
\ No newline at end of file
--- /dev/null
+body: single-review\r
+strip_id_or_class: featured-review\r
+strip_id_or_class: resources\r
+strip_id_or_class: rate-the-book\r
+strip_id_or_class: write-review\r
+
+test_url: http://themuseumofinnocence.com/review.php?id=1179
\ No newline at end of file
--- /dev/null
+title: //h1[@class='print-title']\r
+body: //div[@class='print-content']\r
+author: //a[contains(@href, '/authors')]\r
+author: substring-before(//div[@class='print-created'], '|')\r
+date: //span[@class='article-date']\r
+date: substring-after(//div[@class='print-created'], '|')\r
+prune: no\r
+\r
+single_page_link: //ul[contains(@class, 'article-actions-bar')]//a[contains(@href, '/print/article/')]\r
+\r
+test_url: http://www.thenation.com/article/162331/hard-against-time-roy-fisher
\ No newline at end of file
--- /dev/null
+body: //div[@id="beta-inner"]\r
+title: //h3[@class="entry-header"]\r
+
+test_url: http://thenetworkgarden.blogs.com/weblog/2011/09/microsoft-metro-and-the-next-wave-in-computing.html
\ No newline at end of file
--- /dev/null
+body: //div[@class= 'article-body']\r
+author: //div[@class='featured mb-1']//a[starts-with(@href,'/author')]\r
+\r
+strip: //div[@class = 'bargo']\r
+strip: //div[@class = 'tf']\r
+strip: //div[@class = 'article']/div[@class = 'blue-box']\r
+strip_id_or_class: respond\r
+\r
+tidy: no\r
+next_page_link: //div[@class='pages-wrapper']//span/following-sibling::a/@href\r
+\r
+test_url: http://thenextweb.com/apple/2011/10/12/tnw-review-a-complete-guide-to-apples-ios-5-with-icloud-an-os-14-years-in-the-making/
\ No newline at end of file
--- /dev/null
+body: //div[@id='fullstory']\r
+strip: //div[@id='page_leftbar']
+test_url: http://theoaklandpress.com/articles/2011/04/25/news/doc4db5330e0bce9220005852.txt
\ No newline at end of file
--- /dev/null
+title: //h2[@class='title']\r
+date: substring-before(//p[@class='meta'], '|')\r
+body: //div[@class='story']\r
+#body: //div[@class='article_body']\r
+\r
+strip: //h2[@class='title']\r
+strip: //p[@class='meta']\r
+strip: //div[@class='ga_section']\r
+strip: //div[@id='recent_slider']\r
+
+test_url: http://www.theonion.com/articles/pathetic-bobcats-owner-again-regaling-players-with,27572/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='post-title']\r
+body: //div[@class='post']\r
+author: //p[@class='posted-by']\r
+date: //div[@class='sprite post-date']\r
+\r
+# The body of the post doesn't have it's own div so we have to strip out the metadata\r
+strip: //div[@class='author_avatar']\r
+strip: //div[@class='sprite post-date']\r
+strip: //h1[@class='post-title']\r
+strip: //p[@class='posted-by']
+test_url: http://thepioneerwoman.com/cooking/2011/08/pie-fats-a-comparison/
\ No newline at end of file
--- /dev/null
+title: //div[@id="article"]/h2\r
+author: //div[@id="article"]/p[@class="byline"]/a[1]\r
+date: //div[@id="article"]/p[@class="dateline"]/a[2]\r
+body: //div[@id="article"]/div[@id="body"]
+test_url: http://www.theregister.co.uk/2011/10/06/gas_bill_shocker/
\ No newline at end of file
--- /dev/null
+body: //div[@id='node-content']\r
+strip_id_or_class: pager
+test_url: http://www.theroot.com/views/why-i-am-male-feminist
\ No newline at end of file
--- /dev/null
+title: /html/body/div/div[2]/div/div/h1\r
+\r
+body: /html/body/div/div[2]/div/div/div[2]
+test_url: http://therumpus.net/2010/07/the-rumpus-interview-with-david-means/?full=yes
\ No newline at end of file
--- /dev/null
+#body: (//div[@class='ftr-yt-vid'])[1]\r
+body: (//blockquote[contains(@class, 'postcontent')])[1]\r
+body: (//div[starts-with(@id, 'post_message')])[1]\r
+\r
+prune: no\r
+tidy: no\r
+\r
+#replace_string(<iframe title="YouTube video player"): <div class="ftr-yt-vid"><iframe title="YouTube video player"\r
+#replace_string(</iframe>): </iframe> </div>\r
+\r
+test_url: http://www.thesiasat.com/showthread.php?19220-Dunya-News-HASB-E-HAAL-16-06-2012-Part-1-5
\ No newline at end of file
--- /dev/null
+title: //h3[@class='post-title']/a[@class='post-title-link']\r
+body: //div[@class='post-content']\r
+author: //div[@class='post-meta-under-title']/a
+test_url: http://www.thesimpledollar.com/2011/09/13/determining-the-size-of-your-emergency-fund/
\ No newline at end of file
--- /dev/null
+strip: //*[(@id = "content")]/h2\r
+strip: //*[(@class = "wp-notable-line")]
+test_url: http://www.thespoiler.co.uk/index.php/2010/10/21/wayne-rooney-tells-man-utd-its-not-me-its-you
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, 'cTitle')]\r
+body: //div[contains(@class, 'KonaBody') or @id='articleimageright']\r
+author: //meta[@name='Author']/@content\r
+date: //meta[@name='OriginalPublicationDate']/@content\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.thespoof.com/news/spoof.cfm?headline=s8i108389
\ No newline at end of file
--- /dev/null
+# savage* filtering is for Savage Love, such as: http://www.thestranger.com/seattle/SavageLove?oid=5135029\r
+\r
+#other filtering are plain articles, such as: http://www.thestranger.com/seattle/the-stranger-election-control-board/Content?oid=5142885\r
+\r
+title: //div[@id='savageColumn_head']/h1\r
+title: //h1[@class="headlineLarge"]\r
+\r
+strip: //div[@id='savage_right'] | //div[@id='savageColumn_head'] | //div[@id='savageArticleRight'] | //div[@id='articleRight'] | //div[@class='savAppBanner']\r
+\r
+body: //div[@id='savageColumn']\r
+body: //div[@id='story_text']
+test_url: http://www.thestranger.com/seattle/SavageLove?oid=5135029
\ No newline at end of file
--- /dev/null
+title: //div[@id='storyHdr']/h1\r
+title: //div[@id='print']//h2\r
+body: //div[@class="virtualpage"]\r
+body: //div[@id='print']//div[@id='bd']\r
+author: //meta[@name="AUTHOR"]/@content\r
+author: (//div[@id='print']//div[@id='bd']/h4)[1]\r
+date: //meta[@name="DATE"]/@content\r
+date: //div[@id='print']//div[@id='dte']\r
+\r
+strip_id_or_class: articleFooter\r
+strip_id_or_class: sidebar\r
+strip_id_or_class: ie6PrintSubhead\r
+strip_id_or_class: subHdr\r
+\r
+\r
+replace_string(<P/>): </p><p>\r
+\r
+prune: no\r
+\r
+#TODO: redirects back - perhaps needs referer to work\r
+single_page_link: //div[@id='storyDetail']//a[contains(@href, '/print/')]\r
+\r
+test_url: http://www.thestreet.com/story/11386556/1/which-of-these-10-dividend-stocks-is-worth-the-risk.html\r
+# multi page\r
+test_url: http://www.thestreet.com/story/11387090/1/7-ubs-stock-picks-for-2012.html
\ No newline at end of file
--- /dev/null
+strip:/html/body/form/div[2]/div[3]/div/div/div/div/div/div/div/div/div/div[2]/div[3]/div[2]/div/p[2]
+test_url: http://thethaovanhoa.vn/151N20110519085606745T129/levante-quyet-giu-caicedo.htm
\ No newline at end of file
--- /dev/null
+title: //h1[contains(@class, "headline")]\r
+\r
+author: //p[contains(@class, "byline")]/a[contains(@class, "author")]\r
+\r
+date: substring-after(normalize-space(//p[contains(@class, "byline")]/span[contains(@class, "publish-date")]), "on ")\r
+\r
+body: //article[contains(@class, 'feature-entry')]\r
+body: //article\r
+prune: no\r
+tidy: no\r
+\r
+strip: //article/header\r
+strip: //*[@id='sticky-menu']\r
+strip: //aside\r
+strip: //nav\r
+\r
+strip_id_or_class: gallery\r
+strip_id_or_class: article-meta\r
+strip_id_or_class: story-navigation\r
+strip_id_or_class: slegend\r
+strip_id_or_class: related-product-meta\r
+strip_id_or_class: comments\r
+strip_id_or_class: ui-jump-list\r
+strip_id_or_class: pullquote\r
+\r
+strip: //q\r
+\r
+strip: //a[contains(@class, 'entry-section-title')]\r
+\r
+test_url: http://www.theverge.com/2012/2/29/2821763/lytro-review\r
+test_url: http://www.theverge.com/2011/11/3/2534861/nokia-lumia-800-review
\ No newline at end of file
--- /dev/null
+body: //div[@class="briefingEntry"]\r
+prune: no\r
+
+test_url: http://theweek.com/article/index/215763/insider-trading-on-capitol-hill
\ No newline at end of file
--- /dev/null
+author: //p[@class="byline"]/a\r
+body: //div[@class="post"]\r
+
+test_url: http://thinkprogress.org/special/2011/11/12/367040/harvard-law-professor-criticizes-homeland-security-feel-of-overreaction-to-occupy-harvard/
\ No newline at end of file
--- /dev/null
+body: //div[@class='main-content-panel']/div[@class='img'] | //div[@id='page_content_Content9_oModuleContent_2_div_Body']\r
+test_url: http://www.thisdaylive.com/articles/australia-pm-talks-human-rights-with-chinas-wen/90394/
\ No newline at end of file
--- /dev/null
+author: //div[@class='meta clearfix']/a\r
+body: //div[@class='post']\r
+\r
+strip: //div[@class='metaCat']\r
+strip: //div[@class='post']/h1\r
+strip: //div[@class='post']/div[@class='meta clearfix']\r
+strip: //div[@class='post']/div[@class='social-bar clearfix']
+test_url: http://thisismynext.com/2011/10/18/galaxy-nexus-android-ice-cream-sandwich-pictures-video-hands-on/
\ No newline at end of file
--- /dev/null
+author: //span[@class='fn']\r
+date: substring-before(substring-after(//*[@id='center_ajax_sub']/div/div[3],'|'),'|')\r
+test_url: http://tidbits.com/article/12651
\ No newline at end of file
--- /dev/null
+# 2011-10-25 - carlo@... - Initial setup.\r
+\r
+single_page_link: //li[@class='print']/a/@href\r
+\r
+title: //h1\r
+author: //meta[@name="byline"]/@content\r
+date: //meta[@name="date"]/@content\r
+\r
+strip: //span[@class="see"]\r
+strip: //div[@class="byline"]\r
+strip: //div[@id="date2"]\r
+strip: //h1\r
+\r
+test_url: http://www.time.com/time/specials/packages/article/0,28804,2094921_2094923_2094924,00.html
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@class="storytext"]\r
+strip: //div[@id="thelogin"]\r
+strip: //*[@class="hide"]\r
+strip: //div[@id="anchored"]
+test_url: http://www.timeshighereducation.co.uk/story.asp?sectioncode=26&storycode=416124&c=1
\ No newline at end of file
--- /dev/null
+body: //div[@id='content']\r
+\r
+strip_id_or_class: featured-box\r
+strip_id_or_class: postmeta\r
+strip_id_or_class: respond\r
+\r
+author: //a[contains(@href, '/author/') and contains(@title, 'Posts by')]\r
+date: substring-before(//a[contains(@href, '/author/') and contains(@title, 'Posts by')]/.., ' by ')\r
+test_url: http://www.tipb.com/2011/10/17/iphone-4s-review/
\ No newline at end of file
--- /dev/null
+title: //div[contains(@class, 'article_detail')]/div[@class='entry_header']/h1\r
+title: //div[contains(@class, 'article_detail')]//h1\r
+title: //h1\r
+\r
+body: //div[contains(@class, 'article_detail')]\r
+\r
+author: //div[@class='article_detail']/div[@class='entry_header']/li/div[@class='author']//h3\r
+author: div[@class='author']//h3\r
+strip: //div[contains(@class, 'field-field-book-cover')]\r
+\r
+date: translate(//*[@class='post_date' and contains(., ' 20')], '|', '')\r
+\r
+prune: no\r
+\r
+single_page_link: //a[@class='print-page']\r
+\r
+test_url: http://www.tnr.com/blog/jonathan-chait/92991/did-obama-get-rolled
\ No newline at end of file
--- /dev/null
+title: //div[@id='maincontent']//div[@class='title']\r
+body: //div[@id='maincontent']//div[@class='byline'] | //div[@id='maincontent']//div[@class='meat']\r
+\r
+tidy: no\r
+
+test_url: http://www.tomdispatch.com/post/175436/tomgram:_noam_chomsky%2C_the_imperial_mentality_and_9_11/
\ No newline at end of file
--- /dev/null
+tidy: no\r
+title: //title\r
+author: //a[@itemprop = 'author']\r
+date: //time[@itemprop = 'datePublished']\r
+body: //div[@id = 'intelliTXT']\r
+\r
+next_page_link: //li[@class="pagin next"]/a
+test_url: http://www.tomshardware.com/reviews/gaming-graphics-card-review,3107.html
\ No newline at end of file
--- /dev/null
+body://div[@id="news-content"]/div[@id="intelliTXT"][1]\r
+\r
+author://div[@id="header-news-infos"]/a[1]\r
+\r
+date: //div[@id="header-news-infos"]/span[1]\r
+\r
+title://h1[@id="header-news-title" and @class="hardwareTitle"][1]\r
+\r
+strip://div[@id="news-content"]/div[@id="intelliTXT"]/table \r
+\r
+footnotes: no
+test_url: http://www.tomshardware.de/DDR4-DDR3-ISSCC-Samsung-Hynix,news-247133.html
\ No newline at end of file
--- /dev/null
+body: //div[@class='post']\r
+\r
+strip: //div[@class='social']\r
+strip: //span[@class='next']\r
+strip: //span[@class='previous']
+test_url: http://toolsandtoys.net/noble-tonic-02/
\ No newline at end of file
--- /dev/null
+title: concat(substring-before(//title,':'),': ',//div[@class='Date2'])
+test_url: http://trailer.web-view.net/Show/0XC4EFE5D648B716BA2E134BC7CE61B9CC001E04F11E9434438186735DBD637488.htm
\ No newline at end of file
--- /dev/null
+title: //div[@class="Post-body"]//span[@class="PostHeader"]\r
+author: //div[@class="PostHeaderIcons metadata"]/a[@title="Author"]\r
+date: substring-before(//div[@class="PostHeaderIcons metadata"], '|')\r
+body: //div[@class="Post-body"]\r
+strip_id_or_class: print1\r
+strip_id_or_class: metadata\r
+strip_id_or_class: authorbox
+test_url: http://traningslara.se/skoinlagg-och-skador-finns-det-nagot-samband/
\ No newline at end of file
--- /dev/null
+title: //title\r
+author: //span/a\r
+date: substring-after(//small,'Published:')\r
+\r
+strip: //h1[@class='vert_class']\r
+strip: //h1[@class='headline']\r
+strip: //img[contains(@src,'logo_triblive.gif')]\r
+\r
+#strip: //h6\r
+#strip_img_src: logo_triblive.gif\r
+\r
+single_page_link: //a[@class='stprint']\r
+test_url: http://triblive.com/sports/2819913-85/lemieux-deal-penguins-burkle-nhl-owners-team-mario-bettman-case
\ No newline at end of file
--- /dev/null
+title: //div[@class='printbody']/h1\r
+body: //div[@class='printbody']\r
+prune: no\r
+\r
+strip: //div[@class='printbody']/a[@href='http://www.truthdig.com/']\r
+strip: //table[@class='footer']\r
+\r
+single_page_link: //div[@class='article_tools']//a[contains(@href, '/print/')]\r
+\r
+test_url: http://www.truthdig.com/report/item/the_election_march_of_the_trolls_20110829/
\ No newline at end of file
--- /dev/null
+title: //h2\r
+author: //a[starts-with(@href, '/AuthorStories')]\r
+body: //div[@id='storyinnerbody']
+test_url: http://www.tthfanfic.org/Story-6512/Kudra+Journeys.htm
\ No newline at end of file
--- /dev/null
+prune: no
+test_url: http://www.tthor.com/06/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='posttitle']\r
+author: //span[@class='author']/a\r
+date: //span[@class='timestamp']\r
+body: //div[@class='body']\r
+
+test_url: http://www.tuaw.com/2011/10/19/apple-posts-fans-memories-of-steve-jobs/
\ No newline at end of file
--- /dev/null
+title: //h1[@class='post-title']\r
+author: //div[@class='display-name']\r
+date: //div[@class='date']\r
+body: //div[@class='body']\r
+footnotes: no\r
+test_url: http://tuckreview.com/2012/8/14/migrating-to-v6
\ No newline at end of file
--- /dev/null
+# Google Custom Search\r
+strip_id_or_class: google_branding_style\r
+\r
+# Avoid double title\r
+strip_id_or_class: pagetitle\r
+\r
+# external links are labelled\r
+strip_image_src: http://static.mediatropes.info/pmwiki/pub/external_link.gif\r
+\r
+title: //div[@class="pagetitle"]\r
+body: //div[@id="wikitext"]\r
+\r
+# don't get clever.\r
+strip_comments: no\r
+prune: no\r
+\r
+# navigation in footer lives inside the wikitext div, annoyingly.\r
+strip_id_or_class: pathholder\r
+
+test_url: http://tvtropes.org/pmwiki/pmwiki.php/Main/WithinParameters
\ No newline at end of file
--- /dev/null
+title: //title\r
+body: (//p[contains(@class, 'js-tweet-text')])[1]\r
+author: (//strong[contains(@class, 'fullname')])[1]\r
+date: //span[contains(@class, 'js-short-timestamp')]/@data-time\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: https://twitter.com/medialens/status/216883678582804480
\ No newline at end of file
--- /dev/null
+body: //div[@class='d3cmsCBody']//div[@class='pubText pubDate' or @class='newsComment' or contains(@class, 'newsPhoto') or @class='newsText']\r
+strip: //div[contains(@class, 'mpindex')]\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.uefa.com/uefaeuropaleague/news/newsid=1617320.html
\ No newline at end of file
--- /dev/null
+# applies to uk.ds.ign.com, uk.wii.ign.com etc.\r
+# possibly to non-UK versions, but I can’t test that\r
+\r
+title: //h1[@class="headline"]\r
+author: //div[@class="hdr-sub byline"]/a\r
+date: //h2[@class="publish-date"]/span\r
+body: //div[@id="main-article-content"]\r
+\r
+strip: //ul[@class="lnks-readmore"]\r
+\r
+strip: //div[@class="inlineImageCaption"]\r
+# can’t make the images appear, so remove the captions\r
+\r
+strip: //div[@style="width:468px"]\r
+# video caption links\r
+\r
+convert_double_br_tags: yes\r
+\r
+strip_comments: no\r
+# otherwise the ‘Closing Comments’ are removed\r
+\r
+# Ratings box could do with some rearranging, but it’s tricky
+test_url: http://uk.xbox360.ign.com/articles/121/1210717p1.html
\ No newline at end of file
--- /dev/null
+author: substring-before(substring-after(//div[@class='post-byline'], 'By '), ', on')\r
+date: substring-after(//div[@class='post-byline'], ', on')\r
+\r
+# for some reason, the following is producing a "no text [48]" error\r
+#title: //div[@class='post-headline']\r
+\r
+# for some reason, the following doesn't appear to isolate just the body copy\r
+body: //div[@class='post-bodycopy']\r
+\r
+# we solve the above issue by stripping out everything else we don't want\r
+# these can probably all be removed if the body: command above worked\r
+strip_id_or_class: reply\r
+strip_id_or_class: left\r
+strip_id_or_class: post-headline\r
+strip_id_or_class: post-byline\r
+strip_id_or_class: footer
+test_url: http://www.uni-watch.com/2011/10/18/the-curious-case-of-steve-debergs-microphone-and-speaker/
\ No newline at end of file
--- /dev/null
+title: //title\r
+body: //td[@id='content']
+test_url: http://www.urbandictionary.com/define.php?term=Grown-Ass
\ No newline at end of file
--- /dev/null
+body: //div[@id='CS_Element_maincontent']\r
+\r
+tidy: no\r
+prune: no\r
+
+test_url: http://www.usccb.org/bible/readings/072412.cfm
\ No newline at end of file
--- /dev/null
+title: //h1\r
+\r
+date: substring-after(//p[@class='overline']/strong, ',')\r
+body: //div[@class="maintext"]\r
+strip: //p[@class='overline']\r
+strip: //h1\r
+tidy: no
+test_url: http://www.useit.com/alertbox/mobile-startup-screen.html
\ No newline at end of file
--- /dev/null
+author: ("Arturo Toledo")\r
+title: //div[@class="post"]/h2\r
+body: //div[@class="entry"]\r
+\r
+# Remove Twitter button\r
+strip: //div[@class="entry"]/p[2]/a/img
+test_url: http://ux.artu.tv/?p=192
\ No newline at end of file
--- /dev/null
+title:h1
+test_url: http://www.uzivatelsketestovani.cz/wiki/doku.php/skoleni-axure-rp
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content\r
+author: //div[contains(@class, 'byline')]//span[contains(@class, 'name')]\r
+date: //div[contains(@class, 'cn_date_time')]\r
+body: //div[contains(@class, 'pageContainers')]\r
+body: //article[@id='items-container']\r
+#body: //h2[@class='sub-header'] | //div[contains(@class, 'contributor-type') or @class='display-date' or @class='content-container']\r
+\r
+strip_id_or_class: bc\r
+strip_id_or_class: utilities\r
+strip_id_or_class: list-supporting\r
+strip_id_or_class: yrail\r
+strip_id_or_class: urail\r
+\r
+prune: no\r
+#tidy: no\r
+\r
+strip_id_or_class: super-rubric-section\r
+strip_id_or_class: cn_date_time\r
+strip_id_or_class: cn_contributors\r
+strip_id_or_class: cn_pagination_controls\r
+strip_id_or_class: cn_features_container\r
+strip_id_or_class: global-footer\r
+strip_id_or_class: cn_ecom_placement\r
+strip: //li[@class='blogNavPrev']\r
+\r
+single_page_link: //a[@title='Print this page']\r
+\r
+test_url: http://www.vanityfair.com/politics/features/2011/05/egypt-revolutionaries-201105\r
+test_url: http://www.vanityfair.com/politics/features/2008/08/hitchens200808\r
+test_url: http://www.vanityfair.com/style/2012/01/prisoners-of-style-201201
\ No newline at end of file
--- /dev/null
+title: //div[@class='ArticleHeadlineDetailedView']\r
+date: //span[@class='ArticlePublicationDateTimeDetailedView']\r
+author://span[@class='ArticleBylineDetailedView']\r
+body: //div[@class='ArticleTextDetailedView']
+test_url: http://www.varingen.no/Nyheter/tabid/392/Default.aspx?ModuleId=56651&articleView=true
\ No newline at end of file
--- /dev/null
+# FB comments are inside an h2. Weird. Without this, the line 'Comments' is preserved by the text parser\r
+\r
+strip: //h2
+test_url: http://www.varsity.co.uk/reviews/2662
\ No newline at end of file
--- /dev/null
+title: //td[@class='second_content']/h1\r
+body: //td[@class='second_content']/div[@class='article_text']
+test_url: http://www.vedomosti.ru/newspaper/article/259377/rasprodazha_mailru
\ No newline at end of file
--- /dev/null
+author: //div[@class="blogginnleggForfatter"]\r
+date: concat(//div[@class='blogginnleggDatoDag'],' ',//div[@class='blogginnleggDatoMnd'])\r
+strip: //div[contains(@id,"bloggDelingslenker")]\r
+strip: //div[contains(@id,"bloggDelingslenker")]
+test_url: http://veggbilder.no/blogginnlegg/fristelser
\ No newline at end of file
--- /dev/null
+title: //h2\r
+date: substring-before(//small," • Permalink")\r
+author:string('Martin Hering')\r
+\r
+Strip: //p/small
+test_url: http://vemedio.com/blog/posts/state-of-support-and-icloud
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]\r
+author: //div[@class="author-name"]\r
+date: //span[@class="the-time"]\r
+body: //div[@class="entry-content"]\r
+strip: //div[@class="vb-gallery"]
+test_url: http://venturebeat.com/2012/07/17/marissa-mayer-yahoo/#s:mayer-1
\ No newline at end of file
--- /dev/null
+4
\ No newline at end of file
--- /dev/null
+title: //article/header/h1\r
+\r
+author: //article/header/section[@class='byline']/span[contains(@class, 'author')]/a\r
+date: //article/header/section[@class='byline']/span[@class='published']/span\r
+\r
+body: //article/section[@class='body']\r
+\r
+convert_double_br_tags: yes\r
+\r
+# This is required, because Tidy chokes on the HTML5 tags...\r
+tidy: no
+test_url: http://www.version2.dk/artikel/17069-amerikansk-hit-investor-er-vild-med-danske-net-ivaerksaettere
\ No newline at end of file
--- /dev/null
+title: //title\r
+body: //div[contains(@class, 'printRecipe')]\r
+strip: //div[@class='recipeHeader']\r
+prune: no\r
+tidy: no\r
+single_page_link: //ul[@class='printOptions']//a[contains(@href, 'detail.aspx?p=1&showphoto=true')]
+test_url: http://www.verybestbaking.com/recipes/143190/Penne-Pasta-with-Sun-dried-Tomato-Cream-Sauce/detail.aspx
\ No newline at end of file
--- /dev/null
+body: //div[@id='artikkelspalte']\r
+strip_id_or_class: 'breadcrumb'
+test_url: http://www.vg.no/spill/artikkel.php?artid=10003628
\ No newline at end of file
--- /dev/null
+title: concat("Video: ", //div[@id='currentVideoTitleDivId'])\r
+body: //div[@id='currentVideoDescriptionId']\r
+author: //meta[@name='author']/@content\r
+\r
+replace_string(<div id="currentVideoDescriptionId" style="display): <div id="currentVideoDescriptionId" style="displayitplease\r
+\r
+replace_string(<div id="currentVideoTitleDivId" style="display): <div id="currentVideoTitleDivId" style="displayitplease\r
+\r
+test_url: http://video.forbes.com/fvn/business/wells-fargo-inside-the-bank-that-works
\ No newline at end of file
--- /dev/null
+title: //h2[@class='posttitle']\r
+date: substring-before(substring-after(//span[@class='postdate'], 'on '), ' by')\r
+date: //span[@class='postdate']\r
+author: //span[@class='postdate']/a\r
+body: //div[@class='entry line_top']
+test_url: http://videogum.com/395042/here-are-some-afternoon-links-92/list/
\ No newline at end of file
--- /dev/null
+title: //h2[@class='headline']\r
+\r
+body: //div[@class='ContentPrint']
+\r
+prune: no\r
+\r
+single_page_link: //a[contains(@href, '/printVersion/')]\r
+\r
+test_url: http://www.villagevoice.com/2010-03-16/news/new-york-s-ten-worst-landlords/
\ No newline at end of file
--- /dev/null
+title: //title\r
+body: //iframe\r
+\r
+find_string: <html><iframe \r
+replace_string: <iframe id="video" \r
+\r
+find_string: ></iframe></html>\r
+replace_string: ></iframe>\r
+\r
+replace_string("): "\r
+\r
+single_page_link: //link[@type='text/xml+oembed']\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://vimeo.com/35941909
\ No newline at end of file
--- /dev/null
+# Author's name, when present, has 'skrifar:' ('writes:') appended to it.\r
+# In case of multiple authors, this would be 'skrifa:', hence only 7 characters\r
+# are stripped off.\r
+author: substring(//div[@class='paragraph']/div[@class='meta'], 0, string-length(//div[@class='paragraph']/div[@class='meta']) - 7)\r
+\r
+date: //span[@class='date']\r
+title: //h1\r
+body: //div[@class='paragraph']\r
+\r
+# Strip out author string when present\r
+strip: //div[@class='paragraph']/div[@class='meta']\r
+\r
+convert_double_br_tags: yes
+test_url: http://visir.is/esb,-ipa,-bhm-og-bsrb/article/2012701319997
\ No newline at end of file
--- /dev/null
+strip: //*[(@id = "ja-search")]\r
+body: //*[(@id = "ja-mainbody")]\r
+body: //*[(@id = "content-mass-bottom")]\r
+strip://h3[contains(span,'Related Posts')]\r
+strip://img
+test_url: http://vitispr.com/blog/coventry-is-a-technology-hotspot
\ No newline at end of file
--- /dev/null
+body: //*[(@class = "historia")]
+test_url: http://vivirmexico.com/2011/09/en-veracruz-arrojan-35-cuerpos-a-plena-luz-del-dia-esta-si-es-una-alarma-social
\ No newline at end of file
--- /dev/null
+body: //div[@cpms_content]//h2[@class='Lead'] | //div[@cpms_content]//p[@class='Normal'] | //div[@cpms_content]//table\r
+strip://div[@class="box-item"]\r
+strip://div[@id="ARTICLE_BANNER"]\r
+strip://a\r
+strip://div[@class="tag-parent"]\r
+strip://div[@class="email-print txtr"]\r
+\r
+test_url: http://vnexpress.net/gl/xa-hoi/2011/04/tim-thay-nan-nhan-cuoi-cung-vu-sap-mo-da-o-len-co/
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@class='entrytext']
+test_url: http://voices.washingtonpost.com/ezra-klein/2010/10/why_isnt_monetary_policy_discr.html
\ No newline at end of file
--- /dev/null
+body: //div[contains(@class, 'KonaBody')]\r
+
+test_url: http://www.vworker.com/RentACoder/misc/BidRequests/ShowBidRequest.asp?lngBidRequestId=1634186
\ No newline at end of file
--- /dev/null
+title: //h2[@class="title"]\r
+body: //div[@class="post"]\r
+
+test_url: http://waffle.wootest.net/2011/06/22/on-reading-news/
\ No newline at end of file
--- /dev/null
+title: //div[@id='pr']/h3\r
+author: //div[@class='dateline']//a[contains(@href, '/author/')]\r
+\r
+# print page\r
+body: //div[@id='prbody']\r
+# standard page\r
+body: //div[@id='pgbody']\r
+\r
+# for multi-page articles\r
+single_page_link: //div[@class='tipjar']//a[contains(@href, '/printerFriendly.php?')]\r
+\r
+prune: no\r
+
+test_url: http://www.walrusmagazine.com/articles/2011.12-memoir-kidnapped
\ No newline at end of file
--- /dev/null
+title: //h3\r
+body: //div[@class="content_wysiwyg"]
+test_url: http://www.warnerbros.fr/game-of-thrones-un-junket-vu-de-l-interieur-268.html
\ No newline at end of file
--- /dev/null
+title://a[@class = 'headline-article']\r
+\r
+author: substring-after(//div[@class = 'article']/p[@class = 'author'], 'By ')\r
+date://div[@class = 'article']/span[@class = 'date']\r
+body://div[@class = 'article']\r
+single_page_link://a[@class = 'print']\r
+strip://p[@class = 'author']\r
+strip://a[@class = 'headline-article']\r
+strip://span[@class = 'date']
+test_url: http://www.washingtonmonthly.com/magazine/julyaugust_2011/features/the_trinity_sisters030380.php
\ No newline at end of file
--- /dev/null
+body: //div[@class="article_body"]\r
+author://meta[@name='DC.creator']/@content\r
+title://meta[@name='title']/@content\r
+date://div[contains(@class,'byline')]//span[contains(@class,'published')]/@title\r
+date://meta[@name="DC.date.issued"]/@content\r
+strip://div[@class="relative primary-slot padding-top img-border gallery-container photo-wrapper"]\r
+strip://div[@id="wp-column six end"]\r
+strip://div[contains(@class,'hidden')]\r
+strip://div[@id='article-side-rail']\r
+strip://div[@class="module component todays-paper-module curved"]\r
+strip://div[@class="module component live-qa curved img-border"]\r
+strip://div[@class="module component newsletter-signup curved"]\r
+strip://div[@class="module featured-stories component curved img-border"]\r
+\r
+strip_id_or_class: carousel\r
+strip_id_or_class: toolbar\r
+strip_id_or_class: module\r
+\r
+test_url: http://www.washingtonpost.com/world/europe/in-europe-new-fears-of-german-might/2011/10/19/gIQA3baZ7L_story.html?hpid=z1\r
+test_url: http://www.washingtonpost.com/national/health-science/radical-theory-of-first-americans-places-stone-age-europeans-in-delmarva-20000-years-ago/2012/02/28/gIQA4mriiR_story.html\r
+test_url: http://www.washingtonpost.com/lifestyle/magazine/the-sorry-fate-of-a-tech-pioneer-halsey-minor-and-historic-virginia-estate-carters-grove/2012/05/30/gJQAwdJG4U_story.html
\ No newline at end of file
--- /dev/null
+body: //div[@id='template_article']\r
+\r
+strip_id_or_class: article_more\r
+strip: //hr\r
+
+test_url: http://www.web-libre.org/dossiers/jacuzzi-gonflable,8493.html
\ No newline at end of file
--- /dev/null
+title://div[@class="post"]/h2\r
+author://p[@class="postinfo"]/a\r
+date:substring-before(substring-after(//p[@class="postinfo"],' on '),' under ')\r
+body://div[@class="contenttext"]
+test_url: http://weblog.bignerdranch.com/?p=304
\ No newline at end of file
--- /dev/null
+title: //h2[@class="pageTitle"]\r
+strip: //div[@class="postfoot"]\r
+strip: //h2[@class="pageTitle"]\r
+strip: //h3[@class="pageTitle"]\r
+body: //div[@class="post"]\r
+author: substring-before(substring-after(//div[@class="postfoot"], 'by'), 'Filed')\r
+date: substring-before(substring-after(//div[@class="postfoot"], 'Published'), 'by')\r
+
+test_url: http://weblogs.asp.net/scottgu/archive/2011/08/31/html-editor-smart-tasks-and-event-handler-generation-asp-net-vnext-series.aspx
\ No newline at end of file
--- /dev/null
+tidy: no\r
+dissolve: //div[@id="content"]/div/article/header\r
+body: //div[@id="content"]/div/article \r
+title: //div[@id="content"]/div/article/h1\r
+date: //div[@id="content"]/div/article/header/div[@id="issueSelectTrigger"]\r
+strip: //div[@id="content"]/div/article/h1\r
+
+test_url: http://webpaper.nzz.ch/2012/06/23/front/JJKMS/aphrodite-und-die-kommunisten?guest_pass=24a3ca5b6d%3AJJKMS%3Ad30e1be8628c099669671d4da56cdce4187790ba
\ No newline at end of file
--- /dev/null
+# set body\r
+tidy: no\r
+body: //div[contains(@class, 'articleContent')]\r
+\r
+# remove clutter\r
+strip: //div[@class='advertising']\r
+strip: //div[@class='themenalarm']\r
+strip: //div[contains(@class, 'inTextTeaser')]\r
+\r
+# remove captions\r
+strip: //span[@class='copyRight']\r
+\r
+# remove photo galleries and extras\r
+strip: //div[contains(@class, 'textGallery')]\r
+strip: //div[contains(@class, 'videoGallery')]\r
+strip: //div[contains(@class, 'imageGallery')]\r
+strip: //div[contains(@class, 'openContent')]\r
+\r
+# remove comments\r
+strip: //div[@id = 'writeComment']\r
+\r
+test_url: http://www.welt.de/vermischtes/weltgeschehen/article11050589/27-Bergleute-in-neuseelaendischer-Mine-vermisst.html
\ No newline at end of file
--- /dev/null
+title: substring-before(//title, '«')\r
+\r
+body: //div[@class='entry']\r
+strip: //div[@class='sharing_label']\r
+strip: //div[@class='snap_nopreview sharing robots-nocontent']
+test_url: http://www.westhamtillidie.com/2012/03/11/twelve-things-we-learned-from-the-doncaster-game/
\ No newline at end of file
--- /dev/null
+autodetect_next_page: no
+test_url: http://what-if.xkcd.com/1/
\ No newline at end of file
--- /dev/null
+strip: //div[@class="navigation"]\r
+strip: //div[@id="sidebar"]\r
+strip: //div[@id="post-extra-content"]\r
+strip: //div[@id="footer"]\r
+strip: //div[contains(@class, "sharing")]\r
+
+test_url: http://whatever.scalzi.com/2011/01/09/quick-giffords-follow-up/
\ No newline at end of file
--- /dev/null
+body://div[contains(@class,'oAndtLyrics')]\r
+strip://div[contains(@class,'info')]\r
+strip://div[contains(@id,'romanization')]\r
+strip://div[contains(@id,'youtube')]\r
+strip://div[contains(@id,'romanizationSelector')]\r
+strip://div[contains(@id,'langSelectWrap')]\r
+strip://div[contains(@id,'requestTranslationWrap')]\r
+strip://div[contains(@id,'viewMore')]\r
+strip://div[contains(@class,'lyricsListInMainContent')]\r
+strip://div[contains(@class,'descIpNoti')]
+test_url: http://wheelyric.com/lyrics/121#2
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='content']\r
+strip_id_or_class: editsection\r
+strip_id_or_class: toc\r
+strip: //div[@id='siteNotice']\r
+strip: //div[@id='content']//table[last()]\r
+prune: no
+test_url: http://wiki.guildwars.com/wiki/Monk
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id='content']\r
+strip_id_or_class: editsection\r
+strip_id_or_class: toc\r
+strip: //div[@id='siteNotice']\r
+strip: //div[@id='content']//table[last()]\r
+prune: no
+test_url: http://wiki.guildwars2.com/wiki/Guardian
\ No newline at end of file
--- /dev/null
+# copied from .wikipedia.org.txt\r
+title: //h1[@id='firstHeading' or @class='firstHeading']\r
+body: //div[@id = 'bodyContent']\r
+strip_id_or_class: editsection\r
+#strip_id_or_class: toc\r
+strip_id_or_class: vertical-navbox\r
+strip: //table[@id='toc'] | //div[@id='p-toc']\r
+strip: //div[@id='catlinks' or @id='contentSub']\r
+strip: //div[@id='jump-to-nav']\r
+strip: //div[@class='thumbcaption']//div[@class='magnify']\r
+strip: //table[@class='navbox']\r
+prune: no\r
+tidy: no
+test_url: http://wikitravel.org/wiki/en/index.php?title=Bangkok&printable=yes
\ No newline at end of file
--- /dev/null
+strip: //div[@class="widget-area"]\r
+title: //*[@class="entry-title"]\r
+date: //time[@class="entry-date"]
+test_url: http://will-self.com/2012/02/01/real-meals-dominos-pizza/
\ No newline at end of file
--- /dev/null
+title: substring-after(//span[@class='itemTitle'], ':') \r
+body: //div[@id='content']
+test_url: http://www.williampfaff.com/modules/news/article.php?storyid=491
\ No newline at end of file
--- /dev/null
+title: //h1/span\r
+\r
+body: //div[@id="news_content"]\r
+\r
+author: //div[@class="bookmarks_btm"]/p[1]/a[1]/text()\r
+\r
+date: //span[@class='date']\r
+\r
+# Rubrikenbild entfernen\r
+strip: //div[@id="news_content"]/a[1]\r
+
+test_url: http://winfuture.de/news,69672.html
\ No newline at end of file
--- /dev/null
+title: //h1[@class='page-heading']\r
+author: //small/strong/a\r
+#their date string is relative, so if you save the page 2 hours after it is posted it may say 'two hours ago, instead of providing a useful date/time'\r
+date: substring-before(substring-after(//small,'on'),'with')\r
+body: //div[@class='entry']\r
+test_url: http://www.winrumors.com/chinese-windows-phone-launch-still-on-track-for-early-2012/
\ No newline at end of file
--- /dev/null
+date: //*[@class='kicker']\r
+body: //*[@class='KonaBody']\r
+test_url: http://www.winsupersite.com/article/paul-thurrotts-wininfo/android-malware-surges-separate-studies-141364
\ No newline at end of file
--- /dev/null
+title: //meta[@property="og:title"]/@content \r
+title: //h1\r
+title: //*[@class='posttitle']\r
+author: //*[@class='entryAuthor']/a[1]\r
+author://*[@class='member-title']\r
+author://li[@class='author']/a[contains(@href, '/author/')]\r
+date: substring-after(//div[@class='entryAuthor'], '·')\r
+date: substring-before(//*[@class='entryDate'], '|')\r
+body: //div[@class='entry']\r
+strip: //span[contains(@class, 'nextprev')]\r
+#strip_id_or_class: ngg-galleryoverview \r
+# ngg-galleryoverview is the whole content sometimes, e.g. http://www.wired.com/underwire/2011/12/best-mixtapes-of-2011/?pid=5736&viewall=true\r
+\r
+strip: //p[span[contains(@class, 'contentjump')]]\r
+strip: //text()[contains(., 'nextpage')]\r
+\r
+prune: no\r
+\r
+single_page_link: //a[contains(@href, '/all/1') and contains(@class, 'contentjumpall')]\r
+\r
+test_url: http://www.wired.com/cloudline/2011/10/meet-arms-cortex-a15-the-future-of-the-ipad-and-possibly-the-macbook-air/\r
+test_url: http://www.wired.com/threatlevel/2012/05/ff_counterfeiter/all/1
\ No newline at end of file
--- /dev/null
+title: //div[@class="bodyText"]/h1/text()\r
+body: //div[@class="bodyText"]\r
+\r
+# author and date are separated by only a newline\r
+# can't figure out how to tokenize that yet\r
+author: //div[@class="bodyText"]/span[@class="info"]/text()\r
+date: //div[@class="bodyText"]/span[@class="info"]/text()\r
+\r
+# strip metdata from body text\r
+strip: //div[@class="bodyText"]/h1/text()\r
+strip: //div[@class="bodyText"]/span[@class="info"]\r
+strip: //div[@class="bodyText"]/span[@class="info"]
+test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas
\ No newline at end of file
--- /dev/null
+date://*[@class="entry-date"]\r
+author://*[@class="author vcard"]\r
+strip://*[@style="position:relative;left:72px;top:2px;"]|//*[@id="authorbox"]
+test_url: http://wmpoweruser.com/breaking-nokia-announces-nfc-support-in-lumia-610-windows-phone-device/
\ No newline at end of file
--- /dev/null
+title: //div[@class="content article"]/h1\r
+date: substring-after(//*[@class='date'], '//')\r
+body: //*[@class='article-content']\r
+strip: //*[@id='nomodal']
+test_url: http://www.worldpoultry.net/news/kyrgyzstan-restricts-poultry-imports-from-russia-and-kazakhstan-9332.html
\ No newline at end of file
--- /dev/null
+title: //p[@id='content']\r
+\r
+body: //div[@class='contentblock']
+test_url: http://www.worldwidewords.org/weirdwords/ww-gro1.htm
\ No newline at end of file
--- /dev/null
+title: //h2[@class="posttitle"]\r
+body: //div[@class="post"]\r
+strip: //h2[@class="posttitle"]\r
+strip: //p[@class="filed-under"]\r
+convert_double_br_tags: yes
+test_url: http://wow.joystiq.com/2011/06/20/the-overachiever-guide-to-midsummer-festival-2011-achievements/
\ No newline at end of file
--- /dev/null
+body://div[@id='articleNew']\r
+strip://div[@id='articleBy']\r
+strip://div[@id='articleDate']\r
+strip://td[@class='articleGraphicCredit']\r
+strip://h1\r
+strip://div[@id='articleEnd']\r
+strip://p[@class='tagline']\r
+strip://div[@class='openBox adslibraryArticle']\r
+strip_id_or_class:ad-180x150-1\r
+\r
+\r
+title: //div[@id="articleNew"]/h1\r
+author: //div[@id="articleBy"]/p/b\r
+date: substring-before(//div[@id="articleDate"], "-")\r
+test_url: http://www1.folha.uol.com.br/mundo/1115805-ex-ditador-argentino-videla-e-condenado-a-50-anos-de-prisao.shtml
\ No newline at end of file
--- /dev/null
+strip_id_or_class: hidelabel
+test_url: http://www3.imperial.ac.uk/newsandeventspggrp/imperialcollege/newssummary/news_14-7-2010-15-53-18
\ No newline at end of file
--- /dev/null
+title:h1
+author: //*[@class = 'author']
+date: //*[@class = 'date']
+body: //*[@id = 'art']
+next_page_link: //*[@id='Str']/a[contains(text(), 'nastepne')]
+strip: //*[@class = 'rel_zdjTOP']
+strip: //*[@id = 'rel']
+strip: //*[@class = 'txt_upl']
+strip: //*[@id='Str']
+strip: //*[@id='source']
+test_url: http://wyborcza.pl/1,123455,11536088,Gdy_peknie_fejs__obryzga_wszystko.html?as=1&startsz=x
\ No newline at end of file
--- /dev/null
+body: //div[@class='article-body']\r
+title: //h1
+test_url: http://wyctim.com/icloud-sync-regebbi-rendszereken/
\ No newline at end of file
--- /dev/null
+title://h1\r
+\r
+date://p[@class='articleDate']\r
+body://div[@class='articleBody wzStandardArticle']
+test_url: http://www.wz-newsline.de/home/sport/tennis/federer-zum-vierten-mal-sieger-in-indian-wells-1.938050
\ No newline at end of file
--- /dev/null
+title: //h1[@class="entry-title"]\r
+author: //span[@class="fn"]\r
+date: //p[@class="meta"]
+test_url: http://xoeb.us/blog/2012/03/16/my-mistakes-with-our-first-release/
\ No newline at end of file
--- /dev/null
+title: //div[@class='pagetitle']
+test_url: http://www.yated.com/content.asp?categoryid=7&contentid=582
\ No newline at end of file
--- /dev/null
+title://div[@class='entry-title']\r
+body://div[@class='entry-content']\r
+strip_comments:yes\r
+convert_double_br_tags:yes
+test_url: http://www.yostivanich.com/2010/07/11/wired-com-with-world-watching-wikileaks-falls-into-disrepair/
\ No newline at end of file
--- /dev/null
+title: //title\r
+body: //iframe\r
+\r
+find_string: <html><iframe \r
+replace_string: <iframe id="video" \r
+\r
+find_string: ></iframe></html>\r
+replace_string: ></iframe>\r
+\r
+single_page_link: //link[@type='text/xml+oembed']\r
+\r
+prune: no\r
+tidy: no\r
+\r
+test_url: http://www.youtube.com/watch?v=F6gLH0r3iVU
\ No newline at end of file
--- /dev/null
+title: //h1[@class="h s-1"]\r
+author: substring-before(substring-after(//p[@class="meta s-10"], 'By'), '|')\r
+author: substring-after(//div[@class="bio"]//h3, 'About ')\r
+date: substring-after(//p[@class="meta s-10"], '|')\r
+date: substring-after(//p[@class="meta"], '|')\r
+body: //div[@class="content-1 entry space-1 clear"]\r
+body: //div[@class="storyBody"]\r
+\r
+test_url: http://www.zdnet.com/blog/microsoft/the-bing-back-end-more-on-cosmos-tiger-and-scope/10920\r
+test_url: http://www.zdnet.com/researchers-find-web-tracking-up-privacy-down-7000000358/
\ No newline at end of file
--- /dev/null
+# 2012-12-23 [carlo@...] fixed half-assed headlines in articles, removed inline author profiles, adjusted picture captions\r
+# 2012-03-17 [dkless@...] Cut metadata parts in the beginning and the ends of the content block; copyright entries for pictures removed; Author fixed, not sure if old entries still valid (I left them); Weird problems with some pages addressed (see last section for removing hidden section)\r
+# 2011-12-09 [carlo@...] Removed "related articles" block\r
+# 2011-08-23 [carlo@...] changed single page link to use print version: page works better, less ambiguity. Related cleanups and simplifications.\r
+# 2011-08-20 [carlo@...] added author, fixed date\r
+\r
+\r
+single_page_link: //a[@title='Druckversion']\r
+tidy: no\r
+\r
+title: //title\r
+date: substring-before( //li[@class="date"], " " )\r
+author: //li[@class="author"]/a/text() | //li[@class="author first"]/a/text()\r
+author: substring-after(//li[@class='source first '], 'Quelle: ')\r
+\r
+strip_id_or_class: articleheader\r
+strip: //div[@id="comments"] | //div[@class="pagination block"] | //p[@class="ressortbacklink"] | //div[@id="relatedArticles"] | // div[@class="inline portrait"]\r
+\r
+#Removes author and date from the start\r
+strip: //ul[@class="tools"]\r
+#Removes copyright statement - often disturb as first line of the news\r
+strip: //p[@class="copyright"]\r
+strip: //div[@class="copyright"]\r
+#Removes pagination links at the end\r
+strip: //div[@class="pagination"]\r
+\r
+# Fix picture captions\r
+wrap_in(small): //p[@class="caption"]/text()\r
+\r
+# Fix sub-headlines\r
+wrap_in(h2): //p/strong\r
+dissolve: //h2/strong\r
+\r
+#Sometimes things are embedded in the print version that are not displayed on the web, but will be displayed in the mobilized versions and lead even to problems. These sections are removed here.\r
+strip_id_or_class:"informatives"\r
+strip_id_or_class:"bottom"\r
+strip_id_or_class:"teasermosaic"\r
+strip_id_or_class:"comments"\r
+strip_id_or_class:"articlefooter af"\r
+strip_id_or_class:"relateds"\r
+strip_id_or_class:"pagination"\r
+\r
+footnotes: no\r
+test_url: http://www.zeit.de/kultur/film/2012-12/Kurzfilmtag
\ No newline at end of file
--- /dev/null
+author: //span[@class='author']//a\r
+date: //span[@class='date']\r
+test_url: http://zerodistraction.com/blog/2012/3/11/retina-ipad-that-means-i-am-going-digital-only-for-comic-boo.html\r
+test_url: http://zerodistraction.com/notes/unreasonably-grumpy
\ No newline at end of file
--- /dev/null
+title: //h1\r
+body: //div[@id="primarycontent"]
+test_url: http://zerokspot.com/weblog/2011/06/26/europython2011/
\ No newline at end of file
--- /dev/null
+title: substring-after(id, 'post')/h2\r
+body://div[@class = 'entry']
+test_url: http://www.zingtrain.com/category/ontrack/january-2007/
\ No newline at end of file