# # After site revisions at SciAm, this configuration does # not work, especially for multi-page articles. For # every article there is now a "Print" link which # is far more reliable. So this configuration should be # removed or disabled. # 2/3/13 # # meta data title://h1[@class = 'articleTitle'] author:substring-after(//span[@class = 'byline'],'By ') date:substring-before(//span[@class = 'datestamp'],'|') #body content body://div[@id = 'articleContent'] #next_page_link://li[@id = 'flairPagination']/a[last()] single_page_link: //a[contains(@href, 'print=true')] #cleanup strip://div[@class = 'fsgBooks'] test_url: http://www.scientificamerican.com/article.cfm?id=do-brain-scans-comatose-patients-reveal-conscious-state test_url: http://www.scientificamerican.com/article.cfm?id=solar-wind-transforms-venus-into-shape-of-comet