]>
Commit | Line | Data |
---|---|---|
ac4d1142 NL |
1 | # meta data\r |
2 | title://h1[@id = 'articleTitle']\r | |
3 | author:substring-after(//ul[@id = 'byLine']/li[1],'By ')\r | |
4 | date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',')\r | |
5 | body://div[@id = 'article-body']\r | |
6 | \r | |
7 | # full content\r | |
8 | single_page_link://td/li[@class = 'article-singlepage']/a\r | |
9 | \r | |
10 | # caption clean up\r | |
11 | wrap_in(i)://span[@class='articleImageCaptionwide']\r | |
12 | move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p\r | |
13 | \r | |
14 | \r | |
15 | # clean up\r | |
16 | strip://p[@id = 'articlePaginationWrapper']\r | |
17 | strip://ul[contains(@class, 'cat-breadcrumb')]\r | |
18 | strip://div [@class= 'viewMorePhotos']\r | |
19 | ||
20 | test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html |