]>
Commit | Line | Data |
---|---|---|
4e067cea NL |
1 | # meta data |
2 | title://h1[@id = 'articleTitle'] | |
3 | author:substring-after(//ul[@id = 'byLine']/li[1],'By ') | |
4 | date:substring-before(substring-after(//ul[@id = 'byLine']/li[last()],','),',') | |
5 | body://div[@id = 'article-body'] | |
6 | ||
7 | # full content | |
8 | single_page_link://td/li[@class = 'article-singlepage']/a | |
9 | ||
3bb6a8ed NL |
10 | # continue link |
11 | single_page_link: //a[@id='continue-btn'] | |
12 | ||
4e067cea NL |
13 | # caption clean up |
14 | wrap_in(i)://span[@class='articleImageCaptionwide'] | |
15 | move_into (//span[@class='articleImageCaptionwide'])://div[@id = 'articleImage']/p | |
16 | ||
17 | ||
18 | # clean up | |
19 | strip://p[@id = 'articlePaginationWrapper'] | |
20 | strip://ul[contains(@class, 'cat-breadcrumb')] | |
21 | strip://div [@class= 'viewMorePhotos'] | |
ac4d1142 | 22 | |
3bb6a8ed | 23 | test_url: http://www.smithsonianmag.com/history-archaeology/The-Goddess-Goes-Home.html |