]>
Commit | Line | Data |
---|---|---|
4e067cea NL |
1 | # story has several pages, should be detected |
2 | body: //div[@id='storyBody'] | |
3 | body: //div[@id='article_body'] | |
4 | body: //div[@id='story_body'] | |
5 | ||
6 | title://h1[@id='article_headline'] | |
7 | ||
8 | # article author | |
9 | author: //p[@class='author']/a | |
10 | # story author(s) | |
11 | author: substring-after(//p[@class='byline'], 'By ') | |
12 | ||
13 | # article date | |
14 | date: //span[@class='published_date'] | |
15 | # story date | |
16 | date: //span[@class='date'] | |
17 | ||
18 | date: substring-after(//div[contains(@class,'attributor')],'on') | |
19 | strip_id_or_class: inset | |
20 | strip: //p/span[@class='photoCredit'] | |
21 | strip: //h1 | |
22 | ||
23 | strip_id_or_class: page_count | |
24 | strip_id_or_class: tools | |
25 | strip_id_or_class: pagination | |
26 | ||
27 | single_page_link: //li[@id='stPrint']/a | |
28 | ||
29 | test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html | |
ac4d1142 | 30 | test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall |