]>
Commit | Line | Data |
---|---|---|
ac4d1142 NL |
1 | # story has several pages, should be detected\r |
2 | body: //div[@id='storyBody']\r | |
3 | body: //div[@id='article_body']\r | |
4 | body: //div[@id='story_body']\r | |
5 | \r | |
6 | title://h1[@id='article_headline']\r | |
7 | \r | |
8 | # article author\r | |
9 | author: //p[@class='author']/a\r | |
10 | # story author(s)\r | |
11 | author: substring-after(//p[@class='byline'], 'By ')\r | |
12 | \r | |
13 | # article date\r | |
14 | date: //span[@class='published_date']\r | |
15 | # story date\r | |
16 | date: //span[@class='date']\r | |
17 | \r | |
18 | date: substring-after(//div[contains(@class,'attributor')],'on')\r | |
19 | strip_id_or_class: inset\r | |
20 | strip: //p/span[@class='photoCredit']\r | |
21 | strip: //h1\r | |
22 | \r | |
23 | strip_id_or_class: page_count\r | |
24 | strip_id_or_class: tools\r | |
25 | strip_id_or_class: pagination\r | |
26 | \r | |
27 | single_page_link: //li[@id='stPrint']/a\r | |
28 | \r | |
29 | test_url: http://www.businessweek.com/magazine/buyback-insurance-a-good-deal-for-retailers-07282011.html\r | |
30 | test_url: http://www.businessweek.com/articles/2012-06-06/american-pain-the-largest-u-dot-s-dot-pill-mills-rise-and-fall |