]>
Commit | Line | Data |
---|---|---|
4e067cea NL |
1 | #keep all body text |
2 | prune: no | |
3 | ||
4 | #title, body, metadata | |
5 | title: //div[@class='story_header']/h1 | |
6 | body: //div[@id='content'] | |
7 | author: substring-after(//span[@class='byline'], "by ") | |
8 | author: substring-after(//span[@class='byline'], "By ") | |
9 | author: //span[@class='byline'] | |
10 | date: //span[@class='date'] | |
11 | ||
12 | #formatting | |
13 | convert_double_br_tags: yes | |
14 | dissolve: //div[@class='slides_full']/ul/li | |
15 | ||
16 | # cleanup | |
17 | strip: //a[@id='story_note'] | |
18 | strip: //br | |
19 | strip: //div[@class='intro'] | |
20 | strip: //div[@class='share-block'] | |
21 | strip: //div[@class='sidebar-social'] | |
22 | strip: //div[@class='top-stories'] | |
23 | strip: //div[@class='prevnext'] | |
ac4d1142 | 24 | test_url: http://www.thedaily.com/page/2012/01/09/010912-news-college-costs-1-5/ |