]>
Commit | Line | Data |
---|---|---|
ac4d1142 NL |
1 | title://h1[@class="articleHeadline"]\r |
2 | body://div[@id="article"]\r | |
3 | strip_id_or_class:articleTools\r | |
4 | strip_id_or_class:readerscomment\r | |
5 | #strip://div[contains(@class, "articleInline runaroundLeft")]\r | |
6 | strip: //div[contains(@class, "doubleRule")]\r | |
7 | # strip image credit - appears as a bold heading\r | |
8 | strip: //div[contains(@class, "articleInline")]//h6\r | |
9 | strip_id_or_class:enlargeThis\r | |
10 | strip_id_or_class:pageLinks\r | |
11 | strip_id_or_class:memberTools\r | |
12 | strip_id_or_class:articleExtras\r | |
13 | strip_id_or_class:singleAd\r | |
14 | strip_id_or_class:byline\r | |
15 | strip_id_or_class:dateline\r | |
16 | strip_id_or_class:articleheadline\r | |
17 | strip_id_or_class:articleBottomExtra\r | |
18 | strip://a[contains(@href, 'nytimes.com/adx/')]\r | |
19 | strip: //nyt_byline\r | |
20 | strip: //span[contains(@class, 'slideshow') or contains(@class, 'video')]\r | |
21 | strip: //p[@class='caption']//a[contains(., 'More Photos')]\r | |
22 | \r | |
23 | prune: no\r | |
24 | tidy: no\r | |
25 | \r | |
26 | date: substring-after(//*[contains(@class, 'dateline')], 'Published:')\r | |
27 | \r | |
28 | single_page_link: //link[contains(@href, 'pagewanted=all')]\r | |
29 | #single_page_link: //a[contains(@href, 'pagewanted=all') and not(contains(@href, 'login'))]\r | |
30 | \r | |
31 | strip://ul[@id = 'toolsList']\r | |
32 | strip://h6[@class = 'kicker']\r | |
33 | author:substring-after(//h6[@class='byline'],'By ')\r | |
34 | \r | |
35 | test_url: http://www.nytimes.com/2011/07/24/books/review/an-academic-authors-unintentional-masterpiece.html\r | |
36 | test_url: http://www.nytimes.com/2012/06/10/arts/television/the-newsroom-aaron-sorkins-return-to-tv.html |