1 title: //div[@class="bodyText"]/h1/text()
2 body: //div[@class="bodyText"]
4 # author and date are separated by only a newline
5 # can't figure out how to tokenize that yet
6 author: //div[@class="bodyText"]/span[@class="info"]/text()
7 date: //div[@class="bodyText"]/span[@class="info"]/text()
9 # strip metdata from body text
10 strip: //div[@class="bodyText"]/h1/text()
11 strip: //div[@class="bodyText"]/span[@class="info"]
12 strip: //div[@class="bodyText"]/span[@class="info"]
13 test_url: http://www.wmnf.org/news_stories/light-rail-advocates-join-forces-to-combat-opposition-in-pinellas