]> git.immae.eu Git - github/wallabag/wallabag.git/blob - inc/3rdparty/site_config/standard/bostonglobe.com.txt
merge fix 776
[github/wallabag/wallabag.git] / inc / 3rdparty / site_config / standard / bostonglobe.com.txt
1 # NOTE: If testing this configuration yields bad results, including junk text like "Try BostonGlobe.com today" and "THIS STORY APPEARED IN", please replace the Test URL with a current-day headline link from bostonglobe.com.
2
3 title: //div[@class="header"]/h1
4 author: substring-after(//div[@class="byline"]/h2[@class="author"],"By ")
5 date: //div[@class="byline"]/p[last()]
6 body: //div[@class="article-body"]
7
8 strip_id_or_class: aside
9 strip_id_or_class: promo
10 strip_id_or_class: skip-nav
11 strip_id_or_class: article-more
12 strip_id_or_class: article-bar
13
14 # This removes image captions. If the parser starts saving images from bostonglobe.com (currently, it does not), then this directive should be removed.
15 strip_id_or_class: figure
16 test_url: http://bostonglobe.com/news/nation/2012/03/17/illinois-primary-could-pivotal/PsDzFZqvhEYyXbOcF9FOkO/story.html