aboutsummaryrefslogblamecommitdiffhomepage
path: root/inc/3rdparty/site_config/standard/neh.gov.txt
blob: e7cc43134211bfd3954d3ed26b8698be83a4f306 (plain) (tree)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15














                                                                                

                                                                       
#host configuration should be http://www.neh.gov/news/humanities/


#meta data 
title:substring-after(substring-after(//title,':'),':')
author:substring-after(//h2[@class = 'subHead'],'By')
date:substring-before(substring-after(//title,':'),':')

#img and caption handling
wrap_in(small)://div[@id = 'mainContent']/table/descendant::p/descendant::text()
wrap_in(fieldset)://div[@id = 'mainContent']/table

# clean up
strip: //table[@class = 'marginpaddingTop']
strip: //h2[@class = 'subHead']

test_url: http://www.neh.gov/news/humanities/2011-11/IslamicScholar.html