From ac4d114214d820b20e18518a2dbc809337e39043 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolas=20L=C5=93uillet?= Date: Fri, 6 Dec 2013 10:13:03 +0100 Subject: [add] new specific configuration files --- inc/3rdparty/site_config/standard/fnal.gov.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/fnal.gov.txt (limited to 'inc/3rdparty/site_config/standard/fnal.gov.txt') diff --git a/inc/3rdparty/site_config/standard/fnal.gov.txt b/inc/3rdparty/site_config/standard/fnal.gov.txt new file mode 100644 index 00000000..7faa6bfc --- /dev/null +++ b/inc/3rdparty/site_config/standard/fnal.gov.txt @@ -0,0 +1,15 @@ +title: normalize(//h1) + +author: //td/p[position()=last()]/em + +# I swear, this is really the best way to do this +date: normalize(//td[contains(@style, "color: #ffffff")]) + +# my god, it's full of tables +body: /table/tbody/tr[5]//table/tbody//table/tbody/tr/td +strip: //h1 + +# the following two lines strip the byline at the end of the article (the byline is a

that consists of an em dash and then some text in an ). I have no idea why I can't just strip //p[position()=last()], but trying to do so includes a bunch of other crap in the output. +strip: //p[position()=last()]/em +strip: //p[position()=last()]/child::text() +test_url: http://www.fnal.gov/pub/today/archive_2011/today11-11-09_MuonDepartmentReadMore.html \ No newline at end of file -- cgit v1.2.3