From da87848cee77d1d1c491924d30b7c92667bfc44c Mon Sep 17 00:00:00 2001 From: tcitworld Date: Tue, 1 Jul 2014 10:18:44 +0200 Subject: new config file, fix for #740 --- inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt (limited to 'inc/3rdparty/site_config/standard') diff --git a/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt new file mode 100644 index 00000000..24c949e9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/moo.nac.uci.edu.txt @@ -0,0 +1,9 @@ +title: //div[@id='header']//h1[1] + +body: //div[@id='content'] + +strip_id_or_class: toc + +prune: no + +test_url: http://moo.nac.uci.edu/~hjm/HOWTO_move_data.html -- cgit v1.2.3 From 82980a148b1e437012744c6755cdc38560132f91 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Mon, 7 Jul 2014 19:17:55 +0300 Subject: quick fix of issue #750: mulipage content for politico.com/magazine articles --- inc/3rdparty/site_config/standard/politico.com.txt | 4 ++++ 1 file changed, 4 insertions(+) mode change 100644 => 100755 inc/3rdparty/site_config/standard/politico.com.txt (limited to 'inc/3rdparty/site_config/standard') diff --git a/inc/3rdparty/site_config/standard/politico.com.txt b/inc/3rdparty/site_config/standard/politico.com.txt old mode 100644 new mode 100755 index 121fd5b9..c5302d1b --- a/inc/3rdparty/site_config/standard/politico.com.txt +++ b/inc/3rdparty/site_config/standard/politico.com.txt @@ -4,10 +4,14 @@ body://div[contains(@class,"story-text")] # Why doesn't this work? next_page_link://ul[contains(@class,"pagination")]/li/a[@rel="next"] next_page_link://ul[contains(@class,"pagination")]/li[contains(@class, "current")]/following-sibling::node()/a +next_page_link://div[contains(@class,"pagination")]/ol/li[contains(@class, "current")]/following-sibling::node()/a date://meta[@name="publish_date"]/@content strip://div[contains(@class, "breadcrumbs")] strip://a[contains(@class, "hidden")] strip://div[contains(@class, "story-embed")] strip://div[contains(@class, "story-text")]//p/a[contains(text(), "Also on POLITICO:")]/.. +strip://div[contains(@class, "story-interrupt")] +strip://footer[contains(@class, "author-bio")] + test_url: http://www.politico.com/news/stories/0712/78105.html \ No newline at end of file -- cgit v1.2.3 From d59536deea443f4bdac2c5cf1bfeea690810a817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Si=C3=B4n=20Le=20Roux?= Date: Thu, 10 Jul 2014 18:30:44 +0200 Subject: Add support for *.about.com Includes next_page_link for multi-page articles and strips pesky in-line 'next' links from the article body. Also includes an Xpath for author but I can't see where this is used in the wallabag UI. The 'tidy' option is turned off because it messed up bulleted lists. Tested with psychology.about.com and food.about.com. --- inc/3rdparty/site_config/standard/.about.com.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/.about.com.txt (limited to 'inc/3rdparty/site_config/standard') diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt new file mode 100644 index 00000000..e1ebaee3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/.about.com.txt @@ -0,0 +1,14 @@ +body: //div[@id='articlebody'] +title: //h1 +author: //p[@id='by']//a + +next_page_link: //span[@class='next']/a +# Not the same as below! + +prune: yes +tidy: no + +# Annoying 'next' links plainly inside the article body +strip: //*[text()[contains(.,'Next: ')]] + +test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm -- cgit v1.2.3