From d59536deea443f4bdac2c5cf1bfeea690810a817 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Si=C3=B4n=20Le=20Roux?= Date: Thu, 10 Jul 2014 18:30:44 +0200 Subject: Add support for *.about.com Includes next_page_link for multi-page articles and strips pesky in-line 'next' links from the article body. Also includes an Xpath for author but I can't see where this is used in the wallabag UI. The 'tidy' option is turned off because it messed up bulleted lists. Tested with psychology.about.com and food.about.com. --- inc/3rdparty/site_config/standard/.about.com.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 inc/3rdparty/site_config/standard/.about.com.txt (limited to 'inc/3rdparty/site_config') diff --git a/inc/3rdparty/site_config/standard/.about.com.txt b/inc/3rdparty/site_config/standard/.about.com.txt new file mode 100644 index 00000000..e1ebaee3 --- /dev/null +++ b/inc/3rdparty/site_config/standard/.about.com.txt @@ -0,0 +1,14 @@ +body: //div[@id='articlebody'] +title: //h1 +author: //p[@id='by']//a + +next_page_link: //span[@class='next']/a +# Not the same as below! + +prune: yes +tidy: no + +# Annoying 'next' links plainly inside the article body +strip: //*[text()[contains(.,'Next: ')]] + +test_url: http://psychology.about.com/od/theoriesofpersonality/ss/defensemech.htm -- cgit v1.2.3 From 5594d7d05469bcff2a046a99d49990bd63a6fd4f Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Mon, 21 Jul 2014 19:34:59 +0300 Subject: issue #750 - config for dn.pt site added --- inc/3rdparty/site_config/standard/dn.pt.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 inc/3rdparty/site_config/standard/dn.pt.txt (limited to 'inc/3rdparty/site_config') diff --git a/inc/3rdparty/site_config/standard/dn.pt.txt b/inc/3rdparty/site_config/standard/dn.pt.txt new file mode 100755 index 00000000..051b8cb9 --- /dev/null +++ b/inc/3rdparty/site_config/standard/dn.pt.txt @@ -0,0 +1,9 @@ +single_page_link: concat('http://www.dn.pt/Common/print.aspx?content_id=', //input[@type='hidden' and @name='link-comments']/@value) +# + +title: //h1 +author: //div[@class="Author"] + +strip: //div[@class="Patrocinio"] + +test_url: http://www.dn.pt/inicio/opiniao/interior.aspx?content_id=3972244&seccao=Alberto%20Gon%E7alves&tag=Opini%E3o%20-%20Em%20Foco&page=1 \ No newline at end of file -- cgit v1.2.3 From 0ce85e0a7fa873c69f1ec159bc188c6a58a2ad21 Mon Sep 17 00:00:00 2001 From: Maryana Rozhankivska Date: Wed, 23 Jul 2014 14:27:57 +0300 Subject: config for habrahabr.ru to grep articles with comments --- inc/3rdparty/site_config/standard/habrahabr.ru.txt | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100755 inc/3rdparty/site_config/standard/habrahabr.ru.txt (limited to 'inc/3rdparty/site_config') diff --git a/inc/3rdparty/site_config/standard/habrahabr.ru.txt b/inc/3rdparty/site_config/standard/habrahabr.ru.txt new file mode 100755 index 00000000..67538359 --- /dev/null +++ b/inc/3rdparty/site_config/standard/habrahabr.ru.txt @@ -0,0 +1,21 @@ +title: //span[@class="post_title"] +author: //div[@class="author"] +date: //div[@class="published + +body: //div[@class='content html_format'] | //div[@id='comments'] + +strip: //a[@class="link_to_comment"] +strip: //div[@class="show_tree"] +strip: //a[@class="to_parent"] + + +replace_string(class="reply_comments"): style="padding-left: 20px" +replace_string(class="voting "): style="float: right" +replace_string(src="//habrastorage.org/getpro/habr/avatars/): style="width:24px; height:24px;" class="123" src="//habrastorage.org/getpro/habr/avatars/ +replace_string(class="info "): style="padding-top:5px;font-size:0.85em;line-height:24px;" + + +prune: no +tidy: no + +test_url: http://habrahabr.ru/post/229883/ \ No newline at end of file -- cgit v1.2.3 From ecb8c1389c353fa1dead7e70b35d6140257c8830 Mon Sep 17 00:00:00 2001 From: zinnober Date: Sat, 23 Aug 2014 16:47:29 +0200 Subject: Complete rework of faz.net-template adding multipage support and major article cleanup --- inc/3rdparty/site_config/custom/blogs.faz.net.txt | 45 +++++++++ inc/3rdparty/site_config/standard/faz.net.txt | 117 +++++++++++++++++----- 2 files changed, 136 insertions(+), 26 deletions(-) create mode 100644 inc/3rdparty/site_config/custom/blogs.faz.net.txt mode change 100755 => 100644 inc/3rdparty/site_config/standard/faz.net.txt (limited to 'inc/3rdparty/site_config') diff --git a/inc/3rdparty/site_config/custom/blogs.faz.net.txt b/inc/3rdparty/site_config/custom/blogs.faz.net.txt new file mode 100644 index 00000000..4f2626f1 --- /dev/null +++ b/inc/3rdparty/site_config/custom/blogs.faz.net.txt @@ -0,0 +1,45 @@ +# Author: zinnober + +tidy: no +prune: no + +# Set author +author: //a[@rel='author'] + +# Set date +date: //span[@class='Datum'] + +# Content is here +body: //div[@class='Artikel'] + +# Tidy up before article +strip: //div[@id='FAZHeaderNeu'] +strip: //h2[@itemprop='headline'] +strip: //span[@class='Datum'] +strip: //span[@class='Autor'] +strip_id_or_class: ArticlePagerTop +strip: //div[@class='FAZArtikelEinleitung']/h2 + +# General cleanup +strip: //div[@class='clear'] +strip: //span[@class='Bildnachweis'] +strip: //iframe +strip_id_or_class: Community +strip: ' · ' + +# Remove tracking and ads +strip_image_src: /l.gif? +strip: //img[@width='1'] +strip_id_or_class: invisible +strip_id_or_class: Anzeige +strip_id_or_class: billboard + +# Remove clutter after article +strip_id_or_class: Tagline +strip_id_or_class: ArtikelAbbinder +strip_id_or_class: FAZArtikelKommentare +strip_id_or_class: ArtikelKommentieren +strip_id_or_class: FAZContentRight + +# Try it yourself +test_url: http://blogs.faz.net/wost/2014/08/17/viel-fuck-und-wenig-guter-sex-1239/ diff --git a/inc/3rdparty/site_config/standard/faz.net.txt b/inc/3rdparty/site_config/standard/faz.net.txt old mode 100755 new mode 100644 index d087d2aa..47048a1b --- a/inc/3rdparty/site_config/standard/faz.net.txt +++ b/inc/3rdparty/site_config/standard/faz.net.txt @@ -1,36 +1,101 @@ +# Author: zinnober +# Complete rewrite of the faz.net template as the standard one is broken +# I tried to consider as many page variants as possible, which was some serious work + +tidy: no +prune: no + # Title title: //p[@class='Content HeadlineShort'] -# Authors -# some are known and have a link, others don't -author: substring-after(//span[@class='Autor'], 'Von') +# Set author +author: substring-after(//span[@class='Autor'], 'von ') +author: //span[@class='caps last']/span[@class='caps last'] +author: //a[@rel='author'] -# Date +# Set date date: //span[@class='Datum'] +date: //span[@class='Datum'],/span + +# Fetch full multipage articles +next_page_link: //a[@title='Nächste Seite'] -# Body +# Content is here body: //div[@class='Artikel'] -# Removements before body text -strip: //div[@class='Breadcrumbs'] -strip: //div[@class='QuickSearchBox'] -strip: //div[@class='FAZArtikelEinleitung'] -strip: //div[@class='FAZArtikelReiter'] +# Tidy up before article +strip: //div[@id='FAZHeaderNeu'] +strip: //h2[@itemprop='headline'] +strip: //span[@class='Datum'] +strip: //span[@class='Autor'] +strip_id_or_class: ArticlePagerTop + +# General cleanup strip: //div[@class='clear'] +strip: //a[@title='Zur Homepage FAZ.NET'] +strip: //iframe +replace_string( · ): + +# Remove tracking and ads +strip_image_src: /l.gif? +strip: //div[contains(@style, 'background-image')] +strip: //img[@width='1'] +strip_id_or_class: invisible +strip_id_or_class: Anzeige +strip_id_or_class: billboard + +# Remove various text boxes and social media foo +strip_id_or_class: WeitereBeitraege +strip_id_or_class: WBListe +strip_id_or_class: AutorenModul +strip_id_or_class: Community +strip_id_or_class: SocialMediaStatus +strip_id_or_class: RelatedLinkBox +strip_id_or_class: MultimediaNavigation +strip_id_or_class: IndexTitel + +# Fix picture caps and pictures (use better resolution and remove clutter) +strip_id_or_class: LightBoxOverlay +strip_id_or_class: exitLarge +strip_id_or_class: PagerBox +strip_id_or_class: Bildnachweis +strip_id_or_class: Bildueberschrift +strip_id_or_class: Bildbeschreibung +strip_id_or_class: ArtikelBild610 +strip_id_or_class: MediaLink +strip_id_or_class: FotoBoxInnerLeft +strip_id_or_class: BilderRelatedLinks + +# Remove clutter after article +strip_id_or_class: ArticlePagerBottom +strip_id_or_class: backToHome +strip_id_or_class: ArtikelAbbinder +strip_id_or_class: lesermeinungscontainer +strip_id_or_class: ThemenLinks +strip_id_or_class: rechtehinweis +strip_id_or_class: FAZArtikelMap +strip_id_or_class: FAZArtikelKommentare +strip_id_or_class: ArtikelKommentieren +strip_id_or_class: FAZArtikelFunktionen +strip_id_or_class: mailLB +strip_id_or_class: FAZContentRight +strip_id_or_class: stageModule +strip_id_or_class: ContentFooter +strip_id_or_class: ServicesFooter +strip_id_or_class: FAZFooter + +# Clean up stuff present just in some articles +strip_id_or_class: Teaser620 +strip_id_or_class: TeaserMultimedia +strip_id_or_class: VideoBox + +# Remove as soon as Wallabag maight be able to embed flash video +strip_id_or_class: mmoObjectAsTeaserInArticle +strip_id_or_class: additionalStylesAudioVideo +strip_id_or_class: hideMMElements + +# Try it yourself +test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken +test_url: http://www.faz.net/aktuell/politik/inland/allensbach-analyse-im-namen-des-volkes-13106492.html +test_url: http://www.faz.net/aktuell/feuilleton/kino/video-filmkritiken/video-filmkritik-when-animals-dream-zerrissene-jugend-13105772.html -# General removements -strip: //span[@class='Bildnachweis'] -strip: //img[@class='MediaIcon'] -strip: //div[@class='ArtikelMediaLink'] -dissolve: //a[img] - -# Removements after body text -strip: //div[@class='ArtikelAbbinder'] -strip: //div[@class='ArtikelKommentieren Artikelfuss GETS;tk;boxen.top-lesermeinungen;tp;content'] -strip: //div[@class='FAZArtikelKommentare FAZArtikelContent'] -strip: //div[@class='FAZArtikelFunktionen'] -strip: //div[@id='FAZContentRight'] - -# Fix picture captions -wrap_in(small): //span[@class='Bildunterschrift']/text() -test_url: http://www.faz.net/aktuell/feuilleton/zum-tod-von-margaret-thatcher-die-reizfigur-12141919.html#Drucken \ No newline at end of file -- cgit v1.2.3