calibre

Overview
Code
Bugs
Blueprints
Translations
Answers

Merge lp:~tomek3d/calibre/kalibrator into lp:calibre

kalibrator
Merge into trunk

Proposed by Tomasz Długosz on 2012-11-18

Status:	Merged
Merged at revision:	13703
Proposed branch:	lp:~tomek3d/calibre/kalibrator
Merge into:	lp:calibre
Diff against target:	288 lines (+252/-0) 5 files modified recipes/antyweb.recipe (+49/-0) recipes/bankier_pl.recipe (+51/-0) recipes/f1_ultra.recipe (+35/-0) recipes/myapple_pl.recipe (+50/-0) recipes/telepolis_pl.recipe (+67/-0)
To merge this branch:	bzr merge lp:~tomek3d/calibre/kalibrator
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Kovid Goyal		2012-11-18	Pending
Review via email: mp+134812@code.launchpad.net

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Download diff
Side-by-side diff

Subscribers

People subscribed via source and target branches

to all changes:

Ali Baba

Kovid Goyal

Pankaj

Timothy Legge

Tomasz Długosz

gstoychev

calibre

Merge lp:~tomek3d/calibre/kalibrator into lp:calibre

Commit message

Description of the change

Preview Diff

Subscribers

 === added file 'recipes/antyweb.recipe'
 --- recipes/antyweb.recipe	1970-01-01 00:00:00 +0000
 +++ recipes/antyweb.recipe	2012-11-18 17:55:22 +0000
@@ -0,0 +1,49 @@
++import re
++
++from calibre.web.feeds.news import BasicNewsRecipe
++
++class AntywebRecipe(BasicNewsRecipe):
++    encoding = 'utf-8'
++    __license__ = 'GPL v3'
++    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
++    language = 'pl'
++    version = 1
++    title = u'Antyweb'
++    category = u'News'
++    description = u'Blog o internecie i nowych technologiach'
++    cover_url=''
++    remove_empty_feeds= True
++    auto_cleanup = False
++    no_stylesheets=True
++    use_embedded_content = False
++    oldest_article = 1
++    max_articles_per_feed = 100
++    remove_javascript = True
++    simultaneous_downloads = 3
++
++    keep_only_tags =[]
++    keep_only_tags.append(dict(name = 'h1', attrs = { 'class' : 'mm-article-title'}))
++    keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'mm-article-content'}))
++
++
++    remove_tags =[]
++    remove_tags.append(dict(name = 'h2', attrs = {'class' : 'widgettitle'}))
++    remove_tags.append(dict(name = 'img', attrs = {'class' : 'alignleft'}))
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'float: right;margin-left:1em;margin-bottom: 0.5em;padding-bottom: 3px; width: 72px;'}))
++    remove_tags.append(dict(name = 'img', attrs = {'src' : 'http://antyweb.pl/wp-content/uploads/2011/09/HOSTERSI_testy_pasek600x30.gif'}))
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'podwpisowe'}))
++
++
++    extra_css = '''
++                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
++                       '''
++
++    feeds          = [
++                            (u'Artykuly', u'feed://feeds.feedburner.com/Antyweb?format=xml'),
++                     ]
++    def preprocess_html(self, soup):
++        for alink in soup.findAll('a'):
++            if alink.string is not None:
++               tstr = alink.string
++               alink.replaceWith(tstr)
++	return soup
 === added file 'recipes/bankier_pl.recipe'
 --- recipes/bankier_pl.recipe	1970-01-01 00:00:00 +0000
 +++ recipes/bankier_pl.recipe	2012-11-18 17:55:22 +0000
@@ -0,0 +1,51 @@
++#!/usr/bin/env  python
++
++__license__ = 'GPL v3'
++__author__ = 'teepel <teepel44@gmail.com>'
++
++'''
++bankier.pl
++'''
++
++from calibre.web.feeds.news import BasicNewsRecipe
++import re
++
++class bankier(BasicNewsRecipe):
++    title          = u'Bankier.pl'
++    __author__ = 'teepel <teepel44@gmail.com>'
++    language       = 'pl'
++    description ='Polski portal finansowy. Informacje o: gospodarka, inwestowanie, finanse osobiste, prowadzenie firmy, kursy walut, notowania akcji, fundusze.'
++    masthead_url='http://www.bankier.pl/gfx/hd-mid-02.gif'
++    INDEX='http://bankier.pl/'
++    remove_empty_feeds= True
++    oldest_article = 1
++    max_articles_per_feed = 100
++    remove_javascript=True
++    no_stylesheets=True
++    simultaneous_downloads = 5
++
++    keep_only_tags =[]
++    keep_only_tags.append(dict(name = 'div', attrs = {'align' : 'left'}))
++
++    remove_tags =[]
++    remove_tags.append(dict(name = 'table', attrs = {'cellspacing' : '2'}))
++    remove_tags.append(dict(name = 'div', attrs = {'align' : 'center'}))
++    remove_tags.append(dict(name = 'img', attrs = {'src' : '/gfx/hd-mid-02.gif'}))
++    #remove_tags.append(dict(name = 'a', attrs = {'target' : '_blank'}))
++    #remove_tags.append(dict(name = 'br', attrs = {'clear' : 'all'}))
++
++    feeds          = [
++	        (u'Wiadomości dnia', u'http://feeds.feedburner.com/bankier-wiadomosci-dnia'),
++	        (u'Finanse osobiste', u'http://feeds.feedburner.com/bankier-finanse-osobiste'),
++	        (u'Firma', u'http://feeds.feedburner.com/bankier-firma'),
++	        (u'Giełda', u'http://feeds.feedburner.com/bankier-gielda'),
++	        (u'Rynek walutowy', u'http://feeds.feedburner.com/bankier-rynek-walutowy'),
++	        (u'Komunikaty ze spółek', u'http://feeds.feedburner.com/bankier-espi'),
++	     ]
++    def print_version(self, url):
++        segment = url.split('.')
++        urlPart = segment[2]
++        segments = urlPart.split('-')
++        urlPart2 = segments[-1]
++        return 'http://www.bankier.pl/wiadomosci/print.html?article_id=' + urlPart2
++
 \ No newline at end of file
 === added file 'recipes/f1_ultra.recipe'
 --- recipes/f1_ultra.recipe	1970-01-01 00:00:00 +0000
 +++ recipes/f1_ultra.recipe	2012-11-18 17:55:22 +0000
@@ -0,0 +1,35 @@
++from calibre.web.feeds.news import BasicNewsRecipe
++import re
++
++class f1ultra(BasicNewsRecipe):
++    title = u'Formuła 1 - F1 ultra'
++    __license__ = 'GPL v3'
++    __author__ = 'MrStefan <mrstefaan@gmail.com>, Artur Stachecki <artur.stachecki@gmail.com>'
++    language = 'pl'
++    description =u'Formuła 1, Robert Kubica, F3, GP2 oraz inne serie wyścigowe.'
++    masthead_url='http://www.f1ultra.pl/templates/f1ultra/images/logo.gif'
++    remove_empty_feeds= True
++    oldest_article = 1
++    max_articles_per_feed = 100
++    remove_javascript=True
++    no_stylesheets=True
++
++    keep_only_tags =[(dict(name = 'div', attrs = {'id' : 'main'}))]
++    remove_tags_after =[dict(attrs = {'style' : 'margin-top:5px;margin-bottom:5px;display: inline;'})]
++    remove_tags =[(dict(attrs = {'class' : ['buttonheading', 'avPlayerContainer', 'createdate']}))]
++    remove_tags.append(dict(attrs = {'title' : ['PDF', 'Drukuj', 'Email']}))
++    remove_tags.append(dict(name = 'form', attrs = {'method' : 'post'}))
++    remove_tags.append(dict(name = 'hr', attrs = {'size' : '2'}))
++
++    preprocess_regexps = [(re.compile(r'align="left"'), lambda match: ''),
++		          (re.compile(r'align="right"'), lambda match: ''),
++		          (re.compile(r'width=\"*\"'), lambda match: ''),
++        		  (re.compile(r'\<table .*?\>'), lambda match: '')]
++
++
++    extra_css = '''.contentheading { font-size: 1.4em; font-weight: bold; }
++	           img { display: block; clear: both;}
++	        '''
++    remove_attributes = ['width','height','position','float','padding-left','padding-right','padding','text-align']
++
++    feeds = [(u'F1 Ultra', u'http://www.f1ultra.pl/index.php?option=com_rd_rss&id=1&Itemid=245')]
 === added file 'recipes/icons/antyweb.png'
 Binary files recipes/icons/antyweb.png	1970-01-01 00:00:00 +0000 and recipes/icons/antyweb.png	2012-11-18 17:55:22 +0000 differ
 === added file 'recipes/icons/bankier_pl.png'
 Binary files recipes/icons/bankier_pl.png	1970-01-01 00:00:00 +0000 and recipes/icons/bankier_pl.png	2012-11-18 17:55:22 +0000 differ
 === added file 'recipes/icons/f1_ultra.png'
 Binary files recipes/icons/f1_ultra.png	1970-01-01 00:00:00 +0000 and recipes/icons/f1_ultra.png	2012-11-18 17:55:22 +0000 differ
 === added file 'recipes/icons/myapple_pl.png'
 Binary files recipes/icons/myapple_pl.png	1970-01-01 00:00:00 +0000 and recipes/icons/myapple_pl.png	2012-11-18 17:55:22 +0000 differ
 === added file 'recipes/icons/telepolis_pl.png'
 Binary files recipes/icons/telepolis_pl.png	1970-01-01 00:00:00 +0000 and recipes/icons/telepolis_pl.png	2012-11-18 17:55:22 +0000 differ
 === added file 'recipes/myapple_pl.recipe'
 --- recipes/myapple_pl.recipe	1970-01-01 00:00:00 +0000
 +++ recipes/myapple_pl.recipe	2012-11-18 17:55:22 +0000
@@ -0,0 +1,50 @@
++import re
++
++from calibre.web.feeds.news import BasicNewsRecipe
++
++class MyAppleRecipe(BasicNewsRecipe):
++    __license__ = 'GPL v3'
++    __author__ = u'Artur Stachecki <artur.stachecki@gmail.com>'
++    language = 'pl'
++    version = 1
++
++    title = u'MyApple.pl'
++    category = u'News'
++    description = u' Największy w Polsce serwis zajmujący się tematyką związaną z Apple i wszelkimi produktami tej firmy.'
++    cover_url=''
++    remove_empty_feeds= True
++    no_stylesheets=True
++    oldest_article = 7
++    max_articles_per_feed = 100000
++    recursions = 0
++
++    no_stylesheets = True
++    remove_javascript = True
++    simultaneous_downloads = 3
++
++    keep_only_tags =[]
++    keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'article_content'}))
++
++    remove_tags =[]
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'article_author_date_comment_container'}))
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'fullwidth'}))
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'cmslinks'}))
++    remove_tags.append(dict(name = 'div', attrs = {'class' : 'googleads-468'}))
++    remove_tags.append(dict(name = 'div', attrs = {'id' : 'comments'}))
++
++
++    extra_css = '''
++                    body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
++                    td.contentheading{font-size: large; font-weight: bold;}
++                    '''
++
++    feeds          = [
++                            ('News', 'feed://myapple.pl/external.php?do=rss&type=newcontent&sectionid=1&days=120&count=10'),
++                          ]
++
++    def preprocess_html(self, soup):
++        for alink in soup.findAll('a'):
++            if alink.string is not None:
++               tstr = alink.string
++               alink.replaceWith(tstr)
++        return soup
 \ No newline at end of file
 === added file 'recipes/telepolis_pl.recipe'
 --- recipes/telepolis_pl.recipe	1970-01-01 00:00:00 +0000
 +++ recipes/telepolis_pl.recipe	2012-11-18 17:55:22 +0000
@@ -0,0 +1,67 @@
++#!/usr/bin/env  python
++
++__license__ = 'GPL v3'
++
++from calibre.web.feeds.news import BasicNewsRecipe
++import re
++
++
++class telepolis(BasicNewsRecipe):
++    title = u'Telepolis.pl'
++    __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
++    language = 'pl'
++    description = u'Twój telekomunikacyjny serwis informacyjny.\
++                  Codzienne informacje, testy i artykuły,\
++                  promocje, baza telefonów oraz centrum rozrywki'
++    oldest_article = 7
++    masthead_url = 'http://telepolis.pl/i/telepolis-logo2.gif'
++    max_articles_per_feed = 100
++    simultaneous_downloads = 5
++    remove_javascript = True
++    no_stylesheets = True
++    use_embedded_content = False
++
++    remove_tags = []
++    remove_tags.append(dict(attrs={'alt': 'TELEPOLIS.pl'}))
++
++    preprocess_regexps = [(re.compile(r'<: .*? :>'),
++                           lambda match: ''),
++                          (re.compile(r'<b>Zobacz:</b>.*?</a>', re.DOTALL),
++                           lambda match: ''),
++                          (re.compile(r'<-ankieta.*?>'),
++                           lambda match: ''),
++                          (re.compile(r'\(Q\!\)'),
++                           lambda match: ''),
++                          (re.compile(r'\(plik.*?\)'),
++                           lambda match: ''),
++                          (re.compile(r'<br.*?><br.*?>', re.DOTALL),
++                           lambda match: '')
++                          ]
++
++    extra_css = '''.tb { font-weight: bold; font-size: 20px;}'''
++
++    feeds = [
++        (u'Wiadomości', u'http://www.telepolis.pl/rss/news.php'),
++        (u'Artykuły', u'http://www.telepolis.pl/rss/artykuly.php')
++    ]
++
++    def print_version(self, url):
++        if 'news.php' in url:
++            print_url = url.replace('news.php', 'news_print.php')
++        else:
++            print_url = url.replace('artykuly.php', 'art_print.php')
++        return print_url
++
++    def preprocess_html(self, soup):
++        for image in soup.findAll('img'):
++            if 'm.jpg' in image['src']:
++                image_big = image['src']
++                image_big = image_big.replace('m.jpg', '.jpg')
++                image['src'] = image_big
++        logo = soup.find('tr')
++        logo.extract()
++        for tag in soup.findAll('tr'):
++            for strings in ['Wiadomość wydrukowana', 'copyright']:
++                if strings in self.tag_to_string(tag):
++                    tag.extract()
++        return self.adeify_images(soup)