Merge lp:~tomek3d/calibre/kalibrator into lp:calibre

Proposed by Tomasz Długosz
Status: Merged
Merged at revision: 13675
Proposed branch: lp:~tomek3d/calibre/kalibrator
Merge into: lp:calibre
Diff against target: 296 lines (+244/-2)
5 files modified
.bzrignore (+42/-0)
recipes/autosport.recipe (+31/-0)
recipes/blognexto.recipe (+29/-0)
recipes/brewiarz.recipe (+141/-0)
recipes/dobreprogamy.recipe (+1/-2)
To merge this branch: bzr merge lp:~tomek3d/calibre/kalibrator
Reviewer Review Type Date Requested Status
Kovid Goyal Pending
Review via email: mp+133789@code.launchpad.net

Description of the change

Some more fixes and recipes from kalibrator project. Plus bzrignore with our tv listings.

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file '.bzrignore'
2--- .bzrignore 2012-11-07 11:57:11 +0000
3+++ .bzrignore 2012-11-10 10:56:21 +0000
4@@ -39,3 +39,45 @@
5 recipes/.gitignore
6 recipes/README
7 recipes/katalog_egazeciarz.recipe
8+recipes/tv_axnscifi.recipe
9+recipes/tv_comedycentral.recipe
10+recipes/tv_discoveryscience.recipe
11+recipes/tv_foxlife.recipe
12+recipes/tv_fox.recipe
13+recipes/tv_hbo.recipe
14+recipes/tv_kinopolska.recipe
15+recipes/tv_nationalgeographic.recipe
16+recipes/tv_polsat2.recipe
17+recipes/tv_polsat.recipe
18+recipes/tv_tv4.recipe
19+recipes/tv_tvn7.recipe
20+recipes/tv_tvn.recipe
21+recipes/tv_tvp1.recipe
22+recipes/tv_tvp2.recipe
23+recipes/tv_tvphd.recipe
24+recipes/tv_tvphistoria.recipe
25+recipes/tv_tvpkultura.recipe
26+recipes/tv_tvppolonia.recipe
27+recipes/tv_tvpuls.recipe
28+recipes/tv_viasathistory.recipe
29+recipes/icons/tv_axnscifi.png
30+recipes/icons/tv_comedycentral.png
31+recipes/icons/tv_discoveryscience.png
32+recipes/icons/tv_foxlife.png
33+recipes/icons/tv_fox.png
34+recipes/icons/tv_hbo.png
35+recipes/icons/tv_kinopolska.png
36+recipes/icons/tv_nationalgeographic.png
37+recipes/icons/tv_polsat2.png
38+recipes/icons/tv_polsat.png
39+recipes/icons/tv_tv4.png
40+recipes/icons/tv_tvn7.png
41+recipes/icons/tv_tvn.png
42+recipes/icons/tv_tvp1.png
43+recipes/icons/tv_tvp2.png
44+recipes/icons/tv_tvphd.png
45+recipes/icons/tv_tvphistoria.png
46+recipes/icons/tv_tvpkultura.png
47+recipes/icons/tv_tvppolonia.png
48+recipes/icons/tv_tvpuls.png
49+recipes/icons/tv_viasathistory.png
50
51=== added file 'recipes/autosport.recipe'
52--- recipes/autosport.recipe 1970-01-01 00:00:00 +0000
53+++ recipes/autosport.recipe 2012-11-10 10:56:21 +0000
54@@ -0,0 +1,31 @@
55+#!/usr/bin/env python
56+
57+__license__ = 'GPL v3'
58+__author__ = 'MrStefan <mrstefaan@gmail.com>'
59+
60+'''
61+www.autosport.com
62+'''
63+
64+from calibre.web.feeds.news import BasicNewsRecipe
65+import re
66+
67+class autosport(BasicNewsRecipe):
68+ title = u'Autosport'
69+ __author__ = 'MrStefan <mrstefaan@gmail.com>'
70+ language = 'en_GB'
71+ description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...'
72+ masthead_url='http://cdn.images.autosport.com/asdotcom.gif'
73+ remove_empty_feeds= True
74+ oldest_article = 1
75+ max_articles_per_feed = 100
76+ remove_javascript=True
77+ no_stylesheets=True
78+
79+ keep_only_tags =[]
80+ keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'}))
81+ keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'}))
82+ keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'}))
83+ keep_only_tags.append(dict(name = 'p'))
84+
85+ feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')]
86\ No newline at end of file
87
88=== added file 'recipes/blognexto.recipe'
89--- recipes/blognexto.recipe 1970-01-01 00:00:00 +0000
90+++ recipes/blognexto.recipe 2012-11-10 10:56:21 +0000
91@@ -0,0 +1,29 @@
92+from calibre.web.feeds.news import BasicNewsRecipe
93+import re
94+
95+class blognexto(BasicNewsRecipe):
96+ title = 'BLOG.NEXTO.pl'
97+ __author__ = 'MrStefan <mrstefaan@gmail.com>'
98+ language = 'pl'
99+ description ='o e-publikacjach prawie wszystko'
100+ masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg'
101+ remove_empty_feeds= True
102+ oldest_article = 7
103+ max_articles_per_feed = 100
104+ remove_javascript=True
105+ no_stylesheets=True
106+
107+
108+ keep_only_tags =[]
109+ keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'}))
110+
111+ remove_tags =[]
112+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'}))
113+ remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'}))
114+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'}))
115+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'}))
116+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'}))
117+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'}))
118+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'}))
119+
120+ feeds = [('Artykuly', 'http://feeds.feedburner.com/blognexto')]
121
122=== added file 'recipes/brewiarz.recipe'
123--- recipes/brewiarz.recipe 1970-01-01 00:00:00 +0000
124+++ recipes/brewiarz.recipe 2012-11-10 10:56:21 +0000
125@@ -0,0 +1,141 @@
126+#!/usr/bin/env python
127+
128+__license__ = 'GPL v3'
129+
130+from calibre.web.feeds.news import BasicNewsRecipe
131+import datetime
132+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Tag
133+
134+
135+class brewiarz(BasicNewsRecipe):
136+ title = u'Brewiarz'
137+ __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>'
138+ language = 'pl'
139+ description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.'
140+ masthead_url = 'http://brewiarz.pl/images/logo2.gif'
141+ max_articles_per_feed = 100
142+ remove_javascript = True
143+ no_stylesheets = True
144+ publication_type = 'newspaper'
145+ next_days = 1
146+
147+ def parse_index(self):
148+ dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv",
149+ "05": "v", "06": "vi", "07": "vii", "08": "viii",
150+ "09": "ix", "10": "x", "11": "xi", "12": "xii"}
151+
152+ weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek",
153+ "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"}
154+
155+ now = datetime.datetime.now()
156+
157+ feeds = []
158+ for i in range(0, self.next_days):
159+ url_date = now + datetime.timedelta(days=i)
160+ url_date_month = url_date.strftime("%m")
161+ url_date_month_roman = dec2rom_dict[url_date_month]
162+ url_date_day = url_date.strftime("%d")
163+ url_date_year = url_date.strftime("%Y")[2:]
164+ url_date_weekday = url_date.strftime("%A")
165+ url_date_weekday_pl = weekday_dict[url_date_weekday]
166+
167+ url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3"
168+ articles = self.parse_pages(url)
169+ if articles:
170+ title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year
171+ feeds.append((title, articles))
172+ else:
173+ sectors = self.get_sectors(url)
174+ for subpage in sectors:
175+ title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string
176+ url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href']
177+ print(url)
178+ articles = self.parse_pages(url)
179+ if articles:
180+ feeds.append((title, articles))
181+ return feeds
182+
183+ def get_sectors(self, url):
184+ sectors = []
185+ soup = self.index_to_soup(url)
186+ sectors_table = soup.find(name='table', attrs={'width': '490'})
187+ sector_links = sectors_table.findAll(name='a')
188+ for sector_links_modified in sector_links:
189+ link_parent_text = sector_links_modified.findParent(name='div').text
190+ if link_parent_text:
191+ sector_links_modified.text = link_parent_text.text
192+ sectors.append(sector_links_modified)
193+ return sectors
194+
195+ def parse_pages(self, url):
196+ current_articles = []
197+ soup = self.index_to_soup(url)
198+ www = soup.find(attrs={'class': 'www'})
199+ if www:
200+ box_title = www.find(text='Teksty LG')
201+ article_box_parent = box_title.findParent('ul')
202+ article_box_sibling = article_box_parent.findNextSibling('ul')
203+ for li in article_box_sibling.findAll('li'):
204+ link = li.find(name='a')
205+ ol = link.findNextSibling(name='ol')
206+ if ol:
207+ sublinks = ol.findAll(name='a')
208+ for sublink in sublinks:
209+ link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink)
210+ link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href'])
211+ link_url = url[:-10] + link_url_print
212+ current_articles.append({'title': link_title,
213+ 'url': link_url, 'description': '', 'date': ''})
214+ else:
215+ if link.findParent(name = 'ol'):
216+ continue
217+ else:
218+ link_title = self.tag_to_string(link)
219+ link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href'])
220+ link_url = url[:-10] + link_url_print
221+ current_articles.append({'title': link_title,
222+ 'url': link_url, 'description': '', 'date': ''})
223+ return current_articles
224+ else:
225+ return None
226+
227+ def preprocess_html(self, soup):
228+ footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'})
229+ footer_parent = footer.findParent('div')
230+ footer_parent.extract()
231+
232+ header = soup.find(text='http://brewiarz.pl')
233+ header_parent = header.findParent('div')
234+ header_parent.extract()
235+
236+ subheader = soup.find(text='Kolor szat:').findParent('div')
237+ subheader.extract()
238+
239+ color = soup.find('b')
240+ color.extract()
241+
242+ cleaned = self.strip_tags(soup)
243+
244+ div = cleaned.findAll(name='div')
245+ div[1].extract()
246+ div[2].extract()
247+ div[3].extract()
248+
249+ return cleaned
250+
251+ def strip_tags(self, soup_dirty):
252+ VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body']
253+
254+ for tag in soup_dirty.findAll(True):
255+ if tag.name not in VALID_TAGS:
256+ for i, x in enumerate(tag.parent.contents):
257+ if x == tag:
258+ break
259+ else:
260+ print "Can't find", tag, "in", tag.parent
261+ continue
262+ for r in reversed(tag.contents):
263+ tag.parent.insert(i, r)
264+ tag.extract()
265+
266+ return soup_dirty
267
268=== modified file 'recipes/dobreprogamy.recipe'
269--- recipes/dobreprogamy.recipe 2012-10-17 14:12:08 +0000
270+++ recipes/dobreprogamy.recipe 2012-11-10 10:56:21 +0000
271@@ -6,7 +6,6 @@
272 __author__ = 'fenuks'
273 __licence__ ='GPL v3'
274 category = 'IT'
275- language = 'pl'
276 masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png'
277 cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
278 description = u'Aktualności i blogi z dobreprogramy.pl'
279@@ -29,4 +28,4 @@
280 for a in soup('a'):
281 if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
282 a['href']=self.index + a['href']
283- return soup
284\ No newline at end of file
285+ return soup
286
287=== added file 'recipes/icons/autosport.png'
288Binary files recipes/icons/autosport.png 1970-01-01 00:00:00 +0000 and recipes/icons/autosport.png 2012-11-10 10:56:21 +0000 differ
289=== added file 'recipes/icons/blognexto.png'
290Binary files recipes/icons/blognexto.png 1970-01-01 00:00:00 +0000 and recipes/icons/blognexto.png 2012-11-10 10:56:21 +0000 differ
291=== added file 'recipes/icons/brewiarz.png'
292Binary files recipes/icons/brewiarz.png 1970-01-01 00:00:00 +0000 and recipes/icons/brewiarz.png 2012-11-10 10:56:21 +0000 differ
293=== added file 'recipes/icons/naszdziennik.png'
294Binary files recipes/icons/naszdziennik.png 1970-01-01 00:00:00 +0000 and recipes/icons/naszdziennik.png 2012-11-10 10:56:21 +0000 differ
295=== added file 'recipes/icons/wprost.png'
296Binary files recipes/icons/wprost.png 1970-01-01 00:00:00 +0000 and recipes/icons/wprost.png 2012-11-10 10:56:21 +0000 differ

Subscribers

People subscribed via source and target branches