Merge lp:~tomek3d/calibre/kalibrator into lp:calibre

Proposed by Tomasz Długosz
Status: Merged
Merged at revision: 13678
Proposed branch: lp:~tomek3d/calibre/kalibrator
Merge into: lp:calibre
Diff against target: 234 lines (+204/-1)
3 files modified
recipes/gazeta_pl_krakow.recipe (+103/-0)
recipes/gazeta_pl_warszawa.recipe (+100/-0)
recipes/gazeta_wyborcza.recipe (+1/-1)
To merge this branch: bzr merge lp:~tomek3d/calibre/kalibrator
Reviewer Review Type Date Requested Status
Kovid Goyal Pending
Review via email: mp+133821@code.launchpad.net
To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'recipes/gazeta_pl_krakow.recipe'
2--- recipes/gazeta_pl_krakow.recipe 1970-01-01 00:00:00 +0000
3+++ recipes/gazeta_pl_krakow.recipe 2012-11-11 12:13:22 +0000
4@@ -0,0 +1,103 @@
5+#!/usr/bin/env python
6+
7+__license__ = 'GPL v3'
8+__copyright__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
9+
10+'''
11+krakow.gazeta.pl
12+'''
13+
14+from calibre.web.feeds.news import BasicNewsRecipe
15+import re
16+
17+class gw_krakow(BasicNewsRecipe):
18+ title = u'Gazeta.pl Kraków'
19+ __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
20+ language = 'pl'
21+ description =u'Wiadomości z Krakowa na portalu Gazeta.pl.'
22+ category='newspaper'
23+ publication_type = 'newspaper'
24+ masthead_url='http://bi.gazeta.pl/im/5/8528/m8528105.gif'
25+ INDEX='http://krakow.gazeta.pl/'
26+ remove_empty_feeds= True
27+ oldest_article = 1
28+ max_articles_per_feed = 100
29+ remove_javascript=True
30+ no_stylesheets=True
31+
32+ keep_only_tags =[]
33+ keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'}))
34+
35+ remove_tags =[]
36+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'}))
37+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'}))
38+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'}))
39+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'}))
40+ remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'}))
41+ remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'}))
42+ remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'}))
43+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'}))
44+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'}))
45+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'}))
46+ remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'}))
47+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'}))
48+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'}))
49+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'}))
50+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_buttons'}))
51+
52+ remove_tags_after = [dict(name = 'div', attrs = {'id' : 'gazeta_article_share'})]
53+
54+ feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/krakow.xml')]
55+
56+ def skip_ad_pages(self, soup):
57+ tag=soup.find(name='a', attrs={'class':'btn'})
58+ if tag:
59+ new_soup=self.index_to_soup(tag['href'], raw=True)
60+ return new_soup
61+
62+
63+ def append_page(self, soup, appendtag):
64+ loop=False
65+ tag = soup.find('div', attrs={'id':'Str'})
66+ if appendtag.find('div', attrs={'id':'Str'}):
67+ nexturl=tag.findAll('a')
68+ appendtag.find('div', attrs={'id':'Str'}).extract()
69+ loop=True
70+ if appendtag.find(id='source'):
71+ appendtag.find(id='source').extract()
72+ while loop:
73+ loop=False
74+ for link in nexturl:
75+ if u'następne' in link.string:
76+ url= self.INDEX + link['href']
77+ soup2 = self.index_to_soup(url)
78+ pagetext = soup2.find(id='artykul')
79+ pos = len(appendtag.contents)
80+ appendtag.insert(pos, pagetext)
81+ tag = soup2.find('div', attrs={'id':'Str'})
82+ nexturl=tag.findAll('a')
83+ loop=True
84+
85+ def gallery_article(self, appendtag):
86+ tag=appendtag.find(id='container_gal')
87+ if tag:
88+ nexturl=appendtag.find(id='gal_btn_next').a['href']
89+ appendtag.find(id='gal_navi').extract()
90+ while nexturl:
91+ soup2=self.index_to_soup(nexturl)
92+ pagetext=soup2.find(id='container_gal')
93+ nexturl=pagetext.find(id='gal_btn_next')
94+ if nexturl:
95+ nexturl=nexturl.a['href']
96+ pos = len(appendtag.contents)
97+ appendtag.insert(pos, pagetext)
98+ rem=appendtag.find(id='gal_navi')
99+ if rem:
100+ rem.extract()
101+
102+ def preprocess_html(self, soup):
103+ self.append_page(soup, soup.body)
104+ if soup.find(id='container_gal'):
105+ self.gallery_article(soup.body)
106+ return soup
107+
108
109=== added file 'recipes/gazeta_pl_warszawa.recipe'
110--- recipes/gazeta_pl_warszawa.recipe 1970-01-01 00:00:00 +0000
111+++ recipes/gazeta_pl_warszawa.recipe 2012-11-11 12:13:22 +0000
112@@ -0,0 +1,100 @@
113+#!/usr/bin/env python
114+
115+__license__ = 'GPL v3'
116+__author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
117+
118+'''
119+warszawa.gazeta.pl
120+'''
121+
122+from calibre.web.feeds.news import BasicNewsRecipe
123+import re
124+
125+class gw_wawa(BasicNewsRecipe):
126+ title = u'Gazeta.pl Warszawa'
127+ __author__ = 'teepel <teepel44@gmail.com> based on GW from fenuks'
128+ language = 'pl'
129+ description ='Wiadomości z Warszawy na portalu Gazeta.pl.'
130+ category='newspaper'
131+ publication_type = 'newspaper'
132+ masthead_url='http://bi.gazeta.pl/im/3/4089/m4089863.gif'
133+ INDEX='http://warszawa.gazeta.pl/'
134+ remove_empty_feeds= True
135+ oldest_article = 1
136+ max_articles_per_feed = 100
137+ remove_javascript=True
138+ no_stylesheets=True
139+
140+ keep_only_tags =[]
141+ keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article'}))
142+
143+ remove_tags =[]
144+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_likes'}))
145+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tools'}))
146+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'rel'}))
147+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_share'}))
148+ remove_tags.append(dict(name = 'u1', attrs = {'id' : 'articleToolbar'}))
149+ remove_tags.append(dict(name = 'li', attrs = {'class' : 'atComments'}))
150+ remove_tags.append(dict(name = 'li', attrs = {'class' : 'atLicense'}))
151+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'banP4'}))
152+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'article_toolbar'}))
153+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_tags'}))
154+ remove_tags.append(dict(name = 'p', attrs = {'class' : 'txt_upl'}))
155+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazeta_article_related_new'}))
156+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'gazetaVideoPlayer'}))
157+ remove_tags.append(dict(name = 'div', attrs = {'id' : 'gazeta_article_miniatures'}))
158+
159+ feeds = [(u'Wiadomości', u'http://rss.gazeta.pl/pub/rss/warszawa.xml')]
160+
161+ def skip_ad_pages(self, soup):
162+ tag=soup.find(name='a', attrs={'class':'btn'})
163+ if tag:
164+ new_soup=self.index_to_soup(tag['href'], raw=True)
165+ return new_soup
166+
167+
168+ def append_page(self, soup, appendtag):
169+ loop=False
170+ tag = soup.find('div', attrs={'id':'Str'})
171+ if appendtag.find('div', attrs={'id':'Str'}):
172+ nexturl=tag.findAll('a')
173+ appendtag.find('div', attrs={'id':'Str'}).extract()
174+ loop=True
175+ if appendtag.find(id='source'):
176+ appendtag.find(id='source').extract()
177+ while loop:
178+ loop=False
179+ for link in nexturl:
180+ if u'następne' in link.string:
181+ url= self.INDEX + link['href']
182+ soup2 = self.index_to_soup(url)
183+ pagetext = soup2.find(id='artykul')
184+ pos = len(appendtag.contents)
185+ appendtag.insert(pos, pagetext)
186+ tag = soup2.find('div', attrs={'id':'Str'})
187+ nexturl=tag.findAll('a')
188+ loop=True
189+
190+ def gallery_article(self, appendtag):
191+ tag=appendtag.find(id='container_gal')
192+ if tag:
193+ nexturl=appendtag.find(id='gal_btn_next').a['href']
194+ appendtag.find(id='gal_navi').extract()
195+ while nexturl:
196+ soup2=self.index_to_soup(nexturl)
197+ pagetext=soup2.find(id='container_gal')
198+ nexturl=pagetext.find(id='gal_btn_next')
199+ if nexturl:
200+ nexturl=nexturl.a['href']
201+ pos = len(appendtag.contents)
202+ appendtag.insert(pos, pagetext)
203+ rem=appendtag.find(id='gal_navi')
204+ if rem:
205+ rem.extract()
206+
207+ def preprocess_html(self, soup):
208+ self.append_page(soup, soup.body)
209+ if soup.find(id='container_gal'):
210+ self.gallery_article(soup.body)
211+ return soup
212+
213
214=== modified file 'recipes/gazeta_wyborcza.recipe'
215--- recipes/gazeta_wyborcza.recipe 2012-10-26 19:29:22 +0000
216+++ recipes/gazeta_wyborcza.recipe 2012-11-11 12:13:22 +0000
217@@ -3,7 +3,7 @@
218
219
220 class Gazeta_Wyborcza(BasicNewsRecipe):
221- title = u'Gazeta Wyborcza'
222+ title = u'Gazeta.pl'
223 __author__ = 'fenuks, Artur Stachecki'
224 language = 'pl'
225 description = 'news from gazeta.pl'
226
227=== added file 'recipes/icons/gazeta_pl_krakow.png'
228Binary files recipes/icons/gazeta_pl_krakow.png 1970-01-01 00:00:00 +0000 and recipes/icons/gazeta_pl_krakow.png 2012-11-11 12:13:22 +0000 differ
229=== added file 'recipes/icons/gazeta_pl_szczecin.png'
230Binary files recipes/icons/gazeta_pl_szczecin.png 1970-01-01 00:00:00 +0000 and recipes/icons/gazeta_pl_szczecin.png 2012-11-11 12:13:22 +0000 differ
231=== added file 'recipes/icons/gazeta_pl_warszawa.png'
232Binary files recipes/icons/gazeta_pl_warszawa.png 1970-01-01 00:00:00 +0000 and recipes/icons/gazeta_pl_warszawa.png 2012-11-11 12:13:22 +0000 differ
233=== modified file 'recipes/icons/gazeta_wyborcza.png'
234Binary files recipes/icons/gazeta_wyborcza.png 2011-10-02 14:50:47 +0000 and recipes/icons/gazeta_wyborcza.png 2012-11-11 12:13:22 +0000 differ

Subscribers

People subscribed via source and target branches