Merge lp:~tomek3d/calibre/kalibrator into lp:calibre
- kalibrator
- Merge into trunk
Proposed by
Tomasz Długosz
Status: | Merged |
---|---|
Merged at revision: | 13675 |
Proposed branch: | lp:~tomek3d/calibre/kalibrator |
Merge into: | lp:calibre |
Diff against target: |
296 lines (+244/-2) 5 files modified
.bzrignore (+42/-0) recipes/autosport.recipe (+31/-0) recipes/blognexto.recipe (+29/-0) recipes/brewiarz.recipe (+141/-0) recipes/dobreprogamy.recipe (+1/-2) |
To merge this branch: | bzr merge lp:~tomek3d/calibre/kalibrator |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Kovid Goyal | Pending | ||
Review via email: mp+133789@code.launchpad.net |
Commit message
Description of the change
Some more fixes and recipes from kalibrator project. Plus bzrignore with our tv listings.
To post a comment you must log in.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file '.bzrignore' |
2 | --- .bzrignore 2012-11-07 11:57:11 +0000 |
3 | +++ .bzrignore 2012-11-10 10:56:21 +0000 |
4 | @@ -39,3 +39,45 @@ |
5 | recipes/.gitignore |
6 | recipes/README |
7 | recipes/katalog_egazeciarz.recipe |
8 | +recipes/tv_axnscifi.recipe |
9 | +recipes/tv_comedycentral.recipe |
10 | +recipes/tv_discoveryscience.recipe |
11 | +recipes/tv_foxlife.recipe |
12 | +recipes/tv_fox.recipe |
13 | +recipes/tv_hbo.recipe |
14 | +recipes/tv_kinopolska.recipe |
15 | +recipes/tv_nationalgeographic.recipe |
16 | +recipes/tv_polsat2.recipe |
17 | +recipes/tv_polsat.recipe |
18 | +recipes/tv_tv4.recipe |
19 | +recipes/tv_tvn7.recipe |
20 | +recipes/tv_tvn.recipe |
21 | +recipes/tv_tvp1.recipe |
22 | +recipes/tv_tvp2.recipe |
23 | +recipes/tv_tvphd.recipe |
24 | +recipes/tv_tvphistoria.recipe |
25 | +recipes/tv_tvpkultura.recipe |
26 | +recipes/tv_tvppolonia.recipe |
27 | +recipes/tv_tvpuls.recipe |
28 | +recipes/tv_viasathistory.recipe |
29 | +recipes/icons/tv_axnscifi.png |
30 | +recipes/icons/tv_comedycentral.png |
31 | +recipes/icons/tv_discoveryscience.png |
32 | +recipes/icons/tv_foxlife.png |
33 | +recipes/icons/tv_fox.png |
34 | +recipes/icons/tv_hbo.png |
35 | +recipes/icons/tv_kinopolska.png |
36 | +recipes/icons/tv_nationalgeographic.png |
37 | +recipes/icons/tv_polsat2.png |
38 | +recipes/icons/tv_polsat.png |
39 | +recipes/icons/tv_tv4.png |
40 | +recipes/icons/tv_tvn7.png |
41 | +recipes/icons/tv_tvn.png |
42 | +recipes/icons/tv_tvp1.png |
43 | +recipes/icons/tv_tvp2.png |
44 | +recipes/icons/tv_tvphd.png |
45 | +recipes/icons/tv_tvphistoria.png |
46 | +recipes/icons/tv_tvpkultura.png |
47 | +recipes/icons/tv_tvppolonia.png |
48 | +recipes/icons/tv_tvpuls.png |
49 | +recipes/icons/tv_viasathistory.png |
50 | |
51 | === added file 'recipes/autosport.recipe' |
52 | --- recipes/autosport.recipe 1970-01-01 00:00:00 +0000 |
53 | +++ recipes/autosport.recipe 2012-11-10 10:56:21 +0000 |
54 | @@ -0,0 +1,31 @@ |
55 | +#!/usr/bin/env python |
56 | + |
57 | +__license__ = 'GPL v3' |
58 | +__author__ = 'MrStefan <mrstefaan@gmail.com>' |
59 | + |
60 | +''' |
61 | +www.autosport.com |
62 | +''' |
63 | + |
64 | +from calibre.web.feeds.news import BasicNewsRecipe |
65 | +import re |
66 | + |
67 | +class autosport(BasicNewsRecipe): |
68 | + title = u'Autosport' |
69 | + __author__ = 'MrStefan <mrstefaan@gmail.com>' |
70 | + language = 'en_GB' |
71 | + description =u'Daily Formula 1 and motorsport news from the leading weekly motor racing magazine. The authority on Formula 1, F1, MotoGP, GP2, Champ Car, Le Mans...' |
72 | + masthead_url='http://cdn.images.autosport.com/asdotcom.gif' |
73 | + remove_empty_feeds= True |
74 | + oldest_article = 1 |
75 | + max_articles_per_feed = 100 |
76 | + remove_javascript=True |
77 | + no_stylesheets=True |
78 | + |
79 | + keep_only_tags =[] |
80 | + keep_only_tags.append(dict(name = 'h1', attrs = {'class' : 'news_headline'})) |
81 | + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_author'})) |
82 | + keep_only_tags.append(dict(name = 'td', attrs = {'class' : 'news_article_date'})) |
83 | + keep_only_tags.append(dict(name = 'p')) |
84 | + |
85 | + feeds = [(u'ALL NEWS', u'http://www.autosport.com/rss/allnews.xml')] |
86 | \ No newline at end of file |
87 | |
88 | === added file 'recipes/blognexto.recipe' |
89 | --- recipes/blognexto.recipe 1970-01-01 00:00:00 +0000 |
90 | +++ recipes/blognexto.recipe 2012-11-10 10:56:21 +0000 |
91 | @@ -0,0 +1,29 @@ |
92 | +from calibre.web.feeds.news import BasicNewsRecipe |
93 | +import re |
94 | + |
95 | +class blognexto(BasicNewsRecipe): |
96 | + title = 'BLOG.NEXTO.pl' |
97 | + __author__ = 'MrStefan <mrstefaan@gmail.com>' |
98 | + language = 'pl' |
99 | + description ='o e-publikacjach prawie wszystko' |
100 | + masthead_url='http://blog.nexto.pl/wp-content/uploads/2012/04/logo-blog-nexto.pl_.jpg' |
101 | + remove_empty_feeds= True |
102 | + oldest_article = 7 |
103 | + max_articles_per_feed = 100 |
104 | + remove_javascript=True |
105 | + no_stylesheets=True |
106 | + |
107 | + |
108 | + keep_only_tags =[] |
109 | + keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'content'})) |
110 | + |
111 | + remove_tags =[] |
112 | + remove_tags.append(dict(name = 'div', attrs = {'class' : 'comment-cloud'})) |
113 | + remove_tags.append(dict(name = 'p', attrs = {'class' : 'post-date1'})) |
114 | + remove_tags.append(dict(name = 'div', attrs = {'class' : 'fb-like'})) |
115 | + remove_tags.append(dict(name = 'div', attrs = {'class' : 'tags'})) |
116 | + remove_tags.append(dict(name = 'div', attrs = {'class' : 'postnavi'})) |
117 | + remove_tags.append(dict(name = 'div', attrs = {'class' : 'commments-box'})) |
118 | + remove_tags.append(dict(name = 'div', attrs = {'id' : 'respond'})) |
119 | + |
120 | + feeds = [('Artykuly', 'http://feeds.feedburner.com/blognexto')] |
121 | |
122 | === added file 'recipes/brewiarz.recipe' |
123 | --- recipes/brewiarz.recipe 1970-01-01 00:00:00 +0000 |
124 | +++ recipes/brewiarz.recipe 2012-11-10 10:56:21 +0000 |
125 | @@ -0,0 +1,141 @@ |
126 | +#!/usr/bin/env python |
127 | + |
128 | +__license__ = 'GPL v3' |
129 | + |
130 | +from calibre.web.feeds.news import BasicNewsRecipe |
131 | +import datetime |
132 | +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Tag |
133 | + |
134 | + |
135 | +class brewiarz(BasicNewsRecipe): |
136 | + title = u'Brewiarz' |
137 | + __author__ = 'Artur Stachecki <artur.stachecki@gmail.com>' |
138 | + language = 'pl' |
139 | + description = u'Serwis poświęcony Liturgii Godzin (brewiarzowi) - formie codziennej modlitwy Kościoła katolickiego.' |
140 | + masthead_url = 'http://brewiarz.pl/images/logo2.gif' |
141 | + max_articles_per_feed = 100 |
142 | + remove_javascript = True |
143 | + no_stylesheets = True |
144 | + publication_type = 'newspaper' |
145 | + next_days = 1 |
146 | + |
147 | + def parse_index(self): |
148 | + dec2rom_dict = {"01": "i", "02": "ii", "03": "iii", "04": "iv", |
149 | + "05": "v", "06": "vi", "07": "vii", "08": "viii", |
150 | + "09": "ix", "10": "x", "11": "xi", "12": "xii"} |
151 | + |
152 | + weekday_dict = {"Sunday": "Niedziela", "Monday": "Poniedziałek", "Tuesday": "Wtorek", |
153 | + "Wednesday": "Środa", "Thursday": "Czwartek", "Friday": "Piątek", "Saturday": "Sobota"} |
154 | + |
155 | + now = datetime.datetime.now() |
156 | + |
157 | + feeds = [] |
158 | + for i in range(0, self.next_days): |
159 | + url_date = now + datetime.timedelta(days=i) |
160 | + url_date_month = url_date.strftime("%m") |
161 | + url_date_month_roman = dec2rom_dict[url_date_month] |
162 | + url_date_day = url_date.strftime("%d") |
163 | + url_date_year = url_date.strftime("%Y")[2:] |
164 | + url_date_weekday = url_date.strftime("%A") |
165 | + url_date_weekday_pl = weekday_dict[url_date_weekday] |
166 | + |
167 | + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/index.php3" |
168 | + articles = self.parse_pages(url) |
169 | + if articles: |
170 | + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year |
171 | + feeds.append((title, articles)) |
172 | + else: |
173 | + sectors = self.get_sectors(url) |
174 | + for subpage in sectors: |
175 | + title = url_date_weekday_pl + " " + url_date_day + "." + url_date_month + "." + url_date_year + " - " + subpage.string |
176 | + url = "http://brewiarz.pl/" + url_date_month_roman + "_" + url_date_year + "/" + url_date_day + url_date_month + "/" + subpage['href'] |
177 | + print(url) |
178 | + articles = self.parse_pages(url) |
179 | + if articles: |
180 | + feeds.append((title, articles)) |
181 | + return feeds |
182 | + |
183 | + def get_sectors(self, url): |
184 | + sectors = [] |
185 | + soup = self.index_to_soup(url) |
186 | + sectors_table = soup.find(name='table', attrs={'width': '490'}) |
187 | + sector_links = sectors_table.findAll(name='a') |
188 | + for sector_links_modified in sector_links: |
189 | + link_parent_text = sector_links_modified.findParent(name='div').text |
190 | + if link_parent_text: |
191 | + sector_links_modified.text = link_parent_text.text |
192 | + sectors.append(sector_links_modified) |
193 | + return sectors |
194 | + |
195 | + def parse_pages(self, url): |
196 | + current_articles = [] |
197 | + soup = self.index_to_soup(url) |
198 | + www = soup.find(attrs={'class': 'www'}) |
199 | + if www: |
200 | + box_title = www.find(text='Teksty LG') |
201 | + article_box_parent = box_title.findParent('ul') |
202 | + article_box_sibling = article_box_parent.findNextSibling('ul') |
203 | + for li in article_box_sibling.findAll('li'): |
204 | + link = li.find(name='a') |
205 | + ol = link.findNextSibling(name='ol') |
206 | + if ol: |
207 | + sublinks = ol.findAll(name='a') |
208 | + for sublink in sublinks: |
209 | + link_title = self.tag_to_string(link) + " - " + self.tag_to_string(sublink) |
210 | + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', sublink['href']) |
211 | + link_url = url[:-10] + link_url_print |
212 | + current_articles.append({'title': link_title, |
213 | + 'url': link_url, 'description': '', 'date': ''}) |
214 | + else: |
215 | + if link.findParent(name = 'ol'): |
216 | + continue |
217 | + else: |
218 | + link_title = self.tag_to_string(link) |
219 | + link_url_print = re.sub('php3', 'php3?kr=_druk&wr=lg&', link['href']) |
220 | + link_url = url[:-10] + link_url_print |
221 | + current_articles.append({'title': link_title, |
222 | + 'url': link_url, 'description': '', 'date': ''}) |
223 | + return current_articles |
224 | + else: |
225 | + return None |
226 | + |
227 | + def preprocess_html(self, soup): |
228 | + footer = soup.find(name='a', attrs={'href': 'http://brewiarz.pl'}) |
229 | + footer_parent = footer.findParent('div') |
230 | + footer_parent.extract() |
231 | + |
232 | + header = soup.find(text='http://brewiarz.pl') |
233 | + header_parent = header.findParent('div') |
234 | + header_parent.extract() |
235 | + |
236 | + subheader = soup.find(text='Kolor szat:').findParent('div') |
237 | + subheader.extract() |
238 | + |
239 | + color = soup.find('b') |
240 | + color.extract() |
241 | + |
242 | + cleaned = self.strip_tags(soup) |
243 | + |
244 | + div = cleaned.findAll(name='div') |
245 | + div[1].extract() |
246 | + div[2].extract() |
247 | + div[3].extract() |
248 | + |
249 | + return cleaned |
250 | + |
251 | + def strip_tags(self, soup_dirty): |
252 | + VALID_TAGS = ['p', 'div', 'br', 'b', 'a', 'title', 'head', 'html', 'body'] |
253 | + |
254 | + for tag in soup_dirty.findAll(True): |
255 | + if tag.name not in VALID_TAGS: |
256 | + for i, x in enumerate(tag.parent.contents): |
257 | + if x == tag: |
258 | + break |
259 | + else: |
260 | + print "Can't find", tag, "in", tag.parent |
261 | + continue |
262 | + for r in reversed(tag.contents): |
263 | + tag.parent.insert(i, r) |
264 | + tag.extract() |
265 | + |
266 | + return soup_dirty |
267 | |
268 | === modified file 'recipes/dobreprogamy.recipe' |
269 | --- recipes/dobreprogamy.recipe 2012-10-17 14:12:08 +0000 |
270 | +++ recipes/dobreprogamy.recipe 2012-11-10 10:56:21 +0000 |
271 | @@ -6,7 +6,6 @@ |
272 | __author__ = 'fenuks' |
273 | __licence__ ='GPL v3' |
274 | category = 'IT' |
275 | - language = 'pl' |
276 | masthead_url='http://static.dpcdn.pl/css/Black/Images/header_logo_napis_fullVersion.png' |
277 | cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png' |
278 | description = u'Aktualności i blogi z dobreprogramy.pl' |
279 | @@ -29,4 +28,4 @@ |
280 | for a in soup('a'): |
281 | if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']: |
282 | a['href']=self.index + a['href'] |
283 | - return soup |
284 | \ No newline at end of file |
285 | + return soup |
286 | |
287 | === added file 'recipes/icons/autosport.png' |
288 | Binary files recipes/icons/autosport.png 1970-01-01 00:00:00 +0000 and recipes/icons/autosport.png 2012-11-10 10:56:21 +0000 differ |
289 | === added file 'recipes/icons/blognexto.png' |
290 | Binary files recipes/icons/blognexto.png 1970-01-01 00:00:00 +0000 and recipes/icons/blognexto.png 2012-11-10 10:56:21 +0000 differ |
291 | === added file 'recipes/icons/brewiarz.png' |
292 | Binary files recipes/icons/brewiarz.png 1970-01-01 00:00:00 +0000 and recipes/icons/brewiarz.png 2012-11-10 10:56:21 +0000 differ |
293 | === added file 'recipes/icons/naszdziennik.png' |
294 | Binary files recipes/icons/naszdziennik.png 1970-01-01 00:00:00 +0000 and recipes/icons/naszdziennik.png 2012-11-10 10:56:21 +0000 differ |
295 | === added file 'recipes/icons/wprost.png' |
296 | Binary files recipes/icons/wprost.png 1970-01-01 00:00:00 +0000 and recipes/icons/wprost.png 2012-11-10 10:56:21 +0000 differ |