Merge lp:~lifanxi/calibre/douban-metadata-plugin into lp:calibre
- douban-metadata-plugin
- Merge into trunk
Proposed by
Li Fanxi
Status: | Merged |
---|---|
Merged at revision: | 9207 |
Proposed branch: | lp:~lifanxi/calibre/douban-metadata-plugin |
Merge into: | lp:calibre |
Diff against target: |
368 lines (+351/-2) 2 files modified
src/calibre/customize/builtins.py (+2/-2) src/calibre/ebooks/metadata/sources/douban.py (+349/-0) |
To merge this branch: | bzr merge lp:~lifanxi/calibre/douban-metadata-plugin |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Kovid Goyal | Pending | ||
Review via email: mp+60315@code.launchpad.net |
Commit message
Description of the change
Updated the Douban.com metadata source plugin to work with the new metadata source framework in calibre 8.0.
You may add the logic to disable this plugin by default if the language is not set to zh.
Thanks.
To post a comment you must log in.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'src/calibre/customize/builtins.py' |
2 | --- src/calibre/customize/builtins.py 2011-05-03 06:24:21 +0000 |
3 | +++ src/calibre/customize/builtins.py 2011-05-08 14:53:23 +0000 |
4 | @@ -628,8 +628,8 @@ |
5 | from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary |
6 | from calibre.ebooks.metadata.sources.isbndb import ISBNDB |
7 | from calibre.ebooks.metadata.sources.overdrive import OverDrive |
8 | - |
9 | -plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] |
10 | +from calibre.ebooks.metadata.sources.douban import Douban |
11 | +plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban] |
12 | |
13 | # }}} |
14 | |
15 | |
16 | === added file 'src/calibre/ebooks/metadata/sources/douban.py' |
17 | --- src/calibre/ebooks/metadata/sources/douban.py 1970-01-01 00:00:00 +0000 |
18 | +++ src/calibre/ebooks/metadata/sources/douban.py 2011-05-08 14:53:23 +0000 |
19 | @@ -0,0 +1,349 @@ |
20 | +#!/usr/bin/env python |
21 | +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai |
22 | +from __future__ import (unicode_literals, division, absolute_import, |
23 | + print_function) |
24 | + |
25 | +__license__ = 'GPL v3' |
26 | +__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>' |
27 | +__docformat__ = 'restructuredtext en' |
28 | + |
29 | +import time, hashlib |
30 | +from urllib import urlencode |
31 | +from functools import partial |
32 | +from Queue import Queue, Empty |
33 | + |
34 | +from lxml import etree |
35 | + |
36 | +from calibre.ebooks.metadata import check_isbn |
37 | +from calibre.ebooks.metadata.sources.base import Source |
38 | +from calibre.ebooks.metadata.book.base import Metadata |
39 | +from calibre.ebooks.chardet import xml_to_unicode |
40 | +from calibre.utils.date import parse_date, utcnow |
41 | +from calibre.utils.cleantext import clean_ascii_chars |
42 | +from calibre import as_unicode |
43 | + |
44 | +NAMESPACES = { |
45 | + 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', |
46 | + 'atom' : 'http://www.w3.org/2005/Atom', |
47 | + 'db': 'http://www.douban.com/xmlns/', |
48 | + 'gd': 'http://schemas.google.com/g/2005' |
49 | + } |
50 | +XPath = partial(etree.XPath, namespaces=NAMESPACES) |
51 | +total_results = XPath('//openSearch:totalResults') |
52 | +start_index = XPath('//openSearch:startIndex') |
53 | +items_per_page = XPath('//openSearch:itemsPerPage') |
54 | +entry = XPath('//atom:entry') |
55 | +entry_id = XPath('descendant::atom:id') |
56 | +title = XPath('descendant::atom:title') |
57 | +description = XPath('descendant::atom:summary') |
58 | +publisher = XPath("descendant::db:attribute[@name='publisher']") |
59 | +isbn = XPath("descendant::db:attribute[@name='isbn13']") |
60 | +date = XPath("descendant::db:attribute[@name='pubdate']") |
61 | +creator = XPath("descendant::db:attribute[@name='author']") |
62 | +booktag = XPath("descendant::db:tag/attribute::name") |
63 | +rating = XPath("descendant::gd:rating/attribute::average") |
64 | +cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") |
65 | + |
66 | +def get_details(browser, url, timeout): # {{{ |
67 | + try: |
68 | + raw = browser.open_novisit(url, timeout=timeout).read() |
69 | + except Exception as e: |
70 | + gc = getattr(e, 'getcode', lambda : -1) |
71 | + if gc() != 403: |
72 | + raise |
73 | + # Douban is throttling us, wait a little |
74 | + time.sleep(2) |
75 | + raw = browser.open_novisit(url, timeout=timeout).read() |
76 | + |
77 | + return raw |
78 | +# }}} |
79 | + |
80 | +def to_metadata(browser, log, entry_, timeout): # {{{ |
81 | + def get_text(extra, x): |
82 | + try: |
83 | + ans = x(extra) |
84 | + if ans: |
85 | + ans = ans[0].text |
86 | + if ans and ans.strip(): |
87 | + return ans.strip() |
88 | + except: |
89 | + log.exception('Programming error:') |
90 | + return None |
91 | + |
92 | + id_url = entry_id(entry_)[0].text |
93 | + douban_id = id_url.split('/')[-1] |
94 | + title_ = ': '.join([x.text for x in title(entry_)]).strip() |
95 | + authors = [x.text.strip() for x in creator(entry_) if x.text] |
96 | + if not authors: |
97 | + authors = [_('Unknown')] |
98 | + if not id_url or not title: |
99 | + # Silently discard this entry |
100 | + return None |
101 | + |
102 | + mi = Metadata(title_, authors) |
103 | + mi.identifiers = {'douban':douban_id} |
104 | + try: |
105 | + raw = get_details(browser, id_url, timeout) |
106 | + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), |
107 | + strip_encoding_pats=True)[0]) |
108 | + extra = entry(feed)[0] |
109 | + except: |
110 | + log.exception('Failed to get additional details for', mi.title) |
111 | + return mi |
112 | + mi.comments = get_text(extra, description) |
113 | + mi.publisher = get_text(extra, publisher) |
114 | + |
115 | + # ISBN |
116 | + isbns = [] |
117 | + for x in [t.text for t in isbn(extra)]: |
118 | + if check_isbn(x): |
119 | + isbns.append(x) |
120 | + if isbns: |
121 | + mi.isbn = sorted(isbns, key=len)[-1] |
122 | + mi.all_isbns = isbns |
123 | + |
124 | + # Tags |
125 | + try: |
126 | + btags = [x for x in booktag(extra) if x] |
127 | + tags = [] |
128 | + for t in btags: |
129 | + atags = [y.strip() for y in t.split('/')] |
130 | + for tag in atags: |
131 | + if tag not in tags: |
132 | + tags.append(tag) |
133 | + except: |
134 | + log.exception('Failed to parse tags:') |
135 | + tags = [] |
136 | + if tags: |
137 | + mi.tags = [x.replace(',', ';') for x in tags] |
138 | + |
139 | + # pubdate |
140 | + pubdate = get_text(extra, date) |
141 | + if pubdate: |
142 | + try: |
143 | + default = utcnow().replace(day=15) |
144 | + mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) |
145 | + except: |
146 | + log.error('Failed to parse pubdate %r'%pubdate) |
147 | + |
148 | + # Ratings |
149 | + if rating(extra): |
150 | + try: |
151 | + mi.rating = float(rating(extra)[0]) / 2.0 |
152 | + except: |
153 | + log.exception('Failed to parse rating') |
154 | + mi.rating = 0 |
155 | + |
156 | + # Cover |
157 | + mi.has_douban_cover = None |
158 | + u = cover_url(extra) |
159 | + if u: |
160 | + u = u[0].replace('/spic/', '/lpic/'); |
161 | + # If URL contains "book-default", the book doesn't have a cover |
162 | + if u.find('book-default') == -1: |
163 | + mi.has_douban_cover = u |
164 | + return mi |
165 | +# }}} |
166 | + |
167 | +class Douban(Source): |
168 | + |
169 | + name = 'Douban Books' |
170 | + author = _('Li Fanxi') |
171 | + version = (2, 0, 0) |
172 | + |
173 | + description = _('Downloads metadata from Douban.com') |
174 | + |
175 | + capabilities = frozenset(['identify', 'cover']) |
176 | + touched_fields = frozenset(['title', 'authors', 'tags', |
177 | + 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', |
178 | + 'identifier:douban']) # language currently disabled |
179 | + supports_gzip_transfer_encoding = True |
180 | + cached_cover_url_is_reliable = True |
181 | + |
182 | + DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' |
183 | + DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/' |
184 | + |
185 | + def get_book_url(self, identifiers): # {{{ |
186 | + db = identifiers.get('douban', None) |
187 | + if db is not None: |
188 | + return ('douban', db, self.DOUBAN_BOOK_URL%db) |
189 | + else: |
190 | + return None |
191 | + # }}} |
192 | + |
193 | + def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ |
194 | + SEARCH_URL = 'http://api.douban.com/book/subjects?' |
195 | + ISBN_URL = 'http://api.douban.com/book/subject/isbn/' |
196 | + SUBJECT_URL = 'http://api.douban.com/book/subject/' |
197 | + |
198 | + q = '' |
199 | + t = None |
200 | + isbn = check_isbn(identifiers.get('isbn', None)) |
201 | + subject = identifiers.get('douban', None) |
202 | + if isbn is not None: |
203 | + q = isbn |
204 | + t = 'isbn' |
205 | + elif subject is not None: |
206 | + q = subject |
207 | + t = 'subject' |
208 | + elif title or authors: |
209 | + def build_term(prefix, parts): |
210 | + return ' '.join(x for x in parts) |
211 | + title_tokens = list(self.get_title_tokens(title)) |
212 | + if title_tokens: |
213 | + q += build_term('title', title_tokens) |
214 | + author_tokens = self.get_author_tokens(authors, |
215 | + only_first_author=True) |
216 | + if author_tokens: |
217 | + q += ((' ' if q != '' else '') + |
218 | + build_term('author', author_tokens)) |
219 | + t = 'search' |
220 | + q = q.strip() |
221 | + if isinstance(q, unicode): |
222 | + q = q.encode('utf-8') |
223 | + if not q: |
224 | + return None |
225 | + url = None |
226 | + if t == "isbn": |
227 | + url = ISBN_URL + q |
228 | + elif t == 'subject': |
229 | + url = SUBJECT_URL + q |
230 | + else: |
231 | + url = SEARCH_URL + urlencode({ |
232 | + 'q': q, |
233 | + }) |
234 | + if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': |
235 | + url = url + "?apikey=" + self.DOUBAN_API_KEY |
236 | + return url |
237 | + # }}} |
238 | + |
239 | + def download_cover(self, log, result_queue, abort, # {{{ |
240 | + title=None, authors=None, identifiers={}, timeout=30): |
241 | + cached_url = self.get_cached_cover_url(identifiers) |
242 | + if cached_url is None: |
243 | + log.info('No cached cover found, running identify') |
244 | + rq = Queue() |
245 | + self.identify(log, rq, abort, title=title, authors=authors, |
246 | + identifiers=identifiers) |
247 | + if abort.is_set(): |
248 | + return |
249 | + results = [] |
250 | + while True: |
251 | + try: |
252 | + results.append(rq.get_nowait()) |
253 | + except Empty: |
254 | + break |
255 | + results.sort(key=self.identify_results_keygen( |
256 | + title=title, authors=authors, identifiers=identifiers)) |
257 | + for mi in results: |
258 | + cached_url = self.get_cached_cover_url(mi.identifiers) |
259 | + if cached_url is not None: |
260 | + break |
261 | + if cached_url is None: |
262 | + log.info('No cover found') |
263 | + return |
264 | + |
265 | + if abort.is_set(): |
266 | + return |
267 | + br = self.browser |
268 | + log('Downloading cover from:', cached_url) |
269 | + try: |
270 | + cdata = br.open_novisit(cached_url, timeout=timeout).read() |
271 | + if cdata: |
272 | + result_queue.put((self, cdata)) |
273 | + except: |
274 | + log.exception('Failed to download cover from:', cached_url) |
275 | + |
276 | + # }}} |
277 | + |
278 | + def get_cached_cover_url(self, identifiers): # {{{ |
279 | + url = None |
280 | + db = identifiers.get('douban', None) |
281 | + if db is None: |
282 | + isbn = identifiers.get('isbn', None) |
283 | + if isbn is not None: |
284 | + db = self.cached_isbn_to_identifier(isbn) |
285 | + if db is not None: |
286 | + url = self.cached_identifier_to_cover_url(db) |
287 | + |
288 | + return url |
289 | + # }}} |
290 | + |
291 | + def get_all_details(self, br, log, entries, abort, # {{{ |
292 | + result_queue, timeout): |
293 | + for relevance, i in enumerate(entries): |
294 | + try: |
295 | + ans = to_metadata(br, log, i, timeout) |
296 | + if isinstance(ans, Metadata): |
297 | + ans.source_relevance = relevance |
298 | + db = ans.identifiers['douban'] |
299 | + for isbn in getattr(ans, 'all_isbns', []): |
300 | + self.cache_isbn_to_identifier(isbn, db) |
301 | + if ans.has_douban_cover: |
302 | + self.cache_identifier_to_cover_url(db, |
303 | + ans.has_douban_cover) |
304 | + self.clean_downloaded_metadata(ans) |
305 | + result_queue.put(ans) |
306 | + except: |
307 | + log.exception( |
308 | + 'Failed to get metadata for identify entry:', |
309 | + etree.tostring(i)) |
310 | + if abort.is_set(): |
311 | + break |
312 | + # }}} |
313 | + |
314 | + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ |
315 | + identifiers={}, timeout=30): |
316 | + query = self.create_query(log, title=title, authors=authors, |
317 | + identifiers=identifiers) |
318 | + if not query: |
319 | + log.error('Insufficient metadata to construct query') |
320 | + return |
321 | + br = self.browser |
322 | + try: |
323 | + raw = br.open_novisit(query, timeout=timeout).read() |
324 | + except Exception as e: |
325 | + log.exception('Failed to make identify query: %r'%query) |
326 | + return as_unicode(e) |
327 | + try: |
328 | + parser = etree.XMLParser(recover=True, no_network=True) |
329 | + feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), |
330 | + strip_encoding_pats=True)[0], parser=parser) |
331 | + entries = entry(feed) |
332 | + except Exception as e: |
333 | + log.exception('Failed to parse identify results') |
334 | + return as_unicode(e) |
335 | + if not entries and identifiers and title and authors and \ |
336 | + not abort.is_set(): |
337 | + return self.identify(log, result_queue, abort, title=title, |
338 | + authors=authors, timeout=timeout) |
339 | + |
340 | + # There is no point running these queries in threads as douban |
341 | + # throttles requests returning 403 Forbidden errors |
342 | + self.get_all_details(br, log, entries, abort, result_queue, timeout) |
343 | + |
344 | + return None |
345 | + # }}} |
346 | + |
347 | +if __name__ == '__main__': # tests {{{ |
348 | + # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py |
349 | + from calibre.ebooks.metadata.sources.test import (test_identify_plugin, |
350 | + title_test, authors_test) |
351 | + test_identify_plugin(Douban.name, |
352 | + [ |
353 | + |
354 | + |
355 | + ( |
356 | + {'identifiers':{'isbn': '9787536692930'}, 'title':'三体', |
357 | + 'authors':['刘慈欣']}, |
358 | + [title_test('三体', exact=True), |
359 | + authors_test(['刘慈欣'])] |
360 | + ), |
361 | + |
362 | + ( |
363 | + {'title': 'Linux内核修炼之道', 'authors':['任桥伟']}, |
364 | + [title_test('Linux内核修炼之道', exact=False)] |
365 | + ), |
366 | + ]) |
367 | +# }}} |
368 | + |