Merge lp:~lifanxi/calibre/douban-metadata-plugin into lp:calibre

Proposed by Li Fanxi
Status: Merged
Merged at revision: 9207
Proposed branch: lp:~lifanxi/calibre/douban-metadata-plugin
Merge into: lp:calibre
Diff against target: 368 lines (+351/-2)
2 files modified
src/calibre/customize/builtins.py (+2/-2)
src/calibre/ebooks/metadata/sources/douban.py (+349/-0)
To merge this branch: bzr merge lp:~lifanxi/calibre/douban-metadata-plugin
Reviewer Review Type Date Requested Status
Kovid Goyal Pending
Review via email: mp+60315@code.launchpad.net

Description of the change

Updated the Douban.com metadata source plugin to work with the new metadata source framework in calibre 8.0.
You may add the logic to disable this plugin by default if the language is not set to zh.
Thanks.

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'src/calibre/customize/builtins.py'
--- src/calibre/customize/builtins.py 2011-05-03 06:24:21 +0000
+++ src/calibre/customize/builtins.py 2011-05-08 14:53:23 +0000
@@ -628,8 +628,8 @@
628from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary628from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
629from calibre.ebooks.metadata.sources.isbndb import ISBNDB629from calibre.ebooks.metadata.sources.isbndb import ISBNDB
630from calibre.ebooks.metadata.sources.overdrive import OverDrive630from calibre.ebooks.metadata.sources.overdrive import OverDrive
631631from calibre.ebooks.metadata.sources.douban import Douban
632plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]632plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
633633
634# }}}634# }}}
635635
636636
=== added file 'src/calibre/ebooks/metadata/sources/douban.py'
--- src/calibre/ebooks/metadata/sources/douban.py 1970-01-01 00:00:00 +0000
+++ src/calibre/ebooks/metadata/sources/douban.py 2011-05-08 14:53:23 +0000
@@ -0,0 +1,349 @@
1#!/usr/bin/env python
2# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
3from __future__ import (unicode_literals, division, absolute_import,
4 print_function)
5
6__license__ = 'GPL v3'
7__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
8__docformat__ = 'restructuredtext en'
9
10import time, hashlib
11from urllib import urlencode
12from functools import partial
13from Queue import Queue, Empty
14
15from lxml import etree
16
17from calibre.ebooks.metadata import check_isbn
18from calibre.ebooks.metadata.sources.base import Source
19from calibre.ebooks.metadata.book.base import Metadata
20from calibre.ebooks.chardet import xml_to_unicode
21from calibre.utils.date import parse_date, utcnow
22from calibre.utils.cleantext import clean_ascii_chars
23from calibre import as_unicode
24
25NAMESPACES = {
26 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
27 'atom' : 'http://www.w3.org/2005/Atom',
28 'db': 'http://www.douban.com/xmlns/',
29 'gd': 'http://schemas.google.com/g/2005'
30 }
31XPath = partial(etree.XPath, namespaces=NAMESPACES)
32total_results = XPath('//openSearch:totalResults')
33start_index = XPath('//openSearch:startIndex')
34items_per_page = XPath('//openSearch:itemsPerPage')
35entry = XPath('//atom:entry')
36entry_id = XPath('descendant::atom:id')
37title = XPath('descendant::atom:title')
38description = XPath('descendant::atom:summary')
39publisher = XPath("descendant::db:attribute[@name='publisher']")
40isbn = XPath("descendant::db:attribute[@name='isbn13']")
41date = XPath("descendant::db:attribute[@name='pubdate']")
42creator = XPath("descendant::db:attribute[@name='author']")
43booktag = XPath("descendant::db:tag/attribute::name")
44rating = XPath("descendant::gd:rating/attribute::average")
45cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
46
47def get_details(browser, url, timeout): # {{{
48 try:
49 raw = browser.open_novisit(url, timeout=timeout).read()
50 except Exception as e:
51 gc = getattr(e, 'getcode', lambda : -1)
52 if gc() != 403:
53 raise
54 # Douban is throttling us, wait a little
55 time.sleep(2)
56 raw = browser.open_novisit(url, timeout=timeout).read()
57
58 return raw
59# }}}
60
61def to_metadata(browser, log, entry_, timeout): # {{{
62 def get_text(extra, x):
63 try:
64 ans = x(extra)
65 if ans:
66 ans = ans[0].text
67 if ans and ans.strip():
68 return ans.strip()
69 except:
70 log.exception('Programming error:')
71 return None
72
73 id_url = entry_id(entry_)[0].text
74 douban_id = id_url.split('/')[-1]
75 title_ = ': '.join([x.text for x in title(entry_)]).strip()
76 authors = [x.text.strip() for x in creator(entry_) if x.text]
77 if not authors:
78 authors = [_('Unknown')]
79 if not id_url or not title:
80 # Silently discard this entry
81 return None
82
83 mi = Metadata(title_, authors)
84 mi.identifiers = {'douban':douban_id}
85 try:
86 raw = get_details(browser, id_url, timeout)
87 feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
88 strip_encoding_pats=True)[0])
89 extra = entry(feed)[0]
90 except:
91 log.exception('Failed to get additional details for', mi.title)
92 return mi
93 mi.comments = get_text(extra, description)
94 mi.publisher = get_text(extra, publisher)
95
96 # ISBN
97 isbns = []
98 for x in [t.text for t in isbn(extra)]:
99 if check_isbn(x):
100 isbns.append(x)
101 if isbns:
102 mi.isbn = sorted(isbns, key=len)[-1]
103 mi.all_isbns = isbns
104
105 # Tags
106 try:
107 btags = [x for x in booktag(extra) if x]
108 tags = []
109 for t in btags:
110 atags = [y.strip() for y in t.split('/')]
111 for tag in atags:
112 if tag not in tags:
113 tags.append(tag)
114 except:
115 log.exception('Failed to parse tags:')
116 tags = []
117 if tags:
118 mi.tags = [x.replace(',', ';') for x in tags]
119
120 # pubdate
121 pubdate = get_text(extra, date)
122 if pubdate:
123 try:
124 default = utcnow().replace(day=15)
125 mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
126 except:
127 log.error('Failed to parse pubdate %r'%pubdate)
128
129 # Ratings
130 if rating(extra):
131 try:
132 mi.rating = float(rating(extra)[0]) / 2.0
133 except:
134 log.exception('Failed to parse rating')
135 mi.rating = 0
136
137 # Cover
138 mi.has_douban_cover = None
139 u = cover_url(extra)
140 if u:
141 u = u[0].replace('/spic/', '/lpic/');
142 # If URL contains "book-default", the book doesn't have a cover
143 if u.find('book-default') == -1:
144 mi.has_douban_cover = u
145 return mi
146# }}}
147
148class Douban(Source):
149
150 name = 'Douban Books'
151 author = _('Li Fanxi')
152 version = (2, 0, 0)
153
154 description = _('Downloads metadata from Douban.com')
155
156 capabilities = frozenset(['identify', 'cover'])
157 touched_fields = frozenset(['title', 'authors', 'tags',
158 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
159 'identifier:douban']) # language currently disabled
160 supports_gzip_transfer_encoding = True
161 cached_cover_url_is_reliable = True
162
163 DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
164 DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
165
166 def get_book_url(self, identifiers): # {{{
167 db = identifiers.get('douban', None)
168 if db is not None:
169 return ('douban', db, self.DOUBAN_BOOK_URL%db)
170 else:
171 return None
172 # }}}
173
174 def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
175 SEARCH_URL = 'http://api.douban.com/book/subjects?'
176 ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
177 SUBJECT_URL = 'http://api.douban.com/book/subject/'
178
179 q = ''
180 t = None
181 isbn = check_isbn(identifiers.get('isbn', None))
182 subject = identifiers.get('douban', None)
183 if isbn is not None:
184 q = isbn
185 t = 'isbn'
186 elif subject is not None:
187 q = subject
188 t = 'subject'
189 elif title or authors:
190 def build_term(prefix, parts):
191 return ' '.join(x for x in parts)
192 title_tokens = list(self.get_title_tokens(title))
193 if title_tokens:
194 q += build_term('title', title_tokens)
195 author_tokens = self.get_author_tokens(authors,
196 only_first_author=True)
197 if author_tokens:
198 q += ((' ' if q != '' else '') +
199 build_term('author', author_tokens))
200 t = 'search'
201 q = q.strip()
202 if isinstance(q, unicode):
203 q = q.encode('utf-8')
204 if not q:
205 return None
206 url = None
207 if t == "isbn":
208 url = ISBN_URL + q
209 elif t == 'subject':
210 url = SUBJECT_URL + q
211 else:
212 url = SEARCH_URL + urlencode({
213 'q': q,
214 })
215 if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
216 url = url + "?apikey=" + self.DOUBAN_API_KEY
217 return url
218 # }}}
219
220 def download_cover(self, log, result_queue, abort, # {{{
221 title=None, authors=None, identifiers={}, timeout=30):
222 cached_url = self.get_cached_cover_url(identifiers)
223 if cached_url is None:
224 log.info('No cached cover found, running identify')
225 rq = Queue()
226 self.identify(log, rq, abort, title=title, authors=authors,
227 identifiers=identifiers)
228 if abort.is_set():
229 return
230 results = []
231 while True:
232 try:
233 results.append(rq.get_nowait())
234 except Empty:
235 break
236 results.sort(key=self.identify_results_keygen(
237 title=title, authors=authors, identifiers=identifiers))
238 for mi in results:
239 cached_url = self.get_cached_cover_url(mi.identifiers)
240 if cached_url is not None:
241 break
242 if cached_url is None:
243 log.info('No cover found')
244 return
245
246 if abort.is_set():
247 return
248 br = self.browser
249 log('Downloading cover from:', cached_url)
250 try:
251 cdata = br.open_novisit(cached_url, timeout=timeout).read()
252 if cdata:
253 result_queue.put((self, cdata))
254 except:
255 log.exception('Failed to download cover from:', cached_url)
256
257 # }}}
258
259 def get_cached_cover_url(self, identifiers): # {{{
260 url = None
261 db = identifiers.get('douban', None)
262 if db is None:
263 isbn = identifiers.get('isbn', None)
264 if isbn is not None:
265 db = self.cached_isbn_to_identifier(isbn)
266 if db is not None:
267 url = self.cached_identifier_to_cover_url(db)
268
269 return url
270 # }}}
271
272 def get_all_details(self, br, log, entries, abort, # {{{
273 result_queue, timeout):
274 for relevance, i in enumerate(entries):
275 try:
276 ans = to_metadata(br, log, i, timeout)
277 if isinstance(ans, Metadata):
278 ans.source_relevance = relevance
279 db = ans.identifiers['douban']
280 for isbn in getattr(ans, 'all_isbns', []):
281 self.cache_isbn_to_identifier(isbn, db)
282 if ans.has_douban_cover:
283 self.cache_identifier_to_cover_url(db,
284 ans.has_douban_cover)
285 self.clean_downloaded_metadata(ans)
286 result_queue.put(ans)
287 except:
288 log.exception(
289 'Failed to get metadata for identify entry:',
290 etree.tostring(i))
291 if abort.is_set():
292 break
293 # }}}
294
295 def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
296 identifiers={}, timeout=30):
297 query = self.create_query(log, title=title, authors=authors,
298 identifiers=identifiers)
299 if not query:
300 log.error('Insufficient metadata to construct query')
301 return
302 br = self.browser
303 try:
304 raw = br.open_novisit(query, timeout=timeout).read()
305 except Exception as e:
306 log.exception('Failed to make identify query: %r'%query)
307 return as_unicode(e)
308 try:
309 parser = etree.XMLParser(recover=True, no_network=True)
310 feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
311 strip_encoding_pats=True)[0], parser=parser)
312 entries = entry(feed)
313 except Exception as e:
314 log.exception('Failed to parse identify results')
315 return as_unicode(e)
316 if not entries and identifiers and title and authors and \
317 not abort.is_set():
318 return self.identify(log, result_queue, abort, title=title,
319 authors=authors, timeout=timeout)
320
321 # There is no point running these queries in threads as douban
322 # throttles requests returning 403 Forbidden errors
323 self.get_all_details(br, log, entries, abort, result_queue, timeout)
324
325 return None
326 # }}}
327
328if __name__ == '__main__': # tests {{{
329 # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
330 from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
331 title_test, authors_test)
332 test_identify_plugin(Douban.name,
333 [
334
335
336 (
337 {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
338 'authors':['刘慈欣']},
339 [title_test('三体', exact=True),
340 authors_test(['刘慈欣'])]
341 ),
342
343 (
344 {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
345 [title_test('Linux内核修炼之道', exact=False)]
346 ),
347 ])
348# }}}
349

Subscribers

People subscribed via source and target branches