Merge lp:~lifanxi/calibre/douban-metadata-plugin into lp:calibre

Proposed by Li Fanxi
Status: Merged
Merged at revision: 9207
Proposed branch: lp:~lifanxi/calibre/douban-metadata-plugin
Merge into: lp:calibre
Diff against target: 368 lines (+351/-2)
2 files modified
src/calibre/customize/builtins.py (+2/-2)
src/calibre/ebooks/metadata/sources/douban.py (+349/-0)
To merge this branch: bzr merge lp:~lifanxi/calibre/douban-metadata-plugin
Reviewer Review Type Date Requested Status
Kovid Goyal Pending
Review via email: mp+60315@code.launchpad.net

Description of the change

Updated the Douban.com metadata source plugin to work with the new metadata source framework in calibre 8.0.
You may add the logic to disable this plugin by default if the language is not set to zh.
Thanks.

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/calibre/customize/builtins.py'
2--- src/calibre/customize/builtins.py 2011-05-03 06:24:21 +0000
3+++ src/calibre/customize/builtins.py 2011-05-08 14:53:23 +0000
4@@ -628,8 +628,8 @@
5 from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
6 from calibre.ebooks.metadata.sources.isbndb import ISBNDB
7 from calibre.ebooks.metadata.sources.overdrive import OverDrive
8-
9-plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
10+from calibre.ebooks.metadata.sources.douban import Douban
11+plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
12
13 # }}}
14
15
16=== added file 'src/calibre/ebooks/metadata/sources/douban.py'
17--- src/calibre/ebooks/metadata/sources/douban.py 1970-01-01 00:00:00 +0000
18+++ src/calibre/ebooks/metadata/sources/douban.py 2011-05-08 14:53:23 +0000
19@@ -0,0 +1,349 @@
20+#!/usr/bin/env python
21+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
22+from __future__ import (unicode_literals, division, absolute_import,
23+ print_function)
24+
25+__license__ = 'GPL v3'
26+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
27+__docformat__ = 'restructuredtext en'
28+
29+import time, hashlib
30+from urllib import urlencode
31+from functools import partial
32+from Queue import Queue, Empty
33+
34+from lxml import etree
35+
36+from calibre.ebooks.metadata import check_isbn
37+from calibre.ebooks.metadata.sources.base import Source
38+from calibre.ebooks.metadata.book.base import Metadata
39+from calibre.ebooks.chardet import xml_to_unicode
40+from calibre.utils.date import parse_date, utcnow
41+from calibre.utils.cleantext import clean_ascii_chars
42+from calibre import as_unicode
43+
44+NAMESPACES = {
45+ 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
46+ 'atom' : 'http://www.w3.org/2005/Atom',
47+ 'db': 'http://www.douban.com/xmlns/',
48+ 'gd': 'http://schemas.google.com/g/2005'
49+ }
50+XPath = partial(etree.XPath, namespaces=NAMESPACES)
51+total_results = XPath('//openSearch:totalResults')
52+start_index = XPath('//openSearch:startIndex')
53+items_per_page = XPath('//openSearch:itemsPerPage')
54+entry = XPath('//atom:entry')
55+entry_id = XPath('descendant::atom:id')
56+title = XPath('descendant::atom:title')
57+description = XPath('descendant::atom:summary')
58+publisher = XPath("descendant::db:attribute[@name='publisher']")
59+isbn = XPath("descendant::db:attribute[@name='isbn13']")
60+date = XPath("descendant::db:attribute[@name='pubdate']")
61+creator = XPath("descendant::db:attribute[@name='author']")
62+booktag = XPath("descendant::db:tag/attribute::name")
63+rating = XPath("descendant::gd:rating/attribute::average")
64+cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href")
65+
66+def get_details(browser, url, timeout): # {{{
67+ try:
68+ raw = browser.open_novisit(url, timeout=timeout).read()
69+ except Exception as e:
70+ gc = getattr(e, 'getcode', lambda : -1)
71+ if gc() != 403:
72+ raise
73+ # Douban is throttling us, wait a little
74+ time.sleep(2)
75+ raw = browser.open_novisit(url, timeout=timeout).read()
76+
77+ return raw
78+# }}}
79+
80+def to_metadata(browser, log, entry_, timeout): # {{{
81+ def get_text(extra, x):
82+ try:
83+ ans = x(extra)
84+ if ans:
85+ ans = ans[0].text
86+ if ans and ans.strip():
87+ return ans.strip()
88+ except:
89+ log.exception('Programming error:')
90+ return None
91+
92+ id_url = entry_id(entry_)[0].text
93+ douban_id = id_url.split('/')[-1]
94+ title_ = ': '.join([x.text for x in title(entry_)]).strip()
95+ authors = [x.text.strip() for x in creator(entry_) if x.text]
96+ if not authors:
97+ authors = [_('Unknown')]
98+ if not id_url or not title:
99+ # Silently discard this entry
100+ return None
101+
102+ mi = Metadata(title_, authors)
103+ mi.identifiers = {'douban':douban_id}
104+ try:
105+ raw = get_details(browser, id_url, timeout)
106+ feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
107+ strip_encoding_pats=True)[0])
108+ extra = entry(feed)[0]
109+ except:
110+ log.exception('Failed to get additional details for', mi.title)
111+ return mi
112+ mi.comments = get_text(extra, description)
113+ mi.publisher = get_text(extra, publisher)
114+
115+ # ISBN
116+ isbns = []
117+ for x in [t.text for t in isbn(extra)]:
118+ if check_isbn(x):
119+ isbns.append(x)
120+ if isbns:
121+ mi.isbn = sorted(isbns, key=len)[-1]
122+ mi.all_isbns = isbns
123+
124+ # Tags
125+ try:
126+ btags = [x for x in booktag(extra) if x]
127+ tags = []
128+ for t in btags:
129+ atags = [y.strip() for y in t.split('/')]
130+ for tag in atags:
131+ if tag not in tags:
132+ tags.append(tag)
133+ except:
134+ log.exception('Failed to parse tags:')
135+ tags = []
136+ if tags:
137+ mi.tags = [x.replace(',', ';') for x in tags]
138+
139+ # pubdate
140+ pubdate = get_text(extra, date)
141+ if pubdate:
142+ try:
143+ default = utcnow().replace(day=15)
144+ mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
145+ except:
146+ log.error('Failed to parse pubdate %r'%pubdate)
147+
148+ # Ratings
149+ if rating(extra):
150+ try:
151+ mi.rating = float(rating(extra)[0]) / 2.0
152+ except:
153+ log.exception('Failed to parse rating')
154+ mi.rating = 0
155+
156+ # Cover
157+ mi.has_douban_cover = None
158+ u = cover_url(extra)
159+ if u:
160+ u = u[0].replace('/spic/', '/lpic/');
161+ # If URL contains "book-default", the book doesn't have a cover
162+ if u.find('book-default') == -1:
163+ mi.has_douban_cover = u
164+ return mi
165+# }}}
166+
167+class Douban(Source):
168+
169+ name = 'Douban Books'
170+ author = _('Li Fanxi')
171+ version = (2, 0, 0)
172+
173+ description = _('Downloads metadata from Douban.com')
174+
175+ capabilities = frozenset(['identify', 'cover'])
176+ touched_fields = frozenset(['title', 'authors', 'tags',
177+ 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
178+ 'identifier:douban']) # language currently disabled
179+ supports_gzip_transfer_encoding = True
180+ cached_cover_url_is_reliable = True
181+
182+ DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
183+ DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
184+
185+ def get_book_url(self, identifiers): # {{{
186+ db = identifiers.get('douban', None)
187+ if db is not None:
188+ return ('douban', db, self.DOUBAN_BOOK_URL%db)
189+ else:
190+ return None
191+ # }}}
192+
193+ def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
194+ SEARCH_URL = 'http://api.douban.com/book/subjects?'
195+ ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
196+ SUBJECT_URL = 'http://api.douban.com/book/subject/'
197+
198+ q = ''
199+ t = None
200+ isbn = check_isbn(identifiers.get('isbn', None))
201+ subject = identifiers.get('douban', None)
202+ if isbn is not None:
203+ q = isbn
204+ t = 'isbn'
205+ elif subject is not None:
206+ q = subject
207+ t = 'subject'
208+ elif title or authors:
209+ def build_term(prefix, parts):
210+ return ' '.join(x for x in parts)
211+ title_tokens = list(self.get_title_tokens(title))
212+ if title_tokens:
213+ q += build_term('title', title_tokens)
214+ author_tokens = self.get_author_tokens(authors,
215+ only_first_author=True)
216+ if author_tokens:
217+ q += ((' ' if q != '' else '') +
218+ build_term('author', author_tokens))
219+ t = 'search'
220+ q = q.strip()
221+ if isinstance(q, unicode):
222+ q = q.encode('utf-8')
223+ if not q:
224+ return None
225+ url = None
226+ if t == "isbn":
227+ url = ISBN_URL + q
228+ elif t == 'subject':
229+ url = SUBJECT_URL + q
230+ else:
231+ url = SEARCH_URL + urlencode({
232+ 'q': q,
233+ })
234+ if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
235+ url = url + "?apikey=" + self.DOUBAN_API_KEY
236+ return url
237+ # }}}
238+
239+ def download_cover(self, log, result_queue, abort, # {{{
240+ title=None, authors=None, identifiers={}, timeout=30):
241+ cached_url = self.get_cached_cover_url(identifiers)
242+ if cached_url is None:
243+ log.info('No cached cover found, running identify')
244+ rq = Queue()
245+ self.identify(log, rq, abort, title=title, authors=authors,
246+ identifiers=identifiers)
247+ if abort.is_set():
248+ return
249+ results = []
250+ while True:
251+ try:
252+ results.append(rq.get_nowait())
253+ except Empty:
254+ break
255+ results.sort(key=self.identify_results_keygen(
256+ title=title, authors=authors, identifiers=identifiers))
257+ for mi in results:
258+ cached_url = self.get_cached_cover_url(mi.identifiers)
259+ if cached_url is not None:
260+ break
261+ if cached_url is None:
262+ log.info('No cover found')
263+ return
264+
265+ if abort.is_set():
266+ return
267+ br = self.browser
268+ log('Downloading cover from:', cached_url)
269+ try:
270+ cdata = br.open_novisit(cached_url, timeout=timeout).read()
271+ if cdata:
272+ result_queue.put((self, cdata))
273+ except:
274+ log.exception('Failed to download cover from:', cached_url)
275+
276+ # }}}
277+
278+ def get_cached_cover_url(self, identifiers): # {{{
279+ url = None
280+ db = identifiers.get('douban', None)
281+ if db is None:
282+ isbn = identifiers.get('isbn', None)
283+ if isbn is not None:
284+ db = self.cached_isbn_to_identifier(isbn)
285+ if db is not None:
286+ url = self.cached_identifier_to_cover_url(db)
287+
288+ return url
289+ # }}}
290+
291+ def get_all_details(self, br, log, entries, abort, # {{{
292+ result_queue, timeout):
293+ for relevance, i in enumerate(entries):
294+ try:
295+ ans = to_metadata(br, log, i, timeout)
296+ if isinstance(ans, Metadata):
297+ ans.source_relevance = relevance
298+ db = ans.identifiers['douban']
299+ for isbn in getattr(ans, 'all_isbns', []):
300+ self.cache_isbn_to_identifier(isbn, db)
301+ if ans.has_douban_cover:
302+ self.cache_identifier_to_cover_url(db,
303+ ans.has_douban_cover)
304+ self.clean_downloaded_metadata(ans)
305+ result_queue.put(ans)
306+ except:
307+ log.exception(
308+ 'Failed to get metadata for identify entry:',
309+ etree.tostring(i))
310+ if abort.is_set():
311+ break
312+ # }}}
313+
314+ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
315+ identifiers={}, timeout=30):
316+ query = self.create_query(log, title=title, authors=authors,
317+ identifiers=identifiers)
318+ if not query:
319+ log.error('Insufficient metadata to construct query')
320+ return
321+ br = self.browser
322+ try:
323+ raw = br.open_novisit(query, timeout=timeout).read()
324+ except Exception as e:
325+ log.exception('Failed to make identify query: %r'%query)
326+ return as_unicode(e)
327+ try:
328+ parser = etree.XMLParser(recover=True, no_network=True)
329+ feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
330+ strip_encoding_pats=True)[0], parser=parser)
331+ entries = entry(feed)
332+ except Exception as e:
333+ log.exception('Failed to parse identify results')
334+ return as_unicode(e)
335+ if not entries and identifiers and title and authors and \
336+ not abort.is_set():
337+ return self.identify(log, result_queue, abort, title=title,
338+ authors=authors, timeout=timeout)
339+
340+ # There is no point running these queries in threads as douban
341+ # throttles requests returning 403 Forbidden errors
342+ self.get_all_details(br, log, entries, abort, result_queue, timeout)
343+
344+ return None
345+ # }}}
346+
347+if __name__ == '__main__': # tests {{{
348+ # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
349+ from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
350+ title_test, authors_test)
351+ test_identify_plugin(Douban.name,
352+ [
353+
354+
355+ (
356+ {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
357+ 'authors':['刘慈欣']},
358+ [title_test('三体', exact=True),
359+ authors_test(['刘慈欣'])]
360+ ),
361+
362+ (
363+ {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
364+ [title_test('Linux内核修炼之道', exact=False)]
365+ ),
366+ ])
367+# }}}
368+

Subscribers

People subscribed via source and target branches