1
=== modified file 'src/calibre/customize/builtins.py'
2
--- src/calibre/customize/builtins.py	2011-05-03 06:24:21 +0000
3
+++ src/calibre/customize/builtins.py	2011-05-08 14:53:23 +0000
4
@@ -628,8 +628,8 @@
5
628
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
628
from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
6
629
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
629
from calibre.ebooks.metadata.sources.isbndb import ISBNDB
7
630
from calibre.ebooks.metadata.sources.overdrive import OverDrive
630
from calibre.ebooks.metadata.sources.overdrive import OverDrive
10
631
631
from calibre.ebooks.metadata.sources.douban import Douban
11
632
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
632
plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive, Douban]
12
633
633
13
634
# }}}
634
# }}}
14
635
635
15
636
636
16
=== added file 'src/calibre/ebooks/metadata/sources/douban.py'
17
--- src/calibre/ebooks/metadata/sources/douban.py	1970-01-01 00:00:00 +0000
18
+++ src/calibre/ebooks/metadata/sources/douban.py	2011-05-08 14:53:23 +0000
19
@@ -0,0 +1,349 @@
20
1
#!/usr/bin/env python
21
2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
22
3
from __future__ import (unicode_literals, division, absolute_import,
23
4
                        print_function)
24
5
25
6
__license__   = 'GPL v3'
26
7
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>; 2011, Li Fanxi <lifanxi@freemindworld.com>'
27
8
__docformat__ = 'restructuredtext en'
28
9
29
10
import time, hashlib
30
11
from urllib import urlencode
31
12
from functools import partial
32
13
from Queue import Queue, Empty
33
14
34
15
from lxml import etree
35
16
36
17
from calibre.ebooks.metadata import check_isbn
37
18
from calibre.ebooks.metadata.sources.base import Source
38
19
from calibre.ebooks.metadata.book.base import Metadata
39
20
from calibre.ebooks.chardet import xml_to_unicode
40
21
from calibre.utils.date import parse_date, utcnow
41
22
from calibre.utils.cleantext import clean_ascii_chars
42
23
from calibre import as_unicode
43
24
44
25
NAMESPACES = {
45
26
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
46
27
              'atom' : 'http://www.w3.org/2005/Atom',
47
28
              'db': 'http://www.douban.com/xmlns/',
48
29
              'gd': 'http://schemas.google.com/g/2005'
49
30
            }
50
31
XPath = partial(etree.XPath, namespaces=NAMESPACES)
51
32
total_results  = XPath('//openSearch:totalResults')
52
33
start_index    = XPath('//openSearch:startIndex')
53
34
items_per_page = XPath('//openSearch:itemsPerPage')
54
35
entry          = XPath('//atom:entry')
55
36
entry_id       = XPath('descendant::atom:id')
56
37
title          = XPath('descendant::atom:title')
57
38
description    = XPath('descendant::atom:summary')
58
39
publisher      = XPath("descendant::db:attribute[@name='publisher']")
59
40
isbn           = XPath("descendant::db:attribute[@name='isbn13']")
60
41
date           = XPath("descendant::db:attribute[@name='pubdate']")
61
42
creator        = XPath("descendant::db:attribute[@name='author']")
62
43
booktag        = XPath("descendant::db:tag/attribute::name")
63
44
rating         = XPath("descendant::gd:rating/attribute::average")
64
45
cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")
65
46
66
47
def get_details(browser, url, timeout): # {{{
67
48
    try:
68
49
        raw = browser.open_novisit(url, timeout=timeout).read()
69
50
    except Exception as e:
70
51
        gc = getattr(e, 'getcode', lambda : -1)
71
52
        if gc() != 403:
72
53
            raise
73
54
        # Douban is throttling us, wait a little
74
55
        time.sleep(2)
75
56
        raw = browser.open_novisit(url, timeout=timeout).read()
76
57
77
58
    return raw
78
59
# }}}
79
60
80
61
def to_metadata(browser, log, entry_, timeout): # {{{
81
62
    def get_text(extra, x):
82
63
        try:
83
64
            ans = x(extra)
84
65
            if ans:
85
66
                ans = ans[0].text
86
67
                if ans and ans.strip():
87
68
                    return ans.strip()
88
69
        except:
89
70
            log.exception('Programming error:')
90
71
        return None
91
72
92
73
    id_url = entry_id(entry_)[0].text
93
74
    douban_id = id_url.split('/')[-1]
94
75
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
95
76
    authors = [x.text.strip() for x in creator(entry_) if x.text]
96
77
    if not authors:
97
78
        authors = [_('Unknown')]
98
79
    if not id_url or not title:
99
80
        # Silently discard this entry
100
81
        return None
101
82
102
83
    mi = Metadata(title_, authors)
103
84
    mi.identifiers = {'douban':douban_id}
104
85
    try:
105
86
        raw = get_details(browser, id_url, timeout)
106
87
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
107
88
            strip_encoding_pats=True)[0])
108
89
        extra = entry(feed)[0]
109
90
    except:
110
91
        log.exception('Failed to get additional details for', mi.title)
111
92
        return mi
112
93
    mi.comments = get_text(extra, description)
113
94
    mi.publisher = get_text(extra, publisher)
114
95
115
96
    # ISBN
116
97
    isbns = []
117
98
    for x in [t.text for t in isbn(extra)]:
118
99
        if check_isbn(x):
119
100
            isbns.append(x)
120
101
    if isbns:
121
102
        mi.isbn = sorted(isbns, key=len)[-1]
122
103
    mi.all_isbns = isbns
123
104
124
105
    # Tags
125
106
    try:
126
107
        btags = [x for x in booktag(extra) if x]
127
108
        tags = []
128
109
        for t in btags:
129
110
            atags = [y.strip() for y in t.split('/')]
130
111
            for tag in atags:
131
112
                if tag not in tags:
132
113
                    tags.append(tag)
133
114
    except:
134
115
        log.exception('Failed to parse tags:')
135
116
        tags = []
136
117
    if tags:
137
118
        mi.tags = [x.replace(',', ';') for x in tags]
138
119
        
139
120
    # pubdate
140
121
    pubdate = get_text(extra, date)
141
122
    if pubdate:
142
123
        try:
143
124
            default = utcnow().replace(day=15)
144
125
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
145
126
        except:
146
127
            log.error('Failed to parse pubdate %r'%pubdate)
147
128
148
129
    # Ratings
149
130
    if rating(extra):
150
131
        try:
151
132
            mi.rating = float(rating(extra)[0]) / 2.0
152
133
        except:
153
134
            log.exception('Failed to parse rating')
154
135
            mi.rating = 0
155
136
156
137
    # Cover
157
138
    mi.has_douban_cover = None
158
139
    u = cover_url(extra)
159
140
    if u:
160
141
        u = u[0].replace('/spic/', '/lpic/');
161
142
        # If URL contains "book-default", the book doesn't have a cover
162
143
        if u.find('book-default') == -1:
163
144
            mi.has_douban_cover = u
164
145
    return mi
165
146
# }}}
166
147
167
148
class Douban(Source):
168
149
169
150
    name = 'Douban Books'
170
151
    author = _('Li Fanxi')
171
152
    version = (2, 0, 0)
172
153
    
173
154
    description = _('Downloads metadata from Douban.com')
174
155
175
156
    capabilities = frozenset(['identify', 'cover'])
176
157
    touched_fields = frozenset(['title', 'authors', 'tags', 
177
158
        'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating',
178
159
        'identifier:douban']) # language currently disabled
179
160
    supports_gzip_transfer_encoding = True
180
161
    cached_cover_url_is_reliable = True
181
162
182
163
    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
183
164
    DOUBAN_BOOK_URL = 'http://book.douban.com/subject/%s/'
184
165
185
166
    def get_book_url(self, identifiers): # {{{
186
167
        db = identifiers.get('douban', None)
187
168
        if db is not None:
188
169
            return ('douban', db, self.DOUBAN_BOOK_URL%db)
189
170
        else:
190
171
            return None
191
172
    # }}}
192
173
193
174
    def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
194
175
        SEARCH_URL = 'http://api.douban.com/book/subjects?'
195
176
        ISBN_URL = 'http://api.douban.com/book/subject/isbn/'
196
177
        SUBJECT_URL = 'http://api.douban.com/book/subject/'
197
178
198
179
        q = ''
199
180
        t = None
200
181
        isbn = check_isbn(identifiers.get('isbn', None))
201
182
        subject = identifiers.get('douban', None)
202
183
        if isbn is not None:
203
184
            q = isbn
204
185
            t = 'isbn'
205
186
        elif subject is not None:
206
187
            q = subject
207
188
            t = 'subject'
208
189
        elif title or authors:
209
190
            def build_term(prefix, parts):
210
191
                return ' '.join(x for x in parts)
211
192
            title_tokens = list(self.get_title_tokens(title))
212
193
            if title_tokens:
213
194
                q += build_term('title', title_tokens)
214
195
            author_tokens = self.get_author_tokens(authors,
215
196
                    only_first_author=True)
216
197
            if author_tokens:
217
198
                q += ((' ' if q != '' else '') + 
218
199
                    build_term('author', author_tokens))
219
200
            t = 'search'
220
201
        q = q.strip()
221
202
        if isinstance(q, unicode):
222
203
            q = q.encode('utf-8')
223
204
        if not q:
224
205
            return None
225
206
        url = None
226
207
        if t == "isbn":
227
208
            url = ISBN_URL + q
228
209
        elif t == 'subject':
229
210
            url = SUBJECT_URL + q
230
211
        else:
231
212
            url = SEARCH_URL + urlencode({
232
213
                    'q': q,
233
214
                    })
234
215
        if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
235
216
            url = url + "?apikey=" + self.DOUBAN_API_KEY
236
217
        return url
237
218
    # }}}
238
219
239
220
    def download_cover(self, log, result_queue, abort, # {{{
240
221
            title=None, authors=None, identifiers={}, timeout=30):
241
222
        cached_url = self.get_cached_cover_url(identifiers)
242
223
        if cached_url is None:
243
224
            log.info('No cached cover found, running identify')
244
225
            rq = Queue()
245
226
            self.identify(log, rq, abort, title=title, authors=authors,
246
227
                    identifiers=identifiers)
247
228
            if abort.is_set():
248
229
                return
249
230
            results = []
250
231
            while True:
251
232
                try:
252
233
                    results.append(rq.get_nowait())
253
234
                except Empty:
254
235
                    break
255
236
            results.sort(key=self.identify_results_keygen(
256
237
                title=title, authors=authors, identifiers=identifiers))
257
238
            for mi in results:
258
239
                cached_url = self.get_cached_cover_url(mi.identifiers)
259
240
                if cached_url is not None:
260
241
                    break
261
242
        if cached_url is None:
262
243
            log.info('No cover found')
263
244
            return
264
245
265
246
        if abort.is_set():
266
247
            return
267
248
        br = self.browser
268
249
        log('Downloading cover from:', cached_url)
269
250
        try:
270
251
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
271
252
            if cdata:
272
253
                result_queue.put((self, cdata))
273
254
        except:
274
255
            log.exception('Failed to download cover from:', cached_url)
275
256
276
257
    # }}}
277
258
278
259
    def get_cached_cover_url(self, identifiers): # {{{
279
260
        url = None
280
261
        db = identifiers.get('douban', None)
281
262
        if db is None:
282
263
            isbn = identifiers.get('isbn', None)
283
264
            if isbn is not None:
284
265
                db = self.cached_isbn_to_identifier(isbn)
285
266
        if db is not None:
286
267
            url = self.cached_identifier_to_cover_url(db)
287
268
288
269
        return url
289
270
    # }}}
290
271
291
272
    def get_all_details(self, br, log, entries, abort, # {{{
292
273
            result_queue, timeout):
293
274
        for relevance, i in enumerate(entries):
294
275
            try:
295
276
                ans = to_metadata(br, log, i, timeout)
296
277
                if isinstance(ans, Metadata):
297
278
                    ans.source_relevance = relevance
298
279
                    db = ans.identifiers['douban']
299
280
                    for isbn in getattr(ans, 'all_isbns', []):
300
281
                        self.cache_isbn_to_identifier(isbn, db)
301
282
                    if ans.has_douban_cover:
302
283
                        self.cache_identifier_to_cover_url(db,
303
284
                                ans.has_douban_cover)
304
285
                    self.clean_downloaded_metadata(ans)
305
286
                    result_queue.put(ans)
306
287
            except:
307
288
                log.exception(
308
289
                    'Failed to get metadata for identify entry:',
309
290
                    etree.tostring(i))
310
291
            if abort.is_set():
311
292
                break
312
293
    # }}}
313
294
314
295
    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
315
296
            identifiers={}, timeout=30):
316
297
        query = self.create_query(log, title=title, authors=authors,
317
298
                identifiers=identifiers)
318
299
        if not query:
319
300
            log.error('Insufficient metadata to construct query')
320
301
            return
321
302
        br = self.browser
322
303
        try:
323
304
            raw = br.open_novisit(query, timeout=timeout).read()
324
305
        except Exception as e:
325
306
            log.exception('Failed to make identify query: %r'%query)
326
307
            return as_unicode(e)
327
308
        try:
328
309
            parser = etree.XMLParser(recover=True, no_network=True)
329
310
            feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
330
311
                strip_encoding_pats=True)[0], parser=parser)
331
312
            entries = entry(feed)
332
313
        except Exception as e:
333
314
            log.exception('Failed to parse identify results')
334
315
            return as_unicode(e)
335
316
        if not entries and identifiers and title and authors and \
336
317
                not abort.is_set():
337
318
            return self.identify(log, result_queue, abort, title=title,
338
319
                    authors=authors, timeout=timeout)
339
320
340
321
        # There is no point running these queries in threads as douban
341
322
        # throttles requests returning 403 Forbidden errors
342
323
        self.get_all_details(br, log, entries, abort, result_queue, timeout)
343
324
344
325
        return None
345
326
    # }}}
346
327
347
328
if __name__ == '__main__': # tests {{{
348
329
    # To run these test use: calibre-debug -e src/calibre/ebooks/metadata/sources/douban.py
349
330
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
350
331
            title_test, authors_test)
351
332
    test_identify_plugin(Douban.name,
352
333
        [
353
334
354
335
355
336
            (
356
337
                {'identifiers':{'isbn': '9787536692930'}, 'title':'三体',
357
338
                    'authors':['刘慈欣']},
358
339
                [title_test('三体', exact=True),
359
340
                    authors_test(['刘慈欣'])]
360
341
            ),
361
342
362
343
            (
363
344
                {'title': 'Linux内核修炼之道', 'authors':['任桥伟']},
364
345
                [title_test('Linux内核修炼之道', exact=False)]
365
346
            ),
366
347
    ])
367
348
# }}}
368
349
Status:	Merged
Merged at revision:	9207
Proposed branch:	lp:~lifanxi/calibre/douban-metadata-plugin
Merge into:	lp:calibre
Diff against target:	368 lines (+351/-2) 2 files modified src/calibre/customize/builtins.py (+2/-2) src/calibre/ebooks/metadata/sources/douban.py (+349/-0)
To merge this branch:	bzr merge lp:~lifanxi/calibre/douban-metadata-plugin
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Kovid Goyal		2011-05-08	Pending
Review via email: mp+60315@code.launchpad.net