1
=== modified file 'doc/microfiber.rst'
2
--- doc/microfiber.rst	2012-08-15 22:11:15 +0000
3
+++ doc/microfiber.rst	2012-08-16 04:26:20 +0000
4
@@ -308,7 +308,7 @@
5
308
308
6
309
309
7
310
.. class:: Database(name, env='http://localhost:5984/')
310
.. class:: Database(name, env='http://localhost:5984/')
9
311
    
311
10
312
    Makes requests relative to a CouchDB database URL.
312
    Makes requests relative to a CouchDB database URL.
11
313
    
313
    
12
314
    Create a :class:`Database` like this:
314
    Create a :class:`Database` like this:
13
@@ -424,6 +424,34 @@
14
424
        *Note:* for subtle reasons that take a while to explain, you probably
424
        *Note:* for subtle reasons that take a while to explain, you probably
15
425
        don't want to use this method.  Instead use
425
        don't want to use this method.  Instead use
16
426
        :meth:`Database.save_many()`.
426
        :meth:`Database.save_many()`.
17
427
        
18
428
    .. method:: dump(filename)
19
429
20
430
        Dump this database to regular JSON file *filename*.
21
431
22
432
        For example:
23
433
24
434
        >>> db = Database('foo')  #doctest: +SKIP
25
435
        >>> db.dump('foo.json')  #doctest: +SKIP
26
436
27
437
        Or if *filename* ends with ``'.json.gz'``, the file will be
28
438
        gzip-compressed as it is written:
29
439
30
440
        >>> db.dump('foo.json.gz')  #doctest: +SKIP
31
441
32
442
        CouchDB is a bit awkward in that its API doesn't offer a nice way to
33
443
        make a request whose response is suitable for writing directly to a
34
444
        file, without decoding/encoding.  It would be nice if that dump could
35
445
        be loaded directly from the file as well.  One of the biggest issues is
36
446
        that a dump really needs to have doc['_rev'] removed.
37
447
38
448
        This method is a compromise on many fronts, but it was made with these
39
449
        priorities:
40
450
41
451
            1. Readability of the dumped JSON file
42
452
43
453
            2. High performance and low memory usage, despite the fact that
44
454
               we must encode and decode each doc
45
427
455
46
428
456
47
429
457
48
430
458
49
=== modified file 'microfiber.py'
50
--- microfiber.py	2012-08-15 22:11:15 +0000
51
+++ microfiber.py	2012-08-16 04:26:20 +0000
52
@@ -40,15 +40,17 @@
53
40
"""
40
"""
54
41
41
55
42
from os import urandom
42
from os import urandom
57
43
from io import BufferedReader
43
from io import BufferedReader, TextIOWrapper
58
44
from base64 import b32encode, b64encode
44
from base64 import b32encode, b64encode
59
45
import json
45
import json
60
46
from gzip import GzipFile
61
46
import time
47
import time
62
47
from hashlib import sha1
48
from hashlib import sha1
63
48
import hmac
49
import hmac
64
49
from urllib.parse import urlparse, urlencode, quote_plus
50
from urllib.parse import urlparse, urlencode, quote_plus
65
50
from http.client import HTTPConnection, HTTPSConnection, BadStatusLine
51
from http.client import HTTPConnection, HTTPSConnection, BadStatusLine
66
51
import threading
52
import threading
67
53
from queue import Queue
68
52
import math
54
import math
69
53
55
70
54
56
71
@@ -413,27 +415,74 @@
72
413
        super().__init__(msg.format(count))
415
        super().__init__(msg.format(count))
73
414
416
74
415
417
75
418
def _start_thread(target, *args):
76
419
    thread = threading.Thread(target=target, args=args)
77
420
    thread.daemon = True
78
421
    thread.start()
79
422
    return thread
80
423
81
424
82
425
class SmartQueue(Queue):
83
426
    """
84
427
    Queue with custom get() that raises exception instances from the queue.
85
428
    """
86
429
87
430
    def get(self, block=True, timeout=None):
88
431
        item = super().get(block, timeout)
89
432
        if isinstance(item, Exception):
90
433
            raise item
91
434
        return item
92
435
93
436
94
437
def _fakelist_worker(rows, db, queue):
95
438
    try:
96
439
        for doc_ids in id_slice_iter(rows, 50):
97
440
            queue.put(db.get_many(doc_ids))
98
441
        queue.put(None)
99
442
    except Exception as e:
100
443
        queue.put(e)
101
444
102
445
103
416
class FakeList(list):
446
class FakeList(list):
107
417
    __slots__ = ('_count', '_iterable')
447
    """
108
418
448
    Trick ``json.dump()`` into doing memory-efficient incremental encoding.
109
419
    def __init__(self, count, iterable):
449
110
450
    This class is a hack to allow `Database.dump()` to dump a large database
111
451
    while keeping the memory usage constant.
112
452
113
453
    It also provides two hacks to improve the performance of `Database.dump()`:
114
454
115
455
        1. Documents are retrieved 50 at a time using `Database.get_many()`
116
456
117
457
        2. The CouchDB requests are made in a separate thread so `json.dump()`
118
458
           can be busy doing work while we're waiting for a response
119
459
    """
120
460
121
461
    __slots__ = ('_rows', '_db')
122
462
123
463
    def __init__(self, rows, db):
124
420
        super().__init__()
464
        super().__init__()
127
421
        self._count = count
465
        self._rows = rows
128
422
        self._iterable = iterable
466
        self._db = db
129
423
467
130
424
    def __len__(self):
468
    def __len__(self):
132
425
        return self._count
469
        return len(self._rows)
133
426
470
134
427
    def __iter__(self):
471
    def __iter__(self):
144
428
        for doc in self._iterable:
472
        queue = SmartQueue(2)
145
429
            yield doc
473
        thread = _start_thread(_fakelist_worker, self._rows, self._db, queue)
146
430
474
        while True:
147
431
475
            docs = queue.get()
148
432
def iter_all_docs(rows, db, attachments=True):
476
            if docs is None:
149
433
    for r in rows:
477
                break
150
434
        doc = db.get(r['id'], rev=r['value']['rev'], attachments=attachments)
478
            for doc in docs:
151
435
        del doc['_rev']
479
                del doc['_rev']
152
436
        yield doc
480
                try:
153
481
                    del doc['_attachments']
154
482
                except KeyError:
155
483
                    pass
156
484
                yield doc
157
485
        thread.join()  # Make sure reader() terminates
158
437
486
159
438
487
160
439
class CouchBase(object):
488
class CouchBase(object):
161
@@ -876,12 +925,45 @@
162
876
            options['reduce'] = False
925
            options['reduce'] = False
163
877
        return self.get('_design', design, '_view', view, **options)
926
        return self.get('_design', design, '_view', view, **options)
164
878
927
174
879
    def dump(self, fp, attachments=True):
928
    def dump(self, filename):
175
880
        rows = self.get('_all_docs')['rows']
929
        """
176
881
        iterable = iter_all_docs(rows, self, attachments)
930
        Dump this database to regular JSON file *filename*.
177
882
        docs = FakeList(len(rows), iterable)
931
178
883
        json.dump({'docs': docs}, fp, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))
932
        For example:
179
884
        
933
180
885
    def load(self, fp):
934
        >>> db = Database('foo')  #doctest: +SKIP
181
886
        return self.post(fp, '_bulk_docs')
935
        >>> db.dump('foo.json')  #doctest: +SKIP
182
887
        
936
183
937
        Or if *filename* ends with ``'.json.gz'``, the file will be
184
938
        gzip-compressed as it is written:
185
939
186
940
        >>> db.dump('foo.json.gz')  #doctest: +SKIP
187
941
188
942
        CouchDB is a bit awkward in that its API doesn't offer a nice way to
189
943
        make a request whose response is suitable for writing directly to a
190
944
        file, without decoding/encoding.  It would be nice if that dump could
191
945
        be loaded directly from the file as well.  One of the biggest issues is
192
946
        that a dump really needs to have doc['_rev'] removed.
193
947
194
948
        This method is a compromise on many fronts, but it was made with these
195
949
        priorities:
196
950
197
951
            1. Readability of the dumped JSON file
198
952
199
953
            2. High performance and low memory usage, despite the fact that
200
954
               we must encode and decode each doc
201
955
        """
202
956
        if filename.lower().endswith('.json.gz'):
203
957
            _fp = open(filename, 'wb')
204
958
            fp = TextIOWrapper(GzipFile('docs.json', fileobj=_fp, mtime=1))
205
959
        else:
206
960
            fp = open(filename, 'w')
207
961
        rows = self.get('_all_docs', endkey='_')['rows']
208
962
        docs = FakeList(rows, self)
209
963
        json.dump(docs, fp,
210
964
            ensure_ascii=False,
211
965
            sort_keys=True,
212
966
            indent=4,
213
967
            separators=(',', ': '),
214
968
        )
215
969
216
888
970
217
=== modified file 'test_microfiber.py'
218
--- test_microfiber.py	2012-08-15 22:11:15 +0000
219
+++ test_microfiber.py	2012-08-16 04:26:20 +0000
220
@@ -30,6 +30,7 @@
221
30
from base64 import b64encode, b64decode, b32encode, b32decode
30
from base64 import b64encode, b64decode, b32encode, b32decode
222
31
from copy import deepcopy
31
from copy import deepcopy
223
32
import json
32
import json
224
33
import gzip
225
33
import time
34
import time
226
34
import io
35
import io
227
35
import tempfile
36
import tempfile
228
@@ -58,6 +59,26 @@
229
58
B32ALPHABET = frozenset('234567ABCDEFGHIJKLMNOPQRSTUVWXYZ')
59
B32ALPHABET = frozenset('234567ABCDEFGHIJKLMNOPQRSTUVWXYZ')
230
59
60
231
60
61
232
62
# A sample view from Dmedia:
233
63
doc_type = """
234
64
function(doc) {
235
65
    emit(doc.type, null);
236
66
}
237
67
"""
238
68
doc_time = """
239
69
function(doc) {
240
70
    emit(doc.time, null);
241
71
}
242
72
"""
243
73
doc_design = {
244
74
    '_id': '_design/doc',
245
75
    'views': {
246
76
        'type': {'map': doc_type, 'reduce': '_count'},
247
77
        'time': {'map': doc_time},
248
78
    },
249
79
}
250
80
251
81
252
61
def is_microfiber_id(_id):
82
def is_microfiber_id(_id):
253
62
    assert isinstance(_id, str)
83
    assert isinstance(_id, str)
254
63
    return (
84
    return (
255
@@ -1014,6 +1035,52 @@
256
1014
        self.env = None
1035
        self.env = None
257
1015
1036
258
1016
1037
259
1038
class TestFakeList(LiveTestCase):
260
1039
    def test_init(self):
261
1040
        db = microfiber.Database('foo', self.env)
262
1041
        self.assertTrue(db.ensure())
263
1042
264
1043
        # Test when DB is empty
265
1044
        rows = []
266
1045
        fake = microfiber.FakeList(rows, db)
267
1046
        self.assertIsInstance(fake, list)
268
1047
        self.assertIs(fake._rows, rows)
269
1048
        self.assertIs(fake._db, db)
270
1049
        self.assertEqual(len(fake), 0)
271
1050
        self.assertEqual(list(fake), [])
272
1051
273
1052
        # Test when there are some docs
274
1053
        ids = sorted(test_id() for i in range(201))
275
1054
        orig = [
276
1055
            {'_id': _id, 'hello': 'мир', 'welcome': 'все'}
277
1056
            for _id in ids
278
1057
        ]
279
1058
        docs = deepcopy(orig)
280
1059
        db.save_many(docs)
281
1060
        rows = db.get('_all_docs')['rows']
282
1061
        fake = microfiber.FakeList(rows, db)
283
1062
        self.assertIsInstance(fake, list)
284
1063
        self.assertIs(fake._rows, rows)
285
1064
        self.assertIs(fake._db, db)
286
1065
        self.assertEqual(len(fake), 201)
287
1066
        self.assertEqual(list(fake), orig)
288
1067
289
1068
        # Verify that _attachments get deleted
290
1069
        for doc in docs:
291
1070
            db.put_att('application/octet-stream', b'foobar', doc['_id'], 'baz',
292
1071
                rev=doc['_rev']
293
1072
            )
294
1073
        for _id in ids:
295
1074
            self.assertIn('_attachments', db.get(_id))
296
1075
        rows = db.get('_all_docs')['rows']
297
1076
        fake = microfiber.FakeList(rows, db)
298
1077
        self.assertIsInstance(fake, list)
299
1078
        self.assertIs(fake._rows, rows)
300
1079
        self.assertIs(fake._db, db)
301
1080
        self.assertEqual(len(fake), 201)
302
1081
        self.assertEqual(list(fake), orig)
303
1082
304
1083
305
1017
class TestCouchBaseLive(LiveTestCase):
1084
class TestCouchBaseLive(LiveTestCase):
306
1018
    klass = microfiber.CouchBase
1085
    klass = microfiber.CouchBase
307
1019
1086
308
@@ -1676,3 +1743,60 @@
309
1676
            db.get_many([ids[17], nope, ids[18]]),
1743
            db.get_many([ids[17], nope, ids[18]]),
310
1677
            [docs[17], None, docs[18]]
1744
            [docs[17], None, docs[18]]
311
1678
        )
1745
        )
312
1746
313
1747
    def test_dump(self):
314
1748
        db = microfiber.Database('foo', self.env)
315
1749
        self.assertTrue(db.ensure())
316
1750
        docs = [
317
1751
            {'_id': test_id(), 'hello': 'мир', 'welcome': 'все'}
318
1752
            for i in range(200)
319
1753
        ]
320
1754
        docs_s = microfiber.dumps(
321
1755
            sorted(docs, key=lambda d: d['_id']),
322
1756
            pretty=True
323
1757
        )
324
1758
        docs.append(deepcopy(doc_design))
325
1759
        checksum = md5(docs_s.encode('utf-8')).hexdigest()
326
1760
        db.save_many(docs)
327
1761
328
1762
        # Test with .json
329
1763
        dst = path.join(self.tmpcouch.paths.bzr, 'foo.json')
330
1764
        db.dump(dst)
331
1765
        self.assertEqual(open(dst, 'r').read(), docs_s)
332
1766
        self.assertEqual(
333
1767
            md5(open(dst, 'rb').read()).hexdigest(),
334
1768
            checksum
335
1769
        )
336
1770
337
1771
        # Test with .json.gz
338
1772
        dst = path.join(self.tmpcouch.paths.bzr, 'foo.json.gz')
339
1773
        db.dump(dst)
340
1774
        gz_checksum = md5(open(dst, 'rb').read()).hexdigest()
341
1775
        self.assertEqual(
342
1776
            md5(gzip.GzipFile(dst, 'rb').read()).hexdigest(),
343
1777
            checksum
344
1778
        )
345
1779
346
1780
        # Test that timestamp doesn't change gz_checksum
347
1781
        time.sleep(2)
348
1782
        db.dump(dst)
349
1783
        self.assertEqual(
350
1784
            md5(open(dst, 'rb').read()).hexdigest(),
351
1785
            gz_checksum
352
1786
        )
353
1787
354
1788
        # Test that filename doesn't change gz_checksum
355
1789
        dst = path.join(self.tmpcouch.paths.bzr, 'bar.json.gz')
356
1790
        db.dump(dst)
357
1791
        self.assertEqual(
358
1792
            md5(open(dst, 'rb').read()).hexdigest(),
359
1793
            gz_checksum
360
1794
        )
361
1795
362
1796
        # Make sure .JSON.GZ also works, that case is ignored
363
1797
        dst = path.join(self.tmpcouch.paths.bzr, 'FOO.JSON.GZ')
364
1798
        db.dump(dst)
365
1799
        self.assertEqual(
366
1800
            md5(open(dst, 'rb').read()).hexdigest(),
367
1801
            gz_checksum
368
1802
        )
Status:	Merged
Merged at revision:	133
Proposed branch:	lp:~jderose/microfiber/db-dump
Merge into:	lp:microfiber
Diff against target:	368 lines (+260/-26) 3 files modified doc/microfiber.rst (+29/-1) microfiber.py (+107/-25) test_microfiber.py (+124/-0)
To merge this branch:	bzr merge lp:~jderose/microfiber/db-dump
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
microfiber dev		2012-08-16	Pending
Review via email: mp+119829@code.launchpad.net