Merge lp:~jderose/microfiber/db-dump into lp:microfiber

Proposed by Jason Gerard DeRose
Status: Merged
Merged at revision: 133
Proposed branch: lp:~jderose/microfiber/db-dump
Merge into: lp:microfiber
Diff against target: 368 lines (+260/-26)
3 files modified
doc/microfiber.rst (+29/-1)
microfiber.py (+107/-25)
test_microfiber.py (+124/-0)
To merge this branch: bzr merge lp:~jderose/microfiber/db-dump
Reviewer Review Type Date Requested Status
microfiber dev Pending
Review via email: mp+119829@code.launchpad.net

Description of the change

Use the revised Database.dump() method like this:

>>> db.dump('foo.json')

Or gzip-compress the dump:

>>> db.dump('foo.json.gz')

Like before, doc['_rev'] is deleted before dumping to the file. However, the attachments kwarg was removed, and now we only dump *without* the attachments. Even the stub doc['_attachments'] gets deleted when present. We'll probably add some more flexibility here later, but for now it suits the needs of Novacut and Dmedia.

Also, two big performance improvements were made:

1) We request docs 50 at a time (roughly 4x improvement)

2) We make CouchDB requests in a separate thread (roughly 2x improvement after above)

To post a comment you must log in.
Revision history for this message
David Jordan (dmj726) wrote :

Approved.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'doc/microfiber.rst'
--- doc/microfiber.rst 2012-08-15 22:11:15 +0000
+++ doc/microfiber.rst 2012-08-16 04:26:20 +0000
@@ -308,7 +308,7 @@
308308
309309
310.. class:: Database(name, env='http://localhost:5984/')310.. class:: Database(name, env='http://localhost:5984/')
311 311
312 Makes requests relative to a CouchDB database URL.312 Makes requests relative to a CouchDB database URL.
313 313
314 Create a :class:`Database` like this:314 Create a :class:`Database` like this:
@@ -424,6 +424,34 @@
424 *Note:* for subtle reasons that take a while to explain, you probably424 *Note:* for subtle reasons that take a while to explain, you probably
425 don't want to use this method. Instead use425 don't want to use this method. Instead use
426 :meth:`Database.save_many()`.426 :meth:`Database.save_many()`.
427
428 .. method:: dump(filename)
429
430 Dump this database to regular JSON file *filename*.
431
432 For example:
433
434 >>> db = Database('foo') #doctest: +SKIP
435 >>> db.dump('foo.json') #doctest: +SKIP
436
437 Or if *filename* ends with ``'.json.gz'``, the file will be
438 gzip-compressed as it is written:
439
440 >>> db.dump('foo.json.gz') #doctest: +SKIP
441
442 CouchDB is a bit awkward in that its API doesn't offer a nice way to
443 make a request whose response is suitable for writing directly to a
444 file, without decoding/encoding. It would be nice if that dump could
445 be loaded directly from the file as well. One of the biggest issues is
446 that a dump really needs to have doc['_rev'] removed.
447
448 This method is a compromise on many fronts, but it was made with these
449 priorities:
450
451 1. Readability of the dumped JSON file
452
453 2. High performance and low memory usage, despite the fact that
454 we must encode and decode each doc
427455
428456
429457
430458
=== modified file 'microfiber.py'
--- microfiber.py 2012-08-15 22:11:15 +0000
+++ microfiber.py 2012-08-16 04:26:20 +0000
@@ -40,15 +40,17 @@
40"""40"""
4141
42from os import urandom42from os import urandom
43from io import BufferedReader43from io import BufferedReader, TextIOWrapper
44from base64 import b32encode, b64encode44from base64 import b32encode, b64encode
45import json45import json
46from gzip import GzipFile
46import time47import time
47from hashlib import sha148from hashlib import sha1
48import hmac49import hmac
49from urllib.parse import urlparse, urlencode, quote_plus50from urllib.parse import urlparse, urlencode, quote_plus
50from http.client import HTTPConnection, HTTPSConnection, BadStatusLine51from http.client import HTTPConnection, HTTPSConnection, BadStatusLine
51import threading52import threading
53from queue import Queue
52import math54import math
5355
5456
@@ -413,27 +415,74 @@
413 super().__init__(msg.format(count))415 super().__init__(msg.format(count))
414416
415417
418def _start_thread(target, *args):
419 thread = threading.Thread(target=target, args=args)
420 thread.daemon = True
421 thread.start()
422 return thread
423
424
425class SmartQueue(Queue):
426 """
427 Queue with custom get() that raises exception instances from the queue.
428 """
429
430 def get(self, block=True, timeout=None):
431 item = super().get(block, timeout)
432 if isinstance(item, Exception):
433 raise item
434 return item
435
436
437def _fakelist_worker(rows, db, queue):
438 try:
439 for doc_ids in id_slice_iter(rows, 50):
440 queue.put(db.get_many(doc_ids))
441 queue.put(None)
442 except Exception as e:
443 queue.put(e)
444
445
416class FakeList(list):446class FakeList(list):
417 __slots__ = ('_count', '_iterable')447 """
418448 Trick ``json.dump()`` into doing memory-efficient incremental encoding.
419 def __init__(self, count, iterable):449
450 This class is a hack to allow `Database.dump()` to dump a large database
451 while keeping the memory usage constant.
452
453 It also provides two hacks to improve the performance of `Database.dump()`:
454
455 1. Documents are retrieved 50 at a time using `Database.get_many()`
456
457 2. The CouchDB requests are made in a separate thread so `json.dump()`
458 can be busy doing work while we're waiting for a response
459 """
460
461 __slots__ = ('_rows', '_db')
462
463 def __init__(self, rows, db):
420 super().__init__()464 super().__init__()
421 self._count = count465 self._rows = rows
422 self._iterable = iterable466 self._db = db
423467
424 def __len__(self):468 def __len__(self):
425 return self._count469 return len(self._rows)
426470
427 def __iter__(self):471 def __iter__(self):
428 for doc in self._iterable:472 queue = SmartQueue(2)
429 yield doc473 thread = _start_thread(_fakelist_worker, self._rows, self._db, queue)
430474 while True:
431475 docs = queue.get()
432def iter_all_docs(rows, db, attachments=True):476 if docs is None:
433 for r in rows:477 break
434 doc = db.get(r['id'], rev=r['value']['rev'], attachments=attachments)478 for doc in docs:
435 del doc['_rev']479 del doc['_rev']
436 yield doc480 try:
481 del doc['_attachments']
482 except KeyError:
483 pass
484 yield doc
485 thread.join() # Make sure reader() terminates
437486
438487
439class CouchBase(object):488class CouchBase(object):
@@ -876,12 +925,45 @@
876 options['reduce'] = False925 options['reduce'] = False
877 return self.get('_design', design, '_view', view, **options)926 return self.get('_design', design, '_view', view, **options)
878927
879 def dump(self, fp, attachments=True):928 def dump(self, filename):
880 rows = self.get('_all_docs')['rows']929 """
881 iterable = iter_all_docs(rows, self, attachments)930 Dump this database to regular JSON file *filename*.
882 docs = FakeList(len(rows), iterable)931
883 json.dump({'docs': docs}, fp, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ': '))932 For example:
884 933
885 def load(self, fp):934 >>> db = Database('foo') #doctest: +SKIP
886 return self.post(fp, '_bulk_docs')935 >>> db.dump('foo.json') #doctest: +SKIP
887 936
937 Or if *filename* ends with ``'.json.gz'``, the file will be
938 gzip-compressed as it is written:
939
940 >>> db.dump('foo.json.gz') #doctest: +SKIP
941
942 CouchDB is a bit awkward in that its API doesn't offer a nice way to
943 make a request whose response is suitable for writing directly to a
944 file, without decoding/encoding. It would be nice if that dump could
945 be loaded directly from the file as well. One of the biggest issues is
946 that a dump really needs to have doc['_rev'] removed.
947
948 This method is a compromise on many fronts, but it was made with these
949 priorities:
950
951 1. Readability of the dumped JSON file
952
953 2. High performance and low memory usage, despite the fact that
954 we must encode and decode each doc
955 """
956 if filename.lower().endswith('.json.gz'):
957 _fp = open(filename, 'wb')
958 fp = TextIOWrapper(GzipFile('docs.json', fileobj=_fp, mtime=1))
959 else:
960 fp = open(filename, 'w')
961 rows = self.get('_all_docs', endkey='_')['rows']
962 docs = FakeList(rows, self)
963 json.dump(docs, fp,
964 ensure_ascii=False,
965 sort_keys=True,
966 indent=4,
967 separators=(',', ': '),
968 )
969
888970
=== modified file 'test_microfiber.py'
--- test_microfiber.py 2012-08-15 22:11:15 +0000
+++ test_microfiber.py 2012-08-16 04:26:20 +0000
@@ -30,6 +30,7 @@
30from base64 import b64encode, b64decode, b32encode, b32decode30from base64 import b64encode, b64decode, b32encode, b32decode
31from copy import deepcopy31from copy import deepcopy
32import json32import json
33import gzip
33import time34import time
34import io35import io
35import tempfile36import tempfile
@@ -58,6 +59,26 @@
58B32ALPHABET = frozenset('234567ABCDEFGHIJKLMNOPQRSTUVWXYZ')59B32ALPHABET = frozenset('234567ABCDEFGHIJKLMNOPQRSTUVWXYZ')
5960
6061
62# A sample view from Dmedia:
63doc_type = """
64function(doc) {
65 emit(doc.type, null);
66}
67"""
68doc_time = """
69function(doc) {
70 emit(doc.time, null);
71}
72"""
73doc_design = {
74 '_id': '_design/doc',
75 'views': {
76 'type': {'map': doc_type, 'reduce': '_count'},
77 'time': {'map': doc_time},
78 },
79}
80
81
61def is_microfiber_id(_id):82def is_microfiber_id(_id):
62 assert isinstance(_id, str)83 assert isinstance(_id, str)
63 return (84 return (
@@ -1014,6 +1035,52 @@
1014 self.env = None1035 self.env = None
10151036
10161037
1038class TestFakeList(LiveTestCase):
1039 def test_init(self):
1040 db = microfiber.Database('foo', self.env)
1041 self.assertTrue(db.ensure())
1042
1043 # Test when DB is empty
1044 rows = []
1045 fake = microfiber.FakeList(rows, db)
1046 self.assertIsInstance(fake, list)
1047 self.assertIs(fake._rows, rows)
1048 self.assertIs(fake._db, db)
1049 self.assertEqual(len(fake), 0)
1050 self.assertEqual(list(fake), [])
1051
1052 # Test when there are some docs
1053 ids = sorted(test_id() for i in range(201))
1054 orig = [
1055 {'_id': _id, 'hello': 'мир', 'welcome': 'все'}
1056 for _id in ids
1057 ]
1058 docs = deepcopy(orig)
1059 db.save_many(docs)
1060 rows = db.get('_all_docs')['rows']
1061 fake = microfiber.FakeList(rows, db)
1062 self.assertIsInstance(fake, list)
1063 self.assertIs(fake._rows, rows)
1064 self.assertIs(fake._db, db)
1065 self.assertEqual(len(fake), 201)
1066 self.assertEqual(list(fake), orig)
1067
1068 # Verify that _attachments get deleted
1069 for doc in docs:
1070 db.put_att('application/octet-stream', b'foobar', doc['_id'], 'baz',
1071 rev=doc['_rev']
1072 )
1073 for _id in ids:
1074 self.assertIn('_attachments', db.get(_id))
1075 rows = db.get('_all_docs')['rows']
1076 fake = microfiber.FakeList(rows, db)
1077 self.assertIsInstance(fake, list)
1078 self.assertIs(fake._rows, rows)
1079 self.assertIs(fake._db, db)
1080 self.assertEqual(len(fake), 201)
1081 self.assertEqual(list(fake), orig)
1082
1083
1017class TestCouchBaseLive(LiveTestCase):1084class TestCouchBaseLive(LiveTestCase):
1018 klass = microfiber.CouchBase1085 klass = microfiber.CouchBase
10191086
@@ -1676,3 +1743,60 @@
1676 db.get_many([ids[17], nope, ids[18]]),1743 db.get_many([ids[17], nope, ids[18]]),
1677 [docs[17], None, docs[18]]1744 [docs[17], None, docs[18]]
1678 )1745 )
1746
1747 def test_dump(self):
1748 db = microfiber.Database('foo', self.env)
1749 self.assertTrue(db.ensure())
1750 docs = [
1751 {'_id': test_id(), 'hello': 'мир', 'welcome': 'все'}
1752 for i in range(200)
1753 ]
1754 docs_s = microfiber.dumps(
1755 sorted(docs, key=lambda d: d['_id']),
1756 pretty=True
1757 )
1758 docs.append(deepcopy(doc_design))
1759 checksum = md5(docs_s.encode('utf-8')).hexdigest()
1760 db.save_many(docs)
1761
1762 # Test with .json
1763 dst = path.join(self.tmpcouch.paths.bzr, 'foo.json')
1764 db.dump(dst)
1765 self.assertEqual(open(dst, 'r').read(), docs_s)
1766 self.assertEqual(
1767 md5(open(dst, 'rb').read()).hexdigest(),
1768 checksum
1769 )
1770
1771 # Test with .json.gz
1772 dst = path.join(self.tmpcouch.paths.bzr, 'foo.json.gz')
1773 db.dump(dst)
1774 gz_checksum = md5(open(dst, 'rb').read()).hexdigest()
1775 self.assertEqual(
1776 md5(gzip.GzipFile(dst, 'rb').read()).hexdigest(),
1777 checksum
1778 )
1779
1780 # Test that timestamp doesn't change gz_checksum
1781 time.sleep(2)
1782 db.dump(dst)
1783 self.assertEqual(
1784 md5(open(dst, 'rb').read()).hexdigest(),
1785 gz_checksum
1786 )
1787
1788 # Test that filename doesn't change gz_checksum
1789 dst = path.join(self.tmpcouch.paths.bzr, 'bar.json.gz')
1790 db.dump(dst)
1791 self.assertEqual(
1792 md5(open(dst, 'rb').read()).hexdigest(),
1793 gz_checksum
1794 )
1795
1796 # Make sure .JSON.GZ also works, that case is ignored
1797 dst = path.join(self.tmpcouch.paths.bzr, 'FOO.JSON.GZ')
1798 db.dump(dst)
1799 self.assertEqual(
1800 md5(open(dst, 'rb').read()).hexdigest(),
1801 gz_checksum
1802 )

Subscribers

People subscribed via source and target branches