Merge lp:~jelmer/brz/big-file-vf into lp:brz

Proposed by Jelmer Vernooij
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: no longer in the source branch.
Merge reported by: The Breezy Bot
Merged at revision: not available
Proposed branch: lp:~jelmer/brz/big-file-vf
Merge into: lp:brz
Diff against target: 255 lines (+133/-15)
6 files modified
breezy/bzr/groupcompress.py (+45/-10)
breezy/bzr/knit.py (+15/-0)
breezy/bzr/versionedfile.py (+42/-0)
breezy/bzr/vf_repository.py (+3/-2)
breezy/tests/blackbox/test_big_file.py (+8/-3)
breezy/tests/per_versionedfile.py (+20/-0)
To merge this branch: bzr merge lp:~jelmer/brz/big-file-vf
Reviewer Review Type Date Requested Status
Martin Packman Approve
Review via email: mp+368882@code.launchpad.net

Commit message

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Description of the change

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Improve big file tests:
* Use RLIMIT_AS rather than RLIMIT_DATA
* Skip the tests if there is not enough disk space.

This ahead of improved support for large files.

To post a comment you must log in.
Revision history for this message
Martin Packman (gz) wrote :

Looks good, one teeny test nit inline.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'breezy/bzr/groupcompress.py'
--- breezy/bzr/groupcompress.py 2018-11-17 16:53:10 +0000
+++ breezy/bzr/groupcompress.py 2019-06-16 15:55:45 +0000
@@ -1295,18 +1295,56 @@
1295 back to future add_lines calls in the parent_texts dictionary.1295 back to future add_lines calls in the parent_texts dictionary.
1296 """1296 """
1297 self._index._check_write_ok()1297 self._index._check_write_ok()
1298 self._check_add(key, lines, random_id, check_content)1298 if check_content:
1299 self._check_lines_not_unicode(lines)
1300 self._check_lines_are_lines(lines)
1301 return self.add_chunks(
1302 key, parents, iter(lines), parent_texts, left_matching_blocks,
1303 nostore_sha, random_id)
1304
1305 def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
1306 left_matching_blocks=None, nostore_sha=None, random_id=False):
1307 """Add a text to the store.
1308
1309 :param key: The key tuple of the text to add.
1310 :param parents: The parents key tuples of the text to add.
1311 :param chunk_iter: An iterator over chunks. Chunks
1312 don't need to be file lines; the only requirement is that they
1313 are bytes.
1314 :param parent_texts: An optional dictionary containing the opaque
1315 representations of some or all of the parents of version_id to
1316 allow delta optimisations. VERY IMPORTANT: the texts must be those
1317 returned by add_lines or data corruption can be caused.
1318 :param left_matching_blocks: a hint about which areas are common
1319 between the text and its left-hand-parent. The format is
1320 the SequenceMatcher.get_matching_blocks format.
1321 :param nostore_sha: Raise ExistingContent and do not add the lines to
1322 the versioned file if the digest of the lines matches this.
1323 :param random_id: If True a random id has been selected rather than
1324 an id determined by some deterministic process such as a converter
1325 from a foreign VCS. When True the backend may choose not to check
1326 for uniqueness of the resulting key within the versioned file, so
1327 this should only be done when the result is expected to be unique
1328 anyway.
1329 :return: The text sha1, the number of bytes in the text, and an opaque
1330 representation of the inserted version which can be provided
1331 back to future add_lines calls in the parent_texts dictionary.
1332 """
1333 self._index._check_write_ok()
1334 self._check_add(key, random_id)
1299 if parents is None:1335 if parents is None:
1300 # The caller might pass None if there is no graph data, but kndx1336 # The caller might pass None if there is no graph data, but kndx
1301 # indexes can't directly store that, so we give them1337 # indexes can't directly store that, so we give them
1302 # an empty tuple instead.1338 # an empty tuple instead.
1303 parents = ()1339 parents = ()
1304 # double handling for now. Make it work until then.1340 # double handling for now. Make it work until then.
1305 length = sum(map(len, lines))1341 # TODO(jelmer): problematic for big files: let's not keep the list of
1306 record = ChunkedContentFactory(key, parents, None, lines)1342 # chunks in memory.
1307 sha1 = list(self._insert_record_stream([record], random_id=random_id,1343 chunks = list(chunk_iter)
1308 nostore_sha=nostore_sha))[0]1344 record = ChunkedContentFactory(key, parents, None, chunks)
1309 return sha1, length, None1345 sha1 = list(self._insert_record_stream(
1346 [record], random_id=random_id, nostore_sha=nostore_sha))[0]
1347 return sha1, sum(map(len, chunks)), None
13101348
1311 def add_fallback_versioned_files(self, a_versioned_files):1349 def add_fallback_versioned_files(self, a_versioned_files):
1312 """Add a source of texts for texts not present in this knit.1350 """Add a source of texts for texts not present in this knit.
@@ -1338,7 +1376,7 @@
1338 self._index._graph_index.clear_cache()1376 self._index._graph_index.clear_cache()
1339 self._index._int_cache.clear()1377 self._index._int_cache.clear()
13401378
1341 def _check_add(self, key, lines, random_id, check_content):1379 def _check_add(self, key, random_id):
1342 """check that version_id and lines are safe to add."""1380 """check that version_id and lines are safe to add."""
1343 version_id = key[-1]1381 version_id = key[-1]
1344 if version_id is not None:1382 if version_id is not None:
@@ -1349,9 +1387,6 @@
1349 # probably check that the existing content is identical to what is1387 # probably check that the existing content is identical to what is
1350 # being inserted, and otherwise raise an exception. This would make1388 # being inserted, and otherwise raise an exception. This would make
1351 # the bundle code simpler.1389 # the bundle code simpler.
1352 if check_content:
1353 self._check_lines_not_unicode(lines)
1354 self._check_lines_are_lines(lines)
13551390
1356 def get_parent_map(self, keys):1391 def get_parent_map(self, keys):
1357 """Get a map of the graph parents of keys.1392 """Get a map of the graph parents of keys.
13581393
=== modified file 'breezy/bzr/knit.py'
--- breezy/bzr/knit.py 2019-05-28 21:46:09 +0000
+++ breezy/bzr/knit.py 2019-06-16 15:55:45 +0000
@@ -996,6 +996,21 @@
996 parent_texts, left_matching_blocks, nostore_sha, random_id,996 parent_texts, left_matching_blocks, nostore_sha, random_id,
997 line_bytes=line_bytes)997 line_bytes=line_bytes)
998998
999 def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
1000 left_matching_blocks=None, nostore_sha=None, random_id=False):
1001 """See VersionedFiles.add_chunks()."""
1002 self._index._check_write_ok()
1003 self._check_add(key, None, random_id, check_content=False)
1004 if parents is None:
1005 # The caller might pass None if there is no graph data, but kndx
1006 # indexes can't directly store that, so we give them
1007 # an empty tuple instead.
1008 parents = ()
1009 line_bytes = b''.join(chunk_iter)
1010 return self._add(key, None, parents,
1011 parent_texts, left_matching_blocks, nostore_sha, random_id,
1012 line_bytes=line_bytes)
1013
999 def _add(self, key, lines, parents, parent_texts,1014 def _add(self, key, lines, parents, parent_texts,
1000 left_matching_blocks, nostore_sha, random_id,1015 left_matching_blocks, nostore_sha, random_id,
1001 line_bytes):1016 line_bytes):
10021017
=== modified file 'breezy/bzr/versionedfile.py'
--- breezy/bzr/versionedfile.py 2018-11-11 04:08:32 +0000
+++ breezy/bzr/versionedfile.py 2019-06-16 15:55:45 +0000
@@ -978,6 +978,38 @@
978 """978 """
979 raise NotImplementedError(self.add_lines)979 raise NotImplementedError(self.add_lines)
980980
981 def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
982 left_matching_blocks=None, nostore_sha=None, random_id=False,
983 check_content=True):
984 """Add a text to the store from a chunk iterable.
985
986 :param key: The key tuple of the text to add. If the last element is
987 None, a CHK string will be generated during the addition.
988 :param parents: The parents key tuples of the text to add.
989 :param chunk_iter: An iterable over bytestrings.
990 :param parent_texts: An optional dictionary containing the opaque
991 representations of some or all of the parents of version_id to
992 allow delta optimisations. VERY IMPORTANT: the texts must be those
993 returned by add_lines or data corruption can be caused.
994 :param left_matching_blocks: a hint about which areas are common
995 between the text and its left-hand-parent. The format is
996 the SequenceMatcher.get_matching_blocks format.
997 :param nostore_sha: Raise ExistingContent and do not add the lines to
998 the versioned file if the digest of the lines matches this.
999 :param random_id: If True a random id has been selected rather than
1000 an id determined by some deterministic process such as a converter
1001 from a foreign VCS. When True the backend may choose not to check
1002 for uniqueness of the resulting key within the versioned file, so
1003 this should only be done when the result is expected to be unique
1004 anyway.
1005 :param check_content: If True, the lines supplied are verified to be
1006 bytestrings that are correctly formed lines.
1007 :return: The text sha1, the number of bytes in the text, and an opaque
1008 representation of the inserted version which can be provided
1009 back to future add_lines calls in the parent_texts dictionary.
1010 """
1011 raise NotImplementedError(self.add_chunks)
1012
981 def add_mpdiffs(self, records):1013 def add_mpdiffs(self, records):
982 """Add mpdiffs to this VersionedFile.1014 """Add mpdiffs to this VersionedFile.
9831015
@@ -1201,6 +1233,16 @@
1201 self._mapper = mapper1233 self._mapper = mapper
1202 self._is_locked = is_locked1234 self._is_locked = is_locked
12031235
1236 def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
1237 left_matching_blocks=None, nostore_sha=None,
1238 random_id=False):
1239 # For now, just fallback to add_lines.
1240 lines = osutils.chunks_to_lines(list(chunk_iter))
1241 return self.add_lines(
1242 key, parents, lines, parent_texts,
1243 left_matching_blocks, nostore_sha, random_id,
1244 check_content=True)
1245
1204 def add_lines(self, key, parents, lines, parent_texts=None,1246 def add_lines(self, key, parents, lines, parent_texts=None,
1205 left_matching_blocks=None, nostore_sha=None, random_id=False,1247 left_matching_blocks=None, nostore_sha=None, random_id=False,
1206 check_content=True):1248 check_content=True):
12071249
=== modified file 'breezy/bzr/vf_repository.py'
--- breezy/bzr/vf_repository.py 2019-06-15 12:04:34 +0000
+++ breezy/bzr/vf_repository.py 2019-06-16 15:55:45 +0000
@@ -569,8 +569,9 @@
569569
570 def _add_file_to_weave(self, file_id, fileobj, parents, nostore_sha):570 def _add_file_to_weave(self, file_id, fileobj, parents, nostore_sha):
571 parent_keys = tuple([(file_id, parent) for parent in parents])571 parent_keys = tuple([(file_id, parent) for parent in parents])
572 return self.repository.texts.add_lines(572 return self.repository.texts.add_chunks(
573 (file_id, self._new_revision_id), parent_keys, fileobj.readlines(),573 (file_id, self._new_revision_id), parent_keys,
574 osutils.file_iterator(fileobj),
574 nostore_sha=nostore_sha, random_id=self.random_revid)[0:2]575 nostore_sha=nostore_sha, random_id=self.random_revid)[0:2]
575576
576577
577578
=== modified file 'breezy/tests/blackbox/test_big_file.py'
--- breezy/tests/blackbox/test_big_file.py 2019-06-15 21:45:04 +0000
+++ breezy/tests/blackbox/test_big_file.py 2019-06-16 15:55:45 +0000
@@ -21,6 +21,7 @@
21memory.21memory.
22"""22"""
2323
24import errno
24import os25import os
25import resource26import resource
2627
@@ -36,8 +37,8 @@
36BIG_FILE_SIZE = 1024 * 1024 * 50037BIG_FILE_SIZE = 1024 * 1024 * 500
37BIG_FILE_CHUNK_SIZE = 1024 * 102438BIG_FILE_CHUNK_SIZE = 1024 * 1024
3839
39RESOURCE = resource.RLIMIT_DATA40RESOURCE = resource.RLIMIT_AS
40LIMIT = 1024 * 1024 * 20041LIMIT = 1024 * 1024 * 100
4142
4243
43def make_big_file(path):44def make_big_file(path):
@@ -50,8 +51,12 @@
50class TestAdd(tests.TestCaseWithTransport):51class TestAdd(tests.TestCaseWithTransport):
5152
52 def writeBigFile(self, path):53 def writeBigFile(self, path):
53 make_big_file(path)
54 self.addCleanup(os.unlink, path)54 self.addCleanup(os.unlink, path)
55 try:
56 make_big_file(path)
57 except EnvironmentError as e:
58 if e.errno == errno.ENOSPC:
59 self.skipTest('not enough disk space for big file')
5560
56 def setUp(self):61 def setUp(self):
57 super(TestAdd, self).setUp()62 super(TestAdd, self).setUp()
5863
=== modified file 'breezy/tests/per_versionedfile.py'
--- breezy/tests/per_versionedfile.py 2018-11-11 04:08:32 +0000
+++ breezy/tests/per_versionedfile.py 2019-06-16 15:55:45 +0000
@@ -1538,6 +1538,26 @@
1538 records.sort()1538 records.sort()
1539 self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)1539 self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
15401540
1541 def test_add_chunks(self):
1542 f = self.get_versionedfiles()
1543 key0 = self.get_simple_key(b'r0')
1544 key1 = self.get_simple_key(b'r1')
1545 key2 = self.get_simple_key(b'r2')
1546 keyf = self.get_simple_key(b'foo')
1547 f.add_chunks(key0, [], [b'a', b'\nb\n'])
1548 if self.graph:
1549 f.add_chunks(key1, [key0], [b'b', b'\n', b'c\n'])
1550 else:
1551 f.add_chunks(key1, [], [b'b\n', b'c\n'])
1552 keys = f.keys()
1553 self.assertIn(key0, keys)
1554 self.assertIn(key1, keys)
1555 records = []
1556 for record in f.get_record_stream([key0, key1], 'unordered', True):
1557 records.append((record.key, record.get_bytes_as('fulltext')))
1558 records.sort()
1559 self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
1560
1541 def test_annotate(self):1561 def test_annotate(self):
1542 files = self.get_versionedfiles()1562 files = self.get_versionedfiles()
1543 self.get_diamond_files(files)1563 self.get_diamond_files(files)

Subscribers

People subscribed via source and target branches