Merge lp:~jelmer/brz/big-file-vf into lp:brz

Proposed by Jelmer Vernooij
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: no longer in the source branch.
Merge reported by: The Breezy Bot
Merged at revision: not available
Proposed branch: lp:~jelmer/brz/big-file-vf
Merge into: lp:brz
Diff against target: 255 lines (+133/-15)
6 files modified
breezy/bzr/groupcompress.py (+45/-10)
breezy/bzr/knit.py (+15/-0)
breezy/bzr/versionedfile.py (+42/-0)
breezy/bzr/vf_repository.py (+3/-2)
breezy/tests/blackbox/test_big_file.py (+8/-3)
breezy/tests/per_versionedfile.py (+20/-0)
To merge this branch: bzr merge lp:~jelmer/brz/big-file-vf
Reviewer Review Type Date Requested Status
Martin Packman Approve
Review via email: mp+368882@code.launchpad.net

Commit message

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Description of the change

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Improve big file tests:
* Use RLIMIT_AS rather than RLIMIT_DATA
* Skip the tests if there is not enough disk space.

This ahead of improved support for large files.

To post a comment you must log in.
Revision history for this message
Martin Packman (gz) wrote :

Looks good, one teeny test nit inline.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'breezy/bzr/groupcompress.py'
2--- breezy/bzr/groupcompress.py 2018-11-17 16:53:10 +0000
3+++ breezy/bzr/groupcompress.py 2019-06-16 15:55:45 +0000
4@@ -1295,18 +1295,56 @@
5 back to future add_lines calls in the parent_texts dictionary.
6 """
7 self._index._check_write_ok()
8- self._check_add(key, lines, random_id, check_content)
9+ if check_content:
10+ self._check_lines_not_unicode(lines)
11+ self._check_lines_are_lines(lines)
12+ return self.add_chunks(
13+ key, parents, iter(lines), parent_texts, left_matching_blocks,
14+ nostore_sha, random_id)
15+
16+ def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
17+ left_matching_blocks=None, nostore_sha=None, random_id=False):
18+ """Add a text to the store.
19+
20+ :param key: The key tuple of the text to add.
21+ :param parents: The parents key tuples of the text to add.
22+ :param chunk_iter: An iterator over chunks. Chunks
23+ don't need to be file lines; the only requirement is that they
24+ are bytes.
25+ :param parent_texts: An optional dictionary containing the opaque
26+ representations of some or all of the parents of version_id to
27+ allow delta optimisations. VERY IMPORTANT: the texts must be those
28+ returned by add_lines or data corruption can be caused.
29+ :param left_matching_blocks: a hint about which areas are common
30+ between the text and its left-hand-parent. The format is
31+ the SequenceMatcher.get_matching_blocks format.
32+ :param nostore_sha: Raise ExistingContent and do not add the lines to
33+ the versioned file if the digest of the lines matches this.
34+ :param random_id: If True a random id has been selected rather than
35+ an id determined by some deterministic process such as a converter
36+ from a foreign VCS. When True the backend may choose not to check
37+ for uniqueness of the resulting key within the versioned file, so
38+ this should only be done when the result is expected to be unique
39+ anyway.
40+ :return: The text sha1, the number of bytes in the text, and an opaque
41+ representation of the inserted version which can be provided
42+ back to future add_lines calls in the parent_texts dictionary.
43+ """
44+ self._index._check_write_ok()
45+ self._check_add(key, random_id)
46 if parents is None:
47 # The caller might pass None if there is no graph data, but kndx
48 # indexes can't directly store that, so we give them
49 # an empty tuple instead.
50 parents = ()
51 # double handling for now. Make it work until then.
52- length = sum(map(len, lines))
53- record = ChunkedContentFactory(key, parents, None, lines)
54- sha1 = list(self._insert_record_stream([record], random_id=random_id,
55- nostore_sha=nostore_sha))[0]
56- return sha1, length, None
57+ # TODO(jelmer): problematic for big files: let's not keep the list of
58+ # chunks in memory.
59+ chunks = list(chunk_iter)
60+ record = ChunkedContentFactory(key, parents, None, chunks)
61+ sha1 = list(self._insert_record_stream(
62+ [record], random_id=random_id, nostore_sha=nostore_sha))[0]
63+ return sha1, sum(map(len, chunks)), None
64
65 def add_fallback_versioned_files(self, a_versioned_files):
66 """Add a source of texts for texts not present in this knit.
67@@ -1338,7 +1376,7 @@
68 self._index._graph_index.clear_cache()
69 self._index._int_cache.clear()
70
71- def _check_add(self, key, lines, random_id, check_content):
72+ def _check_add(self, key, random_id):
73 """check that version_id and lines are safe to add."""
74 version_id = key[-1]
75 if version_id is not None:
76@@ -1349,9 +1387,6 @@
77 # probably check that the existing content is identical to what is
78 # being inserted, and otherwise raise an exception. This would make
79 # the bundle code simpler.
80- if check_content:
81- self._check_lines_not_unicode(lines)
82- self._check_lines_are_lines(lines)
83
84 def get_parent_map(self, keys):
85 """Get a map of the graph parents of keys.
86
87=== modified file 'breezy/bzr/knit.py'
88--- breezy/bzr/knit.py 2019-05-28 21:46:09 +0000
89+++ breezy/bzr/knit.py 2019-06-16 15:55:45 +0000
90@@ -996,6 +996,21 @@
91 parent_texts, left_matching_blocks, nostore_sha, random_id,
92 line_bytes=line_bytes)
93
94+ def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
95+ left_matching_blocks=None, nostore_sha=None, random_id=False):
96+ """See VersionedFiles.add_chunks()."""
97+ self._index._check_write_ok()
98+ self._check_add(key, None, random_id, check_content=False)
99+ if parents is None:
100+ # The caller might pass None if there is no graph data, but kndx
101+ # indexes can't directly store that, so we give them
102+ # an empty tuple instead.
103+ parents = ()
104+ line_bytes = b''.join(chunk_iter)
105+ return self._add(key, None, parents,
106+ parent_texts, left_matching_blocks, nostore_sha, random_id,
107+ line_bytes=line_bytes)
108+
109 def _add(self, key, lines, parents, parent_texts,
110 left_matching_blocks, nostore_sha, random_id,
111 line_bytes):
112
113=== modified file 'breezy/bzr/versionedfile.py'
114--- breezy/bzr/versionedfile.py 2018-11-11 04:08:32 +0000
115+++ breezy/bzr/versionedfile.py 2019-06-16 15:55:45 +0000
116@@ -978,6 +978,38 @@
117 """
118 raise NotImplementedError(self.add_lines)
119
120+ def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
121+ left_matching_blocks=None, nostore_sha=None, random_id=False,
122+ check_content=True):
123+ """Add a text to the store from a chunk iterable.
124+
125+ :param key: The key tuple of the text to add. If the last element is
126+ None, a CHK string will be generated during the addition.
127+ :param parents: The parents key tuples of the text to add.
128+ :param chunk_iter: An iterable over bytestrings.
129+ :param parent_texts: An optional dictionary containing the opaque
130+ representations of some or all of the parents of version_id to
131+ allow delta optimisations. VERY IMPORTANT: the texts must be those
132+ returned by add_lines or data corruption can be caused.
133+ :param left_matching_blocks: a hint about which areas are common
134+ between the text and its left-hand-parent. The format is
135+ the SequenceMatcher.get_matching_blocks format.
136+ :param nostore_sha: Raise ExistingContent and do not add the lines to
137+ the versioned file if the digest of the lines matches this.
138+ :param random_id: If True a random id has been selected rather than
139+ an id determined by some deterministic process such as a converter
140+ from a foreign VCS. When True the backend may choose not to check
141+ for uniqueness of the resulting key within the versioned file, so
142+ this should only be done when the result is expected to be unique
143+ anyway.
144+ :param check_content: If True, the lines supplied are verified to be
145+ bytestrings that are correctly formed lines.
146+ :return: The text sha1, the number of bytes in the text, and an opaque
147+ representation of the inserted version which can be provided
148+ back to future add_lines calls in the parent_texts dictionary.
149+ """
150+ raise NotImplementedError(self.add_chunks)
151+
152 def add_mpdiffs(self, records):
153 """Add mpdiffs to this VersionedFile.
154
155@@ -1201,6 +1233,16 @@
156 self._mapper = mapper
157 self._is_locked = is_locked
158
159+ def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
160+ left_matching_blocks=None, nostore_sha=None,
161+ random_id=False):
162+ # For now, just fallback to add_lines.
163+ lines = osutils.chunks_to_lines(list(chunk_iter))
164+ return self.add_lines(
165+ key, parents, lines, parent_texts,
166+ left_matching_blocks, nostore_sha, random_id,
167+ check_content=True)
168+
169 def add_lines(self, key, parents, lines, parent_texts=None,
170 left_matching_blocks=None, nostore_sha=None, random_id=False,
171 check_content=True):
172
173=== modified file 'breezy/bzr/vf_repository.py'
174--- breezy/bzr/vf_repository.py 2019-06-15 12:04:34 +0000
175+++ breezy/bzr/vf_repository.py 2019-06-16 15:55:45 +0000
176@@ -569,8 +569,9 @@
177
178 def _add_file_to_weave(self, file_id, fileobj, parents, nostore_sha):
179 parent_keys = tuple([(file_id, parent) for parent in parents])
180- return self.repository.texts.add_lines(
181- (file_id, self._new_revision_id), parent_keys, fileobj.readlines(),
182+ return self.repository.texts.add_chunks(
183+ (file_id, self._new_revision_id), parent_keys,
184+ osutils.file_iterator(fileobj),
185 nostore_sha=nostore_sha, random_id=self.random_revid)[0:2]
186
187
188
189=== modified file 'breezy/tests/blackbox/test_big_file.py'
190--- breezy/tests/blackbox/test_big_file.py 2019-06-15 21:45:04 +0000
191+++ breezy/tests/blackbox/test_big_file.py 2019-06-16 15:55:45 +0000
192@@ -21,6 +21,7 @@
193 memory.
194 """
195
196+import errno
197 import os
198 import resource
199
200@@ -36,8 +37,8 @@
201 BIG_FILE_SIZE = 1024 * 1024 * 500
202 BIG_FILE_CHUNK_SIZE = 1024 * 1024
203
204-RESOURCE = resource.RLIMIT_DATA
205-LIMIT = 1024 * 1024 * 200
206+RESOURCE = resource.RLIMIT_AS
207+LIMIT = 1024 * 1024 * 100
208
209
210 def make_big_file(path):
211@@ -50,8 +51,12 @@
212 class TestAdd(tests.TestCaseWithTransport):
213
214 def writeBigFile(self, path):
215- make_big_file(path)
216 self.addCleanup(os.unlink, path)
217+ try:
218+ make_big_file(path)
219+ except EnvironmentError as e:
220+ if e.errno == errno.ENOSPC:
221+ self.skipTest('not enough disk space for big file')
222
223 def setUp(self):
224 super(TestAdd, self).setUp()
225
226=== modified file 'breezy/tests/per_versionedfile.py'
227--- breezy/tests/per_versionedfile.py 2018-11-11 04:08:32 +0000
228+++ breezy/tests/per_versionedfile.py 2019-06-16 15:55:45 +0000
229@@ -1538,6 +1538,26 @@
230 records.sort()
231 self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
232
233+ def test_add_chunks(self):
234+ f = self.get_versionedfiles()
235+ key0 = self.get_simple_key(b'r0')
236+ key1 = self.get_simple_key(b'r1')
237+ key2 = self.get_simple_key(b'r2')
238+ keyf = self.get_simple_key(b'foo')
239+ f.add_chunks(key0, [], [b'a', b'\nb\n'])
240+ if self.graph:
241+ f.add_chunks(key1, [key0], [b'b', b'\n', b'c\n'])
242+ else:
243+ f.add_chunks(key1, [], [b'b\n', b'c\n'])
244+ keys = f.keys()
245+ self.assertIn(key0, keys)
246+ self.assertIn(key1, keys)
247+ records = []
248+ for record in f.get_record_stream([key0, key1], 'unordered', True):
249+ records.append((record.key, record.get_bytes_as('fulltext')))
250+ records.sort()
251+ self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
252+
253 def test_annotate(self):
254 files = self.get_versionedfiles()
255 self.get_diamond_files(files)

Subscribers

People subscribed via source and target branches