Merge lp:~jameinel/bzr/1.16-no-first-delta-index into lp:~bzr/bzr/trunk-old

Proposed by John A Meinel
Status: Merged
Merged at revision: not available
Proposed branch: lp:~jameinel/bzr/1.16-no-first-delta-index
Merge into: lp:~bzr/bzr/trunk-old
Diff against target: 130 lines
To merge this branch: bzr merge lp:~jameinel/bzr/1.16-no-first-delta-index
Reviewer Review Type Date Requested Status
Andrew Bennetts Approve
Review via email: mp+6991@code.launchpad.net
To post a comment you must log in.
Revision history for this message
John A Meinel (jameinel) wrote :

This makes a minor modification to the internal DeltaIndex structure for --dev6 repositories.

Namely, it makes generating the first _index "lazy". The idea is that until we do 'make_delta' or a similar function, we don't actually need the index. This has a fairly big impact on 'bzr commit', especially when combined with:
https://code.edge.launchpad.net/~jameinel/bzr/1.16-commit-fulltext/+merge/6988

With both patches, the peak memory consumption is right around 2x a single file content. I know the last bits that we could poke at to drop it down more, but it requires a bit more complexity.

As an added bonus, with both patches applied the time to commit of a single 90MB file in dev6 is down from 7.3s to 4.2s. I would imagine that the 'initial commit' benchmarks for large projects will also be down significantly.

Revision history for this message
Andrew Bennetts (spiv) wrote :

I haven't very carefully read the pyrex changes, but the diff is small and the description makes sense to me.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'NEWS'
--- NEWS 2009-06-04 21:05:44 +0000
+++ NEWS 2009-06-05 02:35:36 +0000
@@ -31,6 +31,12 @@
31 bugs with stacking and non default formats.)31 bugs with stacking and non default formats.)
32 (John Arbash Meinel, #373455)32 (John Arbash Meinel, #373455)
3333
34* ``--development6-rich-root`` delays generating a delta index for the
35 first object inserted into a group. This has a beneficial impact on
36 ``bzr commit`` since each committed texts goes to its own group. For
37 committing a 90MB file, it drops peak memory by about 200MB, and speeds
38 up commit from 7s => 4s. (John Arbash Meinel)
39
34* Numerous operations are now faster for huge projects, i.e. those40* Numerous operations are now faster for huge projects, i.e. those
35 with a large number of files and/or a large number of revisions,41 with a large number of files and/or a large number of revisions,
36 particularly when the latest development format is used. These42 particularly when the latest development format is used. These
3743
=== modified file 'bzrlib/_groupcompress_pyx.pyx'
--- bzrlib/_groupcompress_pyx.pyx 2009-04-09 20:23:07 +0000
+++ bzrlib/_groupcompress_pyx.pyx 2009-06-05 02:35:36 +0000
@@ -118,6 +118,9 @@
118 self._index = NULL118 self._index = NULL
119 safe_free(<void **>&self._source_infos)119 safe_free(<void **>&self._source_infos)
120120
121 def _has_index(self):
122 return (self._index != NULL)
123
121 def add_delta_source(self, delta, unadded_bytes):124 def add_delta_source(self, delta, unadded_bytes):
122 """Add a new delta to the source texts.125 """Add a new delta to the source texts.
123126
@@ -171,6 +174,9 @@
171 source_location = len(self._sources)174 source_location = len(self._sources)
172 if source_location >= self._max_num_sources:175 if source_location >= self._max_num_sources:
173 self._expand_sources()176 self._expand_sources()
177 if source_location != 0 and self._index == NULL:
178 # We were lazy about populating the index, create it now
179 self._populate_first_index()
174 self._sources.append(source)180 self._sources.append(source)
175 c_source = PyString_AS_STRING(source)181 c_source = PyString_AS_STRING(source)
176 c_source_size = PyString_GET_SIZE(source)182 c_source_size = PyString_GET_SIZE(source)
@@ -179,11 +185,24 @@
179 src.size = c_source_size185 src.size = c_source_size
180186
181 src.agg_offset = self._source_offset + unadded_bytes187 src.agg_offset = self._source_offset + unadded_bytes
182 index = create_delta_index(src, self._index)
183 self._source_offset = src.agg_offset + src.size188 self._source_offset = src.agg_offset + src.size
184 if index != NULL:189 # We delay creating the index on the first insert
185 free_delta_index(self._index)190 if source_location != 0:
186 self._index = index191 index = create_delta_index(src, self._index)
192 if index != NULL:
193 free_delta_index(self._index)
194 self._index = index
195
196 cdef _populate_first_index(self):
197 cdef delta_index *index
198 if len(self._sources) != 1 or self._index != NULL:
199 raise AssertionError('_populate_first_index should only be'
200 ' called when we have a single source and no index yet')
201
202 # We know that self._index is already NULL, so whatever
203 # create_delta_index returns is fine
204 self._index = create_delta_index(&self._source_infos[0], NULL)
205 assert self._index != NULL
187206
188 cdef _expand_sources(self):207 cdef _expand_sources(self):
189 raise RuntimeError('if we move self._source_infos, then we need to'208 raise RuntimeError('if we move self._source_infos, then we need to'
@@ -201,7 +220,10 @@
201 cdef unsigned long delta_size220 cdef unsigned long delta_size
202221
203 if self._index == NULL:222 if self._index == NULL:
204 return None223 if len(self._sources) == 0:
224 return None
225 # We were just lazy about generating the index
226 self._populate_first_index()
205227
206 if not PyString_CheckExact(target_bytes):228 if not PyString_CheckExact(target_bytes):
207 raise TypeError('target is not a str')229 raise TypeError('target is not a str')
208230
=== modified file 'bzrlib/groupcompress.py'
--- bzrlib/groupcompress.py 2009-05-29 10:25:37 +0000
+++ bzrlib/groupcompress.py 2009-06-05 02:35:36 +0000
@@ -746,6 +746,14 @@
746746
747 After calling this, the compressor should no longer be used747 After calling this, the compressor should no longer be used
748 """748 """
749 # TODO: this causes us to 'bloat' to 2x the size of content in the
750 # group. This has an impact for 'commit' of large objects.
751 # One possibility is to use self._content_chunks, and be lazy and
752 # only fill out self._content as a full string when we actually
753 # need it. That would at least drop the peak memory consumption
754 # for 'commit' down to ~1x the size of the largest file, at a
755 # cost of increased complexity within this code. 2x is still <<
756 # 3x the size of the largest file, so we are doing ok.
749 content = ''.join(self.chunks)757 content = ''.join(self.chunks)
750 self.chunks = None758 self.chunks = None
751 self._delta_index = None759 self._delta_index = None
752760
=== modified file 'bzrlib/tests/test__groupcompress.py'
--- bzrlib/tests/test__groupcompress.py 2009-04-29 05:53:21 +0000
+++ bzrlib/tests/test__groupcompress.py 2009-06-05 02:35:36 +0000
@@ -272,6 +272,25 @@
272 di = self._gc_module.DeltaIndex('test text\n')272 di = self._gc_module.DeltaIndex('test text\n')
273 self.assertEqual('DeltaIndex(1, 10)', repr(di))273 self.assertEqual('DeltaIndex(1, 10)', repr(di))
274274
275 def test_first_add_source_doesnt_index_until_make_delta(self):
276 di = self._gc_module.DeltaIndex()
277 self.assertFalse(di._has_index())
278 di.add_source(_text1, 0)
279 self.assertFalse(di._has_index())
280 # However, asking to make a delta will trigger the index to be
281 # generated, and will generate a proper delta
282 delta = di.make_delta(_text2)
283 self.assertTrue(di._has_index())
284 self.assertEqual('N\x90/\x1fdiffer from\nagainst other text\n', delta)
285
286 def test_second_add_source_triggers_make_index(self):
287 di = self._gc_module.DeltaIndex()
288 self.assertFalse(di._has_index())
289 di.add_source(_text1, 0)
290 self.assertFalse(di._has_index())
291 di.add_source(_text2, 0)
292 self.assertTrue(di._has_index())
293
275 def test_make_delta(self):294 def test_make_delta(self):
276 di = self._gc_module.DeltaIndex(_text1)295 di = self._gc_module.DeltaIndex(_text1)
277 delta = di.make_delta(_text2)296 delta = di.make_delta(_text2)