Merge lp:~spiv/bzr/all-referenced-texts-check into lp:~bzr/bzr/trunk-old

Proposed by Andrew Bennetts
Status: Merged
Merged at revision: not available
Proposed branch: lp:~spiv/bzr/all-referenced-texts-check
Merge into: lp:~bzr/bzr/trunk-old
Diff against target: 368 lines
To merge this branch: bzr merge lp:~spiv/bzr/all-referenced-texts-check
Reviewer Review Type Date Requested Status
Robert Collins (community) Approve
Review via email: mp+6445@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Andrew Bennetts (spiv) wrote :

This branch fixes bug 368418.

It improves the check for missing parent inventories to pass so long as either:

 - all parent inventories are present, or
 - all texts referenced by the new inventories are present.

Either way, a usable revision delta for those revisions can be calculated, which
is what matters.

Revision history for this message
Robert Collins (lifeless) wrote :

 review approve

Looks like a TODO:
11:28 < lifeless> + # Also, perf tests:
11:28 < lifeless> + # - if all invs present, then no texts are
checked

Please either do it, or remove the comment.

review: Approve
Revision history for this message
Andrew Bennetts (spiv) wrote :

> Please either do it, or remove the comment.

Oops, removed. Thanks!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bzrlib/knit.py'
2--- bzrlib/knit.py 2009-04-27 23:14:00 +0000
3+++ bzrlib/knit.py 2009-05-12 02:35:10 +0000
4@@ -2688,6 +2688,44 @@
5 return key[:-1], key[-1]
6
7
8+class _KeyRefs(object):
9+
10+ def __init__(self):
11+ # dict mapping 'key' to 'set of keys referring to that key'
12+ self.refs = {}
13+
14+ def add_references(self, key, refs):
15+ # Record the new references
16+ for referenced in refs:
17+ try:
18+ needed_by = self.refs[referenced]
19+ except KeyError:
20+ needed_by = self.refs[referenced] = set()
21+ needed_by.add(key)
22+ # Discard references satisfied by the new key
23+ self.add_key(key)
24+
25+ def get_unsatisfied_refs(self):
26+ return self.refs.iterkeys()
27+
28+ def add_key(self, key):
29+ try:
30+ del self.refs[key]
31+ except KeyError:
32+ # No keys depended on this key. That's ok.
33+ pass
34+
35+ def add_keys(self, keys):
36+ for key in keys:
37+ self.add_key(key)
38+
39+ def get_referrers(self):
40+ result = set()
41+ for referrers in self.refs.itervalues():
42+ result.update(referrers)
43+ return result
44+
45+
46 class _KnitGraphIndex(object):
47 """A KnitVersionedFiles index layered on GraphIndex."""
48
49@@ -2723,9 +2761,9 @@
50 self._is_locked = is_locked
51 self._missing_compression_parents = set()
52 if track_external_parent_refs:
53- self._external_parent_refs = set()
54+ self._key_dependencies = _KeyRefs()
55 else:
56- self._external_parent_refs = None
57+ self._key_dependencies = None
58
59 def __repr__(self):
60 return "%s(%r)" % (self.__class__.__name__, self._graph_index)
61@@ -2755,13 +2793,12 @@
62
63 keys = {}
64 compression_parents = set()
65- parent_refs = self._external_parent_refs
66+ key_dependencies = self._key_dependencies
67 for (key, options, access_memo, parents) in records:
68 if self._parents:
69 parents = tuple(parents)
70- if parent_refs is not None:
71- parent_refs.update(parents)
72- parent_refs.discard(key)
73+ if key_dependencies is not None:
74+ key_dependencies.add_references(key, parents)
75 index, pos, size = access_memo
76 if 'no-eol' in options:
77 value = 'N'
78@@ -2829,12 +2866,11 @@
79 new_missing = graph_index.external_references(ref_list_num=1)
80 new_missing.difference_update(self.get_parent_map(new_missing))
81 self._missing_compression_parents.update(new_missing)
82- if self._external_parent_refs is not None:
83+ if self._key_dependencies is not None:
84 # Add parent refs from graph_index (and discard parent refs that
85 # the graph_index has).
86 for node in graph_index.iter_all_entries():
87- self._external_parent_refs.update(node[3][0])
88- self._external_parent_refs.discard(node[1])
89+ self._key_dependencies.add_references(node[1], node[3][0])
90
91 def get_missing_compression_parents(self):
92 """Return the keys of missing compression parents.
93@@ -2847,9 +2883,9 @@
94 def get_missing_parents(self):
95 """Return the keys of missing parents."""
96 # We may have false positives, so filter those out.
97- self._external_parent_refs.difference_update(
98- self.get_parent_map(self._external_parent_refs))
99- return frozenset(self._external_parent_refs)
100+ self._key_dependencies.add_keys(
101+ self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))
102+ return frozenset(self._key_dependencies.get_unsatisfied_refs())
103
104 def _check_read(self):
105 """raise if reads are not permitted."""
106
107=== modified file 'bzrlib/repository.py'
108--- bzrlib/repository.py 2009-05-07 05:08:46 +0000
109+++ bzrlib/repository.py 2009-05-12 02:35:10 +0000
110@@ -1448,7 +1448,31 @@
111 unstacked_inventories = self.inventories._index
112 present_inventories = unstacked_inventories.get_parent_map(
113 key[-1:] for key in parents)
114- parents.difference_update(present_inventories)
115+ if len(parents.difference(present_inventories)) == 0:
116+ # No missing parent inventories.
117+ return set()
118+ # Ok, now we have a list of missing inventories. But these only matter
119+ # if the inventories that reference them are missing some texts they
120+ # appear to introduce.
121+ # XXX: Texts referenced by all added inventories need to be present,
122+ # but at the moment we're only checking for texts referenced by
123+ # inventories at the graph's edge.
124+ key_deps = self.revisions._index._key_dependencies
125+ key_deps.add_keys(present_inventories)
126+ referrers = frozenset(r[0] for r in key_deps.get_referrers())
127+ file_ids = self.fileids_altered_by_revision_ids(referrers)
128+ missing_texts = set()
129+ for file_id, version_ids in file_ids.iteritems():
130+ missing_texts.update(
131+ (file_id, version_id) for version_id in version_ids)
132+ present_texts = self.texts.get_parent_map(missing_texts)
133+ missing_texts.difference_update(present_texts)
134+ if not missing_texts:
135+ # No texts are missing, so all revisions and their deltas are
136+ # reconstructable.
137+ return set()
138+ # Alternatively the text versions could be returned as the missing
139+ # keys, but this is likely to be less data.
140 missing_keys = set(('inventories', rev_id) for (rev_id,) in parents)
141 return missing_keys
142
143@@ -3993,6 +4017,7 @@
144 def _locked_insert_stream(self, stream, src_format):
145 to_serializer = self.target_repo._format._serializer
146 src_serializer = src_format._serializer
147+ new_pack = None
148 if to_serializer == src_serializer:
149 # If serializers match and the target is a pack repository, set the
150 # write cache size on the new pack. This avoids poor performance
151@@ -4039,6 +4064,11 @@
152 self.target_repo.signatures.insert_record_stream(substream)
153 else:
154 raise AssertionError('kaboom! %s' % (substream_type,))
155+ # Done inserting data, and the missing_keys calculations will try to
156+ # read back from the inserted data, so flush the writes to the new pack
157+ # (if this is pack format).
158+ if new_pack is not None:
159+ new_pack._write_data('', flush=True)
160 # Find all the new revisions (including ones from resume_tokens)
161 missing_keys = self.target_repo.get_missing_parent_inventories()
162 try:
163
164=== modified file 'bzrlib/tests/per_repository/test_write_group.py'
165--- bzrlib/tests/per_repository/test_write_group.py 2009-04-21 07:22:04 +0000
166+++ bzrlib/tests/per_repository/test_write_group.py 2009-05-12 02:35:10 +0000
167@@ -120,6 +120,8 @@
168 if token is not None:
169 repo.leave_lock_in_place()
170
171+class TestGetMissingParentInventories(TestCaseWithRepository):
172+
173 def test_empty_get_missing_parent_inventories(self):
174 """A new write group has no missing parent inventories."""
175 repo = self.make_repository('.')
176@@ -131,63 +133,50 @@
177 repo.commit_write_group()
178 repo.unlock()
179
180- def test_get_missing_parent_inventories(self):
181- # Make a trunk with one commit.
182- if isinstance(self.repository_format, remote.RemoteRepositoryFormat):
183- # RemoteRepository by default builds a default format real
184- # repository, but the default format is unstackble. So explicitly
185- # make a stackable real repository and use that.
186- repo = self.make_repository('trunk', format='1.9')
187- repo = bzrdir.BzrDir.open(self.get_url('trunk')).open_repository()
188- else:
189- repo = self.make_repository('trunk')
190- if not repo._format.supports_external_lookups:
191- raise TestNotApplicable('format not stackable')
192- repo.bzrdir._format.set_branch_format(BzrBranchFormat7())
193+ def branch_trunk_and_make_tree(self, trunk_repo, relpath):
194+ tree = self.make_branch_and_memory_tree('branch')
195+ trunk_repo.lock_read()
196+ self.addCleanup(trunk_repo.unlock)
197+ tree.branch.repository.fetch(trunk_repo, revision_id='rev-1')
198+ tree.set_parent_ids(['rev-1'])
199+ return tree
200+
201+ def make_first_commit(self, repo):
202 trunk = repo.bzrdir.create_branch()
203- trunk_repo = repo
204 tree = memorytree.MemoryTree.create_on_branch(trunk)
205 tree.lock_write()
206- if repo._format.rich_root_data:
207- # The tree needs a root
208- tree._inventory.add(InventoryDirectory('the-root-id', '', None))
209+ tree.add([''], ['TREE_ROOT'], ['directory'])
210+ tree.add(['dir'], ['dir-id'], ['directory'])
211+ tree.add(['filename'], ['file-id'], ['file'])
212+ tree.put_file_bytes_non_atomic('file-id', 'content\n')
213+ tree.commit('Trunk commit', rev_id='rev-0')
214 tree.commit('Trunk commit', rev_id='rev-1')
215 tree.unlock()
216- # Branch the trunk, add a new commit.
217- tree = self.make_branch_and_tree('branch')
218- trunk_repo.lock_read()
219- self.addCleanup(trunk_repo.unlock)
220- tree.branch.repository.fetch(trunk_repo, revision_id='rev-1')
221- tree.set_parent_ids(['rev-1'])
222+
223+ def make_new_commit_in_new_repo(self, trunk_repo, parents=None):
224+ tree = self.branch_trunk_and_make_tree(trunk_repo, 'branch')
225+ tree.set_parent_ids(parents)
226 tree.commit('Branch commit', rev_id='rev-2')
227 branch_repo = tree.branch.repository
228- # Make a new repo stacked on trunk, and copy the new commit's revision
229- # and inventory records to it.
230- if isinstance(self.repository_format, remote.RemoteRepositoryFormat):
231- # RemoteRepository by default builds a default format real
232- # repository, but the default format is unstackble. So explicitly
233- # make a stackable real repository and use that.
234- repo = self.make_repository('stacked', format='1.9')
235- repo = bzrdir.BzrDir.open(self.get_url('stacked')).open_repository()
236- else:
237- repo = self.make_repository('stacked')
238 branch_repo.lock_read()
239 self.addCleanup(branch_repo.unlock)
240- repo.add_fallback_repository(trunk.repository)
241- repo.lock_write()
242- repo.start_write_group()
243- trunk_repo.lock_read()
244- repo.inventories.insert_record_stream(
245- branch_repo.inventories.get_record_stream(
246- [('rev-2',)], 'unordered', False))
247- repo.revisions.insert_record_stream(
248- branch_repo.revisions.get_record_stream(
249- [('rev-2',)], 'unordered', False))
250- self.assertEqual(
251- set([('inventories', 'rev-1')]),
252- repo.get_missing_parent_inventories())
253- # Revisions from resumed write groups can also cause missing parent
254- # inventories.
255+ return branch_repo
256+
257+ def make_stackable_repo(self, relpath='trunk'):
258+ if isinstance(self.repository_format, remote.RemoteRepositoryFormat):
259+ # RemoteRepository by default builds a default format real
260+ # repository, but the default format is unstackble. So explicitly
261+ # make a stackable real repository and use that.
262+ repo = self.make_repository(relpath, format='1.9')
263+ repo = bzrdir.BzrDir.open(self.get_url(relpath)).open_repository()
264+ else:
265+ repo = self.make_repository(relpath)
266+ if not repo._format.supports_external_lookups:
267+ raise TestNotApplicable('format not stackable')
268+ repo.bzrdir._format.set_branch_format(BzrBranchFormat7())
269+ return repo
270+
271+ def reopen_repo_and_resume_write_group(self, repo):
272 try:
273 resume_tokens = repo.suspend_write_group()
274 except errors.UnsuspendableWriteGroup:
275@@ -201,9 +190,93 @@
276 reopened_repo.lock_write()
277 self.addCleanup(reopened_repo.unlock)
278 reopened_repo.resume_write_group(resume_tokens)
279+ return reopened_repo
280+
281+ def test_ghost_revision(self):
282+ """A parent inventory may be absent if all the needed texts are present.
283+ i.e., a ghost revision isn't (necessarily) considered to be a missing
284+ parent inventory.
285+ """
286+ # Make a trunk with one commit.
287+ trunk_repo = self.make_stackable_repo()
288+ self.make_first_commit(trunk_repo)
289+ trunk_repo.lock_read()
290+ self.addCleanup(trunk_repo.unlock)
291+ # Branch the trunk, add a new commit.
292+ branch_repo = self.make_new_commit_in_new_repo(
293+ trunk_repo, parents=['rev-1', 'ghost-rev'])
294+ inv = branch_repo.get_inventory('rev-2')
295+ # Make a new repo stacked on trunk, and then copy into it:
296+ # - all texts in rev-2
297+ # - the new inventory (rev-2)
298+ # - the new revision (rev-2)
299+ repo = self.make_stackable_repo('stacked')
300+ repo.lock_write()
301+ repo.start_write_group()
302+ # Add all texts from in rev-2 inventory. Note that this has to exclude
303+ # the root if the repo format does not support rich roots.
304+ rich_root = branch_repo._format.rich_root_data
305+ all_texts = [
306+ (ie.file_id, ie.revision) for ie in inv.iter_just_entries()
307+ if rich_root or inv.id2path(ie.file_id) != '']
308+ repo.texts.insert_record_stream(
309+ branch_repo.texts.get_record_stream(all_texts, 'unordered', False))
310+ # Add inventory and revision for rev-2.
311+ repo.add_inventory('rev-2', inv, ['rev-1', 'ghost-rev'])
312+ repo.revisions.insert_record_stream(
313+ branch_repo.revisions.get_record_stream(
314+ [('rev-2',)], 'unordered', False))
315+ # Now, no inventories are reported as missing, even though there is a
316+ # ghost.
317+ self.assertEqual(set(), repo.get_missing_parent_inventories())
318+ # Resuming the write group does not affect
319+ # get_missing_parent_inventories.
320+ reopened_repo = self.reopen_repo_and_resume_write_group(repo)
321+ self.assertEqual(set(), reopened_repo.get_missing_parent_inventories())
322+ reopened_repo.abort_write_group()
323+
324+ def test_get_missing_parent_inventories(self):
325+ """A stacked repo with a single revision and inventory (no parent
326+ inventory) in it must have all the texts in its inventory (even if not
327+ changed w.r.t. to the absent parent), otherwise it will report missing
328+ texts/parent inventory.
329+
330+ The core of this test is that a file was changed in rev-1, but in a
331+ stacked repo that only has rev-2
332+ """
333+ # Make a trunk with one commit.
334+ trunk_repo = self.make_stackable_repo()
335+ self.make_first_commit(trunk_repo)
336+ trunk_repo.lock_read()
337+ self.addCleanup(trunk_repo.unlock)
338+ # Branch the trunk, add a new commit.
339+ branch_repo = self.make_new_commit_in_new_repo(
340+ trunk_repo, parents=['rev-1'])
341+ inv = branch_repo.get_inventory('rev-2')
342+ # Make a new repo stacked on trunk, and copy the new commit's revision
343+ # and inventory records to it.
344+ repo = self.make_stackable_repo('stacked')
345+ repo.lock_write()
346+ repo.start_write_group()
347+ # Insert a single fulltext inv (using add_inventory because it's
348+ # simpler than insert_record_stream)
349+ repo.add_inventory('rev-2', inv, ['rev-1'])
350+ repo.revisions.insert_record_stream(
351+ branch_repo.revisions.get_record_stream(
352+ [('rev-2',)], 'unordered', False))
353+ # There should be no missing compression parents
354+ self.assertEqual(set(),
355+ repo.inventories.get_missing_compression_parent_keys())
356+ self.assertEqual(
357+ set([('inventories', 'rev-1')]),
358+ repo.get_missing_parent_inventories())
359+ # Resuming the write group does not affect
360+ # get_missing_parent_inventories.
361+ reopened_repo = self.reopen_repo_and_resume_write_group(repo)
362 self.assertEqual(
363 set([('inventories', 'rev-1')]),
364 reopened_repo.get_missing_parent_inventories())
365+ # Adding the parent inventory satisfies get_missing_parent_inventories.
366 reopened_repo.inventories.insert_record_stream(
367 branch_repo.inventories.get_record_stream(
368 [('rev-1',)], 'unordered', False))