Merge lp:~spiv/bzr/all-referenced-texts-check into lp:~bzr/bzr/trunk-old

Proposed by Andrew Bennetts
Status: Merged
Merged at revision: not available
Proposed branch: lp:~spiv/bzr/all-referenced-texts-check
Merge into: lp:~bzr/bzr/trunk-old
Diff against target: 368 lines
To merge this branch: bzr merge lp:~spiv/bzr/all-referenced-texts-check
Reviewer Review Type Date Requested Status
Robert Collins (community) Approve
Review via email: mp+6445@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Andrew Bennetts (spiv) wrote :

This branch fixes bug 368418.

It improves the check for missing parent inventories to pass so long as either:

 - all parent inventories are present, or
 - all texts referenced by the new inventories are present.

Either way, a usable revision delta for those revisions can be calculated, which
is what matters.

Revision history for this message
Robert Collins (lifeless) wrote :

 review approve

Looks like a TODO:
11:28 < lifeless> + # Also, perf tests:
11:28 < lifeless> + # - if all invs present, then no texts are
checked

Please either do it, or remove the comment.

review: Approve
Revision history for this message
Andrew Bennetts (spiv) wrote :

> Please either do it, or remove the comment.

Oops, removed. Thanks!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'bzrlib/knit.py'
--- bzrlib/knit.py 2009-04-27 23:14:00 +0000
+++ bzrlib/knit.py 2009-05-12 02:35:10 +0000
@@ -2688,6 +2688,44 @@
2688 return key[:-1], key[-1]2688 return key[:-1], key[-1]
26892689
26902690
2691class _KeyRefs(object):
2692
2693 def __init__(self):
2694 # dict mapping 'key' to 'set of keys referring to that key'
2695 self.refs = {}
2696
2697 def add_references(self, key, refs):
2698 # Record the new references
2699 for referenced in refs:
2700 try:
2701 needed_by = self.refs[referenced]
2702 except KeyError:
2703 needed_by = self.refs[referenced] = set()
2704 needed_by.add(key)
2705 # Discard references satisfied by the new key
2706 self.add_key(key)
2707
2708 def get_unsatisfied_refs(self):
2709 return self.refs.iterkeys()
2710
2711 def add_key(self, key):
2712 try:
2713 del self.refs[key]
2714 except KeyError:
2715 # No keys depended on this key. That's ok.
2716 pass
2717
2718 def add_keys(self, keys):
2719 for key in keys:
2720 self.add_key(key)
2721
2722 def get_referrers(self):
2723 result = set()
2724 for referrers in self.refs.itervalues():
2725 result.update(referrers)
2726 return result
2727
2728
2691class _KnitGraphIndex(object):2729class _KnitGraphIndex(object):
2692 """A KnitVersionedFiles index layered on GraphIndex."""2730 """A KnitVersionedFiles index layered on GraphIndex."""
26932731
@@ -2723,9 +2761,9 @@
2723 self._is_locked = is_locked2761 self._is_locked = is_locked
2724 self._missing_compression_parents = set()2762 self._missing_compression_parents = set()
2725 if track_external_parent_refs:2763 if track_external_parent_refs:
2726 self._external_parent_refs = set()2764 self._key_dependencies = _KeyRefs()
2727 else:2765 else:
2728 self._external_parent_refs = None2766 self._key_dependencies = None
27292767
2730 def __repr__(self):2768 def __repr__(self):
2731 return "%s(%r)" % (self.__class__.__name__, self._graph_index)2769 return "%s(%r)" % (self.__class__.__name__, self._graph_index)
@@ -2755,13 +2793,12 @@
27552793
2756 keys = {}2794 keys = {}
2757 compression_parents = set()2795 compression_parents = set()
2758 parent_refs = self._external_parent_refs2796 key_dependencies = self._key_dependencies
2759 for (key, options, access_memo, parents) in records:2797 for (key, options, access_memo, parents) in records:
2760 if self._parents:2798 if self._parents:
2761 parents = tuple(parents)2799 parents = tuple(parents)
2762 if parent_refs is not None:2800 if key_dependencies is not None:
2763 parent_refs.update(parents)2801 key_dependencies.add_references(key, parents)
2764 parent_refs.discard(key)
2765 index, pos, size = access_memo2802 index, pos, size = access_memo
2766 if 'no-eol' in options:2803 if 'no-eol' in options:
2767 value = 'N'2804 value = 'N'
@@ -2829,12 +2866,11 @@
2829 new_missing = graph_index.external_references(ref_list_num=1)2866 new_missing = graph_index.external_references(ref_list_num=1)
2830 new_missing.difference_update(self.get_parent_map(new_missing))2867 new_missing.difference_update(self.get_parent_map(new_missing))
2831 self._missing_compression_parents.update(new_missing)2868 self._missing_compression_parents.update(new_missing)
2832 if self._external_parent_refs is not None:2869 if self._key_dependencies is not None:
2833 # Add parent refs from graph_index (and discard parent refs that2870 # Add parent refs from graph_index (and discard parent refs that
2834 # the graph_index has).2871 # the graph_index has).
2835 for node in graph_index.iter_all_entries():2872 for node in graph_index.iter_all_entries():
2836 self._external_parent_refs.update(node[3][0])2873 self._key_dependencies.add_references(node[1], node[3][0])
2837 self._external_parent_refs.discard(node[1])
28382874
2839 def get_missing_compression_parents(self):2875 def get_missing_compression_parents(self):
2840 """Return the keys of missing compression parents.2876 """Return the keys of missing compression parents.
@@ -2847,9 +2883,9 @@
2847 def get_missing_parents(self):2883 def get_missing_parents(self):
2848 """Return the keys of missing parents."""2884 """Return the keys of missing parents."""
2849 # We may have false positives, so filter those out.2885 # We may have false positives, so filter those out.
2850 self._external_parent_refs.difference_update(2886 self._key_dependencies.add_keys(
2851 self.get_parent_map(self._external_parent_refs))2887 self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))
2852 return frozenset(self._external_parent_refs)2888 return frozenset(self._key_dependencies.get_unsatisfied_refs())
28532889
2854 def _check_read(self):2890 def _check_read(self):
2855 """raise if reads are not permitted."""2891 """raise if reads are not permitted."""
28562892
=== modified file 'bzrlib/repository.py'
--- bzrlib/repository.py 2009-05-07 05:08:46 +0000
+++ bzrlib/repository.py 2009-05-12 02:35:10 +0000
@@ -1448,7 +1448,31 @@
1448 unstacked_inventories = self.inventories._index1448 unstacked_inventories = self.inventories._index
1449 present_inventories = unstacked_inventories.get_parent_map(1449 present_inventories = unstacked_inventories.get_parent_map(
1450 key[-1:] for key in parents)1450 key[-1:] for key in parents)
1451 parents.difference_update(present_inventories)1451 if len(parents.difference(present_inventories)) == 0:
1452 # No missing parent inventories.
1453 return set()
1454 # Ok, now we have a list of missing inventories. But these only matter
1455 # if the inventories that reference them are missing some texts they
1456 # appear to introduce.
1457 # XXX: Texts referenced by all added inventories need to be present,
1458 # but at the moment we're only checking for texts referenced by
1459 # inventories at the graph's edge.
1460 key_deps = self.revisions._index._key_dependencies
1461 key_deps.add_keys(present_inventories)
1462 referrers = frozenset(r[0] for r in key_deps.get_referrers())
1463 file_ids = self.fileids_altered_by_revision_ids(referrers)
1464 missing_texts = set()
1465 for file_id, version_ids in file_ids.iteritems():
1466 missing_texts.update(
1467 (file_id, version_id) for version_id in version_ids)
1468 present_texts = self.texts.get_parent_map(missing_texts)
1469 missing_texts.difference_update(present_texts)
1470 if not missing_texts:
1471 # No texts are missing, so all revisions and their deltas are
1472 # reconstructable.
1473 return set()
1474 # Alternatively the text versions could be returned as the missing
1475 # keys, but this is likely to be less data.
1452 missing_keys = set(('inventories', rev_id) for (rev_id,) in parents)1476 missing_keys = set(('inventories', rev_id) for (rev_id,) in parents)
1453 return missing_keys1477 return missing_keys
14541478
@@ -3993,6 +4017,7 @@
3993 def _locked_insert_stream(self, stream, src_format):4017 def _locked_insert_stream(self, stream, src_format):
3994 to_serializer = self.target_repo._format._serializer4018 to_serializer = self.target_repo._format._serializer
3995 src_serializer = src_format._serializer4019 src_serializer = src_format._serializer
4020 new_pack = None
3996 if to_serializer == src_serializer:4021 if to_serializer == src_serializer:
3997 # If serializers match and the target is a pack repository, set the4022 # If serializers match and the target is a pack repository, set the
3998 # write cache size on the new pack. This avoids poor performance4023 # write cache size on the new pack. This avoids poor performance
@@ -4039,6 +4064,11 @@
4039 self.target_repo.signatures.insert_record_stream(substream)4064 self.target_repo.signatures.insert_record_stream(substream)
4040 else:4065 else:
4041 raise AssertionError('kaboom! %s' % (substream_type,))4066 raise AssertionError('kaboom! %s' % (substream_type,))
4067 # Done inserting data, and the missing_keys calculations will try to
4068 # read back from the inserted data, so flush the writes to the new pack
4069 # (if this is pack format).
4070 if new_pack is not None:
4071 new_pack._write_data('', flush=True)
4042 # Find all the new revisions (including ones from resume_tokens)4072 # Find all the new revisions (including ones from resume_tokens)
4043 missing_keys = self.target_repo.get_missing_parent_inventories()4073 missing_keys = self.target_repo.get_missing_parent_inventories()
4044 try:4074 try:
40454075
=== modified file 'bzrlib/tests/per_repository/test_write_group.py'
--- bzrlib/tests/per_repository/test_write_group.py 2009-04-21 07:22:04 +0000
+++ bzrlib/tests/per_repository/test_write_group.py 2009-05-12 02:35:10 +0000
@@ -120,6 +120,8 @@
120 if token is not None:120 if token is not None:
121 repo.leave_lock_in_place()121 repo.leave_lock_in_place()
122122
123class TestGetMissingParentInventories(TestCaseWithRepository):
124
123 def test_empty_get_missing_parent_inventories(self):125 def test_empty_get_missing_parent_inventories(self):
124 """A new write group has no missing parent inventories."""126 """A new write group has no missing parent inventories."""
125 repo = self.make_repository('.')127 repo = self.make_repository('.')
@@ -131,63 +133,50 @@
131 repo.commit_write_group()133 repo.commit_write_group()
132 repo.unlock()134 repo.unlock()
133135
134 def test_get_missing_parent_inventories(self):136 def branch_trunk_and_make_tree(self, trunk_repo, relpath):
135 # Make a trunk with one commit.137 tree = self.make_branch_and_memory_tree('branch')
136 if isinstance(self.repository_format, remote.RemoteRepositoryFormat):138 trunk_repo.lock_read()
137 # RemoteRepository by default builds a default format real139 self.addCleanup(trunk_repo.unlock)
138 # repository, but the default format is unstackble. So explicitly140 tree.branch.repository.fetch(trunk_repo, revision_id='rev-1')
139 # make a stackable real repository and use that.141 tree.set_parent_ids(['rev-1'])
140 repo = self.make_repository('trunk', format='1.9')142 return tree
141 repo = bzrdir.BzrDir.open(self.get_url('trunk')).open_repository()143
142 else:144 def make_first_commit(self, repo):
143 repo = self.make_repository('trunk')
144 if not repo._format.supports_external_lookups:
145 raise TestNotApplicable('format not stackable')
146 repo.bzrdir._format.set_branch_format(BzrBranchFormat7())
147 trunk = repo.bzrdir.create_branch()145 trunk = repo.bzrdir.create_branch()
148 trunk_repo = repo
149 tree = memorytree.MemoryTree.create_on_branch(trunk)146 tree = memorytree.MemoryTree.create_on_branch(trunk)
150 tree.lock_write()147 tree.lock_write()
151 if repo._format.rich_root_data:148 tree.add([''], ['TREE_ROOT'], ['directory'])
152 # The tree needs a root149 tree.add(['dir'], ['dir-id'], ['directory'])
153 tree._inventory.add(InventoryDirectory('the-root-id', '', None))150 tree.add(['filename'], ['file-id'], ['file'])
151 tree.put_file_bytes_non_atomic('file-id', 'content\n')
152 tree.commit('Trunk commit', rev_id='rev-0')
154 tree.commit('Trunk commit', rev_id='rev-1')153 tree.commit('Trunk commit', rev_id='rev-1')
155 tree.unlock()154 tree.unlock()
156 # Branch the trunk, add a new commit.155
157 tree = self.make_branch_and_tree('branch')156 def make_new_commit_in_new_repo(self, trunk_repo, parents=None):
158 trunk_repo.lock_read()157 tree = self.branch_trunk_and_make_tree(trunk_repo, 'branch')
159 self.addCleanup(trunk_repo.unlock)158 tree.set_parent_ids(parents)
160 tree.branch.repository.fetch(trunk_repo, revision_id='rev-1')
161 tree.set_parent_ids(['rev-1'])
162 tree.commit('Branch commit', rev_id='rev-2')159 tree.commit('Branch commit', rev_id='rev-2')
163 branch_repo = tree.branch.repository160 branch_repo = tree.branch.repository
164 # Make a new repo stacked on trunk, and copy the new commit's revision
165 # and inventory records to it.
166 if isinstance(self.repository_format, remote.RemoteRepositoryFormat):
167 # RemoteRepository by default builds a default format real
168 # repository, but the default format is unstackble. So explicitly
169 # make a stackable real repository and use that.
170 repo = self.make_repository('stacked', format='1.9')
171 repo = bzrdir.BzrDir.open(self.get_url('stacked')).open_repository()
172 else:
173 repo = self.make_repository('stacked')
174 branch_repo.lock_read()161 branch_repo.lock_read()
175 self.addCleanup(branch_repo.unlock)162 self.addCleanup(branch_repo.unlock)
176 repo.add_fallback_repository(trunk.repository)163 return branch_repo
177 repo.lock_write()164
178 repo.start_write_group()165 def make_stackable_repo(self, relpath='trunk'):
179 trunk_repo.lock_read()166 if isinstance(self.repository_format, remote.RemoteRepositoryFormat):
180 repo.inventories.insert_record_stream(167 # RemoteRepository by default builds a default format real
181 branch_repo.inventories.get_record_stream(168 # repository, but the default format is unstackble. So explicitly
182 [('rev-2',)], 'unordered', False))169 # make a stackable real repository and use that.
183 repo.revisions.insert_record_stream(170 repo = self.make_repository(relpath, format='1.9')
184 branch_repo.revisions.get_record_stream(171 repo = bzrdir.BzrDir.open(self.get_url(relpath)).open_repository()
185 [('rev-2',)], 'unordered', False))172 else:
186 self.assertEqual(173 repo = self.make_repository(relpath)
187 set([('inventories', 'rev-1')]),174 if not repo._format.supports_external_lookups:
188 repo.get_missing_parent_inventories())175 raise TestNotApplicable('format not stackable')
189 # Revisions from resumed write groups can also cause missing parent176 repo.bzrdir._format.set_branch_format(BzrBranchFormat7())
190 # inventories.177 return repo
178
179 def reopen_repo_and_resume_write_group(self, repo):
191 try:180 try:
192 resume_tokens = repo.suspend_write_group()181 resume_tokens = repo.suspend_write_group()
193 except errors.UnsuspendableWriteGroup:182 except errors.UnsuspendableWriteGroup:
@@ -201,9 +190,93 @@
201 reopened_repo.lock_write()190 reopened_repo.lock_write()
202 self.addCleanup(reopened_repo.unlock)191 self.addCleanup(reopened_repo.unlock)
203 reopened_repo.resume_write_group(resume_tokens)192 reopened_repo.resume_write_group(resume_tokens)
193 return reopened_repo
194
195 def test_ghost_revision(self):
196 """A parent inventory may be absent if all the needed texts are present.
197 i.e., a ghost revision isn't (necessarily) considered to be a missing
198 parent inventory.
199 """
200 # Make a trunk with one commit.
201 trunk_repo = self.make_stackable_repo()
202 self.make_first_commit(trunk_repo)
203 trunk_repo.lock_read()
204 self.addCleanup(trunk_repo.unlock)
205 # Branch the trunk, add a new commit.
206 branch_repo = self.make_new_commit_in_new_repo(
207 trunk_repo, parents=['rev-1', 'ghost-rev'])
208 inv = branch_repo.get_inventory('rev-2')
209 # Make a new repo stacked on trunk, and then copy into it:
210 # - all texts in rev-2
211 # - the new inventory (rev-2)
212 # - the new revision (rev-2)
213 repo = self.make_stackable_repo('stacked')
214 repo.lock_write()
215 repo.start_write_group()
216 # Add all texts from in rev-2 inventory. Note that this has to exclude
217 # the root if the repo format does not support rich roots.
218 rich_root = branch_repo._format.rich_root_data
219 all_texts = [
220 (ie.file_id, ie.revision) for ie in inv.iter_just_entries()
221 if rich_root or inv.id2path(ie.file_id) != '']
222 repo.texts.insert_record_stream(
223 branch_repo.texts.get_record_stream(all_texts, 'unordered', False))
224 # Add inventory and revision for rev-2.
225 repo.add_inventory('rev-2', inv, ['rev-1', 'ghost-rev'])
226 repo.revisions.insert_record_stream(
227 branch_repo.revisions.get_record_stream(
228 [('rev-2',)], 'unordered', False))
229 # Now, no inventories are reported as missing, even though there is a
230 # ghost.
231 self.assertEqual(set(), repo.get_missing_parent_inventories())
232 # Resuming the write group does not affect
233 # get_missing_parent_inventories.
234 reopened_repo = self.reopen_repo_and_resume_write_group(repo)
235 self.assertEqual(set(), reopened_repo.get_missing_parent_inventories())
236 reopened_repo.abort_write_group()
237
238 def test_get_missing_parent_inventories(self):
239 """A stacked repo with a single revision and inventory (no parent
240 inventory) in it must have all the texts in its inventory (even if not
241 changed w.r.t. to the absent parent), otherwise it will report missing
242 texts/parent inventory.
243
244 The core of this test is that a file was changed in rev-1, but in a
245 stacked repo that only has rev-2
246 """
247 # Make a trunk with one commit.
248 trunk_repo = self.make_stackable_repo()
249 self.make_first_commit(trunk_repo)
250 trunk_repo.lock_read()
251 self.addCleanup(trunk_repo.unlock)
252 # Branch the trunk, add a new commit.
253 branch_repo = self.make_new_commit_in_new_repo(
254 trunk_repo, parents=['rev-1'])
255 inv = branch_repo.get_inventory('rev-2')
256 # Make a new repo stacked on trunk, and copy the new commit's revision
257 # and inventory records to it.
258 repo = self.make_stackable_repo('stacked')
259 repo.lock_write()
260 repo.start_write_group()
261 # Insert a single fulltext inv (using add_inventory because it's
262 # simpler than insert_record_stream)
263 repo.add_inventory('rev-2', inv, ['rev-1'])
264 repo.revisions.insert_record_stream(
265 branch_repo.revisions.get_record_stream(
266 [('rev-2',)], 'unordered', False))
267 # There should be no missing compression parents
268 self.assertEqual(set(),
269 repo.inventories.get_missing_compression_parent_keys())
270 self.assertEqual(
271 set([('inventories', 'rev-1')]),
272 repo.get_missing_parent_inventories())
273 # Resuming the write group does not affect
274 # get_missing_parent_inventories.
275 reopened_repo = self.reopen_repo_and_resume_write_group(repo)
204 self.assertEqual(276 self.assertEqual(
205 set([('inventories', 'rev-1')]),277 set([('inventories', 'rev-1')]),
206 reopened_repo.get_missing_parent_inventories())278 reopened_repo.get_missing_parent_inventories())
279 # Adding the parent inventory satisfies get_missing_parent_inventories.
207 reopened_repo.inventories.insert_record_stream(280 reopened_repo.inventories.insert_record_stream(
208 branch_repo.inventories.get_record_stream(281 branch_repo.inventories.get_record_stream(
209 [('rev-1',)], 'unordered', False))282 [('rev-1',)], 'unordered', False))