On Wed, 2009-08-05 at 02:45 +0000, Andrew Bennetts wrote: > === modified file 'NEWS' > > +* InterDifferingSerializer has been removed. The transformations it > + provided are now done automatically by StreamSource. (Andrew Bennetts) ^ has it? > === modified file 'bzrlib/fetch.py' > --- bzrlib/fetch.py 2009-07-09 08:59:51 +0000 > +++ bzrlib/fetch.py 2009-07-29 07:08:54 +0000 > @@ -249,20 +251,77 @@ > # yet, and are unlikely to in non-rich-root environments anyway. > root_id_order.sort(key=operator.itemgetter(0)) > # Create a record stream containing the roots to create. > - def yield_roots(): > - for key in root_id_order: > - root_id, rev_id = key > - rev_parents = parent_map[rev_id] > - # We drop revision parents with different file-ids, because > - # that represents a rename of the root to a different location > - # - its not actually a parent for us. (We could look for that > - # file id in the revision tree at considerably more expense, > - # but for now this is sufficient (and reconcile will catch and > - # correct this anyway). > - # When a parent revision is a ghost, we guess that its root id > - # was unchanged (rather than trimming it from the parent list). > - parent_keys = tuple((root_id, parent) for parent in rev_parents > - if parent != NULL_REVISION and > - rev_id_to_root_id.get(parent, root_id) == root_id) > - yield FulltextContentFactory(key, parent_keys, None, '') > - return [('texts', yield_roots())] > + from bzrlib.graph import FrozenHeadsCache > + graph = FrozenHeadsCache(graph) > + new_roots_stream = _new_root_data_stream( > + root_id_order, rev_id_to_root_id, parent_map, self.source, graph) > + return [('texts', new_roots_stream)] > + These functions have a lot of parameters. Perhaps that would work better as state on the Source? They need docs/more docs respectively. > +def _new_root_data_stream( > + root_keys_to_create, rev_id_to_root_id_map, parent_map, repo, graph=None): > ... > +def _parent_keys_for_root_version( > + root_id, rev_id, rev_id_to_root_id_map, parent_map, repo, graph=None): > === modified file 'bzrlib/help_topics/en/debug-flags.txt' > --- bzrlib/help_topics/en/debug-flags.txt 2009-07-24 03:15:56 +0000 > +++ bzrlib/help_topics/en/debug-flags.txt 2009-08-05 02:05:43 +0000 > @@ -12,6 +12,7 @@ > operations. > -Dfetch Trace history copying between repositories. > -Dfilters Emit information for debugging content filtering. > +-Dforceinvdeltas Force use of inventory deltas during generic streaming fetch. > -Dgraph Trace graph traversal. > -Dhashcache Log every time a working file is read to determine its hash. > -Dhooks Trace hook execution. > @@ -26,3 +27,7 @@ > -Dunlock Some errors during unlock are treated as warnings. > -Dpack Emit information about pack operations. > -Dsftp Trace SFTP internals. > +-Dstream Trace fetch streams. > +-DIDS:never Never use InterDifferingSerializer when fetching. > +-DIDS:always Always use InterDifferingSerializer to fetch if appropriate > + for the format, even for non-local fetches. I'm a bit uncomfortable with all these debug flags. Are they really needed? Do we anticipate asking people to use them? If not perhaps we shouldn't list them, or should use environment variables. > === modified file 'bzrlib/inventory_delta.py' > --- bzrlib/inventory_delta.py 2009-04-02 05:53:12 +0000 > +++ bzrlib/inventory_delta.py 2009-08-05 02:30:11 +0000 > > - def __init__(self, versioned_root, tree_references): > - """Create an InventoryDeltaSerializer. > + def __init__(self): > + """Create an InventoryDeltaSerializer.""" > + self._versioned_root = None > + self._tree_references = None > + self._entry_to_content = { > + 'directory': _directory_content, > + 'file': _file_content, > + 'symlink': _link_content, > + } > + > + def require_flags(self, versioned_root=None, tree_references=None): > + """Set the versioned_root and/or tree_references flags for this > + (de)serializer. ^ why is this not in the constructor? You make the fields settable only once, which seems identical to being set from __init__, but harder to use. As its required to be called, it should be documented in the class or __init__ docstring or something like that. > @@ -221,10 +255,18 @@ > def parse_text_bytes(self, bytes): > """Parse the text bytes of a serialized inventory delta. > > + If versioned_root and/or tree_references flags were set via > + require_flags, then the parsed flags must match or a BzrError will be > + raised. > + ... the changes to this function look like they do what they are intended to do, but I don't understand why thats desirable or necessary: we always know whether we want rich/nonrich etc. > === modified file 'bzrlib/remote.py' > --- bzrlib/remote.py 2009-07-30 04:27:05 +0000 > +++ bzrlib/remote.py 2009-08-05 00:07:14 +0000 > @@ -426,6 +426,7 @@ > self._custom_format = None > self._network_name = None > self._creating_bzrdir = None > + self._supports_chks = None > self._supports_external_lookups = None > self._supports_tree_reference = None > self._rich_root_data = None > @@ -443,6 +444,13 @@ > return self._rich_root_data > > @property > + def supports_chks(self): > + if self._supports_chks is None: > + self._ensure_real() > + self._supports_chks = self._custom_format.supports_chks > + return self._supports_chks > + > + @property > def supports_external_lookups(self): > if self._supports_external_lookups is None: > self._ensure_real() > @@ -579,6 +587,11 @@ > self._ensure_real() > return self._custom_format._serializer > > + @property > + def repository_class(self): > + self._ensure_real() > + return self._custom_format.repository_class > + ^ this property sets off alarm bells for me. What its for? > class RemoteStreamSink(repository.StreamSink): > > + def __init__(self, target_repo): > + repository.StreamSink.__init__(self, target_repo) > + ^ this isn't needed, as its doing nothing more than upcall. > path = target.bzrdir._path_for_remote_call(client) > - if not resume_tokens: > - # XXX: Ugly but important for correctness, *will* be fixed during > - # 1.13 cycle. Pushing a stream that is interrupted results in a > - # fallback to the _real_repositories sink *with a partial stream*. > - # Thats bad because we insert less data than bzr expected. To avoid > - # this we do a trial push to make sure the verb is accessible, and > - # do not fallback when actually pushing the stream. A cleanup patch > - # is going to look at rewinding/restarting the stream/partial > - # buffering etc. ^-- this isn't fixed, is it? > class RemoteStreamSource(repository.StreamSource): > """Stream data from a remote server.""" > @@ -1745,6 +1815,12 @@ > sources.append(repo) > return self.missing_parents_chain(search, sources) > > + def get_stream_for_missing_keys(self, missing_keys): > + self.from_repository._ensure_real() > + real_repo = self.from_repository._real_repository > + real_source = real_repo._get_source(self.to_format) > + return real_source.get_stream_for_missing_keys(missing_keys) ^ this fixes bug 406686. I don't think you mention that though inNEWS etc. > === modified file 'bzrlib/repofmt/groupcompress_repo.py' > --- bzrlib/repofmt/groupcompress_repo.py 2009-07-14 17:33:13 +0000 > +++ bzrlib/repofmt/groupcompress_repo.py 2009-07-22 03:38:51 +0000 > @@ -154,6 +154,8 @@ > self._writer.begin() > # what state is the pack in? (open, finished, aborted) > self._state = 'open' > + # no name until we finish writing the content > + self.name = None > > def _check_references(self): > """Make sure our external references are present. > @@ -466,6 +468,13 @@ > if not self._use_pack(self.new_pack): > self.new_pack.abort() > return None > + self.new_pack.finish_content() > + if len(self.packs) == 1: > + old_pack = self.packs[0] > + old_pack.name == self.new_pack._hash.hexdigest(): > + # The single old pack was already optimally packed. > + self.new_pack.abort() > + return None ^ This isn't quite right. It solves the case of 'bzr pack; bzr pack' (and there is a bug open on that too). However it won't fix the case where a single new pack is streamed, cross format, and also happens to be optimally packed in the outcome. So, please file a bug on this additional case. > @@ -879,14 +888,13 @@ > > def _get_source(self, to_format): > """Return a source for streaming from this repository.""" > - if to_format.__class__ is self._format.__class__: > + if (to_format.supports_chks and > + self._format.repository_class is to_format.repository_class and > + self._format._serializer == to_format._serializer): ^- this change is wrong; repo format classes have constant serializers - the check is overkill, and requiring a format to have a constant repository class, which if we'd needed we would have just passed that in instead. Or to say it differently, either pass the repo class in, or don't test this. > # We must be exactly the same format, otherwise stuff like the chk > - # page layout might be different > + # page layout might be different. > + # Actually, this test is just slightly looser than exact so that > + # CHK2 <-> 2a transfers will work. > return GroupCHKStreamSource(self, to_format) CHK2 <-> 2a have the same serializer. The test should just look at serializer I suspect. > === modified file 'bzrlib/repofmt/pack_repo.py' > --- bzrlib/repofmt/pack_repo.py 2009-07-01 10:42:14 +0000 > +++ bzrlib/repofmt/pack_repo.py 2009-07-16 02:10:20 +0000 > @@ -1567,7 +1574,7 @@ > # determine which packs need changing > pack_operations = [[0, []]] > for pack in self.all_packs(): > - if not hint or pack.name in hint: > + if hint is None or pack.name in hint: ^- the original form is actually faster, AFAIK. because it skips the in test for a hint of []. I'd rather we didn't change it, for all that its not in a common code path. > @@ -2093,6 +2100,7 @@ > # when autopack takes no steps, the names list is still > # unsaved. > return self._save_pack_names() > + return [] ^- is this correctness or changing the signature? If its the latter, update the docstring perhaps? > === modified file 'bzrlib/repository.py' > --- bzrlib/repository.py 2009-08-04 16:20:05 +0000 > +++ bzrlib/repository.py 2009-08-05 02:37:11 +0000 > @@ -924,6 +925,11 @@ > """ > if self._write_group is not self.get_transaction(): > # has an unlock or relock occured ? > + if suppress_errors: > + mutter( > + '(suppressed) mismatched lock context and write group. %r, %r', > + self._write_group, self.get_transaction()) > + return ^- is suppress_errors defined globally? > + def _get_convertable_inventory_stream(self, revision_ids, > + delta_versus_null=False): > + # The source is using CHKs, but the target either doesn't or is has a > + # different serializer. The StreamSink code expects to be able to > + # convert on the target, so we need to put bytes-on-the-wire that can > + # be converted. That means inventory deltas (if the remote is <1.18, > + # RemoteStreamSink will fallback to VFS to insert the deltas). ^- broken english 'or is has' In fact, reread the entire comment ;). > === modified file 'bzrlib/smart/repository.py' > --- bzrlib/smart/repository.py 2009-06-16 06:46:32 +0000 > +++ bzrlib/smart/repository.py 2009-08-04 00:51:24 +0000 > @@ -414,8 +418,39 @@ > repository. > """ > self._to_format = network_format_registry.get(to_network_name) > + if self._should_fake_unknown(): > + return FailedSmartServerResponse( > + ('UnknownMethod', 'Repository.get_stream')) > return None # Signal that we want a body. > > + def _should_fake_unknown(self): ^- the long comment here could well be helped by a docstring. I think the method name could be improved too - perhaps self._support_method(), or self._permit_method(). Returning UnknownMethod is a special way to support it after all. > > > +class SmartServerRepositoryInsertStream_1_18(SmartServerRepositoryInsertStreamLocked): > + """Insert a record stream from a RemoteSink into a repository. > + > + Same as SmartServerRepositoryInsertStreamLocked, except: > + - the lock token argument is optional > + - servers that implement this verb accept 'inventory-delta' records in the > + stream. > + > + New in 1.18. > + """ > + > + def do_repository_request(self, repository, resume_tokens, lock_token=None): > + """StreamSink.insert_stream for a remote repository.""" > + SmartServerRepositoryInsertStreamLocked.do_repository_request( > + self, repository, resume_tokens, lock_token) ^ this is really odd -- you upcall, unconditionally, and do nothing else. Is the reason to just make lock_token optional? If so, why not just change the signature on SmartServerRepositoryInsertStreamLocked.do_repository_request? Secondly, the base class will accept deltas too, AFAICT. So really,its that calling the new registered verb is a way to be sure that deltas will be accepted. I'd be happier with just: register SSRISL twice in the registry, and document the improvements/easements. Note that we will/should be locking always (at the api layer, though not necessarily the network) for compatibility with knits - its why we have SSRISLocked anyway. So the change to make lock_token optional seems unneeded. > === modified file 'bzrlib/tests/__init__.py' > --- bzrlib/tests/__init__.py 2009-08-04 11:40:59 +0000 > +++ bzrlib/tests/__init__.py 2009-08-05 02:37:11 +0000 > @@ -1938,6 +1938,16 @@ > sio.encoding = output_encoding > return sio > > + def disable_verb(self, verb): > + """Disable a smart server verb for one test.""" > + from bzrlib.smart import request > + request_handlers = request.request_handlers > + orig_method = request_handlers.get(verb) > + request_handlers.remove(verb) > + def restoreVerb(): > + request_handlers.register(verb, orig_method) > + self.addCleanup(restoreVerb) [aside] It would be nice to find a better management pattern for these helpers. The blackbox and unit test_push changes are identical; may be ripe for refactoring to reduce duplication there. > === modified file 'bzrlib/tests/per_interrepository/__init__.py' > --- bzrlib/tests/per_interrepository/__init__.py 2009-07-10 06:46:10 +0000 > +++ bzrlib/tests/per_interrepository/__init__.py 2009-07-16 02:13:41 +0000 > @@ -32,8 +32,6 @@ > ) > > from bzrlib.repository import ( > - InterDifferingSerializer, > - InterKnitRepo, > InterRepository, > ) > from bzrlib.tests import ( > @@ -51,15 +49,13 @@ > (interrepo_class, repository_format, repository_format_to). > """ > result = [] > - for interrepo_class, repository_format, repository_format_to in formats: > - id = '%s,%s,%s' % (interrepo_class.__name__, > - repository_format.__class__.__name__, > - repository_format_to.__class__.__name__) > + for repository_format, repository_format_to in formats: > + id = '%s,%s' % (repository_format.__class__.__name__, > + repository_format_to.__class__.__name__) ^- This means that we can't specify a specific interrepo type in test selection. I'm strongly against losing that facility. > scenario = (id, > {"transport_server": transport_server, > "transport_readonly_server": transport_readonly_server, > "repository_format": repository_format, > - "interrepo_class": interrepo_class, > "repository_format_to": repository_format_to, > }) > result.append(scenario) > @@ -68,8 +64,12 @@ > > def default_test_list(): > """Generate the default list of interrepo permutations to test.""" > - from bzrlib.repofmt import knitrepo, pack_repo, weaverepo > + from bzrlib.repofmt import ( > + knitrepo, pack_repo, weaverepo, groupcompress_repo, > + ) > result = [] > + def add_combo(from_format, to_format): > + result.append((from_format, to_format)) > # test the default InterRepository between format 6 and the current > # default format. > # XXX: robertc 20060220 reinstate this when there are two supported > @@ -80,37 +80,33 @@ > for optimiser_class in InterRepository._optimisers: > format_to_test = optimiser_class._get_repo_format_to_test() > if format_to_test is not None: > - result.append((optimiser_class, > - format_to_test, format_to_test)) > + add_combo(format_to_test, format_to_test) > # if there are specific combinations we want to use, we can add them > # here. We want to test rich root upgrading. > - result.append((InterRepository, > - weaverepo.RepositoryFormat5(), > - knitrepo.RepositoryFormatKnit3())) > - result.append((InterRepository, > - knitrepo.RepositoryFormatKnit1(), > - knitrepo.RepositoryFormatKnit3())) > - result.append((InterRepository, > - knitrepo.RepositoryFormatKnit1(), > - knitrepo.RepositoryFormatKnit3())) > - result.append((InterKnitRepo, > - knitrepo.RepositoryFormatKnit1(), > - pack_repo.RepositoryFormatKnitPack1())) > - result.append((InterKnitRepo, > - pack_repo.RepositoryFormatKnitPack1(), > - knitrepo.RepositoryFormatKnit1())) > - result.append((InterKnitRepo, > - knitrepo.RepositoryFormatKnit3(), > - pack_repo.RepositoryFormatKnitPack3())) > - result.append((InterKnitRepo, > - pack_repo.RepositoryFormatKnitPack3(), > - knitrepo.RepositoryFormatKnit3())) > - result.append((InterKnitRepo, > - pack_repo.RepositoryFormatKnitPack3(), > - pack_repo.RepositoryFormatKnitPack4())) > - result.append((InterDifferingSerializer, > - pack_repo.RepositoryFormatKnitPack1(), > - pack_repo.RepositoryFormatKnitPack6RichRoot())) > + add_combo(weaverepo.RepositoryFormat5(), > + knitrepo.RepositoryFormatKnit3()) > + add_combo(knitrepo.RepositoryFormatKnit1(), > + knitrepo.RepositoryFormatKnit3()) > + add_combo(knitrepo.RepositoryFormatKnit1(), > + pack_repo.RepositoryFormatKnitPack1()) > + add_combo(pack_repo.RepositoryFormatKnitPack1(), > + knitrepo.RepositoryFormatKnit1()) > + add_combo(knitrepo.RepositoryFormatKnit3(), > + pack_repo.RepositoryFormatKnitPack3()) > + add_combo(pack_repo.RepositoryFormatKnitPack3(), > + knitrepo.RepositoryFormatKnit3()) > + add_combo(pack_repo.RepositoryFormatKnitPack3(), > + pack_repo.RepositoryFormatKnitPack4()) > + add_combo(pack_repo.RepositoryFormatKnitPack1(), > + pack_repo.RepositoryFormatKnitPack6RichRoot()) > + add_combo(pack_repo.RepositoryFormatKnitPack6RichRoot(), > + groupcompress_repo.RepositoryFormat2a()) > + add_combo(groupcompress_repo.RepositoryFormat2a(), > + pack_repo.RepositoryFormatKnitPack6RichRoot()) > + add_combo(groupcompress_repo.RepositoryFormatCHK2(), > + groupcompress_repo.RepositoryFormat2a()) > + add_combo(groupcompress_repo.RepositoryFormatCHK1(), > + groupcompress_repo.RepositoryFormat2a()) And I share John's concern about losing coverage with the above. We were testing the same helper with different types before, and we're not able to do that now. This doesn't seem coupled to your actual intent, perhaps it would be better to just backout this change. > > > + if not unstacked_repo._format.supports_chks: > + # these assertions aren't valid for groupcompress repos, which may > + # transfer data than strictly necessary to avoid breaking up an > + # already-compressed block of data. > + self.assertFalse(unstacked_repo.has_revision('left')) > + self.assertFalse(unstacked_repo.has_revision('right')) ^ please check the comment, its not quite clear. > + def assertCanStreamRevision(self, repo, revision_id): > + exclude_keys = set(repo.all_revision_ids()) - set([revision_id]) > + search = SearchResult([revision_id], exclude_keys, 1, [revision_id]) > + source = repo._get_source(repo._format) > + for substream_kind, substream in source.get_stream(search): > + # Consume the substream > + list(substream) ^ - this assertion, while named nice, isn't strong enough for me. I'd like it to assert that either a delta matching the local changes, or a full inventory, are streamed - and that the newly referenced file texts are present. > === modified file 'bzrlib/tests/test_inventory_delta.py' > --- bzrlib/tests/test_inventory_delta.py 2009-04-02 05:53:12 +0000 > +++ bzrlib/tests/test_inventory_delta.py 2009-08-05 02:30:11 +0000 > ... The changes in this test script match what the inventory_delta change - but as I'm not convinced by that change they feel somewhat awkard - and incomplete, as there are more dimensions for the object to vary in now. > === modified file 'bzrlib/tests/test_remote.py' > --- bzrlib/tests/test_remote.py 2009-07-30 04:27:05 +0000 > +++ bzrlib/tests/test_remote.py 2009-08-05 01:41:03 +0000 > > + """Tests for using Repository.insert_stream verb when the _1.18 variant is > + not available. > + > + This test case is very similar to TestRepositoryInsertStream_1_18. > + """ Perhaps a base class would make this smaller - it is nearly identical. > > + def test_stream_with_inventory_deltas(self): > + """'inventory-deltas' substreams can't be sent to the > + Repository.insert_stream verb. So when one is encountered the > + RemoteSink immediately stops using that verb and falls back to VFS > + insert_stream. > + """ ^this comment isn't strictly true. We *can* send them, to servers running 1.18. But not all servers will *accept* them, and thats why we want to be sure we *won't* send them. I appreciate this is nitpicking, but we're encoding knowledge about the system for later developers, and this is worth being clear on, I think. I"m not sure how to say 'needs fixing' in review mail, but thats what I'd say. Its close though.