Merge lp:~jameinel/bzr/1.16-chk-direct into lp:~bzr/bzr/trunk-old

Proposed by John A Meinel
Status: Merged
Merged at revision: not available
Proposed branch: lp:~jameinel/bzr/1.16-chk-direct
Merge into: lp:~bzr/bzr/trunk-old
Diff against target: 200 lines
To merge this branch: bzr merge lp:~jameinel/bzr/1.16-chk-direct
Reviewer Review Type Date Requested Status
Martin Pool Approve
Review via email: mp+7203@code.launchpad.net
To post a comment you must log in.
Revision history for this message
John A Meinel (jameinel) wrote :

This is a bit more of an [RFC] than a strict [MERGE], but I figured it would be good to get some feedback on the route I'm taking.

With the attached patch (and most of my other proposed updates for commit w/ dev6 format repositories) I've managed to get the time for initial commit of a MySQL repository down to ~ equivalent to the XML time.

For comparison:

dev6 15.5s
1.9 11.6s
dev6+ 13.5s
dev6-all 11.6s

So this patch by itself saves approximately 2s, or 10-20% of the time for
initial commit.

The main change is to get away from calling CHKMap.map() for every object in
the commit, and instead to create a LeafNode and then _split() it.

This also includes my other change to improve InternalNode.map() performance
(_iter_nodes change: https://code.edge.launchpad.net/~jameinel/bzr/1.16-chkmap-updates/+merge/7113)

However, InternalNode.map() still has a lot of "did the LeafNode shrink"
overhead, that we really don't care about for initial commit.

Hopefully someone will approve 7113 for me, and then I'll resubmit this, so we can see that it isn't a very big change.

Revision history for this message
Martin Pool (mbp) wrote :

I'm not very familiar with the chkmap code you're changing here, but I don't see any problems with this patch. Were you wanting to talk about any particular issue here, or is there a list thread about it?

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'bzrlib/chk_map.py'
--- bzrlib/chk_map.py 2009-06-15 14:49:27 +0000
+++ bzrlib/chk_map.py 2009-06-16 02:36:08 +0000
@@ -203,13 +203,57 @@
203 multiple pages.203 multiple pages.
204 :return: The root chk of the resulting CHKMap.204 :return: The root chk of the resulting CHKMap.
205 """205 """
206 result = CHKMap(store, None, search_key_func=search_key_func)206 # root_key = klass._create_via_map(store, initial_value,
207 # maximum_size=maximum_size, key_width=key_width,
208 # search_key_func=search_key_func)
209 root_key = klass._create_directly(store, initial_value,
210 maximum_size=maximum_size, key_width=key_width,
211 search_key_func=search_key_func)
212 # if root_key != alt_root_key:
213 # result1 = klass(store, root_key, search_key_func=search_key_func)
214 # result2 = klass(store, alt_root_key,
215 # search_key_func=search_key_func)
216 # import pdb; pdb.set_trace()
217 # raise ValueError('Failed to serialize via leaf splitting.')
218 return root_key
219
220 @classmethod
221 def _create_via_map(klass, store, initial_value, maximum_size=0,
222 key_width=1, search_key_func=None):
223 result = klass(store, None, search_key_func=search_key_func)
207 result._root_node.set_maximum_size(maximum_size)224 result._root_node.set_maximum_size(maximum_size)
208 result._root_node._key_width = key_width225 result._root_node._key_width = key_width
209 delta = []226 delta = []
210 for key, value in initial_value.items():227 for key, value in initial_value.items():
211 delta.append((None, key, value))228 delta.append((None, key, value))
212 return result.apply_delta(delta)229 root_key = result.apply_delta(delta)
230 return root_key
231
232 @classmethod
233 def _create_directly(klass, store, initial_value, maximum_size=0,
234 key_width=1, search_key_func=None):
235 node = LeafNode(search_key_func=search_key_func)
236 node.set_maximum_size(maximum_size)
237 node._key_width = key_width
238 node._items = dict(initial_value)
239 node._raw_size = sum([node._key_value_len(key, value)
240 for key,value in initial_value.iteritems()])
241 node._len = len(node._items)
242 node._compute_search_prefix()
243 node._compute_serialised_prefix()
244 if (node._len > 1
245 and maximum_size
246 and node._current_size() > maximum_size):
247 prefix, node_details = node._split(store)
248 if len(node_details) == 1:
249 raise AssertionError('Failed to split using node._split')
250 node = InternalNode(prefix, search_key_func=search_key_func)
251 node.set_maximum_size(maximum_size)
252 node._key_width = key_width
253 for split, subnode in node_details:
254 node.add_node(split, subnode)
255 keys = list(node.serialise(store))
256 return keys[-1]
213257
214 def iter_changes(self, basis):258 def iter_changes(self, basis):
215 """Iterate over the changes between basis and self.259 """Iterate over the changes between basis and self.
@@ -764,7 +808,19 @@
764 result[prefix] = node808 result[prefix] = node
765 else:809 else:
766 node = result[prefix]810 node = result[prefix]
767 node.map(store, key, value)811 sub_prefix, node_details = node.map(store, key, value)
812 if len(node_details) > 1:
813 if prefix != sub_prefix:
814 # This node has been split and is now found via a different
815 # path
816 result.pop(prefix)
817 new_node = InternalNode(sub_prefix,
818 search_key_func=self._search_key_func)
819 new_node.set_maximum_size(self._maximum_size)
820 new_node._key_width = self._key_width
821 for split, node in node_details:
822 new_node.add_node(split, node)
823 result[prefix] = new_node
768 return common_prefix, result.items()824 return common_prefix, result.items()
769825
770 def map(self, store, key, value):826 def map(self, store, key, value):
771827
=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- bzrlib/repofmt/groupcompress_repo.py 2009-06-12 01:11:00 +0000
+++ bzrlib/repofmt/groupcompress_repo.py 2009-06-16 02:36:08 +0000
@@ -674,6 +674,73 @@
674 return self._inventory_add_lines(revision_id, parents,674 return self._inventory_add_lines(revision_id, parents,
675 inv_lines, check_content=False)675 inv_lines, check_content=False)
676676
677 def _get_null_inventory(self):
678 serializer = self._format._serializer
679 null_inv = inventory.CHKInventory(serializer.search_key_name)
680 search_key_func = chk_map.search_key_registry.get(
681 serializer.search_key_name)
682 null_inv.id_to_entry = chk_map.CHKMap(self.chk_bytes,
683 None, search_key_func)
684 null_inv.id_to_entry._root_node.set_maximum_size(
685 serializer.maximum_size)
686 null_inv.parent_id_basename_to_file_id = chk_map.CHKMap(
687 self.chk_bytes, None, search_key_func)
688 null_inv.parent_id_basename_to_file_id._root_node.set_maximum_size(
689 serializer.maximum_size)
690 null_inv.parent_id_basename_to_file_id._root_node._key_width = 2
691 null_inv.root_id = None
692 return null_inv
693
694 def _create_inv_from_null(self, delta, new_revision_id):
695 """This will mutate new_inv directly.
696
697 This is a simplified form of create_by_apply_delta which knows that all
698 the old values must be None, so everything is a create.
699 """
700 serializer = self._format._serializer
701 new_inv = inventory.CHKInventory(serializer.search_key_name)
702 new_inv.revision_id = new_revision_id
703
704 entry_to_bytes = new_inv._entry_to_bytes
705 id_to_entry_dict = {}
706 parent_id_basename_dict = {}
707 for old_path, new_path, file_id, entry in delta:
708 if old_path is not None:
709 raise ValueError('Invalid delta, somebody tried to delete %r'
710 ' from the NULL_REVISION'
711 % ((old_path, file_id),))
712 if new_path is None:
713 raise ValueError('Invalid delta, delta from NULL_REVISION has'
714 ' no new_path %r' % (file_id,))
715 # file id changes
716 if new_path == '':
717 new_inv.root_id = file_id
718 parent_id_basename_key = '', ''
719 else:
720 utf8_entry_name = entry.name.encode('utf-8')
721 parent_id_basename_key = (entry.parent_id, utf8_entry_name)
722 new_value = entry_to_bytes(entry)
723 # Create Caches?
724 ## new_inv._path_to_fileid_cache[new_path] = file_id
725 id_to_entry_dict[(file_id,)] = new_value
726 parent_id_basename_dict[parent_id_basename_key] = file_id
727
728 search_key_func = chk_map.search_key_registry.get(
729 serializer.search_key_name)
730 maximum_size = serializer.maximum_size
731 root_key = chk_map.CHKMap.from_dict(self.chk_bytes, id_to_entry_dict,
732 maximum_size=maximum_size, key_width=1,
733 search_key_func=search_key_func)
734 new_inv.id_to_entry = chk_map.CHKMap(self.chk_bytes, root_key,
735 search_key_func)
736 root_key = chk_map.CHKMap.from_dict(self.chk_bytes,
737 parent_id_basename_dict,
738 maximum_size=maximum_size, key_width=1,
739 search_key_func=search_key_func)
740 new_inv.parent_id_basename_to_file_id = chk_map.CHKMap(self.chk_bytes,
741 root_key, search_key_func)
742 return new_inv
743
677 def add_inventory_by_delta(self, basis_revision_id, delta, new_revision_id,744 def add_inventory_by_delta(self, basis_revision_id, delta, new_revision_id,
678 parents, basis_inv=None, propagate_caches=False):745 parents, basis_inv=None, propagate_caches=False):
679 """Add a new inventory expressed as a delta against another revision.746 """Add a new inventory expressed as a delta against another revision.
@@ -699,24 +766,29 @@
699 repository format specific) of the serialized inventory, and the766 repository format specific) of the serialized inventory, and the
700 resulting inventory.767 resulting inventory.
701 """768 """
702 if basis_revision_id == _mod_revision.NULL_REVISION:
703 return KnitPackRepository.add_inventory_by_delta(self,
704 basis_revision_id, delta, new_revision_id, parents)
705 if not self.is_in_write_group():769 if not self.is_in_write_group():
706 raise AssertionError("%r not in write group" % (self,))770 raise AssertionError("%r not in write group" % (self,))
707 _mod_revision.check_not_reserved_id(new_revision_id)771 _mod_revision.check_not_reserved_id(new_revision_id)
708 basis_tree = self.revision_tree(basis_revision_id)772 basis_tree = None
709 basis_tree.lock_read()773 if basis_inv is None:
710 try:774 if basis_revision_id == _mod_revision.NULL_REVISION:
711 if basis_inv is None:775 new_inv = self._create_inv_from_null(delta, new_revision_id)
776 inv_lines = new_inv.to_lines()
777 return self._inventory_add_lines(new_revision_id, parents,
778 inv_lines, check_content=False), new_inv
779 else:
780 basis_tree = self.revision_tree(basis_revision_id)
781 basis_tree.lock_read()
712 basis_inv = basis_tree.inventory782 basis_inv = basis_tree.inventory
783 try:
713 result = basis_inv.create_by_apply_delta(delta, new_revision_id,784 result = basis_inv.create_by_apply_delta(delta, new_revision_id,
714 propagate_caches=propagate_caches)785 propagate_caches=propagate_caches)
715 inv_lines = result.to_lines()786 inv_lines = result.to_lines()
716 return self._inventory_add_lines(new_revision_id, parents,787 return self._inventory_add_lines(new_revision_id, parents,
717 inv_lines, check_content=False), result788 inv_lines, check_content=False), result
718 finally:789 finally:
719 basis_tree.unlock()790 if basis_tree is not None:
791 basis_tree.unlock()
720792
721 def _iter_inventories(self, revision_ids):793 def _iter_inventories(self, revision_ids):
722 """Iterate over many inventory objects."""794 """Iterate over many inventory objects."""