Merge lp:~jameinel/bzr/1.16-chk-direct into lp:~bzr/bzr/trunk-old

Proposed by John A Meinel
Status: Merged
Merged at revision: not available
Proposed branch: lp:~jameinel/bzr/1.16-chk-direct
Merge into: lp:~bzr/bzr/trunk-old
Diff against target: 200 lines
To merge this branch: bzr merge lp:~jameinel/bzr/1.16-chk-direct
Reviewer Review Type Date Requested Status
Martin Pool Approve
Review via email: mp+7203@code.launchpad.net
To post a comment you must log in.
Revision history for this message
John A Meinel (jameinel) wrote :

This is a bit more of an [RFC] than a strict [MERGE], but I figured it would be good to get some feedback on the route I'm taking.

With the attached patch (and most of my other proposed updates for commit w/ dev6 format repositories) I've managed to get the time for initial commit of a MySQL repository down to ~ equivalent to the XML time.

For comparison:

dev6 15.5s
1.9 11.6s
dev6+ 13.5s
dev6-all 11.6s

So this patch by itself saves approximately 2s, or 10-20% of the time for
initial commit.

The main change is to get away from calling CHKMap.map() for every object in
the commit, and instead to create a LeafNode and then _split() it.

This also includes my other change to improve InternalNode.map() performance
(_iter_nodes change: https://code.edge.launchpad.net/~jameinel/bzr/1.16-chkmap-updates/+merge/7113)

However, InternalNode.map() still has a lot of "did the LeafNode shrink"
overhead, that we really don't care about for initial commit.

Hopefully someone will approve 7113 for me, and then I'll resubmit this, so we can see that it isn't a very big change.

Revision history for this message
Martin Pool (mbp) wrote :

I'm not very familiar with the chkmap code you're changing here, but I don't see any problems with this patch. Were you wanting to talk about any particular issue here, or is there a list thread about it?

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bzrlib/chk_map.py'
2--- bzrlib/chk_map.py 2009-06-15 14:49:27 +0000
3+++ bzrlib/chk_map.py 2009-06-16 02:36:08 +0000
4@@ -203,13 +203,57 @@
5 multiple pages.
6 :return: The root chk of the resulting CHKMap.
7 """
8- result = CHKMap(store, None, search_key_func=search_key_func)
9+ # root_key = klass._create_via_map(store, initial_value,
10+ # maximum_size=maximum_size, key_width=key_width,
11+ # search_key_func=search_key_func)
12+ root_key = klass._create_directly(store, initial_value,
13+ maximum_size=maximum_size, key_width=key_width,
14+ search_key_func=search_key_func)
15+ # if root_key != alt_root_key:
16+ # result1 = klass(store, root_key, search_key_func=search_key_func)
17+ # result2 = klass(store, alt_root_key,
18+ # search_key_func=search_key_func)
19+ # import pdb; pdb.set_trace()
20+ # raise ValueError('Failed to serialize via leaf splitting.')
21+ return root_key
22+
23+ @classmethod
24+ def _create_via_map(klass, store, initial_value, maximum_size=0,
25+ key_width=1, search_key_func=None):
26+ result = klass(store, None, search_key_func=search_key_func)
27 result._root_node.set_maximum_size(maximum_size)
28 result._root_node._key_width = key_width
29 delta = []
30 for key, value in initial_value.items():
31 delta.append((None, key, value))
32- return result.apply_delta(delta)
33+ root_key = result.apply_delta(delta)
34+ return root_key
35+
36+ @classmethod
37+ def _create_directly(klass, store, initial_value, maximum_size=0,
38+ key_width=1, search_key_func=None):
39+ node = LeafNode(search_key_func=search_key_func)
40+ node.set_maximum_size(maximum_size)
41+ node._key_width = key_width
42+ node._items = dict(initial_value)
43+ node._raw_size = sum([node._key_value_len(key, value)
44+ for key,value in initial_value.iteritems()])
45+ node._len = len(node._items)
46+ node._compute_search_prefix()
47+ node._compute_serialised_prefix()
48+ if (node._len > 1
49+ and maximum_size
50+ and node._current_size() > maximum_size):
51+ prefix, node_details = node._split(store)
52+ if len(node_details) == 1:
53+ raise AssertionError('Failed to split using node._split')
54+ node = InternalNode(prefix, search_key_func=search_key_func)
55+ node.set_maximum_size(maximum_size)
56+ node._key_width = key_width
57+ for split, subnode in node_details:
58+ node.add_node(split, subnode)
59+ keys = list(node.serialise(store))
60+ return keys[-1]
61
62 def iter_changes(self, basis):
63 """Iterate over the changes between basis and self.
64@@ -764,7 +808,19 @@
65 result[prefix] = node
66 else:
67 node = result[prefix]
68- node.map(store, key, value)
69+ sub_prefix, node_details = node.map(store, key, value)
70+ if len(node_details) > 1:
71+ if prefix != sub_prefix:
72+ # This node has been split and is now found via a different
73+ # path
74+ result.pop(prefix)
75+ new_node = InternalNode(sub_prefix,
76+ search_key_func=self._search_key_func)
77+ new_node.set_maximum_size(self._maximum_size)
78+ new_node._key_width = self._key_width
79+ for split, node in node_details:
80+ new_node.add_node(split, node)
81+ result[prefix] = new_node
82 return common_prefix, result.items()
83
84 def map(self, store, key, value):
85
86=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
87--- bzrlib/repofmt/groupcompress_repo.py 2009-06-12 01:11:00 +0000
88+++ bzrlib/repofmt/groupcompress_repo.py 2009-06-16 02:36:08 +0000
89@@ -674,6 +674,73 @@
90 return self._inventory_add_lines(revision_id, parents,
91 inv_lines, check_content=False)
92
93+ def _get_null_inventory(self):
94+ serializer = self._format._serializer
95+ null_inv = inventory.CHKInventory(serializer.search_key_name)
96+ search_key_func = chk_map.search_key_registry.get(
97+ serializer.search_key_name)
98+ null_inv.id_to_entry = chk_map.CHKMap(self.chk_bytes,
99+ None, search_key_func)
100+ null_inv.id_to_entry._root_node.set_maximum_size(
101+ serializer.maximum_size)
102+ null_inv.parent_id_basename_to_file_id = chk_map.CHKMap(
103+ self.chk_bytes, None, search_key_func)
104+ null_inv.parent_id_basename_to_file_id._root_node.set_maximum_size(
105+ serializer.maximum_size)
106+ null_inv.parent_id_basename_to_file_id._root_node._key_width = 2
107+ null_inv.root_id = None
108+ return null_inv
109+
110+ def _create_inv_from_null(self, delta, new_revision_id):
111+ """This will mutate new_inv directly.
112+
113+ This is a simplified form of create_by_apply_delta which knows that all
114+ the old values must be None, so everything is a create.
115+ """
116+ serializer = self._format._serializer
117+ new_inv = inventory.CHKInventory(serializer.search_key_name)
118+ new_inv.revision_id = new_revision_id
119+
120+ entry_to_bytes = new_inv._entry_to_bytes
121+ id_to_entry_dict = {}
122+ parent_id_basename_dict = {}
123+ for old_path, new_path, file_id, entry in delta:
124+ if old_path is not None:
125+ raise ValueError('Invalid delta, somebody tried to delete %r'
126+ ' from the NULL_REVISION'
127+ % ((old_path, file_id),))
128+ if new_path is None:
129+ raise ValueError('Invalid delta, delta from NULL_REVISION has'
130+ ' no new_path %r' % (file_id,))
131+ # file id changes
132+ if new_path == '':
133+ new_inv.root_id = file_id
134+ parent_id_basename_key = '', ''
135+ else:
136+ utf8_entry_name = entry.name.encode('utf-8')
137+ parent_id_basename_key = (entry.parent_id, utf8_entry_name)
138+ new_value = entry_to_bytes(entry)
139+ # Create Caches?
140+ ## new_inv._path_to_fileid_cache[new_path] = file_id
141+ id_to_entry_dict[(file_id,)] = new_value
142+ parent_id_basename_dict[parent_id_basename_key] = file_id
143+
144+ search_key_func = chk_map.search_key_registry.get(
145+ serializer.search_key_name)
146+ maximum_size = serializer.maximum_size
147+ root_key = chk_map.CHKMap.from_dict(self.chk_bytes, id_to_entry_dict,
148+ maximum_size=maximum_size, key_width=1,
149+ search_key_func=search_key_func)
150+ new_inv.id_to_entry = chk_map.CHKMap(self.chk_bytes, root_key,
151+ search_key_func)
152+ root_key = chk_map.CHKMap.from_dict(self.chk_bytes,
153+ parent_id_basename_dict,
154+ maximum_size=maximum_size, key_width=1,
155+ search_key_func=search_key_func)
156+ new_inv.parent_id_basename_to_file_id = chk_map.CHKMap(self.chk_bytes,
157+ root_key, search_key_func)
158+ return new_inv
159+
160 def add_inventory_by_delta(self, basis_revision_id, delta, new_revision_id,
161 parents, basis_inv=None, propagate_caches=False):
162 """Add a new inventory expressed as a delta against another revision.
163@@ -699,24 +766,29 @@
164 repository format specific) of the serialized inventory, and the
165 resulting inventory.
166 """
167- if basis_revision_id == _mod_revision.NULL_REVISION:
168- return KnitPackRepository.add_inventory_by_delta(self,
169- basis_revision_id, delta, new_revision_id, parents)
170 if not self.is_in_write_group():
171 raise AssertionError("%r not in write group" % (self,))
172 _mod_revision.check_not_reserved_id(new_revision_id)
173- basis_tree = self.revision_tree(basis_revision_id)
174- basis_tree.lock_read()
175- try:
176- if basis_inv is None:
177+ basis_tree = None
178+ if basis_inv is None:
179+ if basis_revision_id == _mod_revision.NULL_REVISION:
180+ new_inv = self._create_inv_from_null(delta, new_revision_id)
181+ inv_lines = new_inv.to_lines()
182+ return self._inventory_add_lines(new_revision_id, parents,
183+ inv_lines, check_content=False), new_inv
184+ else:
185+ basis_tree = self.revision_tree(basis_revision_id)
186+ basis_tree.lock_read()
187 basis_inv = basis_tree.inventory
188+ try:
189 result = basis_inv.create_by_apply_delta(delta, new_revision_id,
190 propagate_caches=propagate_caches)
191 inv_lines = result.to_lines()
192 return self._inventory_add_lines(new_revision_id, parents,
193 inv_lines, check_content=False), result
194 finally:
195- basis_tree.unlock()
196+ if basis_tree is not None:
197+ basis_tree.unlock()
198
199 def _iter_inventories(self, revision_ids):
200 """Iterate over many inventory objects."""