Merge lp:~jameinel/bzr/2.0b1-stable-groupcompress-order into lp:bzr/2.0
- 2.0b1-stable-groupcompress-order
- Merge into 2.0
Proposed by
Vincent Ladeuil
Status: | Merged |
---|---|
Merged at revision: | not available |
Proposed branch: | lp:~jameinel/bzr/2.0b1-stable-groupcompress-order |
Merge into: | lp:bzr/2.0 |
Diff against target: |
383 lines (has conflicts)
Text conflict in NEWS |
To merge this branch: | bzr merge lp:~jameinel/bzr/2.0b1-stable-groupcompress-order |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
bzr-core | Pending | ||
Review via email: mp+11039@code.launchpad.net |
Commit message
Description of the change
To post a comment you must log in.
Revision history for this message
Vincent Ladeuil (vila) wrote : | # |
Revision history for this message
Vincent Ladeuil (vila) wrote : | # |
Since I made the merge proposal on behalf of John, I'm unclear about whether I should vote myself or if the proposal is enough to indicate that.
Anyway, if nobody objects, John, please merge.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'NEWS' |
2 | --- NEWS 2009-08-31 00:25:33 +0000 |
3 | +++ NEWS 2009-09-02 07:35:10 +0000 |
4 | @@ -112,6 +112,7 @@ |
5 | * ``bzr push`` locally on windows will no longer give a locking error with |
6 | dirstate based formats. (Robert Collins) |
7 | |
8 | +<<<<<<< TREE |
9 | * ``bzr shelve`` and ``bzr unshelve`` now work on windows. |
10 | (Robert Collins, #305006) |
11 | |
12 | @@ -126,6 +127,12 @@ |
13 | fixes a number of causes for ``InconsistentDelta`` errors, and permits |
14 | faster commit of specific paths. (Robert Collins, #347649) |
15 | |
16 | +======= |
17 | +* ``groupcompress`` sort order is now more stable, rather than relying on |
18 | + ``topo_sort`` ordering. The implementation is now |
19 | + ``KnownGraph.gc_sort``. (John Arbash Meinel) |
20 | + |
21 | +>>>>>>> MERGE-SOURCE |
22 | Documentation |
23 | ************* |
24 | |
25 | |
26 | === modified file 'bzrlib/_known_graph_py.py' |
27 | --- bzrlib/_known_graph_py.py 2009-08-17 20:41:26 +0000 |
28 | +++ bzrlib/_known_graph_py.py 2009-09-02 07:35:10 +0000 |
29 | @@ -97,6 +97,10 @@ |
30 | return [node for node in self._nodes.itervalues() |
31 | if not node.parent_keys] |
32 | |
33 | + def _find_tips(self): |
34 | + return [node for node in self._nodes.itervalues() |
35 | + if not node.child_keys] |
36 | + |
37 | def _find_gdfo(self): |
38 | nodes = self._nodes |
39 | known_parent_gdfos = {} |
40 | @@ -218,6 +222,51 @@ |
41 | # We started from the parents, so we don't need to do anymore work |
42 | return topo_order |
43 | |
44 | + def gc_sort(self): |
45 | + """Return a reverse topological ordering which is 'stable'. |
46 | + |
47 | + There are a few constraints: |
48 | + 1) Reverse topological (all children before all parents) |
49 | + 2) Grouped by prefix |
50 | + 3) 'stable' sorting, so that we get the same result, independent of |
51 | + machine, or extra data. |
52 | + To do this, we use the same basic algorithm as topo_sort, but when we |
53 | + aren't sure what node to access next, we sort them lexicographically. |
54 | + """ |
55 | + tips = self._find_tips() |
56 | + # Split the tips based on prefix |
57 | + prefix_tips = {} |
58 | + for node in tips: |
59 | + if node.key.__class__ is str or len(node.key) == 1: |
60 | + prefix = '' |
61 | + else: |
62 | + prefix = node.key[0] |
63 | + prefix_tips.setdefault(prefix, []).append(node) |
64 | + |
65 | + num_seen_children = dict.fromkeys(self._nodes, 0) |
66 | + |
67 | + result = [] |
68 | + for prefix in sorted(prefix_tips): |
69 | + pending = sorted(prefix_tips[prefix], key=lambda n:n.key, |
70 | + reverse=True) |
71 | + while pending: |
72 | + node = pending.pop() |
73 | + if node.parent_keys is None: |
74 | + # Ghost node, skip it |
75 | + continue |
76 | + result.append(node.key) |
77 | + for parent_key in sorted(node.parent_keys, reverse=True): |
78 | + parent_node = self._nodes[parent_key] |
79 | + seen_children = num_seen_children[parent_key] + 1 |
80 | + if seen_children == len(parent_node.child_keys): |
81 | + # All children have been processed, enqueue this parent |
82 | + pending.append(parent_node) |
83 | + # This has been queued up, stop tracking it |
84 | + del num_seen_children[parent_key] |
85 | + else: |
86 | + num_seen_children[parent_key] = seen_children |
87 | + return result |
88 | + |
89 | def merge_sort(self, tip_key): |
90 | """Compute the merge sorted graph output.""" |
91 | from bzrlib import tsort |
92 | |
93 | === modified file 'bzrlib/_known_graph_pyx.pyx' |
94 | --- bzrlib/_known_graph_pyx.pyx 2009-08-26 16:03:59 +0000 |
95 | +++ bzrlib/_known_graph_pyx.pyx 2009-09-02 07:35:10 +0000 |
96 | @@ -25,11 +25,18 @@ |
97 | ctypedef struct PyObject: |
98 | pass |
99 | |
100 | + int PyString_CheckExact(object) |
101 | + |
102 | + int PyObject_RichCompareBool(object, object, int) |
103 | + int Py_LT |
104 | + |
105 | + int PyTuple_CheckExact(object) |
106 | object PyTuple_New(Py_ssize_t n) |
107 | Py_ssize_t PyTuple_GET_SIZE(object t) |
108 | PyObject * PyTuple_GET_ITEM(object t, Py_ssize_t o) |
109 | void PyTuple_SET_ITEM(object t, Py_ssize_t o, object v) |
110 | |
111 | + int PyList_CheckExact(object) |
112 | Py_ssize_t PyList_GET_SIZE(object l) |
113 | PyObject * PyList_GET_ITEM(object l, Py_ssize_t o) |
114 | int PyList_SetItem(object l, Py_ssize_t o, object l) except -1 |
115 | @@ -108,14 +115,65 @@ |
116 | return <_KnownGraphNode>temp_node |
117 | |
118 | |
119 | -cdef _KnownGraphNode _get_parent(parents, Py_ssize_t pos): |
120 | +cdef _KnownGraphNode _get_tuple_node(tpl, Py_ssize_t pos): |
121 | cdef PyObject *temp_node |
122 | - cdef _KnownGraphNode node |
123 | |
124 | - temp_node = PyTuple_GET_ITEM(parents, pos) |
125 | + temp_node = PyTuple_GET_ITEM(tpl, pos) |
126 | return <_KnownGraphNode>temp_node |
127 | |
128 | |
129 | +def get_key(node): |
130 | + cdef _KnownGraphNode real_node |
131 | + real_node = node |
132 | + return real_node.key |
133 | + |
134 | + |
135 | +cdef object _sort_list_nodes(object lst_or_tpl, int reverse): |
136 | + """Sort a list of _KnownGraphNode objects. |
137 | + |
138 | + If lst_or_tpl is a list, it is allowed to mutate in place. It may also |
139 | + just return the input list if everything is already sorted. |
140 | + """ |
141 | + cdef _KnownGraphNode node1, node2 |
142 | + cdef int do_swap, is_tuple |
143 | + cdef Py_ssize_t length |
144 | + |
145 | + is_tuple = PyTuple_CheckExact(lst_or_tpl) |
146 | + if not (is_tuple or PyList_CheckExact(lst_or_tpl)): |
147 | + raise TypeError('lst_or_tpl must be a list or tuple.') |
148 | + length = len(lst_or_tpl) |
149 | + if length == 0 or length == 1: |
150 | + return lst_or_tpl |
151 | + if length == 2: |
152 | + if is_tuple: |
153 | + node1 = _get_tuple_node(lst_or_tpl, 0) |
154 | + node2 = _get_tuple_node(lst_or_tpl, 1) |
155 | + else: |
156 | + node1 = _get_list_node(lst_or_tpl, 0) |
157 | + node2 = _get_list_node(lst_or_tpl, 1) |
158 | + if reverse: |
159 | + do_swap = PyObject_RichCompareBool(node1.key, node2.key, Py_LT) |
160 | + else: |
161 | + do_swap = PyObject_RichCompareBool(node2.key, node1.key, Py_LT) |
162 | + if not do_swap: |
163 | + return lst_or_tpl |
164 | + if is_tuple: |
165 | + return (node2, node1) |
166 | + else: |
167 | + # Swap 'in-place', since lists are mutable |
168 | + Py_INCREF(node1) |
169 | + PyList_SetItem(lst_or_tpl, 1, node1) |
170 | + Py_INCREF(node2) |
171 | + PyList_SetItem(lst_or_tpl, 0, node2) |
172 | + return lst_or_tpl |
173 | + # For all other sizes, we just use 'sorted()' |
174 | + if is_tuple: |
175 | + # Note that sorted() is just list(iterable).sort() |
176 | + lst_or_tpl = list(lst_or_tpl) |
177 | + lst_or_tpl.sort(key=get_key, reverse=reverse) |
178 | + return lst_or_tpl |
179 | + |
180 | + |
181 | cdef class _MergeSorter |
182 | |
183 | cdef class KnownGraph: |
184 | @@ -216,6 +274,19 @@ |
185 | PyList_Append(tails, node) |
186 | return tails |
187 | |
188 | + def _find_tips(self): |
189 | + cdef PyObject *temp_node |
190 | + cdef _KnownGraphNode node |
191 | + cdef Py_ssize_t pos |
192 | + |
193 | + tips = [] |
194 | + pos = 0 |
195 | + while PyDict_Next(self._nodes, &pos, NULL, &temp_node): |
196 | + node = <_KnownGraphNode>temp_node |
197 | + if PyList_GET_SIZE(node.children) == 0: |
198 | + PyList_Append(tips, node) |
199 | + return tips |
200 | + |
201 | def _find_gdfo(self): |
202 | cdef _KnownGraphNode node |
203 | cdef _KnownGraphNode child |
204 | @@ -322,7 +393,7 @@ |
205 | continue |
206 | if node.parents is not None and PyTuple_GET_SIZE(node.parents) > 0: |
207 | for pos from 0 <= pos < PyTuple_GET_SIZE(node.parents): |
208 | - parent_node = _get_parent(node.parents, pos) |
209 | + parent_node = _get_tuple_node(node.parents, pos) |
210 | last_item = last_item + 1 |
211 | if last_item < PyList_GET_SIZE(pending): |
212 | Py_INCREF(parent_node) # SetItem steals a ref |
213 | @@ -397,6 +468,77 @@ |
214 | # We started from the parents, so we don't need to do anymore work |
215 | return topo_order |
216 | |
217 | + def gc_sort(self): |
218 | + """Return a reverse topological ordering which is 'stable'. |
219 | + |
220 | + There are a few constraints: |
221 | + 1) Reverse topological (all children before all parents) |
222 | + 2) Grouped by prefix |
223 | + 3) 'stable' sorting, so that we get the same result, independent of |
224 | + machine, or extra data. |
225 | + To do this, we use the same basic algorithm as topo_sort, but when we |
226 | + aren't sure what node to access next, we sort them lexicographically. |
227 | + """ |
228 | + cdef PyObject *temp |
229 | + cdef Py_ssize_t pos, last_item |
230 | + cdef _KnownGraphNode node, node2, parent_node |
231 | + |
232 | + tips = self._find_tips() |
233 | + # Split the tips based on prefix |
234 | + prefix_tips = {} |
235 | + for pos from 0 <= pos < PyList_GET_SIZE(tips): |
236 | + node = _get_list_node(tips, pos) |
237 | + if PyString_CheckExact(node.key) or len(node.key) == 1: |
238 | + prefix = '' |
239 | + else: |
240 | + prefix = node.key[0] |
241 | + temp = PyDict_GetItem(prefix_tips, prefix) |
242 | + if temp == NULL: |
243 | + prefix_tips[prefix] = [node] |
244 | + else: |
245 | + tip_nodes = <object>temp |
246 | + PyList_Append(tip_nodes, node) |
247 | + |
248 | + result = [] |
249 | + for prefix in sorted(prefix_tips): |
250 | + temp = PyDict_GetItem(prefix_tips, prefix) |
251 | + assert temp != NULL |
252 | + tip_nodes = <object>temp |
253 | + pending = _sort_list_nodes(tip_nodes, 1) |
254 | + last_item = PyList_GET_SIZE(pending) - 1 |
255 | + while last_item >= 0: |
256 | + node = _get_list_node(pending, last_item) |
257 | + last_item = last_item - 1 |
258 | + if node.parents is None: |
259 | + # Ghost |
260 | + continue |
261 | + PyList_Append(result, node.key) |
262 | + # Sorting the parent keys isn't strictly necessary for stable |
263 | + # sorting of a given graph. But it does help minimize the |
264 | + # differences between graphs |
265 | + # For bzr.dev ancestry: |
266 | + # 4.73ms no sort |
267 | + # 7.73ms RichCompareBool sort |
268 | + parents = _sort_list_nodes(node.parents, 1) |
269 | + for pos from 0 <= pos < len(parents): |
270 | + if PyTuple_CheckExact(parents): |
271 | + parent_node = _get_tuple_node(parents, pos) |
272 | + else: |
273 | + parent_node = _get_list_node(parents, pos) |
274 | + # TODO: GraphCycle detection |
275 | + parent_node.seen = parent_node.seen + 1 |
276 | + if (parent_node.seen |
277 | + == PyList_GET_SIZE(parent_node.children)): |
278 | + # All children have been processed, queue up this |
279 | + # parent |
280 | + last_item = last_item + 1 |
281 | + if last_item < PyList_GET_SIZE(pending): |
282 | + Py_INCREF(parent_node) # SetItem steals a ref |
283 | + PyList_SetItem(pending, last_item, parent_node) |
284 | + else: |
285 | + PyList_Append(pending, parent_node) |
286 | + parent_node.seen = 0 |
287 | + return result |
288 | |
289 | def merge_sort(self, tip_key): |
290 | """Compute the merge sorted graph output.""" |
291 | @@ -522,7 +664,7 @@ |
292 | raise RuntimeError('ghost nodes should not be pushed' |
293 | ' onto the stack: %s' % (node,)) |
294 | if PyTuple_GET_SIZE(node.parents) > 0: |
295 | - parent_node = _get_parent(node.parents, 0) |
296 | + parent_node = _get_tuple_node(node.parents, 0) |
297 | ms_node.left_parent = parent_node |
298 | if parent_node.parents is None: # left-hand ghost |
299 | ms_node.left_pending_parent = None |
300 | @@ -532,7 +674,7 @@ |
301 | if PyTuple_GET_SIZE(node.parents) > 1: |
302 | ms_node.pending_parents = [] |
303 | for pos from 1 <= pos < PyTuple_GET_SIZE(node.parents): |
304 | - parent_node = _get_parent(node.parents, pos) |
305 | + parent_node = _get_tuple_node(node.parents, pos) |
306 | if parent_node.parents is None: # ghost |
307 | continue |
308 | PyList_Append(ms_node.pending_parents, parent_node) |
309 | |
310 | === modified file 'bzrlib/tests/test__known_graph.py' |
311 | --- bzrlib/tests/test__known_graph.py 2009-08-26 16:03:59 +0000 |
312 | +++ bzrlib/tests/test__known_graph.py 2009-09-02 07:35:10 +0000 |
313 | @@ -768,3 +768,70 @@ |
314 | }, |
315 | 'E', |
316 | []) |
317 | + |
318 | + |
319 | +class TestKnownGraphStableReverseTopoSort(TestCaseWithKnownGraph): |
320 | + """Test the sort order returned by gc_sort.""" |
321 | + |
322 | + def assertSorted(self, expected, parent_map): |
323 | + graph = self.make_known_graph(parent_map) |
324 | + value = graph.gc_sort() |
325 | + if expected != value: |
326 | + self.assertEqualDiff(pprint.pformat(expected), |
327 | + pprint.pformat(value)) |
328 | + |
329 | + def test_empty(self): |
330 | + self.assertSorted([], {}) |
331 | + |
332 | + def test_single(self): |
333 | + self.assertSorted(['a'], {'a':()}) |
334 | + self.assertSorted([('a',)], {('a',):()}) |
335 | + self.assertSorted([('F', 'a')], {('F', 'a'):()}) |
336 | + |
337 | + def test_linear(self): |
338 | + self.assertSorted(['c', 'b', 'a'], {'a':(), 'b':('a',), 'c':('b',)}) |
339 | + self.assertSorted([('c',), ('b',), ('a',)], |
340 | + {('a',):(), ('b',): (('a',),), ('c',): (('b',),)}) |
341 | + self.assertSorted([('F', 'c'), ('F', 'b'), ('F', 'a')], |
342 | + {('F', 'a'):(), ('F', 'b'): (('F', 'a'),), |
343 | + ('F', 'c'): (('F', 'b'),)}) |
344 | + |
345 | + def test_mixed_ancestries(self): |
346 | + # Each prefix should be sorted separately |
347 | + self.assertSorted([('F', 'c'), ('F', 'b'), ('F', 'a'), |
348 | + ('G', 'c'), ('G', 'b'), ('G', 'a'), |
349 | + ('Q', 'c'), ('Q', 'b'), ('Q', 'a'), |
350 | + ], |
351 | + {('F', 'a'):(), ('F', 'b'): (('F', 'a'),), |
352 | + ('F', 'c'): (('F', 'b'),), |
353 | + ('G', 'a'):(), ('G', 'b'): (('G', 'a'),), |
354 | + ('G', 'c'): (('G', 'b'),), |
355 | + ('Q', 'a'):(), ('Q', 'b'): (('Q', 'a'),), |
356 | + ('Q', 'c'): (('Q', 'b'),), |
357 | + }) |
358 | + |
359 | + def test_stable_sorting(self): |
360 | + # the sort order should be stable even when extra nodes are added |
361 | + self.assertSorted(['b', 'c', 'a'], |
362 | + {'a':(), 'b':('a',), 'c':('a',)}) |
363 | + self.assertSorted(['b', 'c', 'd', 'a'], |
364 | + {'a':(), 'b':('a',), 'c':('a',), 'd':('a',)}) |
365 | + self.assertSorted(['b', 'c', 'd', 'a'], |
366 | + {'a':(), 'b':('a',), 'c':('a',), 'd':('a',)}) |
367 | + self.assertSorted(['Z', 'b', 'c', 'd', 'a'], |
368 | + {'a':(), 'b':('a',), 'c':('a',), 'd':('a',), |
369 | + 'Z':('a',)}) |
370 | + self.assertSorted(['e', 'b', 'c', 'f', 'Z', 'd', 'a'], |
371 | + {'a':(), 'b':('a',), 'c':('a',), 'd':('a',), |
372 | + 'Z':('a',), |
373 | + 'e':('b', 'c', 'd'), |
374 | + 'f':('d', 'Z'), |
375 | + }) |
376 | + |
377 | + def test_skip_ghost(self): |
378 | + self.assertSorted(['b', 'c', 'a'], |
379 | + {'a':(), 'b':('a', 'ghost'), 'c':('a',)}) |
380 | + |
381 | + def test_skip_mainline_ghost(self): |
382 | + self.assertSorted(['b', 'c', 'a'], |
383 | + {'a':(), 'b':('ghost', 'a'), 'c':('a',)}) |
I think it's important to have that patch merged in the 2.0 branch.
It allows tweaking the gc order.