Merge lp:~jameinel/bzr/2.0b1-stable-groupcompress-order into lp:bzr/2.0

Proposed by Vincent Ladeuil
Status: Merged
Merged at revision: not available
Proposed branch: lp:~jameinel/bzr/2.0b1-stable-groupcompress-order
Merge into: lp:bzr/2.0
Diff against target: 383 lines (has conflicts)
Text conflict in NEWS
To merge this branch: bzr merge lp:~jameinel/bzr/2.0b1-stable-groupcompress-order
Reviewer Review Type Date Requested Status
bzr-core Pending
Review via email: mp+11039@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Vincent Ladeuil (vila) wrote :

I think it's important to have that patch merged in the 2.0 branch.

It allows tweaking the gc order.

Revision history for this message
Vincent Ladeuil (vila) wrote :

Since I made the merge proposal on behalf of John, I'm unclear about whether I should vote myself or if the proposal is enough to indicate that.

Anyway, if nobody objects, John, please merge.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'NEWS'
2--- NEWS 2009-08-31 00:25:33 +0000
3+++ NEWS 2009-09-02 07:35:10 +0000
4@@ -112,6 +112,7 @@
5 * ``bzr push`` locally on windows will no longer give a locking error with
6 dirstate based formats. (Robert Collins)
7
8+<<<<<<< TREE
9 * ``bzr shelve`` and ``bzr unshelve`` now work on windows.
10 (Robert Collins, #305006)
11
12@@ -126,6 +127,12 @@
13 fixes a number of causes for ``InconsistentDelta`` errors, and permits
14 faster commit of specific paths. (Robert Collins, #347649)
15
16+=======
17+* ``groupcompress`` sort order is now more stable, rather than relying on
18+ ``topo_sort`` ordering. The implementation is now
19+ ``KnownGraph.gc_sort``. (John Arbash Meinel)
20+
21+>>>>>>> MERGE-SOURCE
22 Documentation
23 *************
24
25
26=== modified file 'bzrlib/_known_graph_py.py'
27--- bzrlib/_known_graph_py.py 2009-08-17 20:41:26 +0000
28+++ bzrlib/_known_graph_py.py 2009-09-02 07:35:10 +0000
29@@ -97,6 +97,10 @@
30 return [node for node in self._nodes.itervalues()
31 if not node.parent_keys]
32
33+ def _find_tips(self):
34+ return [node for node in self._nodes.itervalues()
35+ if not node.child_keys]
36+
37 def _find_gdfo(self):
38 nodes = self._nodes
39 known_parent_gdfos = {}
40@@ -218,6 +222,51 @@
41 # We started from the parents, so we don't need to do anymore work
42 return topo_order
43
44+ def gc_sort(self):
45+ """Return a reverse topological ordering which is 'stable'.
46+
47+ There are a few constraints:
48+ 1) Reverse topological (all children before all parents)
49+ 2) Grouped by prefix
50+ 3) 'stable' sorting, so that we get the same result, independent of
51+ machine, or extra data.
52+ To do this, we use the same basic algorithm as topo_sort, but when we
53+ aren't sure what node to access next, we sort them lexicographically.
54+ """
55+ tips = self._find_tips()
56+ # Split the tips based on prefix
57+ prefix_tips = {}
58+ for node in tips:
59+ if node.key.__class__ is str or len(node.key) == 1:
60+ prefix = ''
61+ else:
62+ prefix = node.key[0]
63+ prefix_tips.setdefault(prefix, []).append(node)
64+
65+ num_seen_children = dict.fromkeys(self._nodes, 0)
66+
67+ result = []
68+ for prefix in sorted(prefix_tips):
69+ pending = sorted(prefix_tips[prefix], key=lambda n:n.key,
70+ reverse=True)
71+ while pending:
72+ node = pending.pop()
73+ if node.parent_keys is None:
74+ # Ghost node, skip it
75+ continue
76+ result.append(node.key)
77+ for parent_key in sorted(node.parent_keys, reverse=True):
78+ parent_node = self._nodes[parent_key]
79+ seen_children = num_seen_children[parent_key] + 1
80+ if seen_children == len(parent_node.child_keys):
81+ # All children have been processed, enqueue this parent
82+ pending.append(parent_node)
83+ # This has been queued up, stop tracking it
84+ del num_seen_children[parent_key]
85+ else:
86+ num_seen_children[parent_key] = seen_children
87+ return result
88+
89 def merge_sort(self, tip_key):
90 """Compute the merge sorted graph output."""
91 from bzrlib import tsort
92
93=== modified file 'bzrlib/_known_graph_pyx.pyx'
94--- bzrlib/_known_graph_pyx.pyx 2009-08-26 16:03:59 +0000
95+++ bzrlib/_known_graph_pyx.pyx 2009-09-02 07:35:10 +0000
96@@ -25,11 +25,18 @@
97 ctypedef struct PyObject:
98 pass
99
100+ int PyString_CheckExact(object)
101+
102+ int PyObject_RichCompareBool(object, object, int)
103+ int Py_LT
104+
105+ int PyTuple_CheckExact(object)
106 object PyTuple_New(Py_ssize_t n)
107 Py_ssize_t PyTuple_GET_SIZE(object t)
108 PyObject * PyTuple_GET_ITEM(object t, Py_ssize_t o)
109 void PyTuple_SET_ITEM(object t, Py_ssize_t o, object v)
110
111+ int PyList_CheckExact(object)
112 Py_ssize_t PyList_GET_SIZE(object l)
113 PyObject * PyList_GET_ITEM(object l, Py_ssize_t o)
114 int PyList_SetItem(object l, Py_ssize_t o, object l) except -1
115@@ -108,14 +115,65 @@
116 return <_KnownGraphNode>temp_node
117
118
119-cdef _KnownGraphNode _get_parent(parents, Py_ssize_t pos):
120+cdef _KnownGraphNode _get_tuple_node(tpl, Py_ssize_t pos):
121 cdef PyObject *temp_node
122- cdef _KnownGraphNode node
123
124- temp_node = PyTuple_GET_ITEM(parents, pos)
125+ temp_node = PyTuple_GET_ITEM(tpl, pos)
126 return <_KnownGraphNode>temp_node
127
128
129+def get_key(node):
130+ cdef _KnownGraphNode real_node
131+ real_node = node
132+ return real_node.key
133+
134+
135+cdef object _sort_list_nodes(object lst_or_tpl, int reverse):
136+ """Sort a list of _KnownGraphNode objects.
137+
138+ If lst_or_tpl is a list, it is allowed to mutate in place. It may also
139+ just return the input list if everything is already sorted.
140+ """
141+ cdef _KnownGraphNode node1, node2
142+ cdef int do_swap, is_tuple
143+ cdef Py_ssize_t length
144+
145+ is_tuple = PyTuple_CheckExact(lst_or_tpl)
146+ if not (is_tuple or PyList_CheckExact(lst_or_tpl)):
147+ raise TypeError('lst_or_tpl must be a list or tuple.')
148+ length = len(lst_or_tpl)
149+ if length == 0 or length == 1:
150+ return lst_or_tpl
151+ if length == 2:
152+ if is_tuple:
153+ node1 = _get_tuple_node(lst_or_tpl, 0)
154+ node2 = _get_tuple_node(lst_or_tpl, 1)
155+ else:
156+ node1 = _get_list_node(lst_or_tpl, 0)
157+ node2 = _get_list_node(lst_or_tpl, 1)
158+ if reverse:
159+ do_swap = PyObject_RichCompareBool(node1.key, node2.key, Py_LT)
160+ else:
161+ do_swap = PyObject_RichCompareBool(node2.key, node1.key, Py_LT)
162+ if not do_swap:
163+ return lst_or_tpl
164+ if is_tuple:
165+ return (node2, node1)
166+ else:
167+ # Swap 'in-place', since lists are mutable
168+ Py_INCREF(node1)
169+ PyList_SetItem(lst_or_tpl, 1, node1)
170+ Py_INCREF(node2)
171+ PyList_SetItem(lst_or_tpl, 0, node2)
172+ return lst_or_tpl
173+ # For all other sizes, we just use 'sorted()'
174+ if is_tuple:
175+ # Note that sorted() is just list(iterable).sort()
176+ lst_or_tpl = list(lst_or_tpl)
177+ lst_or_tpl.sort(key=get_key, reverse=reverse)
178+ return lst_or_tpl
179+
180+
181 cdef class _MergeSorter
182
183 cdef class KnownGraph:
184@@ -216,6 +274,19 @@
185 PyList_Append(tails, node)
186 return tails
187
188+ def _find_tips(self):
189+ cdef PyObject *temp_node
190+ cdef _KnownGraphNode node
191+ cdef Py_ssize_t pos
192+
193+ tips = []
194+ pos = 0
195+ while PyDict_Next(self._nodes, &pos, NULL, &temp_node):
196+ node = <_KnownGraphNode>temp_node
197+ if PyList_GET_SIZE(node.children) == 0:
198+ PyList_Append(tips, node)
199+ return tips
200+
201 def _find_gdfo(self):
202 cdef _KnownGraphNode node
203 cdef _KnownGraphNode child
204@@ -322,7 +393,7 @@
205 continue
206 if node.parents is not None and PyTuple_GET_SIZE(node.parents) > 0:
207 for pos from 0 <= pos < PyTuple_GET_SIZE(node.parents):
208- parent_node = _get_parent(node.parents, pos)
209+ parent_node = _get_tuple_node(node.parents, pos)
210 last_item = last_item + 1
211 if last_item < PyList_GET_SIZE(pending):
212 Py_INCREF(parent_node) # SetItem steals a ref
213@@ -397,6 +468,77 @@
214 # We started from the parents, so we don't need to do anymore work
215 return topo_order
216
217+ def gc_sort(self):
218+ """Return a reverse topological ordering which is 'stable'.
219+
220+ There are a few constraints:
221+ 1) Reverse topological (all children before all parents)
222+ 2) Grouped by prefix
223+ 3) 'stable' sorting, so that we get the same result, independent of
224+ machine, or extra data.
225+ To do this, we use the same basic algorithm as topo_sort, but when we
226+ aren't sure what node to access next, we sort them lexicographically.
227+ """
228+ cdef PyObject *temp
229+ cdef Py_ssize_t pos, last_item
230+ cdef _KnownGraphNode node, node2, parent_node
231+
232+ tips = self._find_tips()
233+ # Split the tips based on prefix
234+ prefix_tips = {}
235+ for pos from 0 <= pos < PyList_GET_SIZE(tips):
236+ node = _get_list_node(tips, pos)
237+ if PyString_CheckExact(node.key) or len(node.key) == 1:
238+ prefix = ''
239+ else:
240+ prefix = node.key[0]
241+ temp = PyDict_GetItem(prefix_tips, prefix)
242+ if temp == NULL:
243+ prefix_tips[prefix] = [node]
244+ else:
245+ tip_nodes = <object>temp
246+ PyList_Append(tip_nodes, node)
247+
248+ result = []
249+ for prefix in sorted(prefix_tips):
250+ temp = PyDict_GetItem(prefix_tips, prefix)
251+ assert temp != NULL
252+ tip_nodes = <object>temp
253+ pending = _sort_list_nodes(tip_nodes, 1)
254+ last_item = PyList_GET_SIZE(pending) - 1
255+ while last_item >= 0:
256+ node = _get_list_node(pending, last_item)
257+ last_item = last_item - 1
258+ if node.parents is None:
259+ # Ghost
260+ continue
261+ PyList_Append(result, node.key)
262+ # Sorting the parent keys isn't strictly necessary for stable
263+ # sorting of a given graph. But it does help minimize the
264+ # differences between graphs
265+ # For bzr.dev ancestry:
266+ # 4.73ms no sort
267+ # 7.73ms RichCompareBool sort
268+ parents = _sort_list_nodes(node.parents, 1)
269+ for pos from 0 <= pos < len(parents):
270+ if PyTuple_CheckExact(parents):
271+ parent_node = _get_tuple_node(parents, pos)
272+ else:
273+ parent_node = _get_list_node(parents, pos)
274+ # TODO: GraphCycle detection
275+ parent_node.seen = parent_node.seen + 1
276+ if (parent_node.seen
277+ == PyList_GET_SIZE(parent_node.children)):
278+ # All children have been processed, queue up this
279+ # parent
280+ last_item = last_item + 1
281+ if last_item < PyList_GET_SIZE(pending):
282+ Py_INCREF(parent_node) # SetItem steals a ref
283+ PyList_SetItem(pending, last_item, parent_node)
284+ else:
285+ PyList_Append(pending, parent_node)
286+ parent_node.seen = 0
287+ return result
288
289 def merge_sort(self, tip_key):
290 """Compute the merge sorted graph output."""
291@@ -522,7 +664,7 @@
292 raise RuntimeError('ghost nodes should not be pushed'
293 ' onto the stack: %s' % (node,))
294 if PyTuple_GET_SIZE(node.parents) > 0:
295- parent_node = _get_parent(node.parents, 0)
296+ parent_node = _get_tuple_node(node.parents, 0)
297 ms_node.left_parent = parent_node
298 if parent_node.parents is None: # left-hand ghost
299 ms_node.left_pending_parent = None
300@@ -532,7 +674,7 @@
301 if PyTuple_GET_SIZE(node.parents) > 1:
302 ms_node.pending_parents = []
303 for pos from 1 <= pos < PyTuple_GET_SIZE(node.parents):
304- parent_node = _get_parent(node.parents, pos)
305+ parent_node = _get_tuple_node(node.parents, pos)
306 if parent_node.parents is None: # ghost
307 continue
308 PyList_Append(ms_node.pending_parents, parent_node)
309
310=== modified file 'bzrlib/tests/test__known_graph.py'
311--- bzrlib/tests/test__known_graph.py 2009-08-26 16:03:59 +0000
312+++ bzrlib/tests/test__known_graph.py 2009-09-02 07:35:10 +0000
313@@ -768,3 +768,70 @@
314 },
315 'E',
316 [])
317+
318+
319+class TestKnownGraphStableReverseTopoSort(TestCaseWithKnownGraph):
320+ """Test the sort order returned by gc_sort."""
321+
322+ def assertSorted(self, expected, parent_map):
323+ graph = self.make_known_graph(parent_map)
324+ value = graph.gc_sort()
325+ if expected != value:
326+ self.assertEqualDiff(pprint.pformat(expected),
327+ pprint.pformat(value))
328+
329+ def test_empty(self):
330+ self.assertSorted([], {})
331+
332+ def test_single(self):
333+ self.assertSorted(['a'], {'a':()})
334+ self.assertSorted([('a',)], {('a',):()})
335+ self.assertSorted([('F', 'a')], {('F', 'a'):()})
336+
337+ def test_linear(self):
338+ self.assertSorted(['c', 'b', 'a'], {'a':(), 'b':('a',), 'c':('b',)})
339+ self.assertSorted([('c',), ('b',), ('a',)],
340+ {('a',):(), ('b',): (('a',),), ('c',): (('b',),)})
341+ self.assertSorted([('F', 'c'), ('F', 'b'), ('F', 'a')],
342+ {('F', 'a'):(), ('F', 'b'): (('F', 'a'),),
343+ ('F', 'c'): (('F', 'b'),)})
344+
345+ def test_mixed_ancestries(self):
346+ # Each prefix should be sorted separately
347+ self.assertSorted([('F', 'c'), ('F', 'b'), ('F', 'a'),
348+ ('G', 'c'), ('G', 'b'), ('G', 'a'),
349+ ('Q', 'c'), ('Q', 'b'), ('Q', 'a'),
350+ ],
351+ {('F', 'a'):(), ('F', 'b'): (('F', 'a'),),
352+ ('F', 'c'): (('F', 'b'),),
353+ ('G', 'a'):(), ('G', 'b'): (('G', 'a'),),
354+ ('G', 'c'): (('G', 'b'),),
355+ ('Q', 'a'):(), ('Q', 'b'): (('Q', 'a'),),
356+ ('Q', 'c'): (('Q', 'b'),),
357+ })
358+
359+ def test_stable_sorting(self):
360+ # the sort order should be stable even when extra nodes are added
361+ self.assertSorted(['b', 'c', 'a'],
362+ {'a':(), 'b':('a',), 'c':('a',)})
363+ self.assertSorted(['b', 'c', 'd', 'a'],
364+ {'a':(), 'b':('a',), 'c':('a',), 'd':('a',)})
365+ self.assertSorted(['b', 'c', 'd', 'a'],
366+ {'a':(), 'b':('a',), 'c':('a',), 'd':('a',)})
367+ self.assertSorted(['Z', 'b', 'c', 'd', 'a'],
368+ {'a':(), 'b':('a',), 'c':('a',), 'd':('a',),
369+ 'Z':('a',)})
370+ self.assertSorted(['e', 'b', 'c', 'f', 'Z', 'd', 'a'],
371+ {'a':(), 'b':('a',), 'c':('a',), 'd':('a',),
372+ 'Z':('a',),
373+ 'e':('b', 'c', 'd'),
374+ 'f':('d', 'Z'),
375+ })
376+
377+ def test_skip_ghost(self):
378+ self.assertSorted(['b', 'c', 'a'],
379+ {'a':(), 'b':('a', 'ghost'), 'c':('a',)})
380+
381+ def test_skip_mainline_ghost(self):
382+ self.assertSorted(['b', 'c', 'a'],
383+ {'a':(), 'b':('ghost', 'a'), 'c':('a',)})

Subscribers

People subscribed via source and target branches