Merge lp:~jelmer/brz/fastexport-perf into lp:brz

Proposed by Jelmer Vernooij
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: no longer in the source branch.
Merge reported by: The Breezy Bot
Merged at revision: not available
Proposed branch: lp:~jelmer/brz/fastexport-perf
Merge into: lp:brz
Diff against target: 304 lines (+91/-64)
3 files modified
breezy/git/git_remote_helper.py (+1/-0)
breezy/git/tests/test_git_remote_helper.py (+1/-1)
breezy/plugins/fastimport/exporter.py (+89/-63)
To merge this branch: bzr merge lp:~jelmer/brz/fastexport-perf
Reviewer Review Type Date Requested Status
Jelmer Vernooij Approve
Review via email: mp+378079@code.launchpad.net

Commit message

Attempt to batch operations when fastexporting a remote bzr repository.

Description of the change

Attempt to batch operations when fastexporting a remote bzr repository.

To post a comment you must log in.
Revision history for this message
Jelmer Vernooij (jelmer) :
review: Approve
Revision history for this message
The Breezy Bot (the-breezy-bot) wrote :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'breezy/git/git_remote_helper.py'
2--- breezy/git/git_remote_helper.py 2019-03-04 07:14:58 +0000
3+++ breezy/git/git_remote_helper.py 2020-01-26 12:49:16 +0000
4@@ -59,6 +59,7 @@
5 pass
6 else:
7 CAPABILITIES.append("import")
8+ CAPABILITIES.append("refspec *:*")
9
10
11 def open_remote_dir(url):
12
13=== modified file 'breezy/git/tests/test_git_remote_helper.py'
14--- breezy/git/tests/test_git_remote_helper.py 2019-03-04 05:21:47 +0000
15+++ breezy/git/tests/test_git_remote_helper.py 2020-01-26 12:49:16 +0000
16@@ -136,7 +136,7 @@
17 self.helper.cmd_capabilities(f, [])
18 capabs = f.getvalue()
19 base = b"fetch\noption\npush\n"
20- self.assertTrue(capabs in (base + b"\n", base + b"import\n\n"), capabs)
21+ self.assertTrue(capabs in (base + b"\n", base + b"import\nrefspec *:*\n\n"), capabs)
22
23 def test_option(self):
24 f = BytesIO()
25
26=== modified file 'breezy/plugins/fastimport/exporter.py'
27--- breezy/plugins/fastimport/exporter.py 2019-12-26 13:20:45 +0000
28+++ breezy/plugins/fastimport/exporter.py 2020-01-26 12:49:16 +0000
29@@ -59,8 +59,9 @@
30 import breezy.revision
31 from ... import (
32 builtins,
33- errors as bazErrors,
34+ errors,
35 lazy_import,
36+ lru_cache,
37 osutils,
38 progress,
39 trace,
40@@ -81,6 +82,8 @@
41 from fastimport import commands
42 """)
43
44+REVISIONS_CHUNK_SIZE = 1000
45+
46
47 def _get_output_stream(destination):
48 if destination is None or destination == '-':
49@@ -188,6 +191,7 @@
50 self.rewrite_tags = rewrite_tags
51 self.no_tags = no_tags
52 self.baseline = baseline
53+ self.tree_cache = lru_cache.LRUCache(max_cache=20)
54 self._multi_author_api_available = hasattr(breezy.revision.Revision,
55 'get_apparent_authors')
56 self.properties_to_exclude = ['authors', 'author']
57@@ -214,8 +218,8 @@
58
59 def interesting_history(self):
60 if self.revision:
61- rev1, rev2 = builtins._get_revision_range(self.revision,
62- self.branch, "fast-export")
63+ rev1, rev2 = builtins._get_revision_range(
64+ self.revision, self.branch, "fast-export")
65 start_rev_id = rev1.rev_id
66 end_rev_id = rev2.rev_id
67 else:
68@@ -230,14 +234,37 @@
69 # revisions to exclude now ...
70 if start_rev_id is not None:
71 self.note("Calculating the revisions to exclude ...")
72- self.excluded_revisions = set([rev_id for rev_id, _, _, _ in
73- self.branch.iter_merge_sorted_revisions(start_rev_id)])
74+ self.excluded_revisions = set(
75+ [rev_id for rev_id, _, _, _ in self.branch.iter_merge_sorted_revisions(start_rev_id)])
76 if self.baseline:
77 # needed so the first relative commit knows its parent
78 self.excluded_revisions.remove(start_rev_id)
79 view_revisions.insert(0, start_rev_id)
80 return list(view_revisions)
81
82+ def emit_commits(self, interesting):
83+ if self.baseline:
84+ revobj = self.branch.repository.get_revision(interesting.pop(0))
85+ self.emit_baseline(revobj, self.ref)
86+ for i in range(0, len(interesting), REVISIONS_CHUNK_SIZE):
87+ chunk = interesting[i:i + REVISIONS_CHUNK_SIZE]
88+ history = dict(self.branch.repository.iter_revisions(chunk))
89+ trees_needed = set()
90+ trees = {}
91+ for revid in chunk:
92+ trees_needed.update(self.preprocess_commit(revid, history[revid], self.ref))
93+
94+ for tree in self._get_revision_trees(trees_needed):
95+ trees[tree.get_revision_id()] = tree
96+
97+ for revid in chunk:
98+ revobj = history[revid]
99+ if len(revobj.parent_ids) == 0:
100+ parent = breezy.revision.NULL_REVISION
101+ else:
102+ parent = revobj.parent_ids[0]
103+ self.emit_commit(revobj, self.ref, trees[parent], trees[revid])
104+
105 def run(self):
106 # Export the data
107 with self.branch.repository.lock_read():
108@@ -247,10 +274,7 @@
109 self._commit_total)
110 if not self.plain_format:
111 self.emit_features()
112- if self.baseline:
113- self.emit_baseline(interesting.pop(0), self.ref)
114- for revid in interesting:
115- self.emit_commit(revid, self.ref)
116+ self.emit_commits(interesting)
117 if self.branch.supports_tags() and not self.no_tags:
118 self.emit_tags()
119
120@@ -310,7 +334,7 @@
121 try:
122 if tree.kind(path) != 'directory':
123 return False
124- except bazErrors.NoSuchFile:
125+ except errors.NoSuchFile:
126 self.warning("Skipping empty_dir detection - no file_id for %s" %
127 (path,))
128 return False
129@@ -326,52 +350,52 @@
130 for feature in sorted(commands.FEATURE_NAMES):
131 self.print_cmd(commands.FeatureCommand(feature))
132
133- def emit_baseline(self, revid, ref):
134+ def emit_baseline(self, revobj, ref):
135 # Emit a full source tree of the first commit's parent
136- revobj = self.branch.repository.get_revision(revid)
137 mark = 1
138- self.revid_to_mark[revid] = mark
139- file_cmds = self._get_filecommands(
140- breezy.revision.NULL_REVISION, revid)
141+ self.revid_to_mark[revobj.revision_id] = mark
142+ tree_old = self.branch.repository.revision_tree(
143+ breezy.revision.NULL_REVISION)
144+ [tree_new] = list(self._get_revision_trees([revobj.revision_id]))
145+ file_cmds = self._get_filecommands(tree_old, tree_new)
146 self.print_cmd(commands.ResetCommand(ref, None))
147 self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
148
149- def emit_commit(self, revid, ref):
150+ def preprocess_commit(self, revid, revobj, ref):
151 if revid in self.revid_to_mark or revid in self.excluded_revisions:
152 return
153-
154- # Get the Revision object
155- try:
156- revobj = self.branch.repository.get_revision(revid)
157- except bazErrors.NoSuchRevision:
158+ if revobj is None:
159 # This is a ghost revision. Mark it as not found and next!
160 self.revid_to_mark[revid] = -1
161 return
162-
163 # Get the primary parent
164 # TODO: Consider the excluded revisions when deciding the parents.
165 # Currently, a commit with parents that are excluded ought to be
166 # triggering the ref calculation below (and it is not).
167 # IGC 20090824
168- ncommits = len(self.revid_to_mark)
169- nparents = len(revobj.parent_ids)
170- if nparents == 0:
171+ if len(revobj.parent_ids) == 0:
172 parent = breezy.revision.NULL_REVISION
173 else:
174 parent = revobj.parent_ids[0]
175
176+ # Print the commit
177+ mark = len(self.revid_to_mark) + 1
178+ self.revid_to_mark[revobj.revision_id] = mark
179+
180+ return [parent, revobj.revision_id]
181+
182+ def emit_commit(self, revobj, ref, tree_old, tree_new):
183 # For parentless commits we need to issue reset command first, otherwise
184 # git-fast-import will assume previous commit was this one's parent
185- if nparents == 0:
186+ if tree_old.get_revision_id() == breezy.revision.NULL_REVISION:
187 self.print_cmd(commands.ResetCommand(ref, None))
188
189- # Print the commit
190- mark = ncommits + 1
191- self.revid_to_mark[revid] = mark
192- file_cmds = self._get_filecommands(parent, revid)
193+ file_cmds = self._get_filecommands(tree_old, tree_new)
194+ mark = self.revid_to_mark[revobj.revision_id]
195 self.print_cmd(self._get_commit_command(ref, mark, revobj, file_cmds))
196
197 # Report progress and checkpoint if it's time for that
198+ ncommits = len(self.revid_to_mark)
199 self.report_progress(ncommits)
200 if (self.checkpoint is not None and self.checkpoint > 0 and ncommits and
201 ncommits % self.checkpoint == 0):
202@@ -450,35 +474,34 @@
203 pass
204
205 # Build and return the result
206- return commands.CommitCommand(git_ref, mark, author_info,
207- committer_info, revobj.message.encode(
208- "utf-8"), from_, merges, file_cmds,
209- more_authors=more_author_info, properties=properties)
210-
211- def _get_revision_trees(self, parent, revision_id):
212- try:
213- tree_old = self.branch.repository.revision_tree(parent)
214- except bazErrors.UnexpectedInventoryFormat:
215- self.warning(
216- "Parent is malformed - diffing against previous parent")
217- # We can't find the old parent. Let's diff against his parent
218- pp = self.branch.repository.get_revision(parent)
219- tree_old = self.branch.repository.revision_tree(pp.parent_ids[0])
220- tree_new = None
221- try:
222- tree_new = self.branch.repository.revision_tree(revision_id)
223- except bazErrors.UnexpectedInventoryFormat:
224- # We can't really do anything anymore
225- self.warning("Revision %s is malformed - skipping" % revision_id)
226- return tree_old, tree_new
227-
228- def _get_filecommands(self, parent, revision_id):
229+ return commands.CommitCommand(
230+ git_ref, mark, author_info, committer_info,
231+ revobj.message.encode("utf-8"), from_, merges, file_cmds,
232+ more_authors=more_author_info, properties=properties)
233+
234+ def _get_revision_trees(self, revids):
235+ missing = []
236+ by_revid = {}
237+ for revid in revids:
238+ if revid == breezy.revision.NULL_REVISION:
239+ by_revid[revid] = self.branch.repository.revision_tree(revid)
240+ elif revid not in self.tree_cache:
241+ missing.append(revid)
242+
243+ for tree in self.branch.repository.revision_trees(missing):
244+ by_revid[tree.get_revision_id()] = tree
245+
246+ for revid in revids:
247+ try:
248+ yield self.tree_cache[revid]
249+ except KeyError:
250+ yield by_revid[revid]
251+
252+ for revid, tree in by_revid.items():
253+ self.tree_cache[revid] = tree
254+
255+ def _get_filecommands(self, tree_old, tree_new):
256 """Get the list of FileCommands for the changes between two revisions."""
257- tree_old, tree_new = self._get_revision_trees(parent, revision_id)
258- if not(tree_old and tree_new):
259- # Something is wrong with this revision - ignore the filecommands
260- return
261-
262 changes = tree_new.changes_from(tree_old)
263
264 my_modified = list(changes.modified)
265@@ -486,14 +509,15 @@
266 # The potential interaction between renames and deletes is messy.
267 # Handle it here ...
268 file_cmds, rd_modifies, renamed = self._process_renames_and_deletes(
269- changes.renamed, changes.removed, revision_id, tree_old)
270+ changes.renamed, changes.removed, tree_new.get_revision_id(), tree_old)
271
272 for cmd in file_cmds:
273 yield cmd
274
275 # Map kind changes to a delete followed by an add
276 for change in changes.kind_changed:
277- path = self._adjust_path_for_renames(path, renamed, revision_id)
278+ path = self._adjust_path_for_renames(
279+ path, renamed, tree_new.get_revision_id())
280 # IGC: I don't understand why a delete is needed here.
281 # In fact, it seems harmful? If you uncomment this line,
282 # please file a bug explaining why you needed to.
283@@ -523,8 +547,10 @@
284 else:
285 self.warning("cannot export '%s' of kind %s yet - ignoring" %
286 (change.path[1], change.kind[1]))
287- for (path, mode), chunks in tree_new.iter_files_bytes(
288- files_to_get):
289+
290+ # TODO(jelmer): Improve performance on remote repositories
291+ # by using Repository.iter_files_bytes for bzr repositories here.
292+ for (path, mode), chunks in tree_new.iter_files_bytes(files_to_get):
293 yield commands.FileModifyCommand(
294 path.encode("utf-8"), mode, None, b''.join(chunks))
295
296@@ -580,7 +606,7 @@
297
298 # Renaming a directory implies all children must be renamed.
299 # Note: changes_from() doesn't handle this
300- if kind == 'directory' and tree_old.kind(change.path[0]) == 'directory':
301+ if change.kind == ('directory', 'directory'):
302 for p, e in tree_old.iter_entries_by_dir(specific_files=[change.path[0]]):
303 if e.kind == 'directory' and self.plain_format:
304 continue

Subscribers

People subscribed via source and target branches