Merge ~racb/usd-importer:changelog-parsing into usd-importer:master

Proposed by Robie Basak on 2017-05-23
Status: Rejected
Rejected by: Nish Aravamudan on 2017-08-04
Proposed branch: ~racb/usd-importer:changelog-parsing
Merge into: usd-importer:master
Diff against target: 425 lines (+192/-155)
1 file modified
gitubuntu/git_repository.py (+192/-155)
Reviewer Review Type Date Requested Status
Nish Aravamudan 2017-05-23 Disapprove on 2017-08-04
Review via email: mp+324476@code.launchpad.net

Description of the Change

Native changelog parsing, though still with shell fallback if required. Sharing this early; not fully tested. Needs discussion in relation to regression risk.

I have not yet tested "git ubuntu merge" at all. Imports appear to be working, but I have also not tested this fully yet.

To post a comment you must log in.
Nish Aravamudan (nacc) wrote :

Superseded/rebased in new merge.

review: Disapprove

Unmerged commits

87b2e97... by Robie Basak on 2017-05-23

Refactor changelog parsing

Replace the shell-based symlink workaround with a proper implementation
that uses pygit2 directly. This results in a significant speedup.

get_changelog_versions_from_treeish() previously returned (None, None)
if 'debian/changelog' was not in the supplied tree-ish. This was not
documented and happened implicitly due to the shell fallback behaviour
in parsing debian/changelog. get_heads_and_versions() appears to rely on
this behaviour, for example when asking for versions for the
'debian/pristine-tar' branch. This behaviour is made explicit in the
replacement function that uses the new changelog parsing.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/gitubuntu/git_repository.py b/gitubuntu/git_repository.py
2index 345ff81..7ed57aa 100644
3--- a/gitubuntu/git_repository.py
4+++ b/gitubuntu/git_repository.py
5@@ -2,10 +2,10 @@
6 ### XXX: is any of this data in lp already?
7
8 from copy import copy
9-from enum import Enum, unique
10-import functools
11+from functools import lru_cache
12 import logging
13 import os
14+import posixpath
15 import re
16 import shutil
17 from subprocess import CalledProcessError
18@@ -16,6 +16,7 @@ from gitubuntu.run import run, runq, decode_binary
19 try:
20 pkg = 'python3-debian'
21 import debian
22+ import debian.changelog
23 pkg = 'python3-pygit2'
24 import pygit2
25 except ImportError:
26@@ -23,33 +24,158 @@ except ImportError:
27 sys.exit(1)
28
29
30-def memoize(obj):
31- cache = obj.cache = {}
32+def _follow_symlinks_to_blob(repo, top_tree, search_path, _rel_tree=None, _rel_path=''):
33+ '''Recursively follow a path down a tree, following symlinks, to find blob
34+
35+ repo: pygit2.Repository object
36+ top_tree: pygit2.Tree object of the top of the tree structure
37+ search_path: '/'-separated path string of blob to find
38+ _rel_tree: (internal) which tree to look further into
39+ _rel_path: (internal) the path we are in so far
40+ '''
41+
42+ NORMAL_BLOB_MODES = set([
43+ pygit2.GIT_FILEMODE_BLOB,
44+ pygit2.GIT_FILEMODE_BLOB_EXECUTABLE,
45+ ])
46+
47+ _rel_tree = _rel_tree or top_tree
48+ head, tail = posixpath.split(search_path)
49+
50+ # A traditional functional split would put a single entry in head with tail
51+ # empty, but posixpath.split doesn't necessarily do this. Jiggle it round
52+ # to make it appear to have traditional semantics.
53+ if not head:
54+ head = tail
55+ tail = None
56+
57+ entry = _rel_tree[head]
58+ if entry.type == 'tree':
59+ return _follow_symlinks_to_blob(
60+ repo=repo,
61+ top_tree=top_tree,
62+ search_path=tail,
63+ _rel_tree=repo.get(entry.id),
64+ _rel_path=posixpath.join(_rel_path, head),
65+ )
66+ elif entry.type == 'blob' and entry.filemode == pygit2.GIT_FILEMODE_LINK:
67+ # Found a symlink. Start again from the top with adjustment for symlink
68+ # following
69+ search_path = posixpath.normpath(
70+ posixpath.join(_rel_path, repo.get(entry.id).data, tail)
71+ )
72+ return _follow_symlinks_to_blob(
73+ repo=repo,
74+ top_tree=top_tree,
75+ search_path=search_path,
76+ )
77+ elif entry.type == 'blob' and entry.filemode in NORMAL_BLOB_MODES:
78+ return repo.get(entry.id)
79+ else:
80+ # Found some special entry such as a "gitlink" (submodule entry)
81+ raise ValueError(
82+ "Found %r filemode %r looking for %r" %
83+ (entry, entry.filemode, posixpath.join(_rel_path, search_path))
84+ )
85+
86+
87+def follow_symlinks_to_blob(repo, treeish, path):
88+ return _follow_symlinks_to_blob(
89+ repo=repo,
90+ top_tree=treeish.peel(pygit2.Tree),
91+ search_path=posixpath.normpath(path),
92+ )
93+
94+
95+class Changelog:
96+ '''Representation of a debian/changelog file found inside a git tree-ish
97+
98+ Uses the debian.changelog module from python3-debian for parsing, but when
99+ this fails we fall back to grep/sed-based pattern matching automatically.
100+ '''
101+ def __init__(self, repo, treeish):
102+ '''
103+ repo: pygit2.Repository instance
104+ treeish: pygit2.Object subclass instance (must peel to pygit2.Tree)
105+ '''
106
107- @functools.wraps(obj)
108- def memoizer(*args, **kwargs):
109- key = str(args) + str(kwargs)
110- if key not in cache:
111- cache[key] = obj(*args, **kwargs)
112+ blob = follow_symlinks_to_blob(
113+ repo=repo,
114+ treeish=treeish,
115+ path='debian/changelog'
116+ )
117+ self._contents = blob.data
118+ self._changelog = debian.changelog.Changelog(self._contents)
119+ if not len(self._changelog.versions):
120+ # assume bad read, so fall back to shell later
121+ self._changelog = None
122+
123+ @lru_cache()
124+ def _shell(self, cmd):
125+ cp = run(
126+ cmd,
127+ input=self._contents,
128+ shell=True,
129+ env=self._env,
130+ quiet=True,
131+ )
132+ return decode_binary(cp.stdout).strip()
133+
134+ @property
135+ def version(self):
136+ if self._changelog:
137+ return str(self._changelog.versions[0])
138 else:
139- logging.debug("Cache hit on %s %s = %s", str(args),
140- str(kwargs),
141- cache[key][:30].replace("\n", "\\n") + "..")
142- return cache[key]
143- return memoizer
144+ shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\''
145+ return self._shell(shell_cmd)
146
147+ @property
148+ def previous_version(self):
149+ if self._changelog:
150+ return str(self._changelog.versions[1])
151+ else:
152+ shell_cmd = 'grep -m1 \'^\S\' | tail -1 | sed \'s/.*(\(.*\)).*/\\1/\''
153+ return self._shell(shell_cmd)
154+
155+ @property
156+ def maintainer(self):
157+ if self._changelog:
158+ return self._changelog.author
159+ else:
160+ shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\1/\''
161+ return self._shell(shell_cmd)
162+
163+ @property
164+ def date(self):
165+ if self._changelog:
166+ return self._changelog.date
167+ else:
168+ shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\2/\''
169+ return self._shell(shell_cmd)
170+
171+ @property
172+ def all_versions(self):
173+ if self._changelog:
174+ return [str(v) for v in self._changelog.versions]
175+ else:
176+ shell_cmd = 'grep \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\''
177+ version_lines = self._shell(shell_cmd)
178+ return [
179+ v_stripped
180+ for v_stripped in (
181+ v.strip() for v in version_lines.splitlines()
182+ )
183+ if v_stripped
184+ ]
185+
186+ @property
187+ def distribution(self):
188+ if self._changelog:
189+ return self._changelog.distributions
190+ else:
191+ shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*\ .*\ \(.*\);.*/\\1/\''
192+ return self._shell(shell_cmd)
193
194-@unique
195-class ChangelogField(Enum):
196- """Trivial enum for specifying fields to extract from
197- debian/changelog
198- """
199- version = 1
200- previous_version = 2
201- maintainer = 3
202- date = 4
203- all_versions = 5
204- distribution = 6
205
206 def git_dep14_tag(version):
207 """Munge a version string according to taken from
208@@ -449,143 +575,64 @@ class GitUbuntuRepository:
209 def garbage_collect(self):
210 self.git_run(['gc'])
211
212- def extract_file_from_treeish(self, treeish, filename, outfile=None):
213+ def extract_file_from_treeish(self, treeish, filename):
214 """extract a file from @treeish to a local file
215
216 Arguments:
217 treeish - SHA1 of treeish
218 filename - file to extract from @treeish
219- outfile - name of file to copy @filename contents to, will be overwritten.
220- If None, will be a NamedTemporaryFile.
221- """
222- if outfile is None:
223- outfile = tempfile.NamedTemporaryFile()
224- else:
225- outfile = open(outfile, mode='w+')
226
227- cat_changelog_cmd = (
228- "echo %s:debian/changelog | "
229- "git cat-file --batch --follow-symlinks | "
230- "sed -n '1{/^[^ ]* blob/!{p;q1}};2,$p'"
231- % treeish
232+ Returns a NamedTemporaryFile that is flushed but not rewound.
233+ """
234+ blob = follow_symlinks_to_blob(
235+ self._local_repo,
236+ treeish=self._local_repo.get(treeish),
237+ path=filename,
238 )
239-
240- try:
241- cp = run(cat_changelog_cmd, shell=True, env=self._env,
242- stdout=outfile)
243- except CalledProcessError:
244- raise Exception('Unable to extract file')
245-
246+ outfile = tempfile.NamedTemporaryFile()
247+ outfile.write(blob.data)
248 outfile.flush()
249 return outfile
250
251- @memoize
252- def parse_changelog_field_in_treeish(self, treeish, field):
253- """Parse debian/changelog for specified field, using
254- dpkg-parsechangelog with fallback to shell munging
255-
256- dpkg-parsechangelog is preferentially used, however there have
257- been multiple cases where historical changelogs have not been
258- parseable. Fallback to a relatively dumb shell-based parsing for
259- relevant fields in that case. If that also fails, as a last
260- resort, see if any source patches are applicable.
261-
262- Arguments:
263- treeish -- SHA1 of treeish
264- field -- ChangelogField value corresponding to field to extract
265-
266- Returns:
267- A subprocess.CompletedProcess instance as returned by
268- subprocess.run
269- """
270- # in each case:
271- # parse_params is what is passed on to dpkg-parsechangelog
272- # shell_cmd is a suitable shell sequence to pipe the changelog
273- # through to achieve the same if dpkg-parsechangelog fails
274- if field is ChangelogField.version:
275- parse_params = '-n1 -SVersion'
276- shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\''
277- elif field is ChangelogField.previous_version:
278- parse_params = '-n1 -o1 -SVersion'
279- shell_cmd = 'grep -m1 \'^\S\' | tail -1 | sed \'s/.*(\(.*\)).*/\\1/\''
280- elif field is ChangelogField.maintainer:
281- parse_params = '-SMaintainer'
282- shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\1/\''
283- elif field is ChangelogField.date:
284- parse_params = '-SDate'
285- shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\2/\''
286- elif field is ChangelogField.all_versions:
287- parse_params = '--format rfc822 -SVersion --all'
288- shell_cmd = 'grep \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\''
289- elif field is ChangelogField.distribution:
290- parse_params = '-SDistribution'
291- shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*\ .*\ \(.*\);.*/\\1/\''
292- else:
293- raise Exception('Unknown changelog field %s' % field)
294-
295- # We cannot use "git show" since it does not follow symlinks (LP:
296- # #1661092). Instead, use "git cat-file --batch --follow-symlinks" and
297- # parse the batch output (first line) to ensure that we get a blob
298- # rather than a symlink following failure. If we don't get a blob, then
299- # print what we got and exit 1.
300- cat_changelog_cmd = (
301- "echo %s:debian/changelog | "
302- "git cat-file --batch --follow-symlinks | "
303- "sed -n '1{/^[^ ]* blob/!{p;q1}};2,$p'"
304- % treeish
305- )
306-
307- try:
308- cp = run(
309- '%s | dpkg-parsechangelog -l- %s' %
310- (cat_changelog_cmd, parse_params),
311- shell=True, env=self._env, quiet=True)
312- except CalledProcessError:
313- try:
314- cp = run(
315- '%s | %s' % (cat_changelog_cmd, shell_cmd),
316- shell=True, env=self._env, quiet=True)
317- except CalledProcessError:
318- raise Exception('Unable to parse changelog')
319-
320- return decode_binary(cp.stdout).strip()
321+ @lru_cache()
322+ def get_changelog_from_treeish(self, treeish):
323+ return Changelog(self._local_repo, self._local_repo.get(treeish))
324
325 def get_changelog_versions_from_treeish(self, treeish):
326 """Extract current and prior versions from debian/changelog in a
327 given treeish
328- """
329- current_version = None
330- last_version = None
331- if treeish is not None:
332- try:
333- current_version = self.parse_changelog_field_in_treeish(
334- treeish, ChangelogField.version
335- )
336- last_version = self.parse_changelog_field_in_treeish(
337- treeish, ChangelogField.previous_version
338- )
339
340- except:
341- logging.exception('Cannot get changelog versions')
342- sys.exit(1)
343+ Returns (None, None) if the treeish supplied is None or if
344+ 'debian/changelog' does not exist in the treeish.
345+ """
346+ if treeish is None:
347+ return None, None
348
349- return (current_version, last_version)
350+ try:
351+ changelog = self.get_changelog_from_treeish(treeish)
352+ except KeyError:
353+ # If 'debian/changelog' does
354+ # not exist, then (None, None) is returned. KeyError propagates up
355+ # from Changelog's __init__.
356+ return None, None
357+ try:
358+ return changelog.version, changelog.previous_version
359+ except:
360+ logging.exception('Cannot get changelog versions')
361+ sys.exit(1)
362
363 def get_changelog_distribution_from_treeish(self, treeish):
364 """Extract targetted distribution from debian/changelog in a
365 given treeish
366 """
367- distribution = None
368- if treeish is not None:
369- try:
370- distribution = self.parse_changelog_field_in_treeish(
371- treeish, ChangelogField.distribution
372- )
373- except:
374- logging.exception('Cannot get changelog distribution')
375- sys.exit(1)
376+ if treeish is None:
377+ return None
378
379- return distribution
380+ try:
381+ return self.get_changelog_from_treeish(treeish).distribution
382+ except:
383+ logging.exception('Cannot get changelog distribution')
384+ sys.exit(1)
385
386 def get_heads_and_versions(self, head_prefix, namespace):
387 """Extract the last version in debian/changelog of all
388@@ -680,12 +727,9 @@ class GitUbuntuRepository:
389 def get_commit_authorship(self, ref, spi):
390 """Extract last debian/changelog entry's maintainer and date"""
391 try:
392- author = self.parse_changelog_field_in_treeish(
393- ref, ChangelogField.maintainer
394- )
395- date = self.parse_changelog_field_in_treeish(
396- ref, ChangelogField.date
397- )
398+ changelog = self.get_changelog_from_treeish(ref)
399+ author = changelog.maintainer
400+ date = changelog.date
401 except:
402 logging.exception('Cannot get commit authorship for %s' % ref)
403 sys.exit(1)
404@@ -833,19 +877,12 @@ class GitUbuntuRepository:
405 return debian.debian_support.version_compare(a, b)
406
407 def get_all_changelog_versions_from_treeish(self, treeish):
408+ changelog = self.get_changelog_from_treeish(treeish)
409 try:
410- lines = self.parse_changelog_field_in_treeish(
411- treeish, ChangelogField.all_versions
412- )
413+ return changelog.all_versions
414 except:
415 logging.exception('Cannot get all versions from changelog')
416 sys.exit(1)
417- versions = list()
418- for version in lines.splitlines():
419- version = version.strip()
420- if len(version) > 0:
421- versions.append(version)
422- return versions
423
424 def annotated_tag(self, tag_name, commitish, force, msg=None):
425 try:

Subscribers

People subscribed via source and target branches