Merge ~racb/usd-importer:changelog-parsing into usd-importer:master
- Git
- lp:~racb/usd-importer
- changelog-parsing
- Merge into master
| Status: | Rejected |
|---|---|
| Rejected by: | Nish Aravamudan on 2017-08-04 |
| Proposed branch: | ~racb/usd-importer:changelog-parsing |
| Merge into: | usd-importer:master |
| Diff against target: |
425 lines (+192/-155) 1 file modified
gitubuntu/git_repository.py (+192/-155) |
| Related bugs: |
| Reviewer | Review Type | Date Requested | Status |
|---|---|---|---|
| Nish Aravamudan | 2017-05-23 | Disapprove on 2017-08-04 | |
|
Review via email:
|
|||
Commit Message
Description of the Change
Native changelog parsing, though still with shell fallback if required. Sharing this early; not fully tested. Needs discussion in relation to regression risk.
I have not yet tested "git ubuntu merge" at all. Imports appear to be working, but I have also not tested this fully yet.
Unmerged commits
- 87b2e97... by Robie Basak on 2017-05-23
-
Refactor changelog parsing
Replace the shell-based symlink workaround with a proper implementation
that uses pygit2 directly. This results in a significant speedup.get_changelog_
versions_ from_treeish( ) previously returned (None, None)
if 'debian/changelog' was not in the supplied tree-ish. This was not
documented and happened implicitly due to the shell fallback behaviour
in parsing debian/changelog. get_heads_and_versions( ) appears to rely on
this behaviour, for example when asking for versions for the
'debian/pristine- tar' branch. This behaviour is made explicit in the
replacement function that uses the new changelog parsing.
Preview Diff
| 1 | diff --git a/gitubuntu/git_repository.py b/gitubuntu/git_repository.py |
| 2 | index 345ff81..7ed57aa 100644 |
| 3 | --- a/gitubuntu/git_repository.py |
| 4 | +++ b/gitubuntu/git_repository.py |
| 5 | @@ -2,10 +2,10 @@ |
| 6 | ### XXX: is any of this data in lp already? |
| 7 | |
| 8 | from copy import copy |
| 9 | -from enum import Enum, unique |
| 10 | -import functools |
| 11 | +from functools import lru_cache |
| 12 | import logging |
| 13 | import os |
| 14 | +import posixpath |
| 15 | import re |
| 16 | import shutil |
| 17 | from subprocess import CalledProcessError |
| 18 | @@ -16,6 +16,7 @@ from gitubuntu.run import run, runq, decode_binary |
| 19 | try: |
| 20 | pkg = 'python3-debian' |
| 21 | import debian |
| 22 | + import debian.changelog |
| 23 | pkg = 'python3-pygit2' |
| 24 | import pygit2 |
| 25 | except ImportError: |
| 26 | @@ -23,33 +24,158 @@ except ImportError: |
| 27 | sys.exit(1) |
| 28 | |
| 29 | |
| 30 | -def memoize(obj): |
| 31 | - cache = obj.cache = {} |
| 32 | +def _follow_symlinks_to_blob(repo, top_tree, search_path, _rel_tree=None, _rel_path=''): |
| 33 | + '''Recursively follow a path down a tree, following symlinks, to find blob |
| 34 | + |
| 35 | + repo: pygit2.Repository object |
| 36 | + top_tree: pygit2.Tree object of the top of the tree structure |
| 37 | + search_path: '/'-separated path string of blob to find |
| 38 | + _rel_tree: (internal) which tree to look further into |
| 39 | + _rel_path: (internal) the path we are in so far |
| 40 | + ''' |
| 41 | + |
| 42 | + NORMAL_BLOB_MODES = set([ |
| 43 | + pygit2.GIT_FILEMODE_BLOB, |
| 44 | + pygit2.GIT_FILEMODE_BLOB_EXECUTABLE, |
| 45 | + ]) |
| 46 | + |
| 47 | + _rel_tree = _rel_tree or top_tree |
| 48 | + head, tail = posixpath.split(search_path) |
| 49 | + |
| 50 | + # A traditional functional split would put a single entry in head with tail |
| 51 | + # empty, but posixpath.split doesn't necessarily do this. Jiggle it round |
| 52 | + # to make it appear to have traditional semantics. |
| 53 | + if not head: |
| 54 | + head = tail |
| 55 | + tail = None |
| 56 | + |
| 57 | + entry = _rel_tree[head] |
| 58 | + if entry.type == 'tree': |
| 59 | + return _follow_symlinks_to_blob( |
| 60 | + repo=repo, |
| 61 | + top_tree=top_tree, |
| 62 | + search_path=tail, |
| 63 | + _rel_tree=repo.get(entry.id), |
| 64 | + _rel_path=posixpath.join(_rel_path, head), |
| 65 | + ) |
| 66 | + elif entry.type == 'blob' and entry.filemode == pygit2.GIT_FILEMODE_LINK: |
| 67 | + # Found a symlink. Start again from the top with adjustment for symlink |
| 68 | + # following |
| 69 | + search_path = posixpath.normpath( |
| 70 | + posixpath.join(_rel_path, repo.get(entry.id).data, tail) |
| 71 | + ) |
| 72 | + return _follow_symlinks_to_blob( |
| 73 | + repo=repo, |
| 74 | + top_tree=top_tree, |
| 75 | + search_path=search_path, |
| 76 | + ) |
| 77 | + elif entry.type == 'blob' and entry.filemode in NORMAL_BLOB_MODES: |
| 78 | + return repo.get(entry.id) |
| 79 | + else: |
| 80 | + # Found some special entry such as a "gitlink" (submodule entry) |
| 81 | + raise ValueError( |
| 82 | + "Found %r filemode %r looking for %r" % |
| 83 | + (entry, entry.filemode, posixpath.join(_rel_path, search_path)) |
| 84 | + ) |
| 85 | + |
| 86 | + |
| 87 | +def follow_symlinks_to_blob(repo, treeish, path): |
| 88 | + return _follow_symlinks_to_blob( |
| 89 | + repo=repo, |
| 90 | + top_tree=treeish.peel(pygit2.Tree), |
| 91 | + search_path=posixpath.normpath(path), |
| 92 | + ) |
| 93 | + |
| 94 | + |
| 95 | +class Changelog: |
| 96 | + '''Representation of a debian/changelog file found inside a git tree-ish |
| 97 | + |
| 98 | + Uses the debian.changelog module from python3-debian for parsing, but when |
| 99 | + this fails we fall back to grep/sed-based pattern matching automatically. |
| 100 | + ''' |
| 101 | + def __init__(self, repo, treeish): |
| 102 | + ''' |
| 103 | + repo: pygit2.Repository instance |
| 104 | + treeish: pygit2.Object subclass instance (must peel to pygit2.Tree) |
| 105 | + ''' |
| 106 | |
| 107 | - @functools.wraps(obj) |
| 108 | - def memoizer(*args, **kwargs): |
| 109 | - key = str(args) + str(kwargs) |
| 110 | - if key not in cache: |
| 111 | - cache[key] = obj(*args, **kwargs) |
| 112 | + blob = follow_symlinks_to_blob( |
| 113 | + repo=repo, |
| 114 | + treeish=treeish, |
| 115 | + path='debian/changelog' |
| 116 | + ) |
| 117 | + self._contents = blob.data |
| 118 | + self._changelog = debian.changelog.Changelog(self._contents) |
| 119 | + if not len(self._changelog.versions): |
| 120 | + # assume bad read, so fall back to shell later |
| 121 | + self._changelog = None |
| 122 | + |
| 123 | + @lru_cache() |
| 124 | + def _shell(self, cmd): |
| 125 | + cp = run( |
| 126 | + cmd, |
| 127 | + input=self._contents, |
| 128 | + shell=True, |
| 129 | + env=self._env, |
| 130 | + quiet=True, |
| 131 | + ) |
| 132 | + return decode_binary(cp.stdout).strip() |
| 133 | + |
| 134 | + @property |
| 135 | + def version(self): |
| 136 | + if self._changelog: |
| 137 | + return str(self._changelog.versions[0]) |
| 138 | else: |
| 139 | - logging.debug("Cache hit on %s %s = %s", str(args), |
| 140 | - str(kwargs), |
| 141 | - cache[key][:30].replace("\n", "\\n") + "..") |
| 142 | - return cache[key] |
| 143 | - return memoizer |
| 144 | + shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 145 | + return self._shell(shell_cmd) |
| 146 | |
| 147 | + @property |
| 148 | + def previous_version(self): |
| 149 | + if self._changelog: |
| 150 | + return str(self._changelog.versions[1]) |
| 151 | + else: |
| 152 | + shell_cmd = 'grep -m1 \'^\S\' | tail -1 | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 153 | + return self._shell(shell_cmd) |
| 154 | + |
| 155 | + @property |
| 156 | + def maintainer(self): |
| 157 | + if self._changelog: |
| 158 | + return self._changelog.author |
| 159 | + else: |
| 160 | + shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\1/\'' |
| 161 | + return self._shell(shell_cmd) |
| 162 | + |
| 163 | + @property |
| 164 | + def date(self): |
| 165 | + if self._changelog: |
| 166 | + return self._changelog.date |
| 167 | + else: |
| 168 | + shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\2/\'' |
| 169 | + return self._shell(shell_cmd) |
| 170 | + |
| 171 | + @property |
| 172 | + def all_versions(self): |
| 173 | + if self._changelog: |
| 174 | + return [str(v) for v in self._changelog.versions] |
| 175 | + else: |
| 176 | + shell_cmd = 'grep \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 177 | + version_lines = self._shell(shell_cmd) |
| 178 | + return [ |
| 179 | + v_stripped |
| 180 | + for v_stripped in ( |
| 181 | + v.strip() for v in version_lines.splitlines() |
| 182 | + ) |
| 183 | + if v_stripped |
| 184 | + ] |
| 185 | + |
| 186 | + @property |
| 187 | + def distribution(self): |
| 188 | + if self._changelog: |
| 189 | + return self._changelog.distributions |
| 190 | + else: |
| 191 | + shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*\ .*\ \(.*\);.*/\\1/\'' |
| 192 | + return self._shell(shell_cmd) |
| 193 | |
| 194 | -@unique |
| 195 | -class ChangelogField(Enum): |
| 196 | - """Trivial enum for specifying fields to extract from |
| 197 | - debian/changelog |
| 198 | - """ |
| 199 | - version = 1 |
| 200 | - previous_version = 2 |
| 201 | - maintainer = 3 |
| 202 | - date = 4 |
| 203 | - all_versions = 5 |
| 204 | - distribution = 6 |
| 205 | |
| 206 | def git_dep14_tag(version): |
| 207 | """Munge a version string according to taken from |
| 208 | @@ -449,143 +575,64 @@ class GitUbuntuRepository: |
| 209 | def garbage_collect(self): |
| 210 | self.git_run(['gc']) |
| 211 | |
| 212 | - def extract_file_from_treeish(self, treeish, filename, outfile=None): |
| 213 | + def extract_file_from_treeish(self, treeish, filename): |
| 214 | """extract a file from @treeish to a local file |
| 215 | |
| 216 | Arguments: |
| 217 | treeish - SHA1 of treeish |
| 218 | filename - file to extract from @treeish |
| 219 | - outfile - name of file to copy @filename contents to, will be overwritten. |
| 220 | - If None, will be a NamedTemporaryFile. |
| 221 | - """ |
| 222 | - if outfile is None: |
| 223 | - outfile = tempfile.NamedTemporaryFile() |
| 224 | - else: |
| 225 | - outfile = open(outfile, mode='w+') |
| 226 | |
| 227 | - cat_changelog_cmd = ( |
| 228 | - "echo %s:debian/changelog | " |
| 229 | - "git cat-file --batch --follow-symlinks | " |
| 230 | - "sed -n '1{/^[^ ]* blob/!{p;q1}};2,$p'" |
| 231 | - % treeish |
| 232 | + Returns a NamedTemporaryFile that is flushed but not rewound. |
| 233 | + """ |
| 234 | + blob = follow_symlinks_to_blob( |
| 235 | + self._local_repo, |
| 236 | + treeish=self._local_repo.get(treeish), |
| 237 | + path=filename, |
| 238 | ) |
| 239 | - |
| 240 | - try: |
| 241 | - cp = run(cat_changelog_cmd, shell=True, env=self._env, |
| 242 | - stdout=outfile) |
| 243 | - except CalledProcessError: |
| 244 | - raise Exception('Unable to extract file') |
| 245 | - |
| 246 | + outfile = tempfile.NamedTemporaryFile() |
| 247 | + outfile.write(blob.data) |
| 248 | outfile.flush() |
| 249 | return outfile |
| 250 | |
| 251 | - @memoize |
| 252 | - def parse_changelog_field_in_treeish(self, treeish, field): |
| 253 | - """Parse debian/changelog for specified field, using |
| 254 | - dpkg-parsechangelog with fallback to shell munging |
| 255 | - |
| 256 | - dpkg-parsechangelog is preferentially used, however there have |
| 257 | - been multiple cases where historical changelogs have not been |
| 258 | - parseable. Fallback to a relatively dumb shell-based parsing for |
| 259 | - relevant fields in that case. If that also fails, as a last |
| 260 | - resort, see if any source patches are applicable. |
| 261 | - |
| 262 | - Arguments: |
| 263 | - treeish -- SHA1 of treeish |
| 264 | - field -- ChangelogField value corresponding to field to extract |
| 265 | - |
| 266 | - Returns: |
| 267 | - A subprocess.CompletedProcess instance as returned by |
| 268 | - subprocess.run |
| 269 | - """ |
| 270 | - # in each case: |
| 271 | - # parse_params is what is passed on to dpkg-parsechangelog |
| 272 | - # shell_cmd is a suitable shell sequence to pipe the changelog |
| 273 | - # through to achieve the same if dpkg-parsechangelog fails |
| 274 | - if field is ChangelogField.version: |
| 275 | - parse_params = '-n1 -SVersion' |
| 276 | - shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 277 | - elif field is ChangelogField.previous_version: |
| 278 | - parse_params = '-n1 -o1 -SVersion' |
| 279 | - shell_cmd = 'grep -m1 \'^\S\' | tail -1 | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 280 | - elif field is ChangelogField.maintainer: |
| 281 | - parse_params = '-SMaintainer' |
| 282 | - shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\1/\'' |
| 283 | - elif field is ChangelogField.date: |
| 284 | - parse_params = '-SDate' |
| 285 | - shell_cmd = 'grep -m1 \'^ --\' | sed \'s/ -- \(.*\) \(.*\)/\\2/\'' |
| 286 | - elif field is ChangelogField.all_versions: |
| 287 | - parse_params = '--format rfc822 -SVersion --all' |
| 288 | - shell_cmd = 'grep \'^\S\' | sed \'s/.*(\(.*\)).*/\\1/\'' |
| 289 | - elif field is ChangelogField.distribution: |
| 290 | - parse_params = '-SDistribution' |
| 291 | - shell_cmd = 'grep -m1 \'^\S\' | sed \'s/.*\ .*\ \(.*\);.*/\\1/\'' |
| 292 | - else: |
| 293 | - raise Exception('Unknown changelog field %s' % field) |
| 294 | - |
| 295 | - # We cannot use "git show" since it does not follow symlinks (LP: |
| 296 | - # #1661092). Instead, use "git cat-file --batch --follow-symlinks" and |
| 297 | - # parse the batch output (first line) to ensure that we get a blob |
| 298 | - # rather than a symlink following failure. If we don't get a blob, then |
| 299 | - # print what we got and exit 1. |
| 300 | - cat_changelog_cmd = ( |
| 301 | - "echo %s:debian/changelog | " |
| 302 | - "git cat-file --batch --follow-symlinks | " |
| 303 | - "sed -n '1{/^[^ ]* blob/!{p;q1}};2,$p'" |
| 304 | - % treeish |
| 305 | - ) |
| 306 | - |
| 307 | - try: |
| 308 | - cp = run( |
| 309 | - '%s | dpkg-parsechangelog -l- %s' % |
| 310 | - (cat_changelog_cmd, parse_params), |
| 311 | - shell=True, env=self._env, quiet=True) |
| 312 | - except CalledProcessError: |
| 313 | - try: |
| 314 | - cp = run( |
| 315 | - '%s | %s' % (cat_changelog_cmd, shell_cmd), |
| 316 | - shell=True, env=self._env, quiet=True) |
| 317 | - except CalledProcessError: |
| 318 | - raise Exception('Unable to parse changelog') |
| 319 | - |
| 320 | - return decode_binary(cp.stdout).strip() |
| 321 | + @lru_cache() |
| 322 | + def get_changelog_from_treeish(self, treeish): |
| 323 | + return Changelog(self._local_repo, self._local_repo.get(treeish)) |
| 324 | |
| 325 | def get_changelog_versions_from_treeish(self, treeish): |
| 326 | """Extract current and prior versions from debian/changelog in a |
| 327 | given treeish |
| 328 | - """ |
| 329 | - current_version = None |
| 330 | - last_version = None |
| 331 | - if treeish is not None: |
| 332 | - try: |
| 333 | - current_version = self.parse_changelog_field_in_treeish( |
| 334 | - treeish, ChangelogField.version |
| 335 | - ) |
| 336 | - last_version = self.parse_changelog_field_in_treeish( |
| 337 | - treeish, ChangelogField.previous_version |
| 338 | - ) |
| 339 | |
| 340 | - except: |
| 341 | - logging.exception('Cannot get changelog versions') |
| 342 | - sys.exit(1) |
| 343 | + Returns (None, None) if the treeish supplied is None or if |
| 344 | + 'debian/changelog' does not exist in the treeish. |
| 345 | + """ |
| 346 | + if treeish is None: |
| 347 | + return None, None |
| 348 | |
| 349 | - return (current_version, last_version) |
| 350 | + try: |
| 351 | + changelog = self.get_changelog_from_treeish(treeish) |
| 352 | + except KeyError: |
| 353 | + # If 'debian/changelog' does |
| 354 | + # not exist, then (None, None) is returned. KeyError propagates up |
| 355 | + # from Changelog's __init__. |
| 356 | + return None, None |
| 357 | + try: |
| 358 | + return changelog.version, changelog.previous_version |
| 359 | + except: |
| 360 | + logging.exception('Cannot get changelog versions') |
| 361 | + sys.exit(1) |
| 362 | |
| 363 | def get_changelog_distribution_from_treeish(self, treeish): |
| 364 | """Extract targetted distribution from debian/changelog in a |
| 365 | given treeish |
| 366 | """ |
| 367 | - distribution = None |
| 368 | - if treeish is not None: |
| 369 | - try: |
| 370 | - distribution = self.parse_changelog_field_in_treeish( |
| 371 | - treeish, ChangelogField.distribution |
| 372 | - ) |
| 373 | - except: |
| 374 | - logging.exception('Cannot get changelog distribution') |
| 375 | - sys.exit(1) |
| 376 | + if treeish is None: |
| 377 | + return None |
| 378 | |
| 379 | - return distribution |
| 380 | + try: |
| 381 | + return self.get_changelog_from_treeish(treeish).distribution |
| 382 | + except: |
| 383 | + logging.exception('Cannot get changelog distribution') |
| 384 | + sys.exit(1) |
| 385 | |
| 386 | def get_heads_and_versions(self, head_prefix, namespace): |
| 387 | """Extract the last version in debian/changelog of all |
| 388 | @@ -680,12 +727,9 @@ class GitUbuntuRepository: |
| 389 | def get_commit_authorship(self, ref, spi): |
| 390 | """Extract last debian/changelog entry's maintainer and date""" |
| 391 | try: |
| 392 | - author = self.parse_changelog_field_in_treeish( |
| 393 | - ref, ChangelogField.maintainer |
| 394 | - ) |
| 395 | - date = self.parse_changelog_field_in_treeish( |
| 396 | - ref, ChangelogField.date |
| 397 | - ) |
| 398 | + changelog = self.get_changelog_from_treeish(ref) |
| 399 | + author = changelog.maintainer |
| 400 | + date = changelog.date |
| 401 | except: |
| 402 | logging.exception('Cannot get commit authorship for %s' % ref) |
| 403 | sys.exit(1) |
| 404 | @@ -833,19 +877,12 @@ class GitUbuntuRepository: |
| 405 | return debian.debian_support.version_compare(a, b) |
| 406 | |
| 407 | def get_all_changelog_versions_from_treeish(self, treeish): |
| 408 | + changelog = self.get_changelog_from_treeish(treeish) |
| 409 | try: |
| 410 | - lines = self.parse_changelog_field_in_treeish( |
| 411 | - treeish, ChangelogField.all_versions |
| 412 | - ) |
| 413 | + return changelog.all_versions |
| 414 | except: |
| 415 | logging.exception('Cannot get all versions from changelog') |
| 416 | sys.exit(1) |
| 417 | - versions = list() |
| 418 | - for version in lines.splitlines(): |
| 419 | - version = version.strip() |
| 420 | - if len(version) > 0: |
| 421 | - versions.append(version) |
| 422 | - return versions |
| 423 | |
| 424 | def annotated_tag(self, tag_name, commitish, force, msg=None): |
| 425 | try: |

Superseded/rebased in new merge.