Comments and a question inline; modulo the answer to the question,
I'm okay with this patch set.

Thanks.

On Tue, Sep 21, 2021 at 05:09:27AM -0000, Alex Murray wrote:
> index 384fb7e..b3224c0 100755
> --- a/scripts/check-syntax
> +++ b/scripts/check-syntax
> @@ -812,23 +812,36 @@ for cve in args:
>                  )
>                  cve_okay = False
>  
> -    # Check to make sure all patch references match the type:reference
> -    # pattern
> -    for key in data.keys():
> -        if "Patches_" in key and len(data[key]) > 0:
> -            for line in re.split("\n", data[key]):
> -                patch_type = re.split(":", line)[0]
> -                if re.search("http", patch_type):
> -                    print(
> -                        "%s: %d: patch reference %s doesn't contain a type modifier (e.g. upstream:)"
> -                        % (cvepath, srcmap[key] if key in srcmap else 1, key),
> -                        file=sys.stderr,
> -                    )
> -                    cve_okay = False
> -                if re.search("patch", patch_type):
> +    for pkg in data["patches"]:
> +        for index, value in enumerate(data["patches"][pkg]):
> +            patch_type, patch = data["patches"][pkg][index]
> +            # validate break-fix entries as 'I?hash|-|local-|URL' and
> +            # others should be a URL - but don't bother with retired
> +            # CVEs as these have a lot of old cruft
> +            if patch_type == "break-fix":
> +                try:
> +                    bfre = "^(-|I?[a-f0-9]{1,40}|local-[A-Za-z0-9-]+|https?://.*)$"
> +                    breaks, fixes = patch.split(' ', 1)
> +                    # breaks and fixes can contain multiple entries separated by |
> +                    for brk in breaks.split('|'):
> +                        if re.match(bfre, brk) is None:
> +                            raise ValueError("invalid break entry '%s':" % brk)
> +                    # fixes can contain multiple entries separated by |
> +                    for fix in fixes.split('|'):
> +                        if re.match(bfre, fix) is None:
> +                            raise ValueError("invalid fix entry '%s':" % fix)
> +                except Exception as e:
> +                        print(
> +                            "%s: %d: invalid break-fix entry: '%s': %s"
> +                            % (cvepath, srcmap["patches"][pkg][index], patch, e),
> +                            file=sys.stderr,
> +                        )
> +                        cve_okay = False
> +            elif not "retired/" in cvepath:

It would possibly be okay to check retired CVEs when --strict is passed.

> +                if "://" not in patch:
>                      print(
> -                        "%s: %d: invalid type modifier in %s, please use upstream:, vendor:, debdiff:, other:, etc."
> -                        % (cvepath, srcmap[key] if key in srcmap else 1, key),
> +                        "%s: %d: invalid patch URL '%s'"
> +                        % (cvepath, srcmap["patches"][pkg][index], patch),
>                          file=sys.stderr,
>                      )
>                      cve_okay = False
> diff --git a/scripts/cve_lib.py b/scripts/cve_lib.py
> index c706a61..88dd326 100755
> --- a/scripts/cve_lib.py
> +++ b/scripts/cve_lib.py
> @@ -1195,6 +1195,8 @@ def load_cve(cve, strict=False, subprojects=list(), srcmap=None):
>      srcmap.setdefault('pkgs', dict())
>      srcmap.setdefault('tags', dict())
>      data.setdefault('tags', dict())
> +    srcmap.setdefault('patches', dict())
> +    data.setdefault('patches', dict())
>      affected = dict()
>      non_ubuntu_affected = dict()
>      lastfield = None
> @@ -1220,6 +1222,17 @@ def load_cve(cve, strict=False, subprojects=list(), srcmap=None):
>                      code, newmsg = notes_parser.parse_line(cve, line, linenum, code)
>                      if code != EXIT_OKAY:
>                          msg += newmsg
> +                elif 'Patches_' in lastfield:
> +                    try:
> +                        _, pkg = lastfield.split('_', 1)
> +                        patch_type, entry = line.split(':', 1)
> +                        patch_type = patch_type.strip()
> +                        entry = entry.strip()
> +                        data['patches'][pkg].append((patch_type, entry))
> +                        srcmap['patches'][pkg].append(linenum)
> +                    except Exception as e:
> +                        msg += "%s: %d: Failed to parse '%s' entry %s: %s\n" % (cve, linenum, lastfield, line, e)
> +                        code = EXIT_FAIL
>                  elif lastfield == 'CVSS':
>                      source, vector = line.split(':', 1)
>                      try:
> @@ -1280,15 +1293,19 @@ def load_cve(cve, strict=False, subprojects=list(), srcmap=None):
>                  msg += "%s: %d: unknown Priority '%s'\n" % (cve, linenum, value)
>                  code = EXIT_FAIL
>          elif 'Patches_' in field:
> -            '''These are raw fields'''
>              try:
>                  foo, pkg = field.split('_', 1)
>              except ValueError:
>                  msg += "%s: %d: bad field with 'Patches_': '%s'\n" % (cve, linenum, field)
>                  code = EXIT_FAIL
>                  continue
> -            data.setdefault(field, value)
> -            srcmap.setdefault(field, linenum)
> +            # value should be empty
> +            if len(value) > 0:
> +                msg += "%s: %d: '%s' field should have no value\n" % (cve, linenum, field)
> +                code = EXIT_FAIL
> +                continue
> +            data['patches'].setdefault(pkg, list())
> +            srcmap['patches'].setdefault(pkg, list())

So one consequence here is that the patches gets populated with empty
lists for every package in the CVE, whether it has a reference to a
patch or not, unlike how tags are handled. For example, using Albert's
example CVE in the motivating bug report, we get:

  In [1]: import cve_lib

  In [2]: cve_lib.load_cve('active/CVE-2012-4542')
  Out[2]: 
  {'tags': {'linux-armadaxp': {'not-ue'},
    'linux-lts-quantal': {'not-ue'},
    'linux-lts-saucy': {'not-ue'}},
   'patches': {'linux': [('vendor',
      'https://oss.oracle.com/git/gitweb.cgi?p=redpatch.git;a=commitdiff;h=76a274e17114abf1a77de6b651424648ce9e10c8'),
     ('vendor', 'http://rhn.redhat.com/errata/RHSA-2013-0496.html'),
     ('vendor', 'http://rhn.redhat.com/errata/RHSA-2013-0579.html'),
     ('vendor', 'http://rhn.redhat.com/errata/RHSA-2013-0882.html'),
     ('vendor', 'http://rhn.redhat.com/errata/RHSA-2013-0928.html')],
    'linux-ec2': [],
    'linux-mvl-dove': [],
    'linux-ti-omap4': [],
    'linux-lts-backport-maverick': [],
    'linux-fsl-imx51': [],
    'linux-lts-backport-oneiric': [],
    'linux-linaro-omap': [],
    'linux-linaro-shared': [],
    'linux-linaro-vexpress': [],
    'linux-qcm-msm': [],
    'linux-armadaxp': [],
  [ rest elided ]

Is that what we want? It makes programmatic manipulation later on
easier (because the entry list will always exist to append to),
but it also means your exporters have to check the length for each
to see if there's anything to export.

>          elif 'Tags_' in field:
>              '''These are processed into the "tags" hash'''
>              try:
> diff --git a/scripts/html_export.py b/scripts/html_export.py
> index 066ffea..0253eaa 100755
> --- a/scripts/html_export.py
> +++ b/scripts/html_export.py
> @@ -272,16 +272,10 @@ def htmlize_cve(cvefile, outfd, commit=None):
>              print('</td></tr>', file=outfd)
>          print('</table>', file=outfd)
>  
> -        patches = 'Patches_%s' % (pkg)
> -        if patches in data:
> -            entries = data[patches]
> -            if entries != "":
> -                print('<div class="patches">Patches:</div>', file=outfd)
> -                print('<table class="table table-responsive patches">', file=outfd)
> -            for patch in entries.split('\n'):
> -                if not ':' in patch:
> -                    continue
> -                source, url = patch.split(':', 1)
> +        if len(data['patches'][pkg]) > 0:
> +            print('<div class="patches">Patches:</div>', file=outfd)
> +            print('<table class="table table-responsive patches">', file=outfd)
> +            for source, url in data['patches'][pkg]:
>                  # We need to handle the line info first, since it may have
>                  # additional info as (master) or something else after the patch
>                  # url.
> @@ -304,8 +298,7 @@ def htmlize_cve(cvefile, outfd, commit=None):
>                          print('<tr><td>%s:</td><td><a href="%s">%s %s</a></td></tr>' % (escape(source).capitalize(), url.replace('"', '%22'), escape(url), url_additional_info), file=outfd)
>                      else:
>                          print('<tr><td>%s:</td><td>%s</td></tr>' % (escape(source).capitalize(), escape(url)), file=outfd)
> -            if entries != "":
> -                print('</table>', file=outfd)
> +            print('</table>', file=outfd)
>  
>          # tags
>          urlregex = re.compile(r"(http[^\s]+)")
> diff --git a/scripts/publish-cves-to-website-api.py b/scripts/publish-cves-to-website-api.py
> index b5dddab..65c2a4f 100755
> --- a/scripts/publish-cves-to-website-api.py
> +++ b/scripts/publish-cves-to-website-api.py
> @@ -40,9 +40,7 @@ def get_tags(cve_data, pkg):
>      return list(cve_data['tags'].get(pkg, list()))
>  
>  def get_patches(cve_data, pkg):
> -    patches_str = cve_data.get(f'Patches_{pkg}', "")
> -    return [line for line in patches_str.split('\n') if line]
> -
> +    return [ patch_type + ": " + entry for patch_type, entry in cve_data['patches'].get(pkg, list())]

Not needed for this patchset, but it would be good if we could be
smarter about our local break-fix entries where there's an alternation
(e.g. $UPSTREAM_HASH|local-CVE-fix ), where maybe we only report the
upstream hash to the web api.

Ideally I'd like to come up with some way to surface the local commits
to point reverse engineer them back to pointers to the commits in
the ubuntu kernel git repos, for transparency.

(See the recent merge request from Kees updating some of the kernel
break-fix entries.)

>  def get_devel_codename(cve_releases):
>      for skip_release in ['upstream', 'devel', 'product', 'snap']:
> diff --git a/scripts/report-pending-fixes b/scripts/report-pending-fixes
> index ca7b7f4..6800f10 100755
> --- a/scripts/report-pending-fixes
> +++ b/scripts/report-pending-fixes
> @@ -67,12 +67,10 @@ for name in cves:
>              report += " %s %s %s %s" % (opt.pkg, rel, state, version)
>  
>          if opt.fixes:
> -            patch_section = 'Patches_%s' % ('linux' if opt.pkg in cve_lib.kernel_srcs else opt.pkg)
> -            if patch_section in cve:
> -                for line in cve[patch_section].split('\n'):
> -                    line.strip()
> -                    if line.lstrip().startswith('break-fix:'):
> -                        report += '\n %s' % line.split()[2]
> +            pkg = 'linux' if opt.pkg in cve_lib.kernel_srcs else opt.pkg
> +            for (patch_type, patch) in cve['patches'][pkg]:
> +                if patch_type == 'break-fix':
> +                        report += '\n %s' % patch.split()[1]

It probably would have been helpful if the author of this code had
left a comment explaining that because we only record break-fix
entries for the primary 'linux' source package, when we're looking
what fixes are pending for a different (derived) kernel, we need to
pull the fixing hashes out of the primary kernel.

>          if opt.descriptions:
>              desc = cve['Ubuntu-Description'].strip()
> diff --git a/scripts/sync-bugs-kernel.py b/scripts/sync-bugs-kernel.py
> index a379f8b..67b1bf9 100755
> --- a/scripts/sync-bugs-kernel.py
> +++ b/scripts/sync-bugs-kernel.py

This script is not used anymore, we used to need to have a corresponding
launchpad bug for each kernel cve and keep the states in sync in both
directions.

Hopefully we won't ever have to bring this back.

> @@ -283,8 +283,8 @@ def add_uct_sha(data, src, sha_pair_to_add):
>  
>      raise ValueError("TODO")
>  
> -    if value.strip() != data.get(patchfield, ''):
> -        data[patchfield] = cve_lib.update_multiline_field('%s/%s' % (cve_lib.active_dir, data['Candidate']), patchfield, value)
> +    # if value.strip() != data.get(patchfield, ''):
> +    #     data[patchfield] = cve_lib.update_multiline_field('%s/%s' % (cve_lib.active_dir, data['Candidate']), patchfield, value)
>  
>  
>  def del_uct_sha(data, src, sha_pair_to_remove):
> @@ -298,8 +298,8 @@ def del_uct_sha(data, src, sha_pair_to_remove):
>  
>      raise ValueError("TODO")
>  
> -    if value.strip() != data.get(patchfield, ''):
> -        data[patchfield] = cve_lib.update_multiline_field('%s/%s' % (cve_lib.active_dir, data['Candidate']), patchfield, value)
> +    # if value.strip() != data.get(patchfield, ''):
> +    #     data[patchfield] = cve_lib.update_multiline_field('%s/%s' % (cve_lib.active_dir, data['Candidate']), patchfield, value)
>  
>  # FIXME: there is a lot of copy/paste loop code here to walk the bug
>  # task vs uct status maps. This should probably be generalized into a
> @@ -454,24 +454,18 @@ def _update_description_from_uct(bug, tasks, data):
>              # a full abort is not needed.
>              continue
>  
> -        if 'Patches_%s' % (src) in data:
> -            for line in data['Patches_%s' % (src)].splitlines():
> -                if ':' not in line:
> -                    continue
> -                field, value = line.strip().split(':', 1)
> -                field = field.strip()
> -                if field not in ['upstream', 'break-fix']:
> -                    continue
> -                value = value.strip()
> -                broken = None
> -                sha = None
> -                if field == 'upstream':
> -                    broken = '-'
> -                    sha = _extract_sha(value)
> -                if field == 'break-fix' and ' ' in value:
> -                    broken, sha = [_extract_sha(x) for x in value.split(' ', 1)]
> -                if sha:
> -                    shas.append((broken, sha))
> +        for (field, value) in data['patches'][src]:
> +            if field not in ['upstream', 'break-fix']:
> +                continue
> +            broken = None
> +            sha = None
> +            if field == 'upstream':
> +                broken = '-'
> +                sha = _extract_sha(value)
> +            if field == 'break-fix' and ' ' in value:
> +                broken, sha = [_extract_sha(x) for x in value.split(' ', 1)]
> +            if sha:
> +                shas.append((broken, sha))
>  
>      description = data['Description'].strip().replace('\n', ' ').strip()
>      if description == "":
> @@ -525,7 +519,6 @@ def sync_new_bug(bug, tasks, data):
>      touched = False
>      if opt.debug:
>          print("\tsync new bug", file=sys.stderr)
> -    shas = []
>      for src in data['pkgs']:
>          if src not in cve_lib.kernel_srcs:
>              continue


-- 
Steve Beattie
<email address hidden>