Hey Louis, thanks for working on this! The general approach looks fine to me, I just have some small things to fix. Louis Bouchard [2015-01-26 14:42 -0000]: > Changes to apport-unpack are only meaningful for Ubuntu Precise, so > they could be brought in as a debian patch, hence dropped from that > MP, while keeping the enhancement to problem_report. I think it's fine to keep that in trunk. We might have other large attachments which are better handled that way. > === modified file 'bin/apport-unpack' > --- bin/apport-unpack 2012-05-04 06:54:56 +0000 > +++ bin/apport-unpack 2015-01-26 14:41:51 +0000 > @@ -49,18 +49,28 @@ > except OSError as e: > fatal(str(e)) > > +binaries = () Please make that a list or a set; using a tuple in this way is confusing. It would also be nice if you could name this "bin_keys"? > for k in pr: > + if pr[k] == u'': No u'' please, just ''; u'' isn't compatible with earlier Python 3.x. > + binaries += (k,) Following the above, bin_keys.append(k) (list) or .add(k) (set). > + with open(report, 'rb') as f: > + for element in binaries: "key" instead of element, please? Let's consistently call the report contents "key" and "value". > + def extract(self, file, item=None, directory='/tmp'): > + '''Extract only one element from the problem_report > + directly without loading the report beforehand > + This is required for Kernel Crash Dumps that can be > + very big and saturate the RAM > + ''' Short descriptions need to be one line. Please rename "item" to "key", and drop the None default for it, as None will turn the whole thing into a no-op. Also, don't specify a default for directory; such defaults are for CLI programs, and libraries should *never* default to /tmp unless they very carefully avoid overwriting existing files in a race free manner (i. e. O_CREAT|O_EXCL). Finally, to further point out that this is for a single key instead of the whole report, could this be named extract_key()? > + self._assert_bin_mode(file) > + self.data.clear() > + key = None > + value = None > + b64_block = False > + bd = None > + # Make sure the report is at the beginning > + file.seek(0) I'd like to avoid this. First, it's unexpected (and undocumented), you might deliberately call this on an open file at an offset, and second it won't work for non-seekable fds such as reading from stdin. > + for line in file: > + # continuation line > + if line.startswith(b' '): > + if not b64_block: > + continue > + assert (key is not None and value is not None) > + if b64_block: > + l = base64.b64decode(line) > + if bd: > + out.write(bd.decompress(l)) > + else: > + # lazy initialization of bd > + # skip gzip header, if present > + if l.startswith(b'\037\213\010'): > + bd = zlib.decompressobj(-zlib.MAX_WBITS) > + out.write(bd.decompress(self._strip_gzip_header(l))) > + else: > + # legacy zlib-only format used default block > + # size > + bd = zlib.decompressobj() > + out.write(bd.decompress(l)) > + else: > + if len(value) > 0: > + out.write('{}'.format(b'\n')) > + if line.endswith(b'\n'): > + out.write('{}'.format(line[1:-1])) > + else: > + out.write('{}'.format(line[1:])) > + else: > + if b64_block: > + if bd: > + value += bd.flush() > + b64_block = False > + bd = None > + if key: > + assert value is not None > + self.data[key] = self._try_unicode(value) > + (key, value) = line.split(b':', 1) > + if not _python2: > + key = key.decode('ASCII') > + if key != item: > + continue > + value = value.strip() > + if value == b'base64': > + value = b'' > + b64_block = True > + try: > + out=open(os.path.join(directory, item), 'wb') > + except IOError as e: > + fatal(str(e)) This logic can become a simpler as you know precisely what to scan for at first: a line equal to key + b':\n'. If it's not found, this should throw a KeyError, otherwise you can start uncompressing/writing from there and quit once you arrive at the next key without having to read the remainder of the file. > + > + if key is not None: > + self.data[key] = self._try_unicode(value) > + > + self.old_keys = set(self.data.keys()) These two shouldn't happen as you don't modify the report. > --- test/test_problem_report.py 2012-10-11 05:56:35 +0000 > +++ test/test_problem_report.py 2015-01-26 14:41:51 +0000 > @@ -258,6 +266,52 @@ > pr.load(BytesIO(b'ProblemType: Crash')) > self.assertEqual(list(pr.keys()), ['ProblemType']) > > + > + def test_extract(self): > + '''extract() with various binary elements.''' > + > + # create a test report with binary elements > + large_val = b'A' * 5000000 > + > + pr = problem_report.ProblemReport() > + pr['Txt'] = 'some text' > + pr['MoreTxt'] = 'some more text' > + pr['Foo'] = problem_report.CompressedValue(b'FooFoo!') > + pr['Uncompressed'] = bin_data > + pr['Bin'] = problem_report.CompressedValue() > + pr['Bin'].set_value(bin_data) > + pr['Large'] = problem_report.CompressedValue(large_val) > + pr['Multiline'] = problem_report.CompressedValue(b'\1\1\1\n\2\2\n\3\3\3') This is really binary, so I would just drop this bit. > + report = BytesIO() > + pr.write(report) > + report.seek(0) > + > + #Extracts nothing if non binary > + pr.extract(report, 'Txt', self.workdir ) > + self.assertEqual(os.path.exists(os.path.join(self.workdir, 'Txt')), False) This should raise a ValueError instead. Silently not doing anything on wrong input (i. e. an ASCII key) is very un-Pythonic. Also, don't use assertEqual(..., True|False) -- use assertTrue() and assertFalse(). Verifying that it does not write a file is still a good thing, of course (after the assertRaises). > + #Check inexistant element > + pr.extract(report, 'Bar', self.workdir ) > + self.assertEqual(os.path.exists(os.path.join(self.workdir, 'Bar')), False) This should raise a KeyError. > + #Check valid elements > + pr.extract(report, 'Foo', self.workdir ) > + element = open(os.path.join(self.workdir, 'Foo')) > + self.assertEqual(element.read(), b'FooFoo!' ) Please don't leave fd leaks. Use "with open(...) as f:". Same for the tests below. Finally, there are a lot of PEP-8 errors in this. Can you please install "pep8" and fix the issues in pep8 -r --ignore=E401,E501,W291,W293 test/test_problem_report.py pep8 -r --ignore=E401,E501,E124 problem_report.py That's what test/run does, BTW. You can just start that, and as soon as it starts running the module tests you are good. It will fail very quickly on pep8/pyflakes errors. Thanks! Martin -- Martin Pitt | http://www.piware.de Ubuntu Developer (www.ubuntu.com) | Debian Developer (www.debian.org)