Breezy

Merge lp:~jelmer/brz/big-file-vf into lp:brz

big-file-vf
Merge into trunk

Proposed by Jelmer Vernooij on 2019-06-16

Status:	Merged
Approved by:	Jelmer Vernooij on 2019-06-16
Approved revision:	no longer in the source branch.
Merge reported by:	The Breezy Bot
Merged at revision:	not available
Proposed branch:	lp:~jelmer/brz/big-file-vf
Merge into:	lp:brz
Diff against target:	255 lines (+133/-15) 6 files modified breezy/bzr/groupcompress.py (+45/-10) breezy/bzr/knit.py (+15/-0) breezy/bzr/versionedfile.py (+42/-0) breezy/bzr/vf_repository.py (+3/-2) breezy/tests/blackbox/test_big_file.py (+8/-3) breezy/tests/per_versionedfile.py (+20/-0)
To merge this branch:	bzr merge lp:~jelmer/brz/big-file-vf
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Martin Packman		2019-06-16	Approve on 2019-06-16
Review via email: mp+368882@code.launchpad.net

Commit message

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Description of the change

Add a VersionedFiles.add_chunks method that takes a chunk_iter.

Improve big file tests:
* Use RLIMIT_AS rather than RLIMIT_DATA
* Skip the tests if there is not enough disk space.

This ahead of improved support for large files.

Revision history for this message

Martin Packman (gz) wrote on 2019-06-16:

Looks good, one teeny test nit inline.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Breezy developers

Jelmer Vernooij

Robert Ladyman

 === modified file 'breezy/bzr/groupcompress.py'
 --- breezy/bzr/groupcompress.py	2018-11-17 16:53:10 +0000
 +++ breezy/bzr/groupcompress.py	2019-06-16 15:55:45 +0000
@@ -1295,18 +1295,56 @@
                   back to future add_lines calls in the parent_texts dictionary.
          """
          self._index._check_write_ok()
--        self._check_add(key, lines, random_id, check_content)
++        if check_content:
++            self._check_lines_not_unicode(lines)
++            self._check_lines_are_lines(lines)
++        return self.add_chunks(
++            key, parents, iter(lines), parent_texts, left_matching_blocks,
++            nostore_sha, random_id)
++
++    def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
++                   left_matching_blocks=None, nostore_sha=None, random_id=False):
++        """Add a text to the store.
++
++        :param key: The key tuple of the text to add.
++        :param parents: The parents key tuples of the text to add.
++        :param chunk_iter: An iterator over chunks. Chunks
++            don't need to be file lines; the only requirement is that they
++            are bytes.
++        :param parent_texts: An optional dictionary containing the opaque
++            representations of some or all of the parents of version_id to
++            allow delta optimisations.  VERY IMPORTANT: the texts must be those
++            returned by add_lines or data corruption can be caused.
++        :param left_matching_blocks: a hint about which areas are common
++            between the text and its left-hand-parent.  The format is
++            the SequenceMatcher.get_matching_blocks format.
++        :param nostore_sha: Raise ExistingContent and do not add the lines to
++            the versioned file if the digest of the lines matches this.
++        :param random_id: If True a random id has been selected rather than
++            an id determined by some deterministic process such as a converter
++            from a foreign VCS. When True the backend may choose not to check
++            for uniqueness of the resulting key within the versioned file, so
++            this should only be done when the result is expected to be unique
++            anyway.
++        :return: The text sha1, the number of bytes in the text, and an opaque
++                 representation of the inserted version which can be provided
++                 back to future add_lines calls in the parent_texts dictionary.
++        """
++        self._index._check_write_ok()
++        self._check_add(key, random_id)
          if parents is None:
              # The caller might pass None if there is no graph data, but kndx
              # indexes can't directly store that, so we give them
              # an empty tuple instead.
              parents = ()
          # double handling for now. Make it work until then.
--        length = sum(map(len, lines))
--        record = ChunkedContentFactory(key, parents, None, lines)
--        sha1 = list(self._insert_record_stream([record], random_id=random_id,
--                                               nostore_sha=nostore_sha))[0]
--        return sha1, length, None
++        # TODO(jelmer): problematic for big files: let's not keep the list of
++        # chunks in memory.
++        chunks = list(chunk_iter)
++        record = ChunkedContentFactory(key, parents, None, chunks)
++        sha1 = list(self._insert_record_stream(
++            [record], random_id=random_id, nostore_sha=nostore_sha))[0]
++        return sha1, sum(map(len, chunks)), None
      def add_fallback_versioned_files(self, a_versioned_files):
          """Add a source of texts for texts not present in this knit.
@@ -1338,7 +1376,7 @@
          self._index._graph_index.clear_cache()
          self._index._int_cache.clear()
--    def _check_add(self, key, lines, random_id, check_content):
++    def _check_add(self, key, random_id):
          """check that version_id and lines are safe to add."""
          version_id = key[-1]
          if version_id is not None:
@@ -1349,9 +1387,6 @@
          # probably check that the existing content is identical to what is
          # being inserted, and otherwise raise an exception.  This would make
          # the bundle code simpler.
--        if check_content:
--            self._check_lines_not_unicode(lines)
--            self._check_lines_are_lines(lines)
      def get_parent_map(self, keys):
          """Get a map of the graph parents of keys.
 === modified file 'breezy/bzr/knit.py'
 --- breezy/bzr/knit.py	2019-05-28 21:46:09 +0000
 +++ breezy/bzr/knit.py	2019-06-16 15:55:45 +0000
@@ -996,6 +996,21 @@
                           parent_texts, left_matching_blocks, nostore_sha, random_id,
                           line_bytes=line_bytes)
++    def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
++                   left_matching_blocks=None, nostore_sha=None, random_id=False):
++        """See VersionedFiles.add_chunks()."""
++        self._index._check_write_ok()
++        self._check_add(key, None, random_id, check_content=False)
++        if parents is None:
++            # The caller might pass None if there is no graph data, but kndx
++            # indexes can't directly store that, so we give them
++            # an empty tuple instead.
++            parents = ()
++        line_bytes = b''.join(chunk_iter)
++        return self._add(key, None, parents,
++                         parent_texts, left_matching_blocks, nostore_sha, random_id,
++                         line_bytes=line_bytes)
++
      def _add(self, key, lines, parents, parent_texts,
               left_matching_blocks, nostore_sha, random_id,
               line_bytes):
 === modified file 'breezy/bzr/versionedfile.py'
 --- breezy/bzr/versionedfile.py	2018-11-11 04:08:32 +0000
 +++ breezy/bzr/versionedfile.py	2019-06-16 15:55:45 +0000
@@ -978,6 +978,38 @@
          """
          raise NotImplementedError(self.add_lines)
++    def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
++                   left_matching_blocks=None, nostore_sha=None, random_id=False,
++                   check_content=True):
++        """Add a text to the store from a chunk iterable.
++
++        :param key: The key tuple of the text to add. If the last element is
++            None, a CHK string will be generated during the addition.
++        :param parents: The parents key tuples of the text to add.
++        :param chunk_iter: An iterable over bytestrings.
++        :param parent_texts: An optional dictionary containing the opaque
++            representations of some or all of the parents of version_id to
++            allow delta optimisations.  VERY IMPORTANT: the texts must be those
++            returned by add_lines or data corruption can be caused.
++        :param left_matching_blocks: a hint about which areas are common
++            between the text and its left-hand-parent.  The format is
++            the SequenceMatcher.get_matching_blocks format.
++        :param nostore_sha: Raise ExistingContent and do not add the lines to
++            the versioned file if the digest of the lines matches this.
++        :param random_id: If True a random id has been selected rather than
++            an id determined by some deterministic process such as a converter
++            from a foreign VCS. When True the backend may choose not to check
++            for uniqueness of the resulting key within the versioned file, so
++            this should only be done when the result is expected to be unique
++            anyway.
++        :param check_content: If True, the lines supplied are verified to be
++            bytestrings that are correctly formed lines.
++        :return: The text sha1, the number of bytes in the text, and an opaque
++                 representation of the inserted version which can be provided
++                 back to future add_lines calls in the parent_texts dictionary.
++        """
++        raise NotImplementedError(self.add_chunks)
++
      def add_mpdiffs(self, records):
          """Add mpdiffs to this VersionedFile.
@@ -1201,6 +1233,16 @@
          self._mapper = mapper
          self._is_locked = is_locked
++    def add_chunks(self, key, parents, chunk_iter, parent_texts=None,
++                   left_matching_blocks=None, nostore_sha=None,
++                   random_id=False):
++        # For now, just fallback to add_lines.
++        lines = osutils.chunks_to_lines(list(chunk_iter))
++        return self.add_lines(
++            key, parents, lines, parent_texts,
++            left_matching_blocks, nostore_sha, random_id,
++            check_content=True)
++
      def add_lines(self, key, parents, lines, parent_texts=None,
                    left_matching_blocks=None, nostore_sha=None, random_id=False,
                    check_content=True):
 === modified file 'breezy/bzr/vf_repository.py'
 --- breezy/bzr/vf_repository.py	2019-06-15 12:04:34 +0000
 +++ breezy/bzr/vf_repository.py	2019-06-16 15:55:45 +0000
@@ -569,8 +569,9 @@
      def _add_file_to_weave(self, file_id, fileobj, parents, nostore_sha):
          parent_keys = tuple([(file_id, parent) for parent in parents])
--        return self.repository.texts.add_lines(
--            (file_id, self._new_revision_id), parent_keys, fileobj.readlines(),
++        return self.repository.texts.add_chunks(
++            (file_id, self._new_revision_id), parent_keys,
++            osutils.file_iterator(fileobj),
              nostore_sha=nostore_sha, random_id=self.random_revid)[0:2]
 === modified file 'breezy/tests/blackbox/test_big_file.py'
 --- breezy/tests/blackbox/test_big_file.py	2019-06-15 21:45:04 +0000
 +++ breezy/tests/blackbox/test_big_file.py	2019-06-16 15:55:45 +0000
@@ -21,6 +21,7 @@
  memory.
  """
++import errno
  import os
  import resource
@@ -36,8 +37,8 @@
  BIG_FILE_SIZE = 1024 * 1024 * 500
  BIG_FILE_CHUNK_SIZE = 1024 * 1024
--RESOURCE = resource.RLIMIT_DATA
--LIMIT = 1024 * 1024 * 200
++RESOURCE = resource.RLIMIT_AS
++LIMIT = 1024 * 1024 * 100
  def make_big_file(path):
@@ -50,8 +51,12 @@
  class TestAdd(tests.TestCaseWithTransport):
      def writeBigFile(self, path):
--        make_big_file(path)
          self.addCleanup(os.unlink, path)
++        try:
++            make_big_file(path)
++        except EnvironmentError as e:
++            if e.errno == errno.ENOSPC:
++                self.skipTest('not enough disk space for big file')
      def setUp(self):
          super(TestAdd, self).setUp()
 === modified file 'breezy/tests/per_versionedfile.py'
 --- breezy/tests/per_versionedfile.py	2018-11-11 04:08:32 +0000
 +++ breezy/tests/per_versionedfile.py	2019-06-16 15:55:45 +0000
@@ -1538,6 +1538,26 @@
          records.sort()
          self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
++    def test_add_chunks(self):
++        f = self.get_versionedfiles()
++        key0 = self.get_simple_key(b'r0')
++        key1 = self.get_simple_key(b'r1')
++        key2 = self.get_simple_key(b'r2')
++        keyf = self.get_simple_key(b'foo')
++        f.add_chunks(key0, [], [b'a', b'\nb\n'])
++        if self.graph:
++            f.add_chunks(key1, [key0], [b'b', b'\n', b'c\n'])
++        else:
++            f.add_chunks(key1, [], [b'b\n', b'c\n'])
++        keys = f.keys()
++        self.assertIn(key0, keys)
++        self.assertIn(key1, keys)
++        records = []
++        for record in f.get_record_stream([key0, key1], 'unordered', True):
++            records.append((record.key, record.get_bytes_as('fulltext')))
++        records.sort()
++        self.assertEqual([(key0, b'a\nb\n'), (key1, b'b\nc\n')], records)
++
      def test_annotate(self):
          files = self.get_versionedfiles()
          self.get_diamond_files(files)