Breezy

Merge lp:~jelmer/brz/compress into lp:brz

compress
Merge into trunk

Proposed by Jelmer Vernooij on 2020-01-25

Status:	Merged
Approved by:	Jelmer Vernooij on 2020-01-25
Approved revision:	no longer in the source branch.
Merge reported by:	The Breezy Bot
Merged at revision:	not available
Proposed branch:	lp:~jelmer/brz/compress
Merge into:	lp:brz
Diff against target:	374 lines (+97/-90) 2 files modified breezy/bzr/groupcompress.py (+22/-19) breezy/tests/test_groupcompress.py (+75/-71)
To merge this branch:	bzr merge lp:~jelmer/brz/compress
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Jelmer Vernooij			Approve on 2020-01-25
Review via email: mp+378071@code.launchpad.net

Commit message

Pass chunks rather than full texts to compress().

Description of the change

Pass chunks rather than full texts to compress().

Revision history for this message

Jelmer Vernooij (jelmer) on 2020-01-25:

review: Approve

Revision history for this message

The Breezy Bot (the-breezy-bot) wrote on 2020-01-25:

Running landing tests failed
https://ci.breezy-vcs.org/job/brz/job/brz-land/648/

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Breezy developers

Jelmer Vernooij

Robert Ladyman

 === modified file 'breezy/bzr/groupcompress.py'
 --- breezy/bzr/groupcompress.py	2020-01-25 04:20:44 +0000
 +++ breezy/bzr/groupcompress.py	2020-01-25 14:24:06 +0000
@@ -568,7 +568,7 @@
          for factory in self._factories:
              bytes = factory.get_bytes_as('fulltext')
              (found_sha1, start_point, end_point,
--             type) = compressor.compress(factory.key, bytes, factory.sha1)
++             type) = compressor.compress(factory.key, [bytes], factory.sha1)
              # Now update this factory with the new offsets, etc
              factory.sha1 = found_sha1
              factory._start = start_point
@@ -826,14 +826,14 @@
          else:
              self._settings = settings
--    def compress(self, key, bytes, expected_sha, nostore_sha=None, soft=False):
++    def compress(self, key, chunks, expected_sha, nostore_sha=None, soft=False):
          """Compress lines with label key.
          :param key: A key tuple. It is stored in the output
              for identification of the text during decompression. If the last
              element is b'None' it is replaced with the sha1 of the text -
              e.g. sha1:xxxxxxx.
--        :param bytes: The bytes to be compressed
++        :param chunks: Chunks of bytes to be compressed
          :param expected_sha: If non-None, the sha the lines are believed to
              have. During compression the sha is calculated; a mismatch will
              cause an error.
@@ -847,7 +847,7 @@
          :seealso VersionedFiles.add_lines:
          """
--        if not bytes:  # empty, like a dir entry, etc
++        if not chunks:  # empty, like a dir entry, etc
              if nostore_sha == _null_sha1:
                  raise errors.ExistingContent()
              return _null_sha1, 0, 0, 'fulltext'
@@ -855,23 +855,25 @@
          if expected_sha is not None:
              sha1 = expected_sha
          else:
--            sha1 = osutils.sha_string(bytes)
++            sha1 = osutils.sha_strings(chunks)
          if nostore_sha is not None:
              if sha1 == nostore_sha:
                  raise errors.ExistingContent()
          if key[-1] is None:
              key = key[:-1] + (b'sha1:' + sha1,)
--        start, end, type = self._compress(key, bytes, len(bytes) / 2, soft)
++        length = sum(map(len, chunks))
++
++        start, end, type = self._compress(key, chunks, length / 2, soft)
          return sha1, start, end, type
--    def _compress(self, key, bytes, max_delta_size, soft=False):
++    def _compress(self, key, chunks, max_delta_size, soft=False):
          """Compress lines with label key.
          :param key: A key tuple. It is stored in the output for identification
              of the text during decompression.
--        :param bytes: The bytes to be compressed
++        :param chunks: The chunks of bytes to be compressed
          :param max_delta_size: The size above which we issue a fulltext instead
              of a delta.
@@ -956,10 +958,10 @@
          # The actual content is managed by LinesDeltaIndex
          self.chunks = self._delta_index.lines
--    def _compress(self, key, bytes, max_delta_size, soft=False):
++    def _compress(self, key, chunks, max_delta_size, soft=False):
          """see _CommonGroupCompressor._compress"""
--        input_len = len(bytes)
--        new_lines = osutils.split_lines(bytes)
++        input_len = sum(map(len, chunks))
++        new_lines = osutils.chunks_to_lines(chunks)
          out_lines, index_lines = self._delta_index.make_delta(
              new_lines, bytes_length=input_len, soft=soft)
          delta_length = sum(map(len, out_lines))
@@ -1011,9 +1013,9 @@
          max_bytes_to_index = self._settings.get('max_bytes_to_index', 0)
          self._delta_index = DeltaIndex(max_bytes_to_index=max_bytes_to_index)
--    def _compress(self, key, bytes, max_delta_size, soft=False):
++    def _compress(self, key, chunks, max_delta_size, soft=False):
          """see _CommonGroupCompressor._compress"""
--        input_len = len(bytes)
++        input_len = sum(map(len, chunks))
          # By having action/label/sha1/len, we can parse the group if the index
          # was ever destroyed, we have the key in 'label', we know the final
          # bytes are valid from sha1, and we know where to find the end of this
@@ -1027,13 +1029,14 @@
              raise AssertionError('_source_offset != endpoint'
                                   ' somehow the DeltaIndex got out of sync with'
                                   ' the output lines')
++        bytes = b''.join(chunks)
          delta = self._delta_index.make_delta(bytes, max_delta_size)
--        if (delta is None):
++        if delta is None:
              type = 'fulltext'
--            enc_length = encode_base128_int(len(bytes))
++            enc_length = encode_base128_int(input_len)
              len_mini_header = 1 + len(enc_length)
              self._delta_index.add_source(bytes, len_mini_header)
--            new_chunks = [b'f', enc_length, bytes]
++            new_chunks = [b'f', enc_length] + chunks
          else:
              type = 'delta'
              enc_length = encode_base128_int(len(delta))
@@ -1836,7 +1839,7 @@
                  max_fulltext_prefix = prefix
              (found_sha1, start_point, end_point,
               type) = self._compressor.compress(record.key,
--                                               bytes, record.sha1, soft=soft,
++                                               [bytes], record.sha1, soft=soft,
                                                 nostore_sha=nostore_sha)
              # delta_ratio = float(len(bytes)) / (end_point - start_point)
              # Check if we want to continue to include that text
@@ -1858,8 +1861,8 @@
                  flush()
                  max_fulltext_len = len(bytes)
                  (found_sha1, start_point, end_point,
--                 type) = self._compressor.compress(record.key, bytes,
--                                                   record.sha1)
++                 type) = self._compressor.compress(
++                     record.key, [bytes], record.sha1)
              if record.key[-1] is None:
                  key = record.key[:-1] + (b'sha1:' + found_sha1,)
              else:
 === modified file 'breezy/tests/test_groupcompress.py'
 --- breezy/tests/test_groupcompress.py	2018-11-11 04:08:32 +0000
 +++ breezy/tests/test_groupcompress.py	2020-01-25 14:24:06 +0000
@@ -81,8 +81,8 @@
      def test_one_nosha_delta(self):
          # diff against NUKK
          compressor = self.compressor()
--        sha1, start_point, end_point, _ = compressor.compress(('label',),
--                                                              b'strange\ncommon\n', None)
++        sha1, start_point, end_point, _ = compressor.compress(
++            ('label',), [b'strange\ncommon\n'], None)
          self.assertEqual(sha_string(b'strange\ncommon\n'), sha1)
          expected_lines = b'f\x0fstrange\ncommon\n'
          self.assertEqual(expected_lines, b''.join(compressor.chunks))
@@ -92,8 +92,8 @@
      def test_empty_content(self):
          compressor = self.compressor()
          # Adding empty bytes should return the 'null' record
--        sha1, start_point, end_point, kind = compressor.compress(('empty',),
--                                                                 b'', None)
++        sha1, start_point, end_point, kind = compressor.compress(
++            ('empty',), [], None)
          self.assertEqual(0, start_point)
          self.assertEqual(0, end_point)
          self.assertEqual('fulltext', kind)
@@ -101,10 +101,11 @@
          self.assertEqual(0, compressor.endpoint)
          self.assertEqual([], compressor.chunks)
          # Even after adding some content
--        compressor.compress(('content',), b'some\nbytes\n', None)
++        compressor.compress(
++            ('content',), [b'some\nbytes\n'], None)
          self.assertTrue(compressor.endpoint > 0)
--        sha1, start_point, end_point, kind = compressor.compress(('empty2',),
--                                                                 b'', None)
++        sha1, start_point, end_point, kind = compressor.compress(
++            ('empty2',), [], None)
          self.assertEqual(0, start_point)
          self.assertEqual(0, end_point)
          self.assertEqual('fulltext', kind)
@@ -114,11 +115,11 @@
          # Knit fetching will try to reconstruct texts locally which results in
          # reading something that is in the compressor stream already.
          compressor = self.compressor()
--        sha1_1, _, _, _ = compressor.compress(('label',),
--                                              b'strange\ncommon long line\nthat needs a 16 byte match\n', None)
++        sha1_1, _, _, _ = compressor.compress(
++            ('label',), [b'strange\ncommon long line\nthat needs a 16 byte match\n'], None)
          expected_lines = list(compressor.chunks)
--        sha1_2, _, end_point, _ = compressor.compress(('newlabel',),
--                                                      b'common long line\nthat needs a 16 byte match\ndifferent\n', None)
++        sha1_2, _, end_point, _ = compressor.compress(
++            ('newlabel',), [b'common long line\nthat needs a 16 byte match\ndifferent\n'], None)
          # get the first out
          self.assertEqual((b'strange\ncommon long line\n'
                            b'that needs a 16 byte match\n', sha1_1),
@@ -130,11 +131,11 @@
      def test_pop_last(self):
          compressor = self.compressor()
--        _, _, _, _ = compressor.compress(('key1',),
--                                         b'some text\nfor the first entry\n', None)
++        _, _, _, _ = compressor.compress(
++            ('key1',), [b'some text\nfor the first entry\n'], None)
          expected_lines = list(compressor.chunks)
--        _, _, _, _ = compressor.compress(('key2',),
--                                         b'some text\nfor the second entry\n', None)
++        _, _, _, _ = compressor.compress(
++            ('key2',), [b'some text\nfor the second entry\n'], None)
          compressor.pop_last()
          self.assertEqual(expected_lines, compressor.chunks)
@@ -146,30 +147,33 @@
      def test_stats(self):
          compressor = self.compressor()
--        compressor.compress(('label',),
--                            b'strange\n'
--                            b'common very very long line\n'
--                            b'plus more text\n', None)
--        compressor.compress(('newlabel',),
--                            b'common very very long line\n'
--                            b'plus more text\n'
--                            b'different\n'
--                            b'moredifferent\n', None)
--        compressor.compress(('label3',),
--                            b'new\n'
--                            b'common very very long line\n'
--                            b'plus more text\n'
--                            b'different\n'
--                            b'moredifferent\n', None)
++        compressor.compress(
++            ('label',), [b'strange\n',
++                         b'common very very long line\n',
++                         b'plus more text\n'], None)
++        compressor.compress(
++            ('newlabel',),
++            [b'common very very long line\n',
++             b'plus more text\n',
++             b'different\n',
++             b'moredifferent\n'], None)
++        compressor.compress(
++            ('label3',),
++            [b'new\n',
++             b'common very very long line\n',
++             b'plus more text\n',
++             b'different\n',
++             b'moredifferent\n'], None)
          self.assertAlmostEqual(1.9, compressor.ratio(), 1)
      def test_two_nosha_delta(self):
          compressor = self.compressor()
--        sha1_1, _, _, _ = compressor.compress(('label',),
--                                              b'strange\ncommon long line\nthat needs a 16 byte match\n', None)
++        sha1_1, _, _, _ = compressor.compress(
++            ('label',),
++            [b'strange\ncommon long line\nthat needs a 16 byte match\n'], None)
          expected_lines = list(compressor.chunks)
--        sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
--                                                                b'common long line\nthat needs a 16 byte match\ndifferent\n', None)
++        sha1_2, start_point, end_point, _ = compressor.compress(
++            ('newlabel',), [b'common long line\nthat needs a 16 byte match\ndifferent\n'], None)
          self.assertEqual(sha_string(b'common long line\n'
                                      b'that needs a 16 byte match\n'
                                      b'different\n'), sha1_2)
@@ -190,15 +194,14 @@
          # The first interesting test: make a change that should use lines from
          # both parents.
          compressor = self.compressor()
--        sha1_1, _, _, _ = compressor.compress(('label',),
--                                              b'strange\ncommon very very long line\nwith some extra text\n', None)
--        sha1_2, _, _, _ = compressor.compress(('newlabel',),
--                                              b'different\nmoredifferent\nand then some more\n', None)
++        sha1_1, _, _, _ = compressor.compress(
++            ('label',), [b'strange\ncommon very very long line\nwith some extra text\n'], None)
++        sha1_2, _, _, _ = compressor.compress(
++            ('newlabel',), [b'different\nmoredifferent\nand then some more\n'], None)
          expected_lines = list(compressor.chunks)
--        sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
--                                                                b'new\ncommon very very long line\nwith some extra text\n'
--                                                                b'different\nmoredifferent\nand then some more\n',
--                                                                None)
++        sha1_3, start_point, end_point, _ = compressor.compress(
++            ('label3',), [b'new\ncommon very very long line\nwith some extra text\n',
++                          b'different\nmoredifferent\nand then some more\n'], None)
          self.assertEqual(
              sha_string(b'new\ncommon very very long line\nwith some extra text\n'
                         b'different\nmoredifferent\nand then some more\n'),
@@ -225,30 +228,32 @@
      def test_stats(self):
          compressor = self.compressor()
--        compressor.compress(('label',),
--                            b'strange\n'
--                            b'common very very long line\n'
--                            b'plus more text\n', None)
--        compressor.compress(('newlabel',),
--                            b'common very very long line\n'
--                            b'plus more text\n'
--                            b'different\n'
--                            b'moredifferent\n', None)
--        compressor.compress(('label3',),
--                            b'new\n'
--                            b'common very very long line\n'
--                            b'plus more text\n'
--                            b'different\n'
--                            b'moredifferent\n', None)
++        compressor.compress(
++            ('label',), [b'strange\n',
++                         b'common very very long line\n',
++                         b'plus more text\n'], None)
++        compressor.compress(
++            ('newlabel',), [
++                b'common very very long line\n',
++                b'plus more text\n',
++                b'different\n',
++                b'moredifferent\n'], None)
++        compressor.compress(
++            ('label3',),
++            [b'new\n',
++             b'common very very long line\n',
++             b'plus more text\n',
++             b'different\n',
++             b'moredifferent\n'], None)
          self.assertAlmostEqual(1.9, compressor.ratio(), 1)
      def test_two_nosha_delta(self):
          compressor = self.compressor()
--        sha1_1, _, _, _ = compressor.compress(('label',),
--                                              b'strange\ncommon long line\nthat needs a 16 byte match\n', None)
++        sha1_1, _, _, _ = compressor.compress(
++            ('label',), [b'strange\ncommon long line\nthat needs a 16 byte match\n'], None)
          expected_lines = list(compressor.chunks)
--        sha1_2, start_point, end_point, _ = compressor.compress(('newlabel',),
--                                                                b'common long line\nthat needs a 16 byte match\ndifferent\n', None)
++        sha1_2, start_point, end_point, _ = compressor.compress(
++            ('newlabel',), [b'common long line\nthat needs a 16 byte match\ndifferent\n'], None)
          self.assertEqual(sha_string(b'common long line\n'
                                      b'that needs a 16 byte match\n'
                                      b'different\n'), sha1_2)
@@ -269,15 +274,14 @@
          # The first interesting test: make a change that should use lines from
          # both parents.
          compressor = self.compressor()
--        sha1_1, _, _, _ = compressor.compress(('label',),
--                                              b'strange\ncommon very very long line\nwith some extra text\n', None)
--        sha1_2, _, _, _ = compressor.compress(('newlabel',),
--                                              b'different\nmoredifferent\nand then some more\n', None)
++        sha1_1, _, _, _ = compressor.compress(
++            ('label',), [b'strange\ncommon very very long line\nwith some extra text\n'], None)
++        sha1_2, _, _, _ = compressor.compress(
++            ('newlabel',), [b'different\nmoredifferent\nand then some more\n'], None)
          expected_lines = list(compressor.chunks)
--        sha1_3, start_point, end_point, _ = compressor.compress(('label3',),
--                                                                b'new\ncommon very very long line\nwith some extra text\n'
--                                                                b'different\nmoredifferent\nand then some more\n',
--                                                                None)
++        sha1_3, start_point, end_point, _ = compressor.compress(
++            ('label3',), [b'new\ncommon very very long line\nwith some extra text\n',
++                          b'different\nmoredifferent\nand then some more\n'], None)
          self.assertEqual(
              sha_string(b'new\ncommon very very long line\nwith some extra text\n'
                         b'different\nmoredifferent\nand then some more\n'),
@@ -305,7 +309,7 @@
          compressor = groupcompress.GroupCompressor()
          start = 0
          for key in sorted(key_to_text):
--            compressor.compress(key, key_to_text[key], None)
++            compressor.compress(key, [key_to_text[key]], None)
          locs = dict((key, (start, end)) for key, (start, _, end, _)
                      in compressor.labels_deltas.items())
          block = compressor.flush()
@@ -946,7 +950,7 @@
          compressor = groupcompress.GroupCompressor()
          start = 0
          for key in sorted(key_to_text):
--            compressor.compress(key, key_to_text[key], None)
++            compressor.compress(key, [key_to_text[key]], None)
          locs = dict((key, (start, end)) for key, (start, _, end, _)
                      in compressor.labels_deltas.items())
          block = compressor.flush()