FileStore

Merge lp:~jderose/filestore/helpers into lp:filestore

helpers
Merge into trunk

Proposed by Jason Gerard DeRose on 2011-09-12

Status:	Merged
Merged at revision:	190
Proposed branch:	lp:~jderose/filestore/helpers
Merge into:	lp:filestore
Diff against target:	453 lines (+251/-50) 2 files modified filestore.py (+95/-42) test_filestore.py (+156/-8)
To merge this branch:	bzr merge lp:~jderose/filestore/helpers
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
David Jordan		2011-09-12	Approve on 2011-09-12
Review via email: mp+74932@code.launchpad.net

Description of the change

This merge:

* Adds 3 new highish-level FileStore methods:

FileStore.content_md5(_id) - compute a known correct MD5 hash (for uploading to S3 using boto)

FileStore.verify_and_move(tmp_fp, _id) - check content hash of a file in .dmedia/partial/, and if correct, move the file into its canonical location. This replaces an equivalent method of the legacy FileStore that is used by the S3 and BitTorrent backends.

FileStore.hash_and_move(tmp_fp) - calculate content hash of a file in .dmedia/tmp/, and then move the file into its canonical location. This replaces an equivalent method of the legacy FileStore that is used by the transcoder.

* Makes the use of the `Leaf` namedtuple consistent. Experience has shown the code is simpler and more robust if you keep track of the position in the file (leaf_index) a given chunk of leaf_data corresponds to. As of several revisions ago, when a leaf is read by reader() or batch_reader(), the index and data are bundled together right there in a Leaf(index, data) namedtuple. But this merge fixes some inconsistencies higher up:

- Consumers of `Leaf` no longer unpack it into (i, data)

- Consumers of `Leaf` yield/return the exact `Leaf` rather than (i, data)

- Hasher.update() has been renamed to the clearer Hasher.hash_leaf()

- Hasher.hash_leaf() now takes a `Leaf` instead of `bytes`

- Hasher is now more robust as it makes sure the Leaf.index from the readers matches up with its expected leaf_index

Revision history for this message

David Jordan (dmj726) wrote on 2011-09-12:

Looks a lot simpler. Approved.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Jason Gerard DeRose

dmedia Dev

sarasfox

 === modified file 'filestore.py'
 --- filestore.py	2011-09-11 17:13:58 +0000
 +++ filestore.py	2011-09-12 03:19:24 +0000
@@ -298,7 +298,8 @@
  import tempfile
  import stat
  from subprocess import check_call, CalledProcessError
--from base64 import b32encode, b32decode
++from base64 import b32encode, b32decode, b64encode
++import hashlib
  from threading import Thread
  from queue import Queue
  from collections import namedtuple
@@ -724,25 +725,26 @@
          self.array = bytearray()
          self.closed = False
--    def update(self, leaf):
--        assert isinstance(leaf, bytes)
--        assert 1 <= len(leaf) <= LEAF_SIZE
--        assert not self.closed
--        if len(leaf) < LEAF_SIZE:
++    def hash_leaf(self, leaf):
++        if not isinstance(leaf, Leaf):
++            raise TypeError(TYPE_ERROR.format('leaf', Leaf, type(leaf), leaf))
++        if self.closed:
++            raise Exception('Cannot call Hasher.hash_leaf() when Hasher.closed')
++        if leaf.index != self.leaf_index:
++            raise Exception('Expected leaf.index {}, got {}'.format(
++                    self.leaf_index, leaf.index)
++            )
++
++        if len(leaf.data) < LEAF_SIZE:
              self.closed = True
--
--        leaf_hash = hash_leaf(self.leaf_index, leaf)
++        leaf_hash = hash_leaf(leaf.index, leaf.data)
          self.array.extend(leaf_hash)
--        self.file_size += len(leaf)
--
--        low = self.leaf_index * LEAF_SIZE + 1
--        high = (self.leaf_index + 1) * LEAF_SIZE
--        assert low <= self.file_size <= high
--
++        self.file_size += len(leaf.data)
          self.leaf_index += 1
          return leaf_hash
      def content_hash(self):
++        self.closed = True
          leaf_hashes = bytes(self.array)
          return ContentHash(
              hash_root(self.file_size, leaf_hashes),
@@ -824,14 +826,14 @@
      assert isinstance(src_fp, (io.BufferedReader, io.BufferedRandom))
      assert src_fp.mode in ('rb', 'rb+')
      src_fp.seek(0)
--    i = 0
++    index = 0
      while True:
          data = src_fp.read(LEAF_SIZE)
          if not data:
              queue.put(None)
              break
--        queue.put(Leaf(i, data))
--        i += 1
++        queue.put(Leaf(index, data))
++        index += 1
  def batch_reader(batch, queue):
@@ -890,8 +892,7 @@
      """
      Iterate through leaves in *src_fp*, reading in a separate thread.
--    The function yields a ``(leaf_index, leaf_data)`` tuple for each leaf in
--    *src_fp*.
++    The function yields a `Leaf` namedtuple for each leaf in *src_fp*.
      Many operations in the `FileStore` involve a loop where an 8 MiB leaf is
      read, then hashed, then possibly written to disk.  As the hashing
@@ -904,8 +905,8 @@
      >>> def example():
      ...     h = Hasher()
      ...     src_fp = open('/my/file', 'rb')
--    ...     for (leaf_index, leaf) in reader_iter(src_fp):
--    ...         h.update(leaf)
++    ...     for leaf in reader_iter(src_fp):
++    ...         h.hash_leaf(leaf)
      ...
      The thread target is the `reader()` function.
@@ -987,7 +988,7 @@
              leaf = q.get()
              if leaf is EndFile:
                  break
--            h.update(leaf.data)
++            h.hash_leaf(leaf)
              for tmp_fp in temps:
                  tmp_fp.write(leaf.data)
          ch = h.content_hash()
@@ -1013,10 +1014,10 @@
      :param dst_fp: optional file opened in ``'wb'`` mode
      """
      hasher = Hasher()
--    for (i, leaf) in reader_iter(src_fp):
--        hasher.update(leaf)
++    for leaf in reader_iter(src_fp):
++        hasher.hash_leaf(leaf)
          if dst_fp:
--            dst_fp.write(leaf)
++            dst_fp.write(leaf.data)
      return hasher.content_hash()
@@ -1296,37 +1297,61 @@
          if size != file_size:
              corrupt = self.move_to_corrupt(src_fp, _id)
              raise SizeIntegrityError(self.parent, _id, file_size, size)
--        for (i, leaf) in reader_iter(src_fp):
--            got = hash_leaf(i, leaf)
--            expected = get_leaf_hash(leaf_hashes, i)
++        for leaf in reader_iter(src_fp):
++            got = hash_leaf(leaf.index, leaf.data)
++            expected = get_leaf_hash(leaf_hashes, leaf.index)
              if got != expected:
                  self.move_to_corrupt(src_fp, _id)
--                raise LeafIntegrityError(self.parent, _id, i, expected, got)
--            yield (i, leaf)
--        assert get_leaf_hash(leaf_hashes, i + 1) == b''
++                raise LeafIntegrityError(
++                    self.parent, _id, leaf.index, expected, got
++                )
++            yield leaf
++        assert get_leaf_hash(leaf_hashes, leaf.index + 1) == b''
      def verify_iter2(self, _id):
          """
          Yield each leaf as it's read, verifying file integrity after final leaf.
--        This method will yield ``(leaf_index, leaf_data)`` tuple for each leaf
--        in the file identified by *_id*.  Use this method with care as the
--        integrity of the leaves is not known till after the last leaf has been
--        yielded.
++        This method will yield a `Leaf` namedtuple for each leaf in the file
++        identified by *_id*.  Use this method with care as the integrity of the
++        leaves is not known till after the last leaf has been yielded.
          This method is similar to `FileStore.verify_iter()` except only the
          *_id* is needed.
          """
          src_fp = self.open(_id)
          h = Hasher()
--        for (i, leaf) in reader_iter(src_fp):
--            h.update(leaf)
--            yield (i, leaf)
++        for leaf in reader_iter(src_fp):
++            h.hash_leaf(leaf)
++            yield leaf
          c = h.content_hash()
          if c.id != _id:
              self.move_to_corrupt(src_fp, _id)
              raise FileIntegrityError(self.parent, _id, c.id)
++    def content_md5(self, _id):
++        """
++        Compute md5 hash of the file with *_id* for use in Content-MD5 header.
++
++        For example:
++
++        >>> fs = FileStore('/home/jderose')  #doctest: +SKIP
++        >>> fs.content_md5('YDDL5ROVABZP4NBSJPC3HUQDVDAGAP5L26YFXD3UR6N5OLVN')  #doctest: +SKIP
++        ('99ca2a74521ad7825768bbfe7fe0dc49', 'mcoqdFIa14JXaLv+f+DcSQ==')
++
++        This method guarantees the correct md5 hash is computed for the file
++        with *_id* because it verifies the file as it computes the md5 hash.
++        Note that you do not have this guarantee if you simply opened the file
++        with `FileStore.open()` and computed the md5 hash that way.
++
++        If you need the md5 hash for use with boto when uploading to S3, use
++        this method rather than letting boto compute the md5 hash itself.
++        """
++        md5 = hashlib.md5()
++        for leaf in self.verify_iter2(_id):
++            md5.update(leaf.data)
++        return (md5.hexdigest(), b64encode(md5.digest()).decode('utf-8'))
++
      def remove(self, _id):
          """
          Delete file with *_id* from underlying filesystem.
@@ -1445,7 +1470,7 @@
          """
          Move a file from its canonical location to its corrupt location.
--        While a file is found to be corrupt (meaning the computed content hash
++        When a file is found to be corrupt (meaning the computed content hash
          doesn't match the expected content hash), it is moved from its canonical
          location to a special corrupt location.
@@ -1476,6 +1501,34 @@
          src_fp.close()
          return dst
++    def verify_and_move(self, tmp_fp, _id):
++        allowed = (io.BufferedReader, io.BufferedRandom)
++        if not isinstance(tmp_fp, allowed):
++            raise TypeError(
++                TYPE_ERROR.format('tmp_fp', allowed, type(tmp_fp), tmp_fp)
++            )
++        if tmp_fp.name != self.partial_path(_id):
++            raise ValueError(
++                'bad partial_path() for {!r}: {!r}'.format(_id, tmp_fp.name)
++            )
++        ch = hash_fp(tmp_fp)
++        if ch.id != _id:
++            raise ValueError(
++                'expected {!r}, computed {!r}'.format(_id, ch.id)
++            )
++        self.move_to_canonical(tmp_fp, _id)
++        return ch
++
++    def hash_and_move(self, tmp_fp):
++        allowed = (io.BufferedReader, io.BufferedRandom)
++        if not isinstance(tmp_fp, allowed):
++            raise TypeError(
++                TYPE_ERROR.format('tmp_fp', allowed, type(tmp_fp), tmp_fp)
++            )
++        ch = hash_fp(tmp_fp)
++        self.move_to_canonical(tmp_fp, ch.id)
++        return ch
++
      def import_file(self, src_fp):
          """
          Atomically copy open file *src_fp* into this filestore.
@@ -1524,10 +1577,10 @@
          size = os.fstat(src_fp.fileno()).st_size
          temps = [fs.allocate_tmp(size) for fs in filestores]
          h = Hasher()
--        for (i, leaf) in reader_iter(src_fp):
--            h.update(leaf)
++        for leaf in reader_iter(src_fp):
++            h.hash_leaf(leaf)
              for tmp_fp in temps:
--                tmp_fp.write(leaf)
++                tmp_fp.write(leaf.data)
          c = h.content_hash()
          if c.id != _id:
              self.move_to_corrupt(src_fp, _id)
 === modified file 'test_filestore.py'
 --- test_filestore.py	2011-09-11 09:13:31 +0000
 +++ test_filestore.py	2011-09-12 03:19:24 +0000
@@ -1052,30 +1052,76 @@
          self.assertEqual(h.array, b'')
          self.assertFalse(h.closed)
--    def test_update(self):
--        h = filestore.Hasher()
--
--        self.assertEqual(h.update(LEAVES[0]), LEAF_HASHES[0:30])
++    def test_hash_leaf(self):
++        # Test with bad leaf type
++        h = filestore.Hasher()
++        with self.assertRaises(TypeError) as cm:
++            h.hash_leaf(b'nope')
++        self.assertEqual(
++            str(cm.exception),
++            TYPE_ERROR.format('leaf', filestore.Leaf, bytes, b'nope')
++        )
++
++        # We'll use these below
++        leaf0 = filestore.Leaf(0, LEAVES[0])
++        leaf1 = filestore.Leaf(1, LEAVES[1])
++        leaf2 = filestore.Leaf(2, LEAVES[2])
++
++        # Test when closed
++        h = filestore.Hasher()
++        h.closed = True
++        with self.assertRaises(Exception) as cm:
++            h.hash_leaf(leaf0)
++        self.assertEqual(
++            str(cm.exception),
++            'Cannot call Hasher.hash_leaf() when Hasher.closed'
++        )
++
++        # Test when leaf_index is wrong
++        h = filestore.Hasher()
++        h.leaf_index = 1
++        with self.assertRaises(Exception) as cm:
++            h.hash_leaf(leaf0)
++        self.assertEqual(
++            str(cm.exception),
++            'Expected leaf.index 1, got 0'
++        )
++
++        # Test when it's all good
++        h = filestore.Hasher()
++
++        self.assertEqual(h.hash_leaf(leaf0), LEAF_HASHES[0:30])
          self.assertEqual(h.leaf_index, 1)
          self.assertEqual(h.file_size, filestore.LEAF_SIZE)
          self.assertFalse(h.closed)
--        self.assertEqual(h.update(LEAVES[1]), LEAF_HASHES[30:60])
++        self.assertEqual(h.hash_leaf(leaf1), LEAF_HASHES[30:60])
          self.assertEqual(h.leaf_index, 2)
          self.assertEqual(h.file_size, filestore.LEAF_SIZE * 2)
          self.assertFalse(h.closed)
--        self.assertEqual(h.update(LEAVES[2]), LEAF_HASHES[60:90])
++        self.assertEqual(h.hash_leaf(leaf2), LEAF_HASHES[60:90])
          self.assertEqual(h.leaf_index, 3)
          self.assertEqual(h.file_size, sum(len(l) for l in LEAVES))
          self.assertTrue(h.closed)
      def test_content_hash(self):
++        leaf0 = filestore.Leaf(0, LEAVES[0])
++        leaf1 = filestore.Leaf(1, LEAVES[1])
++        leaf2 = filestore.Leaf(2, LEAVES[2])
++
          h = filestore.Hasher()
--        for l in LEAVES:
--            h.update(l)
++        for leaf in (leaf0, leaf1, leaf2):
++            h.hash_leaf(leaf)
          self.assertEqual(h.content_hash(), CH)
++        # Test that Hasher.content_hash() sets closed = True
++        h = filestore.Hasher()
++        h.hash_leaf(leaf0)
++        self.assertFalse(h.closed)
++        h.content_hash()
++        self.assertTrue(h.closed)
++
  class TestFileStore(TestCase):
      def test_init(self):
@@ -1671,6 +1717,50 @@
          self.assertFalse(path.exists(canonical))
          self.assertTrue(path.isfile(corrupt))
++    def test_content_md5(self):
++        tmp = TempDir()
++        fs = filestore.FileStore(tmp.dir)
++
++        canonical = fs.path(ID)
++        corrupt = fs.corrupt_path(ID)
++
++        # File doesn't exist
++        with self.assertRaises(IOError) as cm:
++            md5 = fs.content_md5(ID)
++        self.assertEqual(cm.exception.errno, 2)
++
++        # File exists:
++        fp = open(canonical, 'wb')
++        for leaf in LEAVES:
++            fp.write(leaf)
++        fp.close()
++
++        self.assertEqual(
++            fs.content_md5(ID),
++            ('99ca2a74521ad7825768bbfe7fe0dc49', 'mcoqdFIa14JXaLv+f+DcSQ==')
++        )
++
++        # File exists and is corrupted
++        fp = open(canonical, 'wb')
++        for leaf in LEAVES:
++            fp.write(leaf)
++        fp.write(b'F')
++        fp.close()
++        c = filestore.hash_fp(open(canonical, 'rb'))
++        self.assertNotEqual(c.id, ID)
++
++        self.assertTrue(path.isfile(canonical))
++        self.assertFalse(path.exists(corrupt))
++
++        with self.assertRaises(filestore.FileIntegrityError) as cm:
++            md5 = fs.content_md5(ID)
++        self.assertEqual(cm.exception.id, ID)
++        self.assertEqual(cm.exception.parent, tmp.dir)
++        self.assertEqual(cm.exception.bad_id, c.id)
++
++        self.assertFalse(path.exists(canonical))
++        self.assertTrue(path.isfile(corrupt))
++
      def test_remove(self):
          tmp = TempDir()
          fs = filestore.FileStore(tmp.dir)
@@ -1890,6 +1980,64 @@
          self.assertTrue(path.isfile(corrupt))
          self.assertEqual(open(corrupt, 'rb').read(), b'yup')
++    def test_verify_and_move(self):
++        # Test when it's all good
++        tmp = TempDir()
++        fs = filestore.FileStore(tmp.dir)
++        src = fs.partial_path(ID)
++        write_sample_file(src)
++        src_fp = open(src, 'rb')
++        self.assertFalse(fs.exists(ID))
++        self.assertEqual(fs.verify_and_move(src_fp, ID), CH)
++        self.assertTrue(fs.exists(ID))
++        self.assertEqual(fs.verify(ID), CH)
++
++        # Test when tmp_fp.name != partial_path(_id)
++        tmp = TempDir()
++        fs = filestore.FileStore(tmp.dir)
++        src = path.join(fs.tmp, 'foo.mov')
++        write_sample_file(src)
++        src_fp = open(src, 'rb')
++        with self.assertRaises(ValueError) as cm:
++            fs.verify_and_move(src_fp, ID)
++        self.assertEqual(
++            str(cm.exception),
++            'bad partial_path() for {!r}: {!r}'.format(ID, src)
++        )
++        self.assertFalse(fs.exists(ID))
++
++        # Test when partial has wrong content
++        tmp = TempDir()
++        fs = filestore.FileStore(tmp.dir)
++        src = fs.partial_path(ID)
++        src_fp = open(src, 'wb')
++        for leaf in LEAVES:
++            src_fp.write(leaf)
++        src_fp.write(b'F')
++        src_fp.close()
++        ch = filestore.hash_fp(open(src, 'rb'))
++        self.assertNotEqual(ch.id, ID)
++        src_fp = open(src, 'rb')
++        with self.assertRaises(ValueError) as cm:
++            fs.verify_and_move(src_fp, ID)
++        self.assertEqual(
++            str(cm.exception),
++            'expected {!r}, computed {!r}'.format(ID, ch.id)
++        )
++        self.assertFalse(fs.exists(ID))
++
++    def test_hash_and_move(self):
++        tmp = TempDir()
++        fs = filestore.FileStore(tmp.dir)
++
++        src = path.join(fs.tmp, 'foo.mov')
++        write_sample_file(src)
++        src_fp = open(src, 'rb')
++        self.assertFalse(fs.exists(ID))
++        self.assertEqual(fs.hash_and_move(src_fp), CH)
++        self.assertTrue(fs.exists(ID))
++        self.assertEqual(fs.verify(ID), CH)
++
      def test_import_file(self):
          tmp = TempDir()
          src = tmp.join('movie.mov')

FileStore

Merge lp:~jderose/filestore/helpers into lp:filestore

Commit message

Description of the change

Preview Diff

Subscribers