Merge lp:~jderose/dmedia/tie-size-to-id into lp:dmedia

Proposed by Jason Gerard DeRose
Status: Merged
Approved by: Jason Gerard DeRose
Approved revision: 170
Merged at revision: 167
Proposed branch: lp:~jderose/dmedia/tie-size-to-id
Merge into: lp:dmedia
Diff against target: 309 lines (+133/-18)
6 files modified
dmedia/downloader.py (+3/-2)
dmedia/filestore.py (+83/-9)
dmedia/schema.py (+1/-1)
dmedia/tests/helpers.py (+2/-2)
dmedia/tests/test_downloader.py (+1/-1)
dmedia/tests/test_filestore.py (+43/-3)
To merge this branch: bzr merge lp:~jderose/dmedia/tie-size-to-id
Reviewer Review Type Date Requested Status
Jason Gerard DeRose Approve
David Jordan Approve
Review via email: mp+50765@code.launchpad.net

Description of the change

Cryptographically ties doc['bytes'] to doc['_id'].

This allows us to insure that for a given doc['_id'] (the top-hash), the doc['bytes'] and doc['_attachments']['leaves']['data'] are actually correct.

Validating doc['bytes'] is important because we want to be fairly confident about preallocating that much space before starting a file download. Otherwise a malicious peer need do nothing more than start circulating a bogus doc with the wrong doc['bytes'] to wreak all kinds of havoc.

Even though we're still using sha1, I'm doing this is a way that makes sense for when we migrate to Skein. The top-hash of a file whose size is 31415 bytes gets "personalized" like this:

tophash = sha1(b'dmedia/tophash 31415' + leaves).digest()

With Skein it would work like this:

tophash = skein512(leaves, pers=b'dmedia/tophash 31415').digest()

To post a comment you must log in.
Revision history for this message
Jason Gerard DeRose (jderose) wrote :

Ah, also note that leafhash_personalization() and leafhash() aren't currently being used... just some daydreaming about how I'll do this when we switch to Skein.

lp:~jderose/dmedia/tie-size-to-id updated
170. By Jason Gerard DeRose

Replaced somewhat silly HASH alias with sha1 so code is easier to read

Revision history for this message
David Jordan (dmj726) wrote :

Without signing this as a crypto expert, I'd say this code looks fine.

review: Approve
Revision history for this message
Jason Gerard DeRose (jderose) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'dmedia/downloader.py'
2--- dmedia/downloader.py 2011-02-13 07:24:32 +0000
3+++ dmedia/downloader.py 2011-02-22 15:22:38 +0000
4@@ -24,6 +24,7 @@
5 """
6
7 from os import path
8+from hashlib import sha1
9 from base64 import b32encode
10 from urlparse import urlparse
11 from httplib import HTTPConnection, HTTPSConnection
12@@ -38,7 +39,7 @@
13 from . import __version__
14 from .constants import CHUNK_SIZE, TYPE_ERROR
15 from .errors import DownloadFailure
16-from .filestore import FileStore, HashList, HASH
17+from .filestore import FileStore, HashList
18
19
20 USER_AGENT = 'dmedia %s' % __version__
21@@ -154,7 +155,7 @@
22 def process_leaf(self, i, expected):
23 for r in xrange(3):
24 chunk = self.download_leaf(i)
25- got = b32encode(HASH(chunk).digest())
26+ got = b32encode(sha1(chunk).digest())
27 if got == expected:
28 self.dst_fp.write(chunk)
29 return chunk
30
31=== modified file 'dmedia/filestore.py'
32--- dmedia/filestore.py 2011-02-19 02:37:59 +0000
33+++ dmedia/filestore.py 2011-02-22 15:22:38 +0000
34@@ -155,7 +155,7 @@
35 from os import path
36 import stat
37 import tempfile
38-from hashlib import sha1 as HASH
39+from hashlib import sha1
40 from base64 import b32encode, b32decode
41 import json
42 import re
43@@ -291,6 +291,80 @@
44 return b32
45
46
47+def tophash_personalization(file_size):
48+ """
49+ Personalize the top-hash with *file_size*.
50+
51+ For example:
52+
53+ >>> tophash_personalization(3141)
54+ 'dmedia/tophash 3141'
55+
56+ This is used to cryptographically tie ``doc['bytes']`` to ``doc['_id']``.
57+ You can't change the leaves or the file size without affecting the top-hash.
58+
59+ The personalization is designed to be easy to implement in JavaScript. For
60+ example, this is the equivalent JavaScript function:
61+
62+ ::
63+
64+ function tophash_personalization(file_size) {
65+ return ['dmedia/tophash', file_size].join(' ');
66+ }
67+
68+ When hashing with Skein, this value would be used for the Skein
69+ personalization parameter. See PySkein and the Skein specification for
70+ details:
71+
72+ http://packages.python.org/pyskein/
73+
74+ http://www.skein-hash.info/
75+
76+ When hashing with sha1, the top-hash is calculated like this:
77+
78+ >>> from hashlib import sha1
79+ >>> from base64 import b32encode
80+ >>> pers = tophash_personalization(3141)
81+ >>> leaves = b'pretend this is the concatenated leaves'
82+ >>> b32encode(sha1(pers + leaves).digest()) # The top-hash
83+ 'M55ORBTYICEDQ2WUREDYIYYO6VUJ3R6S'
84+
85+ :param file_size: the file size in bytes (an ``int``)
86+ """
87+ return ' '.join(['dmedia/tophash', str(file_size)]).encode('utf-8')
88+
89+
90+def tophash(file_size):
91+ """
92+ Initialize hash for a file that is *file_size* bytes.
93+ """
94+ return sha1(tophash_personalization(file_size))
95+
96+
97+def leafhash_personalization(file_size, leaf_index):
98+ """
99+ Personalize the leaf-hash with *file_size* and *leaf_index*.
100+
101+ For example:
102+
103+ >>> leafhash_personalization(3141, 0)
104+ 'dmedia/leafhash 3141 0'
105+
106+ :param file_size: the file size in bytes (an ``int``)
107+ :param leaf_index: the index of this leaf (an ``int``, starting at zero)
108+ """
109+ return ' '.join(
110+ ['dmedia/leafhash', str(file_size), str(leaf_index)]
111+ ).encode('utf-8')
112+
113+
114+def leafhash(file_size, leaf_index):
115+ """
116+ Initialize hash for the *leaf_index* leaf in a file of *file_size* bytes.
117+ """
118+ return sha1(leafhash_personalization(file_size, leaf_index))
119+
120+
121 class HashList(object):
122 """
123 Simple hash-list (a 1-deep tree-hash).
124@@ -347,7 +421,7 @@
125 self.dst_fp = dst_fp
126 self.leaf_size = leaf_size
127 self.file_size = os.fstat(src_fp.fileno()).st_size
128- self.h = HASH()
129+ self.h = tophash(self.file_size)
130 self.leaves = []
131 self.q = Queue(4)
132 self.thread = Thread(target=self.hashing_thread)
133@@ -368,7 +442,7 @@
134 queue. This functionality is in its own method simply to make testing
135 easier.
136 """
137- digest = HASH(chunk).digest()
138+ digest = sha1(chunk).digest()
139 self.h.update(digest)
140 self.leaves.append(digest)
141 if self.dst_fp is not None:
142@@ -443,7 +517,7 @@
143 if fp.mode != 'rb':
144 raise ValueError("fp: must be opened in mode 'rb'; got %r" % fp.mode)
145 fp.seek(0) # Make sure we are at beginning of file
146- h = HASH()
147+ h = sha1()
148 size = os.fstat(fp.fileno()).st_size
149 h.update(str(size).encode('utf-8'))
150 h.update(fp.read(QUICK_ID_CHUNK))
151@@ -499,7 +573,7 @@
152 >>> from dmedia.tests import sample_mov # Sample .MOV file
153 >>> src_fp = open(sample_mov, 'rb')
154 >>> fs.import_file(src_fp, 'mov') #doctest: +ELLIPSIS
155- ('ZR765XWSF6S7JQHLUI4GCG5BHGPE252O', [...])
156+ ('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', [...])
157
158 And when you have the content-hash and extension, you can retrieve the full
159 path of the file using `FileStore.path()`:
160@@ -970,9 +1044,9 @@
161 temporary file name, like this:
162
163 >>> fs = FileStore()
164- >>> tmp = fs.tmp('ZR765XWSF6S7JQHLUI4GCG5BHGPE252O', 'mov', create=True)
165+ >>> tmp = fs.tmp('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', 'mov', create=True)
166 >>> tmp #doctest: +ELLIPSIS
167- '/tmp/store.../transfers/ZR765XWSF6S7JQHLUI4GCG5BHGPE252O.mov'
168+ '/tmp/store.../transfers/TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK.mov'
169
170 Then the downloader will write to the temporary file as it's being
171 downloaded:
172@@ -991,9 +1065,9 @@
173 Finally, the downloader will move the temporary file into its canonical
174 location:
175
176- >>> dst = fs.tmp_verify_move('ZR765XWSF6S7JQHLUI4GCG5BHGPE252O', 'mov')
177+ >>> dst = fs.tmp_verify_move('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', 'mov')
178 >>> dst #doctest: +ELLIPSIS
179- '/tmp/store.../ZR/765XWSF6S7JQHLUI4GCG5BHGPE252O.mov'
180+ '/tmp/store.../TG/X33XXWU3EVHEEY5J7NBOJGKBFXLEBK.mov'
181
182 The return value is the absolute path of the canonical file.
183
184
185=== modified file 'dmedia/schema.py'
186--- dmedia/schema.py 2011-02-15 07:53:55 +0000
187+++ dmedia/schema.py 2011-02-22 15:22:38 +0000
188@@ -110,7 +110,7 @@
189 >>> src_fp = open(sample_mov, 'rb')
190 >>> hashlist = HashList(src_fp)
191 >>> hashlist.run()
192-'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O'
193+'TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK'
194
195
196 After calling `HashList.run()`, the binary digests of the leaf content-hashes
197
198=== modified file 'dmedia/tests/helpers.py'
199--- dmedia/tests/helpers.py 2011-02-20 13:01:32 +0000
200+++ dmedia/tests/helpers.py 2011-02-22 15:22:38 +0000
201@@ -33,7 +33,7 @@
202
203 from . import sample_mov, sample_thm
204
205-mov_hash = 'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O'
206+mov_hash = 'TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK'
207 mov_leaves = [
208 b32decode('IXJTSUCYYFECGSG6JIB2R77CAJVJK4W3'),
209 b32decode('MA3IAHUOKXR4TRG7CWAPOO7U4WCV5WJ4'),
210@@ -46,7 +46,7 @@
211
212 mov_qid = 'GJ4AQP3BK3DMTXYOLKDK6CW4QIJJGVMN'
213
214-thm_hash = 'TA3676LFHP2SHNUHAVRYXP7YWGLMUQ4U'
215+thm_hash = 'GKZMOPVZILR43MZCXLVYP7T62XGBT7BQ'
216 thm_leaves = [b32decode('F6ATTKI6YVWVRBQQESAZ4DSUXQ4G457A')]
217 thm_qid = 'EYCDXXCNDB6OIIX5DN74J7KEXLNCQD5M'
218
219
220=== modified file 'dmedia/tests/test_downloader.py'
221--- dmedia/tests/test_downloader.py 2011-02-13 04:34:33 +0000
222+++ dmedia/tests/test_downloader.py 2011-02-22 15:22:38 +0000
223@@ -245,7 +245,7 @@
224 self.assertEqual(path.getsize(sample_mov), path.getsize(src))
225
226 e = raises(IntegrityError, inst.finalize)
227- self.assertEqual(e.got, 'UECTT7A7EIHZ2SGGBMMO5WTTSVU4SUWM')
228+ self.assertEqual(e.got, 'AYDIKK2IYAYTP7H5QCDK5FQ55F7QH4EN')
229 self.assertEqual(e.expected, mov_hash)
230 self.assertEqual(e.filename, src)
231 self.assertFalse(path.exists(dst_d))
232
233=== modified file 'dmedia/tests/test_filestore.py'
234--- dmedia/tests/test_filestore.py 2011-02-18 13:19:08 +0000
235+++ dmedia/tests/test_filestore.py 2011-02-22 15:22:38 +0000
236@@ -153,6 +153,46 @@
237 good = 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
238 assert f(good) is good
239
240+ def test_tophash(self):
241+ f = filestore.tophash
242+ h = f(31415)
243+ self.assertEqual(
244+ h.digest(),
245+ sha1(b'dmedia/tophash 31415').digest()
246+ )
247+ l = ''.join(mov_leaves)
248+ h.update(l)
249+ self.assertEqual(
250+ h.digest(),
251+ sha1(b'dmedia/tophash 31415' + l).digest()
252+ )
253+
254+ def test_leafhash(self):
255+ f = filestore.leafhash
256+ l = ''.join(mov_leaves)
257+
258+ h = f(1079991, 0)
259+ self.assertEqual(
260+ h.digest(),
261+ sha1(b'dmedia/leafhash 1079991 0').digest()
262+ )
263+ h.update(l)
264+ self.assertEqual(
265+ h.digest(),
266+ sha1(b'dmedia/leafhash 1079991 0' + l).digest()
267+ )
268+
269+ h = f(1079991, 1)
270+ self.assertEqual(
271+ h.digest(),
272+ sha1(b'dmedia/leafhash 1079991 1').digest()
273+ )
274+ h.update(l)
275+ self.assertEqual(
276+ h.digest(),
277+ sha1(b'dmedia/leafhash 1079991 1' + l).digest()
278+ )
279+
280 def test_pack_leaves(self):
281 f = filestore.pack_leaves
282
283@@ -412,7 +452,7 @@
284 src_fp.read(1024) # Make sure seek(0) is called
285 dst_fp = open(tmp.join('dst1.mov'), 'wb')
286 inst = self.klass(src_fp, dst_fp, 32 * 2**20)
287- self.assertEqual(inst.run(), 'R3QI4WFID6VDVK2NBB6WXE5ALMNLZAHQ')
288+ self.assertEqual(inst.run(), '6PTK7CX54TFB6HMHI62FJZL7XAEWT72J')
289 self.assertFalse(src_fp.closed) # Should not close file
290 self.assertFalse(dst_fp.closed) # Should not close file
291 dst_fp.close()
292@@ -430,7 +470,7 @@
293 src_fp.read(1024) # Make sure seek(0) is called
294 dst_fp = open(tmp.join('dst2.mov'), 'wb')
295 inst = self.klass(src_fp, dst_fp, 16 * 2**20)
296- self.assertEqual(inst.run(), 'B4IBNJ674EPXZZKNJYXFBDQQTFXIBSSC')
297+ self.assertEqual(inst.run(), 'HNNTABOJXN4ZBJMD665IA7QAZRJKDA3B')
298 self.assertFalse(src_fp.closed) # Should not close file
299 self.assertFalse(dst_fp.closed) # Should not close file
300 dst_fp.close()
301@@ -1133,7 +1173,7 @@
302 self.assertEqual(path.getsize(sample_mov), path.getsize(src))
303
304 e = raises(IntegrityError, inst.tmp_verify_move, mov_hash, 'mov')
305- self.assertEqual(e.got, 'UECTT7A7EIHZ2SGGBMMO5WTTSVU4SUWM')
306+ self.assertEqual(e.got, 'AYDIKK2IYAYTP7H5QCDK5FQ55F7QH4EN')
307 self.assertEqual(e.expected, mov_hash)
308 self.assertEqual(e.filename, src)
309 self.assertFalse(path.exists(dst_d))

Subscribers

People subscribed via source and target branches