Merge lp:~jderose/dmedia/2-core-bringup into lp:dmedia

Proposed by Jason Gerard DeRose
Status: Merged
Merged at revision: 223
Proposed branch: lp:~jderose/dmedia/2-core-bringup
Merge into: lp:dmedia
Diff against target: 9294 lines (+2511/-5127)
26 files modified
debian/changelog (+2/-2)
debian/control (+6/-4)
debian/rules (+8/-1)
dmedia/client.py (+382/-0)
dmedia/core.py (+8/-10)
dmedia/errors.py (+0/-74)
dmedia/extractor.py (+15/-18)
dmedia/filestore.py (+0/-1163)
dmedia/importer.py (+146/-388)
dmedia/schema.py (+225/-368)
dmedia/tests/base.py (+164/-0)
dmedia/tests/couch.py (+6/-9)
dmedia/tests/helpers.py (+10/-13)
dmedia/tests/test_client.py (+259/-0)
dmedia/tests/test_core.py (+31/-24)
dmedia/tests/test_extractor.py (+249/-265)
dmedia/tests/test_filestore.py (+28/-1316)
dmedia/tests/test_importer.py (+474/-878)
dmedia/tests/test_schema.py (+190/-225)
dmedia/tests/test_transfers.py (+118/-113)
dmedia/transfers.py (+26/-40)
dmedia/workers.py (+1/-1)
misc/hashbench.py (+24/-51)
misc/hashbench2.py (+0/-119)
misc/udisks-test.py (+138/-44)
setup.py (+1/-1)
To merge this branch: bzr merge lp:~jderose/dmedia/2-core-bringup
Reviewer Review Type Date Requested Status
dmedia Dev Pending
Review via email: mp+76565@code.launchpad.net

Description of the change

Boy, if you thought the last merge proposal was big. The merge:

1) Ports the dmedia core to Python3, hooray!

2) Guts the old internal filestore, ports to the new Skein based filestore, hooray!

3) Revamps ImportWorker to take advantage of batch_import_iter() from the new filestore

4) Some schema tweaks, the most important of which is changing 'time' in the stored dict to 'mtime' and having this be the mtime from the file-system containing that store... important for quick reality checks so we can decide whether full verification of any files is urgently called for.

5) Started work on improved http dmedia client, which will talk to native dmedia http server (native as apposed to a remote service like UbuntuOne or S3)

6) Is really really really big!

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'debian/changelog'
2--- debian/changelog 2011-08-24 01:23:18 +0000
3+++ debian/changelog 2011-09-22 11:43:29 +0000
4@@ -1,5 +1,5 @@
5-dmedia (11.09.0-0~natty) natty; urgency=low
6+dmedia (11.09.0-0~oneiric) oneiric; urgency=low
7
8 * Upstream 11.09.0 pre-release
9
10- -- Jason Gerard DeRose <jderose@novacut.com> Wed, 24 Aug 2011 01:17:38 +0000
11+ -- Jason Gerard DeRose <jderose@novacut.com> Thu, 22 Sep 2011 10:49:07 +0000
12
13=== modified file 'debian/control'
14--- debian/control 2011-09-16 05:32:07 +0000
15+++ debian/control 2011-09-22 11:43:29 +0000
16@@ -2,16 +2,18 @@
17 Section: python
18 Priority: optional
19 Maintainer: Jason Gerard DeRose <jderose@novacut.com>
20-Build-Depends: debhelper (>= 8.9), python (>= 2.7)
21+Build-Depends: debhelper (>= 8.9), python3 (>= 3.2)
22 Standards-Version: 3.9.2
23-X-Python-Version: 2.7
24+X-Python-Version: 3.2
25+X-Python3-Version: 3.2
26 Homepage: https://launchpad.net/dmedia
27
28 Package: dmedia
29 Architecture: all
30-Depends: ${misc:Depends}, python (>= 2.7),
31+Depends: ${misc:Depends}, python3 (>= 3.2),
32+ python3-filestore (>= 11.09),
33+ python3-microfiber (>= 11.09),
34 dc3 (>= 11.09),
35- python-microfiber (>= 11.09),
36 Description: distributed media library
37 A user-experienced-focused technology aimed at making file management go away
38 for both content-creation and content-consumption.
39
40=== modified file 'debian/rules'
41--- debian/rules 2011-08-27 14:14:43 +0000
42+++ debian/rules 2011-09-22 11:43:29 +0000
43@@ -1,3 +1,10 @@
44 #!/usr/bin/make -f
45 %:
46- dh $@ --with=python2
47+ dh $@ --with=python3
48+
49+override_dh_auto_install:
50+ for pyvers in $(shell py3versions -vr); do \
51+ python$$pyvers setup.py install \
52+ --install-layout=deb \
53+ --root $(CURDIR)/debian/dmedia; \
54+ done
55
56=== added file 'dmedia/client.py'
57--- dmedia/client.py 1970-01-01 00:00:00 +0000
58+++ dmedia/client.py 2011-09-22 11:43:29 +0000
59@@ -0,0 +1,382 @@
60+# dmedia: dmedia hashing protocol and file layout
61+# Copyright (C) 2011 Novacut Inc
62+#
63+# This file is part of `dmedia`.
64+#
65+# `dmedia` is free software: you can redistribute it and/or modify it under
66+# the terms of the GNU Affero General Public License as published by the Free
67+# Software Foundation, either version 3 of the License, or (at your option) any
68+# later version.
69+#
70+# `dmedia` is distributed in the hope that it will be useful, but WITHOUT ANY
71+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
72+# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
73+# details.
74+#
75+# You should have received a copy of the GNU Affero General Public License along
76+# with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
77+#
78+# Authors:
79+# Jason Gerard DeRose <jderose@novacut.com>
80+
81+"""
82+dmedia HTTP client.
83+"""
84+
85+import os
86+from urllib.parse import urlparse
87+from http.client import HTTPConnection, HTTPSConnection
88+from collections import OrderedDict
89+
90+from filestore import LEAF_SIZE, TYPE_ERROR, hash_leaf, reader_iter
91+from filestore import Leaf, ContentHash, SmartQueue, _start_thread
92+
93+from dmedia import __version__
94+
95+
96+USER_AGENT = 'dmedia {}'.format(__version__)
97+
98+
99+class HTTPError(Exception):
100+ """
101+ Base class for custom HTTP client exceptions.
102+ """
103+
104+ def __init__(self, response, method, path):
105+ self.response = response
106+ self.method = method
107+ self.path = path
108+ self.data = response.read()
109+ super().__init__(
110+ '{} {}: {} {}'.format(response.status, response.reason, method, path)
111+ )
112+
113+
114+class ClientError(HTTPError):
115+ """
116+ Base class for all 4xx Client Error exceptions.
117+ """
118+
119+
120+class BadRequest(ClientError):
121+ """
122+ 400 Bad Request.
123+ """
124+
125+
126+class Unauthorized(ClientError):
127+ """
128+ 401 Unauthorized.
129+ """
130+
131+
132+class Forbidden(ClientError):
133+ """
134+ 403 Forbidden.
135+ """
136+
137+
138+class NotFound(ClientError):
139+ """
140+ 404 Not Found.
141+ """
142+
143+
144+class MethodNotAllowed(ClientError):
145+ """
146+ 405 Method Not Allowed.
147+ """
148+
149+
150+class NotAcceptable(ClientError):
151+ """
152+ 406 Not Acceptable.
153+ """
154+
155+
156+class Conflict(ClientError):
157+ """
158+ 409 Conflict.
159+
160+ Raised when the request resulted in an update conflict.
161+ """
162+
163+
164+class PreconditionFailed(ClientError):
165+ """
166+ 412 Precondition Failed.
167+ """
168+
169+
170+class BadContentType(ClientError):
171+ """
172+ 415 Unsupported Media Type.
173+ """
174+
175+
176+class BadRangeRequest(ClientError):
177+ """
178+ 416 Requested Range Not Satisfiable.
179+ """
180+
181+
182+class ExpectationFailed(ClientError):
183+ """
184+ 417 Expectation Failed.
185+
186+ Raised when a bulk operation failed.
187+ """
188+
189+
190+class ServerError(HTTPError):
191+ """
192+ Used to raise exceptions for any 5xx Server Errors.
193+ """
194+
195+
196+errors = {
197+ 400: BadRequest,
198+ 401: Unauthorized,
199+ 403: Forbidden,
200+ 404: NotFound,
201+ 405: MethodNotAllowed,
202+ 406: NotAcceptable,
203+ 409: Conflict,
204+ 412: PreconditionFailed,
205+ 415: BadContentType,
206+ 416: BadRangeRequest,
207+ 417: ExpectationFailed,
208+}
209+
210+
211+def http_conn(url, **options):
212+ """
213+ Return (connection, parsed) tuple.
214+
215+ For example:
216+
217+ >>> (conn, parsed) = http_conn('http://foo.s3.amazonaws.com/')
218+
219+ The returned connection will be either an ``HTTPConnection`` or
220+ ``HTTPSConnection`` instance based on the *url* scheme.
221+
222+ The 2nd item in the returned tuple will be *url* parsed with ``urlparse()``.
223+ """
224+ u = urlparse(url)
225+ if u.scheme not in ('http', 'https'):
226+ raise ValueError('url scheme must be http or https: {!r}'.format(url))
227+ if not u.netloc:
228+ raise ValueError('bad url: {!r}'.format(url))
229+ klass = (HTTPConnection if u.scheme == 'http' else HTTPSConnection)
230+ conn = klass(u.netloc, **options)
231+ return (conn, u)
232+
233+
234+def bytes_range(start, stop=None):
235+ """
236+ Convert from Python slice semantics to an HTTP Range request.
237+
238+ Python slice semantics are quite natural to deal with, whereas the HTTP
239+ Range semantics are a touch wacky, so this function will help prevent silly
240+ errors.
241+
242+ For example, say we're requesting parts of a 10,000 byte long file. This
243+ requests the first 500 bytes:
244+
245+ >>> bytes_range(0, 500)
246+ 'bytes=0-499'
247+
248+ This requests the second 500 bytes:
249+
250+ >>> bytes_range(500, 1000)
251+ 'bytes=500-999'
252+
253+ All three of these request the final 500 bytes:
254+
255+ >>> bytes_range(9500, 10000)
256+ 'bytes=9500-9999'
257+ >>> bytes_range(-500)
258+ 'bytes=-500'
259+ >>> bytes_range(9500)
260+ 'bytes=9500-'
261+
262+ For details on HTTP Range header, see:
263+
264+ http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
265+ """
266+ if start < 0:
267+ assert stop is None
268+ return 'bytes={}'.format(start)
269+ end = ('' if stop is None else stop - 1)
270+ return 'bytes={}-{}'.format(start, end)
271+
272+
273+def check_slice(ch, start, stop):
274+ """
275+ Validate the crap out of a leaf-wise slice of a file.
276+ """
277+ if not isinstance(ch, ContentHash):
278+ raise TypeError(
279+ TYPE_ERROR.format('ch', ContentHash, type(ch), ch)
280+ )
281+ if not isinstance(ch.leaf_hashes, tuple):
282+ raise TypeError(
283+ 'ch.leaf_hashes not unpacked for ch.id={}'.format(ch.id)
284+ )
285+ if not ch.leaf_hashes:
286+ raise ValueError('got empty ch.leaf_hashes for ch.id={}'.format(ch.id))
287+ if not isinstance(start, int):
288+ raise TypeError(
289+ TYPE_ERROR.format('start', int, type(start), start)
290+ )
291+ if not (stop is None or isinstance(stop, int)):
292+ raise TypeError(
293+ TYPE_ERROR.format('stop', int, type(stop), stop)
294+ )
295+ if not (0 <= start < len(ch.leaf_hashes)):
296+ raise ValueError('Need 0 <= start < {}; got start={}'.format(
297+ len(ch.leaf_hashes), start)
298+ )
299+ if not (stop is None or 1 <= stop <= len(ch.leaf_hashes)):
300+ raise ValueError('Need 1 <= stop <= {}; got stop={}'.format(
301+ len(ch.leaf_hashes), stop)
302+ )
303+ if not (stop is None or start < stop):
304+ raise ValueError(
305+ 'Need start < stop; got start={}, stop={}'.format(start, stop)
306+ )
307+
308+
309+def range_header(ch, start=0, stop=None):
310+ check_slice(ch, start, stop)
311+ if start == 0 and (stop is None or stop == len(ch.leaf_hashes)):
312+ return {}
313+ _start = start * LEAF_SIZE
314+ if stop is None or stop == len(ch.leaf_hashes):
315+ _stop = None
316+ else:
317+ _stop = stop * LEAF_SIZE
318+ return {'Range': bytes_range(_start, _stop)}
319+
320+
321+def response_reader(response, queue, start=0):
322+ try:
323+ index = start
324+ while True:
325+ data = response.read(LEAF_SIZE)
326+ if not data:
327+ queue.put(None)
328+ break
329+ queue.put(Leaf(index, data))
330+ index += 1
331+ except Exception as e:
332+ queue.put(e)
333+
334+
335+def threaded_response_iter(response, start=0):
336+ q = SmartQueue(4)
337+ thread = _start_thread(response_reader, response, q, start)
338+ while True:
339+ leaf = q.get()
340+ if leaf is None:
341+ break
342+ yield leaf
343+ thread.join() # Make sure reader() terminates
344+
345+
346+def response_iter(response, start=0):
347+ index = start
348+ while True:
349+ data = response.read(LEAF_SIZE)
350+ if not data:
351+ break
352+ yield Leaf(index, data)
353+ index += 1
354+
355+
356+def missing_leaves(ch, tmp_fp):
357+ assert isinstance(ch.leaf_hashes, tuple)
358+ assert os.fstat(tmp_fp.fileno()).st_size == ch.file_size
359+ assert tmp_fp.mode in ('rb+', 'r+b')
360+ tmp_fp.seek(0)
361+ for leaf in reader_iter(tmp_fp):
362+ leaf_hash = ch.leaf_hashes[leaf.index]
363+ if hash_leaf(leaf.index, leaf.data) != leaf_hash:
364+ yield (leaf.index, leaf_hash)
365+ assert leaf.index == len(ch.leaf_hashes) - 1
366+
367+
368+class DownloadWriter:
369+ def __init__(self, ch, store):
370+ self.ch = ch
371+ self.store = store
372+ self.tmp_fp = store.allocate_partial(ch.file_size, ch.id)
373+ self.resumed = (self.tmp_fp.mode != 'wb')
374+ if self.resumed:
375+ gen = missing_leaves(ch, self.tmp_fp)
376+ else:
377+ gen = enumerate(ch.leaf_hashes)
378+ self.missing = OrderedDict(gen)
379+
380+ def write_leaf(self, leaf):
381+ if hash_leaf(leaf.index, leaf.data) != self.ch.leaf_hashes[leaf.index]:
382+ return False
383+ self.tmp_fp.seek(leaf.index * LEAF_SIZE)
384+ self.tmp_fp.write(leaf.data)
385+ lh = self.missing.pop(leaf.index)
386+ assert lh == self.ch.leaf_hashes[leaf.index]
387+ return True
388+
389+ def next_slice(self):
390+ if not self.missing:
391+ raise Exception('done!')
392+ first = None
393+ for i in self.missing:
394+ if first is None:
395+ first = i
396+ last = i
397+ elif i != last + 1:
398+ return (first, last + 1)
399+ else:
400+ last = i
401+ return (first, last + 1)
402+
403+ def finish(self):
404+ assert not self.missing
405+ self.tmp_fp.close()
406+ tmp_fp = open(self.tmp_fp.name, 'rb')
407+ return self.store.verify_and_move(tmp_fp, self.ch.id)
408+
409+
410+class HTTPClient:
411+ def __init__(self, url, debug=False):
412+ (self.conn, u) = http_conn(url)
413+ self.basepath = (u.path if u.path.endswith('/') else u.path + '/')
414+ self.url = ''.join([u.scheme, '://', u.netloc, self.basepath])
415+ self.u = u
416+ if debug:
417+ self.conn.set_debuglevel(1)
418+
419+ def request(self, method, relpath, body=None, headers=None):
420+ assert not relpath.startswith('/')
421+ path = self.basepath + relpath
422+ h = {'User-Agent': USER_AGENT}
423+ if headers:
424+ h.update(headers)
425+ try:
426+ self.conn.request(method, path, body, h)
427+ response = self.conn.getresponse()
428+ except Exception as e:
429+ self.conn.close()
430+ raise e
431+ if response.status >= 500:
432+ raise ServerError(response, method, path)
433+ if response.status >= 400:
434+ E = errors.get(response.status, ClientError)
435+ raise E(response, method, path)
436+ return response
437+
438+ def get(self, ch, start=0, stop=None):
439+ headers = range_header(ch, start, stop)
440+ return self.request('GET', ch.id, headers=headers)
441+
442
443=== modified file 'dmedia/core.py'
444--- dmedia/core.py 2011-09-15 11:41:48 +0000
445+++ dmedia/core.py 2011-09-22 11:43:29 +0000
446@@ -56,8 +56,8 @@
447 import json
448
449 from microfiber import Database, NotFound, Conflict
450+from filestore import FileStore
451
452-from .filestore import FileStore
453 from .constants import DBNAME
454 from .transfers import TransferManager
455 from .schema import random_id, create_machine, create_store
456@@ -126,8 +126,8 @@
457 if not self.local['filestores']:
458 self.add_filestore(self.home)
459 else:
460- for (parentdir, store) in self.local['filestores'].iteritems():
461- assert store['path'] == parentdir
462+ for (parentdir, store) in self.local['filestores'].items():
463+ assert store['parentdir'] == parentdir
464 try:
465 self.init_filestore(store)
466 except Exception:
467@@ -137,11 +137,11 @@
468 except Conflict:
469 pass
470 if self.local.get('default_filestore') not in self.local['filestores']:
471- self.local['default_filestore'] = store['path']
472+ self.local['default_filestore'] = store['parentdir']
473 return self.local['filestores'][self.local['default_filestore']]
474
475 def init_filestore(self, store):
476- parentdir = store['path']
477+ parentdir = store['parentdir']
478 self._filestores[parentdir] = FileStore(parentdir)
479
480 def add_filestore(self, parentdir):
481@@ -156,16 +156,14 @@
482 store = create_store(parentdir, self.machine_id)
483 self.init_filestore(store)
484 self.local['filestores'][parentdir] = deepcopy(store)
485- self.local['default_filestore'] = store['path']
486+ self.local['default_filestore'] = store['parentdir']
487 self.db.save(self.local)
488 self.db.save(store)
489 return store
490
491 def get_file(self, file_id):
492- doc = self.db.get(file_id)
493- ext = doc.get('ext')
494- for fs in self._filestores.itervalues():
495- filename = fs.path(file_id, ext)
496+ for fs in self._filestores.values():
497+ filename = fs.path(file_id)
498 if path.isfile(filename):
499 return filename
500
501
502=== removed file 'dmedia/errors.py'
503--- dmedia/errors.py 2011-04-20 08:13:48 +0000
504+++ dmedia/errors.py 1970-01-01 00:00:00 +0000
505@@ -1,74 +0,0 @@
506-# Authors:
507-# Jason Gerard DeRose <jderose@novacut.com>
508-#
509-# dmedia: distributed media library
510-# Copyright (C) 2010 Jason Gerard DeRose <jderose@novacut.com>
511-#
512-# This file is part of `dmedia`.
513-#
514-# `dmedia` is free software: you can redistribute it and/or modify it under the
515-# terms of the GNU Affero General Public License as published by the Free
516-# Software Foundation, either version 3 of the License, or (at your option) any
517-# later version.
518-#
519-# `dmedia` is distributed in the hope that it will be useful, but WITHOUT ANY
520-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
521-# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
522-# details.
523-#
524-# You should have received a copy of the GNU Affero General Public License along
525-# with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
526-
527-"""
528-Custom exceptions
529-"""
530-
531-class DmediaError(StandardError):
532- """
533- Base class for all custom dmedia exceptions.
534- """
535-
536- _format = ''
537-
538- def __init__(self, **kw):
539- self._kw = kw
540- for (key, value) in kw.iteritems():
541- assert not hasattr(self, key), 'conflicting kwarg %s.%s = %r' % (
542- self.__class__.__name__, key, value,
543- )
544- setattr(self, key, value)
545- super(DmediaError, self).__init__(self._format % kw)
546-
547-
548-class AmbiguousPath(DmediaError):
549- _format = '%(pathname)r resolves to %(abspath)r'
550-
551-
552-class FileStoreTraversal(DmediaError):
553- """
554- Raised when what should be internal path traverses out of FileStore base.
555-
556- For example:
557-
558- >>> raise FileStoreTraversal(abspath='/foo/barNone/baz', base='/foo/bar')
559- Traceback (most recent call last):
560- ...
561- FileStoreTraversal: '/foo/barNone/baz' outside base '/foo/bar'
562- """
563- _format = '%(abspath)r outside base %(base)r'
564-
565-
566-class DuplicateFile(DmediaError):
567- _format = 'chash=%(chash)r, src=%(src)r, dst=%(dst)r'
568-
569-
570-class DownloadFailure(DmediaError):
571- _format = 'leaf %(leaf)d expected %(expected)r; got %(got)r'
572-
573-
574-class IntegrityError(DmediaError):
575- _format = 'got chash %(got)r; expected %(expected)r for %(filename)r'
576-
577-
578-class TopHashError(DmediaError):
579- _format = 'got tophash %(got)r; expected %(expected)r (size: %(size)r bytes)'
580
581=== modified file 'dmedia/extractor.py'
582--- dmedia/extractor.py 2011-07-12 03:12:55 +0000
583+++ dmedia/extractor.py 2011-09-22 11:43:29 +0000
584@@ -25,7 +25,7 @@
585 """
586
587 from os import path
588-from subprocess import check_call, Popen, PIPE
589+from subprocess import check_call, check_output, CalledProcessError
590 import json
591 import tempfile
592 import shutil
593@@ -95,16 +95,16 @@
594 """
595 Attempt to extract EXIF metadata from file at *filename*.
596 """
597+ cmd = ['exiftool', '-j', filename]
598 try:
599- args = ['exiftool', '-j', filename]
600- (stdout, stderr) = Popen(args, stdout=PIPE).communicate()
601- exif = json.loads(stdout)[0]
602- assert isinstance(exif, dict)
603- for key in EXIFTOOL_IGNORE:
604- exif.pop(key, None)
605- return exif
606- except Exception as e:
607- return {u'Error': u'%s: %s' % (e.__class__.__name__, e)}
608+ output = check_output(cmd)
609+ except CalledProcessError:
610+ return {}
611+ exif = json.loads(output.decode('utf-8'))[0]
612+ assert isinstance(exif, dict)
613+ for key in EXIFTOOL_IGNORE:
614+ exif.pop(key, None)
615+ return exif
616
617
618 def parse_subsec_datetime(string):
619@@ -119,7 +119,7 @@
620 >>> parse_subsec_datetime('2010:10:21 01:44:37')
621 1287625477.0
622 """
623- if not isinstance(string, basestring):
624+ if not isinstance(string, str):
625 return
626 parts = string.split('.')
627 if len(parts) == 1:
628@@ -165,13 +165,10 @@
629 Attempt to extract video metadata from video at *filename*.
630 """
631 try:
632- args = ['totem-video-indexer', filename]
633- popen = Popen(args, stdout=PIPE)
634- (stdout, stderr) = popen.communicate()
635- if popen.returncode != 0:
636- return {}
637+ cmd = ['totem-video-indexer', filename]
638+ output = check_output(cmd).decode('utf-8')
639 info = {}
640- for line in stdout.splitlines():
641+ for line in output.splitlines():
642 pair = line.split('=', 1)
643 if len(pair) != 2:
644 continue
645@@ -240,7 +237,7 @@
646
647 def merge_exif(src, attachments):
648 exif = extract_exif(src)
649- for (key, values) in EXIF_REMAP.iteritems():
650+ for (key, values) in EXIF_REMAP.items():
651 for v in values:
652 if v in exif:
653 yield (key, exif[v])
654
655=== removed file 'dmedia/filestore.py'
656--- dmedia/filestore.py 2011-06-15 04:46:55 +0000
657+++ dmedia/filestore.py 1970-01-01 00:00:00 +0000
658@@ -1,1163 +0,0 @@
659-# Authors:
660-# Jason Gerard DeRose <jderose@novacut.com>
661-# Akshat Jain <ssj6akshat1234@gmail.com)
662-#
663-# dmedia: distributed media library
664-# Copyright (C) 2010, 2011 Jason Gerard DeRose <jderose@novacut.com>
665-#
666-# This file is part of `dmedia`.
667-#
668-# `dmedia` is free software: you can redistribute it and/or modify it under the
669-# terms of the GNU Affero General Public License as published by the Free
670-# Software Foundation, either version 3 of the License, or (at your option) any
671-# later version.
672-#
673-# `dmedia` is distributed in the hope that it will be useful, but WITHOUT ANY
674-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
675-# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
676-# details.
677-#
678-# You should have received a copy of the GNU Affero General Public License along
679-# with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
680-
681-"""
682-Store files in a special layout according to their content-hash.
683-
684-The `FileStore` is the heart of dmedia. Files are assigned a canonical name
685-based on the file's content-hash, and are placed in a special layout within the
686-`FileStore` base directory.
687-
688-The files in a `FileStore` are read-only... they must be as modifying a file
689-will change its content-hash. The only way to modify a file is to copy the
690-original to a temporary file, modify it, and then place the new file into the
691-`FileStore`. This might seem like an unreasonable restriction, but it perfectly
692-captures the use case dmedia is concerned with... a distributed library of media
693-files.
694-
695-On the content-creation side, non-destructive editing is certainly the best
696-practice, especially in professional use cases. On the content consumption
697-side, modifying a file is rather rare. And the somewhat common use case --
698-modifying a file for the sake of updating metadata (say, EXIF) -- can instead be
699-accomplished by updating metadata in the corresponding CouchDB document.
700-
701-Importantly, without the read-only restriction, it would be impossible to make a
702-distributed file system whose file operations remain robust and atomic in the
703-face of arbitrary and prolonged network outages. True to its CouchDB
704-foundations, dmedia is designing with the assumption that network connectivity
705-is the exception rather than the rule.
706-
707-Please read on for the rationale of some key `FileStore` design decisions...
708-
709-
710-Design Decision: base32-encoded content-hash
711-============================================
712-
713-The `FileStore` layout was designed to allow the canonical filename to be
714-constructed from the content-hash in the simplest way possible, without
715-requiring any special decoding or encoding. For this reason, the content-hash
716-(as stored in CouchDB) is base32-encoded.
717-
718-Base32-encoding was chosen because:
719-
720- 1. It's more compact than base16/hex
721-
722- 2. It can be used to name files on case *insensitive* filesystems (whereas
723- base64-encoding cannot)
724-
725-Inside the `FileStore`, the first 2 characters of the content-hash are used for
726-the subdirectory name, and the remaining characters for the filename within that
727-subdirectory. For example:
728-
729->>> from os import path
730->>> chash = 'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O'
731->>> path.join('/foo', chash[:2], chash[2:])
732-'/foo/ZR/765XWSF6S7JQHLUI4GCG5BHGPE252O'
733-
734-
735-Design Decision: canonical filenames have file extensions
736-=========================================================
737-
738-Strictly speaking, there is no technical reason to include a file extension on
739-the canonical filenames. However, there are some practical reasons that make
740-including the file extension worthwhile, despite additional complexity it adds
741-to the `FileStore` API.
742-
743-Most importantly, it allows files in a `FileStore` layout to be served with the
744-correct Content-Type by a vanilla web-server. A key design goal was to be able
745-to point, say, Apache at a dmedia `FileStore` directory have a useful dmedia
746-file server without requiring special Apache plugins for dmedia integration.
747-
748-It also provides broader software compatibility as many applications and
749-libraries do rely on the file extension for type determination. And the file
750-extension is helpful for developers, as a bit of intelligible information in
751-canonical filename will make the layout easier to explore, aid debugging.
752-
753-The current `FileStore` always includes the file extension on the canonical name
754-when the extension is provided by the calling code. However, the API is
755-designed to accommodate `FileStore` implementations that do not include the
756-file extension. The API is also designed so that the calling code isn't
757-required to provide the file extension... say, if the extension was ever removed
758-from the CouchDB schema.
759-
760-To accomplish this, files are identified by the content-hash and extension
761-together, and the extension is optional, defaulting to ``None``. This is the
762-typical calling signature:
763-
764->>> def canonical(chash, ext=None):
765-... pass
766-
767-For example:
768-
769->>> FileStore.relpath('ZR765XWSF6S7JQHLUI4GCG5BHGPE252O')
770-('ZR', '765XWSF6S7JQHLUI4GCG5BHGPE252O')
771->>> FileStore.relpath('ZR765XWSF6S7JQHLUI4GCG5BHGPE252O', 'mov')
772-('ZR', '765XWSF6S7JQHLUI4GCG5BHGPE252O.mov')
773-
774-
775-Design Decision: security good, path traversals bad
776-===================================================
777-
778-The `FileStore` is probably the most security sensitive part of dmedia in that
779-untrusted data (content-hash, file extension) is used to construct paths on the
780-filesystem. This means that the `FileStore` must be carefully designed to
781-prevent path traversal attacks (aka directory traversal attacks).
782-
783-Two lines of defense are used. First, the content-hash and file extension are
784-validated with the following functions:
785-
786- * `safe_b32()` - validates the content-hash
787-
788- * `safe_ext()` - validates the file extension
789-
790-Second, there are methods that ensure that paths constructed relative to the
791-`FileStore` base directory cannot be outside of the base directory:
792-
793- * `FileStore.check_path()` - ensures that a path is inside the base
794- directory
795-
796- * `FileStore.join()` - creates a path relative to the base directory,
797- ensures resulting path is inside the base directory
798-
799- * `FileStore.create_parent()` - creates a file's parent directory only if
800- that parent directory is inside the base directory
801-
802-Each line of defense is designed to fully prevent path traversals, assumes the
803-other defense doesn't exist or will fail. Together, they should provide a
804-strong defense against path traversal attacks.
805-
806-If you discover any security vulnerability in dmedia, please immediately file a
807-bug:
808-
809- https://bugs.launchpad.net/dmedia/+filebug
810-"""
811-
812-import os
813-from os import path
814-import stat
815-import tempfile
816-from hashlib import sha1
817-from base64 import b32encode, b32decode
818-import json
819-import re
820-import logging
821-from subprocess import check_call, CalledProcessError
822-from threading import Thread
823-from Queue import Queue
824-
825-from .errors import AmbiguousPath, FileStoreTraversal
826-from .errors import DuplicateFile, IntegrityError
827-from .constants import LEAF_SIZE, TYPE_ERROR, EXT_PAT
828-from .constants import TRANSFERS_DIR, IMPORTS_DIR, WRITES_DIR
829-
830-B32LENGTH = 32 # Length of base32-encoded hash
831-QUICK_ID_CHUNK = 2 ** 20 # Amount to read for quick_id()
832-FALLOCATE = '/usr/bin/fallocate'
833-EXT_RE = re.compile(EXT_PAT)
834-log = logging.getLogger()
835-
836-
837-def safe_path(pathname):
838- """
839- Ensure that *pathname* is a normalized absolute path.
840-
841- This is to help protect against path-traversal attacks and to prevent use of
842- ambiguous relative paths.
843-
844- For example, if *pathname* is not a normalized absolute path,
845- `AmbiguousPath` is raised:
846-
847- >>> safe_path('/foo/../root')
848- Traceback (most recent call last):
849- ...
850- AmbiguousPath: '/foo/../root' resolves to '/root'
851-
852- Otherwise *pathname* is returned unchanged:
853-
854- >>> safe_path('/foo/bar')
855- '/foo/bar'
856-
857- Also see `safe_open()`.
858- """
859- if path.abspath(pathname) != pathname:
860- raise AmbiguousPath(pathname=pathname, abspath=path.abspath(pathname))
861- return pathname
862-
863-
864-def safe_open(filename, mode):
865- """
866- Only open file if *filename* is a normalized absolute path.
867-
868- This is to help protect against path-traversal attacks and to prevent use of
869- ambiguous relative paths.
870-
871- Prior to opening the file, *filename* is checked with `safe_path()`. If
872- it's not an absolute normalized path, `AmbiguousPath` is raised:
873-
874- >>> safe_open('/foo/../root', 'rb')
875- Traceback (most recent call last):
876- ...
877- AmbiguousPath: '/foo/../root' resolves to '/root'
878-
879- Otherwise returns a ``file`` instance created with ``open()``.
880- """
881- return open(safe_path(filename), mode)
882-
883-
884-def safe_ext(ext):
885- r"""
886- Verify that extension *ext* contains only lowercase ascii letters, digits.
887-
888- A malicious *ext* could cause path traversal or other security gotchas,
889- thus this sanity check. When *wav* is valid, it is returned unchanged:
890-
891- >>> safe_ext('ogv')
892- 'ogv'
893- >>> safe_ext('tar.gz')
894- 'tar.gz'
895-
896- However, when *ext* does not conform, a ``TypeError`` or ``ValueError`` is
897- raised:
898-
899- >>> safe_ext('/../.ssh')
900- Traceback (most recent call last):
901- ...
902- ValueError: ext '/../.ssh' does not match pattern '^[a-z0-9]+(\\.[a-z0-9]+)?$'
903-
904- Also see `safe_b32()`.
905- """
906- if not isinstance(ext, basestring):
907- raise TypeError(
908- TYPE_ERROR % ('ext', basestring, type(ext), ext)
909- )
910- if not EXT_RE.match(ext):
911- raise ValueError(
912- 'ext %r does not match pattern %r' % (ext, EXT_PAT)
913- )
914- return ext
915-
916-
917-def safe_b32(b32):
918- """
919- Verify that *b32* is valid base32-encoding and correct length.
920-
921- A malicious *b32* could cause path traversal or other security gotchas,
922- thus this sanity check. When *b2* is valid, it is returned unchanged:
923-
924- >>> safe_b32('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW')
925- 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
926-
927- However, when *b32* does not conform, a ``TypeError`` or ``ValueError`` is
928- raised:
929-
930- >>> safe_b32('NWBNVXVK5DQGIOW7MYR4K3KA')
931- Traceback (most recent call last):
932- ...
933- ValueError: len(b32) must be 32; got 24: 'NWBNVXVK5DQGIOW7MYR4K3KA'
934-
935- Also see `safe_ext()`.
936- """
937- if not isinstance(b32, basestring):
938- raise TypeError(
939- TYPE_ERROR % ('b32', basestring, type(b32), b32)
940- )
941- try:
942- b32decode(b32)
943- except TypeError as e:
944- raise ValueError('b32: cannot b32decode %r: %s' % (b32, e))
945- if len(b32) != B32LENGTH:
946- raise ValueError('len(b32) must be %d; got %d: %r' %
947- (B32LENGTH, len(b32), b32)
948- )
949- return b32
950-
951-
952-def tophash_personalization(file_size):
953- """
954- Personalize the top-hash with *file_size*.
955-
956- For example:
957-
958- >>> tophash_personalization(3141)
959- 'dmedia/tophash 3141'
960-
961- This is used to cryptographically tie ``doc['bytes']`` to ``doc['_id']``.
962- You can't change the leaves or the file size without affecting the top-hash.
963-
964- The personalization is designed to be easy to implement in JavaScript. For
965- example, this is the equivalent JavaScript function:
966-
967- ::
968-
969- function tophash_personalization(file_size) {
970- return ['dmedia/tophash', file_size].join(' ');
971- }
972-
973- When hashing with Skein, this value would be used for the Skein
974- personalization parameter. See PySkein and the Skein specification for
975- details:
976-
977- http://packages.python.org/pyskein/
978-
979- http://www.skein-hash.info/
980-
981- When hashing with sha1, the top-hash is calculated like this:
982-
983- >>> from hashlib import sha1
984- >>> from base64 import b32encode
985- >>> pers = tophash_personalization(3141)
986- >>> leaves = b'pretend this is the concatenated leaves'
987- >>> b32encode(sha1(pers + leaves).digest()) # The top-hash
988- 'M55ORBTYICEDQ2WUREDYIYYO6VUJ3R6S'
989-
990- :param file_size: the file size in bytes (an ``int``)
991- """
992- return ' '.join(['dmedia/tophash', str(file_size)]).encode('utf-8')
993-
994-
995-def tophash(file_size):
996- """
997- Initialize hash for a file that is *file_size* bytes.
998- """
999- return sha1(tophash_personalization(file_size))
1000-
1001-
1002-def leafhash_personalization(file_size, leaf_index):
1003- """
1004- Personalize the leaf-hash with *file_size* and *leaf_index*.
1005-
1006- For example:
1007-
1008- >>> leafhash_personalization(3141, 0)
1009- 'dmedia/leafhash 3141 0'
1010-
1011- :param file_size: the file size in bytes (an ``int``)
1012- :param leaf_index: the index of this leaf (an ``int``, starting at zero)
1013- """
1014- return ' '.join(
1015- ['dmedia/leafhash', str(file_size), str(leaf_index)]
1016- ).encode('utf-8')
1017-
1018-
1019-def leafhash(file_size, leaf_index):
1020- """
1021- Initialize hash for the *leaf_index* leaf in a file of *file_size* bytes.
1022- """
1023- return sha1(leafhash_personalization(file_size, leaf_index))
1024-
1025-
1026-class HashList(object):
1027- """
1028- Simple hash-list (a 1-deep tree-hash).
1029-
1030- For swarm upload/download, we need to keep the content hashes of the
1031- individual leaves, a list of which is available via the ``HashList.leaves``
1032- attribute after `HashList.run()` has been called.
1033-
1034- The effective content-hash for the entire file is a hash of the leaf hashes
1035- concatenated together. This is handy because it gives us a
1036- cryptographically strong way to associate individual leaves with the file
1037- "_id". This is important because otherwise malicious peers could pollute
1038- the network with invalid leaves, but victims wouldn't know anything was
1039- wrong till the entire file was downloaded. The whole file would fail to
1040- verify, and worse, the victim would have no way of knowing which leaves were
1041- invalid.
1042-
1043- In order to maximize IO utilization, the hash is computed in two threads.
1044- The main thread reads chunks from *src_fp* and puts them into a queue. The
1045- 2nd thread gets chunks from the queue, updates the hash, and then optionally
1046- writes the chunk to *dst_fp* if one was provided when the `HashList` was
1047- created.
1048-
1049- For some background, see:
1050-
1051- https://bugs.launchpad.net/dmedia/+bug/704272
1052-
1053- For more information about hash-lists and tree-hashes, see:
1054-
1055- http://en.wikipedia.org/wiki/Hash_list
1056-
1057- http://en.wikipedia.org/wiki/Tree_hash
1058- """
1059-
1060- def __init__(self, src_fp, dst_fp=None, leaf_size=LEAF_SIZE):
1061- if not isinstance(src_fp, file):
1062- raise TypeError(
1063- TYPE_ERROR % ('src_fp', file, type(src_fp), src_fp)
1064- )
1065- if src_fp.mode != 'rb':
1066- raise ValueError(
1067- "src_fp: mode must be 'rb'; got %r" % src_fp.mode
1068- )
1069- if dst_fp is not None:
1070- if not isinstance(dst_fp, file):
1071- raise TypeError(
1072- TYPE_ERROR % ('dst_fp', file, type(dst_fp), dst_fp)
1073- )
1074- if dst_fp.mode not in ('wb', 'r+b'):
1075- raise ValueError(
1076- "dst_fp: mode must be 'wb' or 'r+b'; got %r" % dst_fp.mode
1077- )
1078- self.src_fp = src_fp
1079- self.dst_fp = dst_fp
1080- self.leaf_size = leaf_size
1081- self.file_size = os.fstat(src_fp.fileno()).st_size
1082- self.h = tophash(self.file_size)
1083- self.leaves = []
1084- self.q = Queue(4)
1085- self.thread = Thread(target=self.hashing_thread)
1086- self.thread.daemon = True
1087- self.__ran = False
1088-
1089- def update(self, chunk):
1090- """
1091- Update hash with *chunk*, optionally write to dst_fp.
1092-
1093- This will append the content-hash of *chunk* to ``HashList.leaves`` and
1094- update the top-hash.
1095-
1096- If the `HashList` was created with a *dst_fp*, *chunk* will be will be
1097- written to *dst_fp*.
1098-
1099- `HashList.hashing_thread()` calls this method once for each chunk in the
1100- queue. This functionality is in its own method simply to make testing
1101- easier.
1102- """
1103- digest = sha1(chunk).digest()
1104- self.h.update(digest)
1105- self.leaves.append(digest)
1106- if self.dst_fp is not None:
1107- self.dst_fp.write(chunk)
1108-
1109- def hashing_thread(self):
1110- while True:
1111- chunk = self.q.get()
1112- if not chunk:
1113- break
1114- self.update(chunk)
1115-
1116- def run(self):
1117- assert self.__ran is False
1118- self.__ran = True
1119- self.src_fp.seek(0) # Make sure we are at beginning of file
1120- self.thread.start()
1121- while True:
1122- chunk = self.src_fp.read(self.leaf_size)
1123- self.q.put(chunk)
1124- if not chunk:
1125- break
1126- self.thread.join()
1127- return b32encode(self.h.digest())
1128-
1129-
1130-def pack_leaves(leaves, digest_bytes=20):
1131- """
1132- Pack leaves together into a ``bytes`` instance for CouchDB attachment.
1133-
1134- :param leaves: a ``list`` containing content-hash of each leaf in the file
1135- (content-hash is binary digest, not base32-encoded)
1136- :param digest_bytes: digest size in bytes; default is 20 (160 bits)
1137- """
1138- for (i, leaf) in enumerate(leaves):
1139- if len(leaf) != digest_bytes:
1140- raise ValueError('digest_bytes=%d, but len(leaves[%d]) is %d' % (
1141- digest_bytes, i, len(leaf)
1142- )
1143- )
1144- return ''.join(leaves)
1145-
1146-
1147-def unpack_leaves(data, digest_bytes=20):
1148- """
1149- Unpack binary *data* into a list of leaf digests.
1150-
1151- :param data: a ``bytes`` instance containing the packed leaf digests
1152- :param digest_bytes: digest size in bytes; default is 20 (160 bits)
1153- """
1154- assert isinstance(data, bytes)
1155- if len(data) % digest_bytes != 0:
1156- raise ValueError(
1157- 'len(data)=%d, not multiple of digest_bytes=%d' % (
1158- len(data), digest_bytes
1159- )
1160- )
1161- return [
1162- data[i*digest_bytes : (i+1)*digest_bytes]
1163- for i in xrange(len(data) / digest_bytes)
1164- ]
1165-
1166-
1167-def quick_id(fp):
1168- """
1169- Compute a quick reasonably unique ID for the open file *fp*.
1170- """
1171- if not isinstance(fp, file):
1172- raise TypeError(
1173- TYPE_ERROR % ('fp', file, type(fp), fp)
1174- )
1175- if fp.mode != 'rb':
1176- raise ValueError("fp: must be opened in mode 'rb'; got %r" % fp.mode)
1177- fp.seek(0) # Make sure we are at beginning of file
1178- h = sha1()
1179- size = os.fstat(fp.fileno()).st_size
1180- h.update(str(size).encode('utf-8'))
1181- h.update(fp.read(QUICK_ID_CHUNK))
1182- return b32encode(h.digest())
1183-
1184-
1185-def fallocate(size, filename):
1186- """
1187- Attempt to efficiently preallocate file *filename* to *size* bytes.
1188-
1189- If the fallocate command is available, it will always at least create an
1190- empty file (the equivalent of ``touch filename``), even the file-system
1191- doesn't support pre-allocation.
1192- """
1193- if not isinstance(size, (int, long)):
1194- raise TypeError(
1195- TYPE_ERROR % ('size', (int, long), type(size), size)
1196- )
1197- if size <= 0:
1198- raise ValueError('size must be >0; got %r' % size)
1199- filename = safe_path(filename)
1200- if not path.isfile(FALLOCATE):
1201- return None
1202- try:
1203- check_call([FALLOCATE, '-l', str(size), filename])
1204- return True
1205- except CalledProcessError:
1206- return False
1207-
1208-
1209-class FileStore(object):
1210- """
1211- Arranges files in a special layout according to their content-hash.
1212-
1213- Security note: this class must be carefully designed to prevent path
1214- traversal!
1215-
1216- To create a `FileStore`, you give it the directory that will be its base on
1217- the filesystem:
1218-
1219- >>> fs = FileStore('/home/jderose') #doctest: +SKIP
1220- >>> fs.base #doctest: +SKIP
1221- '/home/jderose/.dmedia'
1222-
1223- If you don't supply *base*, a temporary directory will be created for you:
1224-
1225- >>> fs = FileStore()
1226- >>> fs.base #doctest: +ELLIPSIS
1227- '/tmp/.../.dmedia'
1228-
1229- You can add files to the store using `FileStore.import_file()`:
1230-
1231- >>> from dmedia.tests import sample_mov # Sample .MOV file
1232- >>> src_fp = open(sample_mov, 'rb')
1233- >>> fs.import_file(src_fp, 'mov') #doctest: +ELLIPSIS
1234- ('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', [...])
1235-
1236- And when you have the content-hash and extension, you can retrieve the full
1237- path of the file using `FileStore.path()`:
1238-
1239- >>> fs.path('HIGJPQWY4PI7G7IFOB2G4TKY6PMTJSI7', 'mov') #doctest: +ELLIPSIS
1240- '/tmp/.../.dmedia/HI/GJPQWY4PI7G7IFOB2G4TKY6PMTJSI7.mov'
1241-
1242- As the files are assumed to be read-only and unchanging, moving a file into
1243- its canonical location must be atomic. There are 2 scenarios that must be
1244- considered:
1245-
1246- 1. Imports - we compute the content-hash as we copy the file into the
1247- `FileStore`, so this requires a randomly-named temporary file. When
1248- the copy completes, file is renamed to its canonical name.
1249-
1250- 2. Transfers - as uploads/downloads might stop or fail and then be
1251- resumed, this requires a canonically-named temporary file. As the
1252- file content-hash is already known (we have its meta-data in
1253- CouchDB), the temporary file is named by the content-hash. Once
1254- download completes, file is renamed to its canonical name.
1255-
1256- In both scenarios, the file size will be known when the temporary file is
1257- created, so an attempt is made to preallocate the entire file using the
1258- `fallocate()` function, which calls the Linux ``fallocate`` command.
1259- """
1260-
1261- def __init__(self, parent=None, dotdir='.dmedia'):
1262- if parent is None:
1263- parent = tempfile.mkdtemp(prefix='store.')
1264- self.parent = safe_path(parent)
1265- if not path.isdir(self.parent):
1266- raise ValueError('%s.parent not a directory: %r' %
1267- (self.__class__.__name__, parent)
1268- )
1269- self.base = path.join(self.parent, dotdir)
1270- try:
1271- os.mkdir(self.base)
1272- except OSError:
1273- pass
1274- if not path.isdir(self.base):
1275- raise ValueError('%s.base not a directory: %r' %
1276- (self.__class__.__name__, self.base)
1277- )
1278- if path.islink(self.base):
1279- raise ValueError('{!r} is symlink to {!r}'.format(
1280- self.base, os.readlink(self.base)
1281- )
1282- )
1283-
1284- def __repr__(self):
1285- return '%s(%r)' % (self.__class__.__name__, self.parent)
1286-
1287- ############################################
1288- # Methods to prevent path traversals attacks
1289- def check_path(self, pathname):
1290- """
1291- Verify that *pathname* in inside this filestore base directory.
1292- """
1293- abspath = path.abspath(pathname)
1294- if abspath.startswith(self.base + os.sep):
1295- return abspath
1296- raise FileStoreTraversal(
1297- pathname=pathname, base=self.base, abspath=abspath
1298- )
1299-
1300- def join(self, *parts):
1301- """
1302- Safely join *parts* with base directory to prevent path traversal.
1303-
1304- For security reasons, it's very important that you use this method
1305- rather than ``path.join()`` directly. This method will prevent path
1306- traversal attacks, ``path.join()`` will not.
1307-
1308- For example:
1309-
1310- >>> fs = FileStore()
1311- >>> fs.join('NW', 'BNVXVK5DQGIOW7MYR4K3KA5K22W7NW') #doctest: +ELLIPSIS
1312- '/tmp/.../.dmedia/NW/BNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
1313-
1314- However, a `FileStoreTraversal` is raised if *parts* cause a path
1315- traversal outside of the `FileStore` base directory:
1316-
1317- >>> fs.join('../ssh') #doctest: +ELLIPSIS
1318- Traceback (most recent call last):
1319- ...
1320- FileStoreTraversal: '/tmp/.../ssh' outside base '/tmp/.../.dmedia'
1321-
1322- Or Likewise if an absolute path is included in *parts*:
1323-
1324- >>> fs.join('NW', '/etc', 'ssh') #doctest: +ELLIPSIS
1325- Traceback (most recent call last):
1326- ...
1327- FileStoreTraversal: '/etc/ssh' outside base '/tmp/.../.dmedia'
1328-
1329- Also see `FileStore.create_parent()`.
1330- """
1331- fullpath = path.join(self.base, *parts)
1332- return self.check_path(fullpath)
1333-
1334- def create_parent(self, filename):
1335- """
1336- Safely create the directory containing *filename*.
1337-
1338- To prevent path traversal attacks, this method will only create
1339- directories within the `FileStore` base directory. For example:
1340-
1341- >>> fs = FileStore()
1342- >>> fs.create_parent('/foo/my.ogv') #doctest: +ELLIPSIS
1343- Traceback (most recent call last):
1344- ...
1345- FileStoreTraversal: '/foo/my.ogv' outside base '/tmp/.../.dmedia'
1346-
1347- It also protects against malicious filenames like this:
1348-
1349- >>> fs.create_parent('/foo/../bar/my.ogv') #doctest: +ELLIPSIS
1350- Traceback (most recent call last):
1351- ...
1352- FileStoreTraversal: '/bar/my.ogv' outside base '/tmp/.../.dmedia'
1353-
1354- If doesn't already exists, the directory containing *filename* is
1355- created. Returns the directory containing *filename*.
1356-
1357- Also see `FileStore.join()`.
1358- """
1359- filename = self.check_path(filename)
1360- containing = path.dirname(filename)
1361- if not path.exists(containing):
1362- os.makedirs(containing)
1363- return containing
1364-
1365-
1366- #################################################
1367- # Methods for working with files in the FileStore
1368- @staticmethod
1369- def relpath(chash, ext=None):
1370- """
1371- Relative path of file with *chash*, ending with *ext*.
1372-
1373- For example:
1374-
1375- >>> FileStore.relpath('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW')
1376- ('NW', 'BNVXVK5DQGIOW7MYR4K3KA5K22W7NW')
1377-
1378- Or with the file extension *ext*:
1379-
1380- >>> FileStore.relpath('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW', ext='mov')
1381- ('NW', 'BNVXVK5DQGIOW7MYR4K3KA5K22W7NW.mov')
1382-
1383- Also see `FileStore.reltmp()`.
1384-
1385- :param chash: base32-encoded content-hash
1386- :param ext: normalized lowercase file extension, eg ``'mov'``
1387- """
1388- chash = safe_b32(chash)
1389- dname = chash[:2]
1390- fname = chash[2:]
1391- if ext:
1392- return (dname, '.'.join((fname, safe_ext(ext))))
1393- return (dname, fname)
1394-
1395- def path(self, chash, ext=None, create=False):
1396- """
1397- Returns path of file with content-hash *chash* and extension *ext*.
1398-
1399- For example:
1400-
1401- >>> fs = FileStore()
1402- >>> fs.path('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW') #doctest: +ELLIPSIS
1403- '/tmp/.../.dmedia/NW/BNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
1404-
1405- Or with a file extension:
1406-
1407- >>> fs.path('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW', 'txt') #doctest: +ELLIPSIS
1408- '/tmp/.../.dmedia/NW/BNVXVK5DQGIOW7MYR4K3KA5K22W7NW.txt'
1409-
1410- If called with ``create=True``, the parent directory is created with
1411- `FileStore.create_parent()`.
1412-
1413- :param chash: base32-encoded content-hash
1414- :param ext: normalized lowercase file extension, eg ``'mov'``
1415- :param create: if ``True``, create parent directory if it does not
1416- already exist; default is ``False``
1417- """
1418- filename = self.join(*self.relpath(chash, ext))
1419- if create:
1420- self.create_parent(filename)
1421- return filename
1422-
1423- def exists(self, chash, ext=None):
1424- """
1425- Return ``True`` if a file with *chash* and *ext* exists.
1426-
1427- :param chash: base32-encoded content-hash
1428- :param ext: normalized lowercase file extension, eg ``'mov'``
1429- """
1430- return path.isfile(self.path(chash, ext))
1431-
1432- def open(self, chash, ext=None):
1433- """
1434- Open the file with *chash* and *ext* in ``'rb'`` mode.
1435-
1436- :param chash: base32-encoded content-hash
1437- :param ext: normalized lowercase file extension, eg ``'mov'``
1438- """
1439- return open(self.path(chash, ext), 'rb')
1440-
1441- def verify(self, chash, ext=None):
1442- """
1443- Verify integrity of file with *chash* and *ext*.
1444-
1445- If the file's content-hash does not equal *chash*, an `IntegrityError`
1446- is raised.
1447-
1448- Otherwise, the open ``file`` is returned after calling ``file.seek(0)``
1449- to put read position back at the start of the file.
1450-
1451- :param chash: base32-encoded content-hash
1452- :param ext: normalized lowercase file extension, eg ``'mov'``
1453- """
1454- src_fp = self.open(chash, ext)
1455- h = HashList(src_fp)
1456- got = h.run()
1457- if got != chash:
1458- corrupted = self.join(*self.reltmp2('corrupted', chash, ext))
1459- self.create_parent(corrupted)
1460- os.rename(src_fp.name, corrupted)
1461- src_fp.close()
1462- raise IntegrityError(got=got, expected=chash, filename=corrupted)
1463- src_fp.seek(0)
1464- return src_fp
1465-
1466- def remove(self, chash, ext=None):
1467- """
1468- Delete file with *chash* and *ext* from underlying filesystem.
1469-
1470- :param chash: base32-encoded content-hash
1471- :param ext: normalized lowercase file extension, eg ``'mov'``
1472- """
1473- filename = self.path(chash, ext)
1474- log.info('Deleting file %r from %r', filename, self)
1475- os.remove(filename)
1476-
1477-
1478- ###########################################################
1479- # Methods for working with temporary files in the FileStore
1480- @staticmethod
1481- def reltmp(chash, ext=None):
1482- """
1483- Relative path of temporary file with *chash*, ending with *ext*.
1484-
1485- For example:
1486-
1487- >>> FileStore.reltmp('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW')
1488- ('transfers', 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW')
1489-
1490- Or with the file extension *ext*:
1491-
1492- >>> FileStore.reltmp('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW', ext='mov')
1493- ('transfers', 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW.mov')
1494-
1495- Also see `FileStore.relpath()`.
1496-
1497- :param chash: base32-encoded content-hash
1498- :param ext: normalized lowercase file extension, eg ``'mov'``
1499- """
1500- chash = safe_b32(chash)
1501- if ext:
1502- return (TRANSFERS_DIR, '.'.join([chash, safe_ext(ext)]))
1503- return (TRANSFERS_DIR, chash)
1504-
1505- @staticmethod
1506- def reltmp2(state, chash, ext=None):
1507- assert state in ('transfers', 'corrupted')
1508- chash = safe_b32(chash)
1509- if ext:
1510- return (state, '.'.join([chash, safe_ext(ext)]))
1511- return (state, chash)
1512-
1513- def tmp(self, chash, ext=None, create=False):
1514- """
1515- Returns path of temporary file with *chash*, ending with *ext*.
1516-
1517- These temporary files are used for file transfers between dmedia peers,
1518- in which case the content-hash is already known. For example:
1519-
1520- >>> fs = FileStore()
1521- >>> fs.tmp('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW') #doctest: +ELLIPSIS
1522- '/tmp/.../.dmedia/transfers/NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
1523-
1524- Or with a file extension:
1525-
1526- >>> fs.tmp('NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW', 'txt') #doctest: +ELLIPSIS
1527- '/tmp/.../.dmedia/transfers/NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW.txt'
1528-
1529- If called with ``create=True``, the parent directory is created with
1530- `FileStore.create_parent()`.
1531-
1532- :param chash: base32-encoded content-hash
1533- :param ext: normalized lowercase file extension, eg ``'mov'``
1534- :param create: if ``True``, create parent directory if it does not
1535- already exist; default is ``False``
1536- """
1537- filename = self.join(*self.reltmp(chash, ext))
1538- if create:
1539- self.create_parent(filename)
1540- return filename
1541-
1542- def allocate_for_transfer(self, size, chash, ext=None):
1543- """
1544- Open the canonical temporary file for a transfer (download or upload).
1545-
1546- When transferring files from other dmedia peers, the content-hash is
1547- already known. As we must be able to easily resume a download or
1548- upload, transfers use a stable, canonical temporary filename derived
1549- from the content-hash and file extension.
1550-
1551- The file *size* is also known, so an attempt is made to efficiently
1552- pre-allocate the temporary file using `fallocate()`.
1553-
1554- If the temporary file already exists, it means we're resuming a
1555- transfer. The file is opened in ``'r+b'`` mode, leaving data in the
1556- temporary file intact. It is the responsibility of higher-level code
1557- to verify the file leaf by leaf in order to determine what portions of
1558- the file have been transfered, what portions of the file still need to
1559- be transferred.
1560-
1561- Note that as the temporary file will likely be pre-allocated, higher-
1562- level code cannot use the size of the temporary file as a means of
1563- determining how much of the file has been transfered.
1564-
1565- If the temporary does not exist, and cannot be pre-allocated, a new
1566- empty file is opened in ``'wb'`` mode. Higher-level code must check
1567- the mode of the ``file`` instance and act accordingly.
1568-
1569- :param size: file size in bytes (an ``int``)
1570- :param chash: base32-encoded content-hash
1571- :param ext: normalized lowercase file extension, eg ``'mov'``
1572- """
1573- filename = self.tmp(chash, ext, create=True)
1574- fallocate(size, filename)
1575- try:
1576- fp = open(filename, 'r+b')
1577- if os.fstat(fp.fileno()).st_size > size:
1578- fp.truncate(size)
1579- return fp
1580- except IOError:
1581- return open(filename, 'wb')
1582-
1583- def allocate_for_import(self, size, ext=None):
1584- """
1585- Open a random temporary file for an import operation.
1586-
1587- When importing a file, the content-hash is computed as the file is
1588- copied into the `FileStore`. As the content-hash isn't known when
1589- allocating the temporary file, a randomly named temporary file is used.
1590-
1591- However, the file *size* is known, so an attempt is made to efficiently
1592- pre-allocate the temporary file using `fallocate()`.
1593-
1594- The file extension *ext* is optional and serves no other purpose than to
1595- aid in debugging. The value of *ext* used here has no effect on the
1596- ultimate canonical file name.
1597-
1598- :param size: file size in bytes (an ``int``)
1599- :param ext: normalized lowercase file extension, eg ``'mov'``
1600- """
1601- imports = self.join(IMPORTS_DIR)
1602- if not path.exists(imports):
1603- os.makedirs(imports)
1604- suffix = ('' if ext is None else '.' + ext)
1605- (fileno, filename) = tempfile.mkstemp(suffix=suffix, dir=imports)
1606- fallocate(size, filename)
1607- # FIXME: This probably isn't the best approach, but for now it works:
1608- tmp_fp = open(filename, 'r+b')
1609- os.close(fileno)
1610- return tmp_fp
1611-
1612- def allocate_for_write(self, ext=None):
1613- """
1614- Open a random temporary file for a write operation.
1615-
1616- Use this method to allocated a temporary file for cases when the file
1617- size is not known in advance, eg when transcoding or rendering.
1618-
1619- The file extension *ext* is optional and serves no other purpose than to
1620- aid in debugging. The value of *ext* used here has no effect on the
1621- ultimate canonical file name.
1622-
1623- :param ext: normalized lowercase file extension, eg ``'mov'``
1624- """
1625- writes = self.join(WRITES_DIR)
1626- if not path.exists(writes):
1627- os.makedirs(writes)
1628- suffix = ('' if ext is None else '.' + ext)
1629- (fileno, filename) = tempfile.mkstemp(suffix=suffix, dir=writes)
1630- tmp_fp = open(filename, 'r+b')
1631- os.close(fileno)
1632- return tmp_fp
1633-
1634- def tmp_move(self, tmp_fp, chash, ext=None):
1635- """
1636- Move temporary file into its canonical location.
1637-
1638- This method will securely and atomically move a temporary file into its
1639- canonical location.
1640-
1641- For example:
1642-
1643- >>> fs = FileStore()
1644- >>> tmp_fp = open(fs.join('foo.mov'), 'wb')
1645- >>> chash = 'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O'
1646- >>> fs.tmp_move(tmp_fp, chash, 'mov') #doctest: +ELLIPSIS
1647- '/tmp/.../.dmedia/ZR/765XWSF6S7JQHLUI4GCG5BHGPE252O.mov'
1648-
1649- Note, however, that this method does *not* verify the content hash of
1650- the temporary file! This is by design as many operations will compute
1651- the content hash as they write to the temporary file. Other operations
1652- should use `FileStore.tmp_verify_move()` to verify and move in one step.
1653-
1654- Regardless, the full content hash should have been verified prior to
1655- calling this method. To ensure the content is not modified, operations
1656- must take these steps:
1657-
1658- 1. Open *tmp_fp* and keep it open, thereby retaining a lock on the
1659- file
1660-
1661- 2. Compute the full content hash, which can be done as content is
1662- written to *tmp_fp* (open in mode ``'r+b'`` to resume a transfer,
1663- but hash of previously transfered leaves must still be verified)
1664-
1665- 3. With *tmp_fp* still open, move the temporary file into its
1666- canonical location using this method.
1667-
1668- As a simple locking mechanism, this method takes an open ``file`` rather
1669- than a filename, thereby preventing the file from being modified during
1670- the move. A ``ValueError`` is raised if *tmp_fp* is already closed.
1671-
1672- For portability reasons, this method requires that *tmp_fp* be opened in
1673- a binary mode: ``'rb'``, ``'wb'``, or ``'r+b'``. A ``ValueError`` is
1674- raised if opened in any other mode.
1675-
1676- For security reasons, this method will only move a temporary file
1677- located within the ``FileStore.base`` directory or a subdirectory
1678- thereof. If an attempt is made to move a file from outside the store,
1679- `FileStoreTraversal` is raised. See `FileStore.check_path()`.
1680-
1681- Just prior to moving the file, a call to ``os.fchmod()`` is made to set
1682- read-only permissions (0444). After the move, *tmp_fp* is closed.
1683-
1684- If the canonical file already exists, `DuplicateFile` is raised.
1685-
1686- The return value is the absolute path of the canonical file.
1687-
1688- :param tmp_fp: a ``file`` instance created with ``open()``
1689- :param chash: base32-encoded content-hash
1690- :param ext: normalized lowercase file extension, eg ``'mov'``
1691- """
1692- # Validate tmp_fp:
1693- if not isinstance(tmp_fp, file):
1694- raise TypeError(
1695- TYPE_ERROR % ('tmp_fp', file, type(tmp_fp), tmp_fp)
1696- )
1697- if tmp_fp.mode not in ('rb', 'wb', 'r+b'):
1698- raise ValueError(
1699- "tmp_fp: mode must be 'rb', 'wb', or 'r+b'; got %r" % tmp_fp.mode
1700- )
1701- if tmp_fp.closed:
1702- raise ValueError('tmp_fp is closed, must be open: %r' % tmp_fp.name)
1703- self.check_path(tmp_fp.name)
1704-
1705- # Get canonical name, check for duplicate:
1706- dst = self.path(chash, ext, create=True)
1707- if path.exists(dst):
1708- raise DuplicateFile(chash=chash, src=tmp_fp.name, dst=dst)
1709-
1710- # Set file to read-only (0444) and move into canonical location
1711- log.info('Moving file %r to %r', tmp_fp.name, dst)
1712- fileno = tmp_fp.fileno()
1713- tmp_fp.flush()
1714- os.fsync(fileno)
1715- os.fchmod(fileno, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
1716- os.rename(tmp_fp.name, dst)
1717- tmp_fp.close()
1718-
1719- # Return canonical filename:
1720- return dst
1721-
1722- def tmp_hash_move(self, tmp_fp, ext=None):
1723- """
1724- Hash temporary file, then move into its canonical location.
1725- """
1726- h = HashList(tmp_fp)
1727- chash = h.run()
1728- self.tmp_move(tmp_fp, chash, ext)
1729- return (chash, h.leaves)
1730-
1731- def tmp_verify_move(self, chash, ext=None):
1732- """
1733- Verify temporary file, then move into its canonical location.
1734-
1735- This method will check the content hash of the canonically-named
1736- temporary file with content hash *chash* and extension *ext*. If the
1737- content hash is correct, this method will then move the temporary file
1738- into its canonical location using `FileStore.tmp_move()`.
1739-
1740- If the content hash is incorrect, `IntegrityError` is raised. If the
1741- canonical file already exists, `DuplicateFile` is raised. Lastly, if
1742- the temporary does not exist, ``IOError`` is raised.
1743-
1744- This method will typically be used with the BitTorrent downloader or
1745- similar, in which case the content hash will be known prior to
1746- downloading. The downloader will first determine the canonical
1747- temporary file name, like this:
1748-
1749- >>> fs = FileStore()
1750- >>> tmp = fs.tmp('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', 'mov', create=True)
1751- >>> tmp #doctest: +ELLIPSIS
1752- '/tmp/.../.dmedia/transfers/TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK.mov'
1753-
1754- Then the downloader will write to the temporary file as it's being
1755- downloaded:
1756-
1757- >>> from dmedia.tests import sample_mov # Sample .MOV file
1758- >>> src_fp = open(sample_mov, 'rb')
1759- >>> tmp_fp = open(tmp, 'wb')
1760- >>> while True:
1761- ... chunk = src_fp.read(2**20) # Read in 1MiB chunks
1762- ... if not chunk:
1763- ... break
1764- ... tmp_fp.write(chunk)
1765- ...
1766- >>> tmp_fp.close()
1767-
1768- Finally, the downloader will move the temporary file into its canonical
1769- location:
1770-
1771- >>> dst = fs.tmp_verify_move('TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK', 'mov')
1772- >>> dst #doctest: +ELLIPSIS
1773- '/tmp/.../.dmedia/TG/X33XXWU3EVHEEY5J7NBOJGKBFXLEBK.mov'
1774-
1775- The return value is the absolute path of the canonical file.
1776-
1777- :param chash: base32-encoded content-hash
1778- :param ext: normalized lowercase file extension, eg ``'mov'``
1779- """
1780- tmp = self.tmp(chash, ext)
1781- tmp_fp = open(tmp, 'rb')
1782- h = HashList(tmp_fp)
1783- got = h.run()
1784- if got != chash:
1785- raise IntegrityError(got=got, expected=chash, filename=tmp_fp.name)
1786- return self.tmp_move(tmp_fp, chash, ext)
1787-
1788- def import_file(self, src_fp, ext=None):
1789- """
1790- Atomically copy open file *src_fp* into this file store.
1791-
1792- The method will compute the content-hash of *src_fp* as it copies it to
1793- a temporary file within this store. Once the copying is complete, the
1794- temporary file will be moved to its canonical location using
1795- `FileStore.tmp_move()`.
1796-
1797- A `DuplicateFile` exception will be raised if the file already exists
1798- in this store.
1799-
1800- This method returns a ``(chash, leaves)`` tuple with the content hash
1801- (top-hash) and a list of the content hashes of the leaves. See
1802- `HashList` for details.
1803-
1804- Note that *src_fp* must have been opened in ``'rb'`` mode.
1805-
1806- :param src_fp: a ``file`` instance created with ``open()``
1807- :param ext: normalized lowercase file extension, eg ``'mov'``
1808- """
1809- size = os.fstat(src_fp.fileno()).st_size
1810- tmp_fp = self.allocate_for_import(size, ext)
1811- h = HashList(src_fp, tmp_fp)
1812- log.info('Importing file %r into %r', src_fp.name, self)
1813- chash = h.run()
1814- try:
1815- self.tmp_move(tmp_fp, chash, ext)
1816- except DuplicateFile as e:
1817- log.warning('File %r is duplicate of %r', src_fp.name, e.dst)
1818- raise DuplicateFile(src=src_fp.name, dst=e.dst, tmp=e.src,
1819- chash=chash, leaves=h.leaves
1820- )
1821- return (chash, h.leaves)
1822
1823=== modified file 'dmedia/importer.py'
1824--- dmedia/importer.py 2011-09-16 04:50:48 +0000
1825+++ dmedia/importer.py 2011-09-22 11:43:29 +0000
1826@@ -25,352 +25,20 @@
1827 Store media files based on content-hash.
1828 """
1829
1830-import os
1831-from os import path
1832-import mimetypes
1833 import time
1834-from base64 import b64encode
1835+from copy import deepcopy
1836+from subprocess import check_call
1837 import logging
1838
1839 import microfiber
1840-
1841-from .schema import (
1842- random_id, create_file, create_batch, create_import, create_drive,
1843- create_partition
1844-)
1845-from .errors import DuplicateFile
1846-from .workers import (
1847- CouchWorker, CouchManager, register, isregistered, exception_name
1848-)
1849-from .filestore import FileStore, quick_id, safe_open, safe_ext, pack_leaves
1850-from .extractor import merge_metadata
1851-
1852-mimetypes.init()
1853-DOTDIR = '.dmedia'
1854+from filestore import FileStore, scandir, batch_import_iter
1855+
1856+from dmedia import workers, schema
1857+
1858+
1859 log = logging.getLogger()
1860
1861
1862-# FIXME: This needs to be done with some real inspection of the file contents,
1863-# but this is just a stopgap for the sake of getting the schema stable:
1864-MEDIA_MAP = {
1865- 'ogv': 'video',
1866- 'mov': 'video',
1867- 'avi': 'video',
1868- 'mts': 'video',
1869-
1870- 'oga': 'audio',
1871- 'flac': 'audio',
1872- 'wav': 'audio',
1873- 'mp3': 'audio',
1874-
1875- 'jpg': 'image',
1876- 'cr2': 'image',
1877- 'png': 'image',
1878-}
1879-
1880-
1881-def normalize_ext(name):
1882- """
1883- Return (root, ext) from *name* where extension is normalized to lower-case.
1884-
1885- If *name* has no extension, ``None`` is returned as 2nd item in (root, ext)
1886- tuple:
1887-
1888- >>> normalize_ext('IMG_2140.CR2')
1889- ('IMG_2140', 'cr2')
1890- >>> normalize_ext('test.jpg')
1891- ('test', 'jpg')
1892- >>> normalize_ext('hello_world')
1893- ('hello_world', None)
1894- """
1895- parts = name.rsplit('.', 1)
1896- if len(parts) == 2:
1897- (root, ext) = parts
1898- if root and ext:
1899- try:
1900- return (root, safe_ext(ext.lower()))
1901- except (ValueError, TypeError):
1902- pass
1903- return (name, None)
1904-
1905-
1906-def scanfiles(base, extensions=None):
1907- """
1908- Recursively iterate through files in directory *base*.
1909- """
1910- try:
1911- names = sorted(os.listdir(base))
1912- except StandardError:
1913- return
1914- dirs = []
1915- for name in names:
1916- if name.startswith('.') or name.endswith('~'):
1917- continue
1918- fullname = path.join(base, name)
1919- if path.islink(fullname):
1920- continue
1921- if path.isfile(fullname):
1922- (root, ext) = normalize_ext(name)
1923- if extensions is None or ext in extensions:
1924- yield {
1925- 'src': fullname,
1926- 'base': base,
1927- 'root': root,
1928- 'doc': {
1929- 'name': name,
1930- 'ext': ext,
1931- },
1932- }
1933- elif path.isdir(fullname):
1934- dirs.append(fullname)
1935- for fullname in dirs:
1936- for d in scanfiles(fullname, extensions):
1937- yield d
1938-
1939-
1940-def files_iter(base):
1941- """
1942- Recursively iterate through files in directory *base*.
1943-
1944- This is used for importing files from a card, after which the card will be
1945- automatically formatted, so we always import all files to be on the safe
1946- side.
1947-
1948- On the other hand, `scanfiles()` is used for migrating an existing library
1949- to dmedia... in which case we want to be more selective about which files to
1950- consider.
1951-
1952- Note that `files_iter()` does not catch errors like ``OSError``. We
1953- specifically want these errors to propagate up! We don't want a permission
1954- error to be interpreted as there being no files on the card!
1955- """
1956- if path.isfile(base):
1957- s = os.stat(base)
1958- yield (base, s.st_size, s.st_mtime)
1959- return
1960- names = sorted(os.listdir(base))
1961- dirs = []
1962- for name in names:
1963- fullname = path.join(base, name)
1964- if path.islink(fullname):
1965- continue
1966- if path.isfile(fullname):
1967- s = os.stat(fullname)
1968- yield (fullname, s.st_size, s.st_mtime)
1969- elif path.isdir(fullname):
1970- dirs.append(fullname)
1971- for fullname in dirs:
1972- for tup in files_iter(fullname):
1973- yield tup
1974-
1975-
1976-class ImportWorker(CouchWorker):
1977- def __init__(self, env, q, key, args):
1978- super(ImportWorker, self).__init__(env, q, key, args)
1979- (self.base, self.extract) = args
1980- self.filestore = FileStore(self.env['filestore']['path'])
1981- self.filestore_id = self.env['filestore']['_id']
1982-
1983- self.filetuples = None
1984- self._processed = []
1985- self.doc = None
1986- self._id = None
1987-
1988- def execute(self, base, extract=False):
1989- import_id = self.start()
1990- self.emit('started', import_id)
1991-
1992- files = self.scanfiles()
1993- total = len(files)
1994- self.emit('count', import_id, total)
1995-
1996- c = 1
1997- for (src, action) in self.import_all_iter():
1998- self.emit('progress', import_id, c, total,
1999- dict(
2000- action=action,
2001- src=src,
2002- )
2003- )
2004- c += 1
2005-
2006- stats = self.finalize()
2007- self.emit('finished', import_id, stats)
2008-
2009- def save(self):
2010- """
2011- Save current 'dmedia/import' record to CouchDB.
2012- """
2013- self.db.save(self.doc)
2014-
2015- def start(self):
2016- """
2017- Create the initial 'dmedia/import' record, return that record's ID.
2018- """
2019- assert self._id is None
2020- #drive = create_drive(self.base)
2021- #partition = create_partition(self.base)
2022- self.doc = create_import(self.base,
2023- None, #partition['_id'],
2024- batch_id=self.env.get('batch_id'),
2025- machine_id=self.env.get('machine_id'),
2026- )
2027- self._id = self.doc['_id']
2028- self.save()
2029- #try:
2030- # self.db.save(drive)
2031- #except microfiber.Conflict:
2032- # pass
2033- #try:
2034- # self.db.save(partition)
2035- #except microfiber.Conflict:
2036- # pass
2037- return self._id
2038-
2039- def scanfiles(self):
2040- """
2041- Build list of files that will be considered for import.
2042-
2043- After this method has been called, the ``Importer.filetuples`` attribute
2044- will contain ``(filename,size,mtime)`` tuples for all files being
2045- considered. This information is saved into the dmedia/import record to
2046- provide a rich audio trail and aid in debugging.
2047- """
2048- assert self.filetuples is None
2049- self.filetuples = tuple(files_iter(self.base))
2050- self.doc['log']['considered'] = [
2051- {'src': src, 'bytes': size, 'mtime': mtime}
2052- for (src, size, mtime) in self.filetuples
2053- ]
2054- total_bytes = sum(size for (src, size, mtime) in self.filetuples)
2055- self.doc['stats']['considered'] = {
2056- 'count': len(self.filetuples), 'bytes': total_bytes
2057- }
2058- self.save()
2059- return self.filetuples
2060-
2061- def _import_file(self, src):
2062- """
2063- Attempt to import *src* into dmedia library.
2064- """
2065- fp = safe_open(src, 'rb')
2066- stat = os.fstat(fp.fileno())
2067- if stat.st_size == 0:
2068- log.warning('File size is zero: %r', src)
2069- return ('empty', None)
2070-
2071- name = path.basename(src)
2072- (root, ext) = normalize_ext(name)
2073- try:
2074- (chash, leaves) = self.filestore.import_file(fp, ext)
2075- action = 'imported'
2076- except DuplicateFile as e:
2077- chash = e.chash
2078- leaves = e.leaves
2079- action = 'skipped'
2080- assert e.tmp.startswith(self.filestore.join('imports'))
2081- # FIXME: We should really probably move this into duplicates/ or
2082- # something and not delete till we verify integrity of what is
2083- # already in the filestore.
2084- os.remove(e.tmp)
2085-
2086- try:
2087- doc = self.db.get(chash)
2088- if self.filestore_id not in doc['stored']:
2089- doc['stored'][self.filestore_id] = {
2090- 'copies': 1,
2091- 'time': time.time(),
2092- }
2093- self.db.save(doc)
2094- return (action, doc)
2095- except microfiber.NotFound as e:
2096- pass
2097-
2098- leaf_hashes = b''.join(leaves)
2099- stored = {
2100- self.filestore_id: {
2101- 'copies': 1,
2102- }
2103- }
2104- doc = create_file(chash, stat.st_size, leaf_hashes, stored, ext=ext)
2105- assert doc['_id'] == chash
2106- doc.update(
2107- import_id=self._id,
2108- mtime=stat.st_mtime,
2109- name=name,
2110- dir=path.relpath(path.dirname(src), self.base),
2111- )
2112- if ext:
2113- doc['content_type'] = mimetypes.types_map.get('.' + ext)
2114- doc['media'] = MEDIA_MAP.get(ext)
2115- if self.extract:
2116- merge_metadata(src, doc)
2117- r = self.db.save(doc)
2118- assert r['id'] == chash
2119- return (action, doc)
2120-
2121- def import_file(self, src, size):
2122- """
2123- Wraps `Importer._import_file()` with error handling and logging.
2124- """
2125- self._processed.append(src)
2126- try:
2127- (action, doc) = self._import_file(src)
2128- if action == 'empty':
2129- entry = src
2130- else:
2131- entry = {
2132- 'src': src,
2133- 'id': doc['_id'],
2134- }
2135- except Exception as e:
2136- log.exception('Error importing %r', src)
2137- action = 'error'
2138- entry = {
2139- 'src': src,
2140- 'name': exception_name(e),
2141- 'msg': str(e),
2142- }
2143- self.doc['log'][action].append(entry)
2144- self.doc['stats'][action]['count'] += 1
2145- self.doc['stats'][action]['bytes'] += size
2146- if action == 'error':
2147- self.save()
2148- return (action, entry)
2149-
2150- def import_all_iter(self):
2151- for (src, size, mtime) in self.filetuples:
2152- (action, entry) = self.import_file(src, size)
2153- yield (src, action)
2154-
2155- def finalize(self):
2156- """
2157- Finalize import and save final import record to CouchDB.
2158-
2159- The method will add the ``"time_end"`` key into the import record and
2160- save it to CouchDB. There will likely also be being changes in the
2161- ``"log"`` and ``"stats"`` keys, which will likewise be saved to CouchDB.
2162- """
2163- assert len(self.filetuples) == len(self._processed)
2164- assert list(t[0] for t in self.filetuples) == self._processed
2165- self.doc['time_end'] = time.time()
2166- self.save()
2167- dt = self.doc['time_end'] - self.doc['time']
2168- log.info('Completed import of %r in %d:%02d',
2169- self.base, dt / 60, dt % 60
2170- )
2171- return self.doc['stats']
2172-
2173-
2174-def to_dbus_stats(stats):
2175- return dict(
2176- imported=stats['imported']['count'],
2177- imported_bytes=stats['imported']['bytes'],
2178- skipped=stats['skipped']['count'],
2179- skipped_bytes=stats['skipped']['bytes'],
2180- )
2181-
2182-
2183 def accumulate_stats(accum, stats):
2184 for (key, d) in stats.items():
2185 if key not in accum:
2186@@ -379,45 +47,131 @@
2187 accum[key][k] += v
2188
2189
2190-class ImportManager(CouchManager):
2191+class ImportWorker(workers.CouchWorker):
2192+ def __init__(self, env, q, key, args):
2193+ super().__init__(env, q, key, args)
2194+ self.basedir = args[0]
2195+ self.id = None
2196+ self.doc = None
2197+
2198+ def execute(self, basedir):
2199+ self.start()
2200+ self.scan()
2201+ self.import_all()
2202+
2203+ def start(self):
2204+ self.doc = schema.create_import(self.basedir,
2205+ machine_id=self.env.get('machine_id'),
2206+ batch_id=self.env.get('batch_id'),
2207+ )
2208+ self.id = self.doc['_id']
2209+ self.db.save(self.doc)
2210+ self.emit('started', self.id)
2211+
2212+ def scan(self):
2213+ self.batch = scandir(self.basedir)
2214+ self.doc['stats']['total'] = {
2215+ 'bytes': self.batch.size,
2216+ 'count': self.batch.count,
2217+ }
2218+ self.doc['import_order'] = [file.name for file in self.batch.files]
2219+ self.doc['files'] = dict(
2220+ (file.name, {'bytes': file.size, 'mtime': file.mtime})
2221+ for file in self.batch.files
2222+ )
2223+ self.db.save(self.doc)
2224+ self.emit('scanned', self.batch.count, self.batch.size)
2225+
2226+ def get_filestores(self):
2227+ stores = []
2228+ for doc in self.env['filestores']:
2229+ fs = FileStore(doc['parentdir'])
2230+ fs.id = doc['_id']
2231+ fs.copies = doc['copies']
2232+ stores.append(fs)
2233+ return stores
2234+
2235+ def import_all(self):
2236+ stores = self.get_filestores()
2237+ try:
2238+ for (status, file, doc) in self.import_iter(*stores):
2239+ self.doc['stats'][status]['count'] += 1
2240+ self.doc['stats'][status]['bytes'] += file.size
2241+ self.doc['files'][file.name]['status'] = status
2242+ if doc is not None:
2243+ self.db.save(doc)
2244+ self.doc['files'][file.name]['id'] = doc['_id']
2245+ self.emit('progress', file.size)
2246+ self.doc['time_end'] = time.time()
2247+ finally:
2248+ self.db.save(self.doc)
2249+ self.emit('finished', self.doc['stats'])
2250+
2251+ def import_iter(self, *filestores):
2252+ common = {
2253+ 'import_id': self.id,
2254+ 'machine_id': self.env.get('machine_id'),
2255+ 'batch_id': self.env.get('batch_id'),
2256+ }
2257+ for (file, ch) in batch_import_iter(self.batch, *filestores):
2258+ if ch is None:
2259+ assert file.size == 0
2260+ yield ('empty', file, None)
2261+ continue
2262+ stored = dict(
2263+ (fs.id, {'copies': fs.copies, 'mtime': fs.stat(ch.id).mtime})
2264+ for fs in filestores
2265+ )
2266+ try:
2267+ doc = self.db.get(ch.id)
2268+ doc['stored'].update(stored)
2269+ yield ('duplicate', file, doc)
2270+ except microfiber.NotFound:
2271+ doc = schema.create_file(
2272+ ch.id, ch.file_size, ch.leaf_hashes, stored
2273+ )
2274+ doc['import'] = {
2275+ 'src': file.name,
2276+ 'mtime': file.mtime,
2277+ }
2278+ doc['import'].update(common)
2279+ doc['ctime'] = file.mtime
2280+ yield ('new', file, doc)
2281+
2282+
2283+class ImportManager(workers.CouchManager):
2284 def __init__(self, env, callback=None):
2285- super(ImportManager, self).__init__(env, callback)
2286+ super().__init__(env, callback)
2287 self.doc = None
2288- self._total = 0
2289- self._completed = 0
2290- if not isregistered(ImportWorker):
2291- register(ImportWorker)
2292-
2293- def save(self):
2294- """
2295- Save current 'dmedia/batch' record to CouchDB.
2296- """
2297- self.db.save(self.doc)
2298+ self._reset_counters()
2299+ if not workers.isregistered(ImportWorker):
2300+ workers.register(ImportWorker)
2301+
2302+ def _reset_counters(self):
2303+ self._count = 0
2304+ self._total_count = 0
2305+ self._bytes = 0
2306+ self._total_bytes = 0
2307+
2308+ def get_worker_env(self, worker, key, args):
2309+ env = deepcopy(self.env)
2310+ env['batch_id'] = self.doc['_id']
2311+ return env
2312
2313 def first_worker_starting(self):
2314 assert self.doc is None
2315 assert self._workers == {}
2316- self._total = 0
2317- self._completed = 0
2318- self.doc = create_batch(self.env.get('machine_id'))
2319- self.save()
2320- self.emit('BatchStarted', self.doc['_id'])
2321+ self._reset_counters()
2322+ self.doc = schema.create_batch(self.env.get('machine_id'))
2323+ self.db.save(self.doc)
2324+ self.emit('batch_started', self.doc['_id'])
2325
2326 def last_worker_finished(self):
2327 assert self._workers == {}
2328 self.doc['time_end'] = time.time()
2329- self.save()
2330- self.emit('BatchFinished', self.doc['_id'],
2331- to_dbus_stats(self.doc['stats'])
2332- )
2333+ self.db.save(self.doc)
2334+ self.emit('batch_finished', self.doc['_id'], self.doc['stats'])
2335 self.doc = None
2336- log.info('Batch complete, compacting database...')
2337- self.db.post(None, '_compact')
2338-
2339- def get_worker_env(self, worker, key, args):
2340- env = dict(self.env)
2341- env['batch_id'] = self.doc['_id']
2342- return env
2343
2344 def on_error(self, key, exception, message):
2345 super(ImportManager, self).on_error(key, exception, message)
2346@@ -426,32 +180,36 @@
2347 self.doc['errors'].append(
2348 {'key': key, 'name': exception, 'msg': message}
2349 )
2350- self.save()
2351+ self.db.save(self.doc)
2352
2353 def on_started(self, key, import_id):
2354 self.doc['imports'].append(import_id)
2355- self.save()
2356- self.emit('ImportStarted', key, import_id)
2357-
2358- def on_count(self, key, import_id, total):
2359- self._total += total
2360- self.emit('ImportCount', key, import_id, total)
2361-
2362- def on_progress(self, key, import_id, completed, total, info):
2363- self._completed += 1
2364- self.emit('ImportProgress', key, import_id, completed, total, info)
2365-
2366- def on_finished(self, key, import_id, stats):
2367+ self.db.save(self.doc)
2368+ self.emit('import_started', key, import_id)
2369+
2370+ def on_scanned(self, key, total_count, total_bytes):
2371+ self._total_count += total_count
2372+ self._total_bytes += total_bytes
2373+ self.emit('batch_progress',
2374+ self._count, self._total_count,
2375+ self._bytes, self._total_bytes,
2376+ )
2377+
2378+ def on_progress(self, key, file_size):
2379+ self._count += 1
2380+ self._bytes += file_size
2381+ self.emit('batch_progress',
2382+ self._count, self._total_count,
2383+ self._bytes, self._total_bytes,
2384+ )
2385+
2386+ def on_finished(self, key, stats):
2387 accumulate_stats(self.doc['stats'], stats)
2388- self.save()
2389- self.emit('ImportFinished', key, import_id, to_dbus_stats(stats))
2390+ self.db.save(self.doc)
2391
2392 def get_batch_progress(self):
2393 with self._lock:
2394- return dict(
2395- completed=self._completed,
2396- total=self._total,
2397- )
2398+ return (self._count, self._total_count, self._bytes, self._total_bytes)
2399
2400- def start_import(self, base, extract=True):
2401- return self.start_job('ImportWorker', base, base, extract)
2402+ def start_import(self, base):
2403+ return self.start_job('ImportWorker', base, base)
2404
2405=== modified file 'dmedia/schema.py'
2406--- dmedia/schema.py 2011-09-16 04:19:18 +0000
2407+++ dmedia/schema.py 2011-09-22 11:43:29 +0000
2408@@ -54,8 +54,7 @@
2409
2410
2411 These test functions are used in the dmedia test suite, and 3rd-party apps would
2412-be well served by doing the same. Please read on for the rationale of some key
2413-dmedia schema design decisions...
2414+be well served by doing the same.
2415
2416
2417
2418@@ -77,69 +76,6 @@
2419 filename directly from a document ID is an important design consideration.
2420
2421
2422-Random IDs
2423-----------
2424-
2425-Random IDs are 120-bit random numbers, base32-encoded. They're much like a
2426-Version 4 (random) UUID, except dmedia random IDs have no reserved bits. For
2427-example:
2428-
2429->>> random_id() #doctest: +SKIP
2430-'NZXXMYLDOV2F6ZTUO5PWM5DX'
2431-
2432-
2433-Intrinsic IDs
2434--------------
2435-
2436-Files in the dmedia library are uniquely identified by their content-hash.
2437-dmedia *is* a distributed filesystem, but a quite simple one in that it only
2438-stores intrinsically-named, read-only files.
2439-
2440-The content-hash is computed as a hash-list (a 1 deep tree-hash). Currently
2441-this is done using the sha1 hash function with an 8 MiB leaf size, but dmedia
2442-is moving to Skein for the final hashing protocol.
2443-
2444-The content-hashes of the individual leaves are stored in the "leaves"
2445-attachment in the CouchDB document. This allows for file integrity checks with
2446-8 MiB granularity, and provides the basis for cryptographically robust swarm
2447-upload and download.
2448-
2449-The base32-encoded sha1 hash is 32-characters long. For example:
2450-
2451->>> from dmedia.filestore import HashList
2452->>> from dmedia.tests import sample_mov # Sample .MOV file
2453->>> src_fp = open(sample_mov, 'rb')
2454->>> hashlist = HashList(src_fp)
2455->>> hashlist.run()
2456-'TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK'
2457-
2458-
2459-After calling `HashList.run()`, the binary digests of the leaf content-hashes
2460-are available via the ``leaves`` attribute (which is a ``list``):
2461-
2462->>> from base64 import b32encode
2463->>> for d in hashlist.leaves:
2464-... print(repr(b32encode(d)))
2465-...
2466-'IXJTSUCYYFECGSG6JIB2R77CAJVJK4W3'
2467-'MA3IAHUOKXR4TRG7CWAPOO7U4WCV5WJ4'
2468-'FHF7KDMAGNYOVNYSYT6ZYWQLUOCTUADI'
2469-
2470-
2471-The overall file content-hash (aka the top-hash) is a hash of the leaf hashes.
2472-Note that this matches what was returned by `HashList.run()`:
2473-
2474->>> from hashlib import sha1
2475->>> b32encode(sha1(''.join(hashlist.leaves)).digest())
2476-'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O'
2477-
2478-
2479-In the near future dmedia will very likely migrate to using a 200-bit Skein-512
2480-hash. See:
2481-
2482- http://packages.python.org/pyskein/
2483-
2484-
2485
2486 Design Decision: mime-like record types
2487 =======================================
2488@@ -310,29 +246,16 @@
2489
2490 from __future__ import print_function
2491
2492-from os import urandom
2493-from base64 import b32encode, b32decode, b64encode
2494+from base64 import b32encode, b64encode
2495 from hashlib import sha1
2496 import re
2497 import time
2498 import socket
2499-import platform
2500-
2501-from .constants import TYPE_ERROR, EXT_PAT
2502-#from .udisks import Device
2503-
2504-
2505-def random_id():
2506- """
2507- Returns a 120-bit base32-encoded random ID.
2508-
2509- The ID will be 24-characters long, URL and filesystem safe. For example:
2510-
2511- >>> random_id() #doctest: +SKIP
2512- 'OVRHK3TUOUQCWIDMNFXGC4TP'
2513-
2514- """
2515- return b32encode(urandom(15))
2516+
2517+from filestore import DIGEST_B32LEN, B32ALPHABET, TYPE_ERROR
2518+from microfiber import random_id, RANDOM_B32LEN
2519+
2520+from .constants import EXT_PAT
2521
2522
2523 # Some private helper functions that don't directly define any schema.
2524@@ -465,7 +388,7 @@
2525 >>> _isinstance('18', "doc['bytes']", int)
2526 Traceback (most recent call last):
2527 ...
2528- TypeError: doc['bytes']: need a <type 'int'>; got a <type 'str'>: '18'
2529+ TypeError: doc['bytes']: need a <class 'int'>; got a <class 'str'>: '18'
2530
2531 """
2532 for a in allowed:
2533@@ -520,7 +443,8 @@
2534 """
2535 value = _value(doc, path)
2536 label = _label(path)
2537- _isinstance(value, label, allowed)
2538+ if not (allowed is None or isinstance(value, allowed)):
2539+ raise TypeError(TYPE_ERROR.format(label, allowed, type(value), value))
2540 if value is None:
2541 return
2542 for c in checks:
2543@@ -542,7 +466,7 @@
2544 >>> _check_if_exists(doc, ['name'], str)
2545 Traceback (most recent call last):
2546 ...
2547- TypeError: doc['name']: need a <type 'str'>; got a <type 'int'>: 17
2548+ TypeError: doc['name']: need a <class 'str'>; got a <class 'int'>: 17
2549
2550
2551 See also `_check()` and `_exists()`.
2552@@ -658,79 +582,81 @@
2553 )
2554
2555
2556-def _base32(value, label):
2557- """
2558- Verify that *value* is a valid base32 encoded document ID.
2559-
2560- Document IDs must:
2561-
2562- 1. be valid base32 encoding
2563-
2564- 2. decode to data that is a multiple of 5-bytes (40-bits ) in length
2565-
2566- For example, invalid encoding:
2567-
2568- >>> _base32('MZZG2ZDS0QVSW2TEMVZG643F', "doc['_id']")
2569- Traceback (most recent call last):
2570- ...
2571- ValueError: doc['_id']: Non-base32 digit found: 'MZZG2ZDS0QVSW2TEMVZG643F'
2572-
2573- And an invalid value:
2574-
2575- >>> _base32('MFQWCYLBMFQWCYI=', "doc['_id']")
2576- Traceback (most recent call last):
2577- ...
2578- ValueError: len(b32decode(doc['_id'])) not multiple of 5: 'MFQWCYLBMFQWCYI='
2579-
2580- """
2581- try:
2582- decoded = b32decode(value)
2583- except TypeError as e:
2584- raise ValueError(
2585- '{}: {}: {!r}'.format(label, e, value)
2586- )
2587- if len(decoded) % 5 != 0:
2588- raise ValueError(
2589- 'len(b32decode({})) not multiple of 5: {!r}'.format(label, value)
2590+def _any_id(value, label):
2591+ """
2592+ Verify that *value* is a base32-encoded ID.
2593+ """
2594+ if not isinstance(value, str):
2595+ raise TypeError(
2596+ TYPE_ERROR.format(label, str, type(value), value)
2597+ )
2598+ if len(value) % 8 != 0:
2599+ raise ValueError(
2600+ '{}: length of ID ({}) not multiple of 8: {!r}'.format(
2601+ label, len(value), value)
2602+ )
2603+ if not set(value).issubset(B32ALPHABET):
2604+ raise ValueError(
2605+ '{}: ID not subset of B32ALPHABET: {!r}'.format(
2606+ label, value)
2607 )
2608
2609
2610 def _random_id(value, label):
2611 """
2612- Verify that *value* is a 120-bit base32 encoded random ID.
2613-
2614- For example:
2615-
2616- >>> _random_id('EIJ5EVPOJSO5ZBDY', "doc['_id']")
2617- Traceback (most recent call last):
2618- ...
2619- ValueError: doc['_id']: random ID must be 24 characters; got 'EIJ5EVPOJSO5ZBDY'
2620-
2621- """
2622- _base32(value, label)
2623- if len(value) != 24:
2624- raise ValueError(
2625- '{}: random ID must be 24 characters; got {!r}'.format(label, value)
2626- )
2627-
2628-
2629-def _content_id(value, label):
2630- """
2631- Verify that *value* is a 160-bit base32 encoded content hash.
2632-
2633- For example:
2634-
2635- >>> _content_id('EIJ5EVPOJSO5ZBDY', "doc['_id']")
2636- Traceback (most recent call last):
2637- ...
2638- ValueError: doc['_id']: content ID must be 32 characters; got 'EIJ5EVPOJSO5ZBDY'
2639-
2640- """
2641- _base32(value, label)
2642- if len(value) != 32:
2643- raise ValueError(
2644- '{}: content ID must be 32 characters; got {!r}'.format(label, value)
2645- )
2646+ Verify that *value* is a 120-bit base32-encoded random ID.
2647+
2648+ For example, the number ``'1'`` is not a valid base32 character:
2649+
2650+ >>> _random_id('1OTXJHVEXTKNXZHCMHDVF276', "doc['_id']")
2651+ Traceback (most recent call last):
2652+ ...
2653+ ValueError: doc['_id']: random ID not subset of B32ALPHABET: '1OTXJHVEXTKNXZHCMHDVF276'
2654+
2655+ """
2656+ if not isinstance(value, str):
2657+ raise TypeError(
2658+ TYPE_ERROR.format(label, str, type(value), value)
2659+ )
2660+ if len(value) != RANDOM_B32LEN:
2661+ raise ValueError(
2662+ '{}: random ID must be {} characters, got {}: {!r}'.format(
2663+ label, RANDOM_B32LEN, len(value), value)
2664+ )
2665+ if not set(value).issubset(B32ALPHABET):
2666+ raise ValueError(
2667+ '{}: random ID not subset of B32ALPHABET: {!r}'.format(
2668+ label, value)
2669+ )
2670+
2671+
2672+def _intrinsic_id(value, label):
2673+ """
2674+ Verify that *value* is a 240-bit base32-encoded intrinsic ID.
2675+
2676+ For example:
2677+
2678+ >>> _intrinsic_id('QE7POGENSF67FGKN2TD3FH4E', "doc['_id']")
2679+ Traceback (most recent call last):
2680+ ...
2681+ ValueError: doc['_id']: intrinsic ID must be 48 characters, got 24: 'QE7POGENSF67FGKN2TD3FH4E'
2682+
2683+ """
2684+ if not isinstance(value, str):
2685+ raise TypeError(
2686+ TYPE_ERROR.format(label, str, type(value), value)
2687+ )
2688+ if len(value) != DIGEST_B32LEN:
2689+ raise ValueError(
2690+ '{}: intrinsic ID must be {} characters, got {}: {!r}'.format(
2691+ label, DIGEST_B32LEN, len(value), value)
2692+ )
2693+ if not set(value).issubset(B32ALPHABET):
2694+ raise ValueError(
2695+ '{}: intrinsic ID not subset of B32ALPHABET: {!r}'.format(
2696+ label, value)
2697+ )
2698+
2699
2700 def _drive_id(drive):
2701 """
2702@@ -754,60 +680,28 @@
2703 """
2704 Verify that *doc* is a valid dmedia document.
2705
2706- This verifies that *doc* has the common schema requirements that all dmedia
2707- documents should have. The *doc* must:
2708-
2709- 1. Have "_id" that is base32-encoded and when decoded is a multiple
2710- of 40-bits (5 bytes)
2711-
2712- 2. Have "ver" equal to ``0``
2713-
2714- 3. Have "type" that matches ``'dmedia/[a-z]+$'``
2715-
2716- 4. Have "time" that is a ``float`` or ``int`` greater than or equal to
2717- zero
2718-
2719 For example, a conforming value:
2720
2721 >>> doc = {
2722 ... '_id': 'NZXXMYLDOV2F6ZTUO5PWM5DX',
2723 ... 'ver': 0,
2724- ... 'type': 'dmedia/file',
2725- ... 'time': 1234567890,
2726- ... }
2727- ...
2728- >>> check_dmedia(doc)
2729-
2730-
2731- And an invalid value:
2732-
2733- >>> doc = {
2734- ... '_id': 'NZXXMYLDOV2F6ZTUO5PWM5DX',
2735- ... 'ver': 0,
2736- ... 'kind': 'dmedia/file', # Changed!
2737- ... 'time': 1234567890,
2738- ... }
2739- ...
2740- >>> check_dmedia(doc)
2741- Traceback (most recent call last):
2742- ...
2743- ValueError: doc['type'] does not exist
2744+ ... 'type': 'dmedia/foo',
2745+ ... 'time': 1234567890,
2746+ ... }
2747+ ...
2748+ >>> check_dmedia(doc)
2749
2750 """
2751 _check(doc, [], dict)
2752-
2753- _check(doc, ['_id'], basestring,
2754- _base32,
2755+ _check(doc, ['_id'], None,
2756+ _any_id,
2757 )
2758-
2759 _check(doc, ['ver'], int,
2760 (_equals, 0),
2761 )
2762-
2763- _check(doc, ['type'], basestring,
2764+ _check(doc, ['type'], str,
2765 (_matches, 'dmedia/[a-z]+$'),
2766 )
2767-
2768 _check(doc, ['time'], (int, float),
2769 (_at_least, 0),
2770 )
2771@@ -820,72 +714,58 @@
2772 For example, a conforming value:
2773
2774 >>> doc = {
2775- ... '_id': 'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O',
2776+ ... '_id': 'ROHNRBKS6T4YETP5JHEGQ3OLSBDBWRCKR2BKILJOA3CP7QZW',
2777+ ... '_attachments': {
2778+ ... 'leaf_hashes': {
2779+ ... 'data': 'v7t381LIyKsBCUYhkGreXx2qKTyyMfMD2eHWWp/L',
2780+ ... 'content_type': 'application/octet-stream',
2781+ ... },
2782+ ... },
2783 ... 'ver': 0,
2784 ... 'type': 'dmedia/file',
2785 ... 'time': 1234567890,
2786 ... 'bytes': 20202333,
2787- ... 'ext': 'mov',
2788 ... 'origin': 'user',
2789 ... 'stored': {
2790 ... 'MZZG2ZDSOQVSW2TEMVZG643F': {
2791 ... 'copies': 2,
2792- ... 'time': 1234567890,
2793- ... },
2794- ... },
2795- ... }
2796- ...
2797- >>> check_file(doc)
2798-
2799-
2800- And an invalid value:
2801-
2802- >>> doc = {
2803- ... '_id': 'ZR765XWSF6S7JQHLUI4GCG5BHGPE252O',
2804- ... 'ver': 0,
2805- ... 'type': 'dmedia/file',
2806- ... 'time': 1234567890,
2807- ... 'bytes': 20202333,
2808- ... 'ext': 'mov',
2809- ... 'origin': 'user',
2810- ... 'stored': {
2811- ... 'MZZG2ZDSOQVSW2TEMVZG643F': {
2812- ... 'number': 2, # Changed!
2813- ... 'time': 1234567890,
2814- ... },
2815- ... },
2816- ... }
2817- ...
2818- >>> check_file(doc)
2819- Traceback (most recent call last):
2820- ...
2821- ValueError: doc['stored']['MZZG2ZDSOQVSW2TEMVZG643F']['copies'] does not exist
2822+ ... 'mtime': 1234567890,
2823+ ... },
2824+ ... },
2825+ ... }
2826+ ...
2827+ >>> check_file(doc)
2828
2829 """
2830- check_dmedia(doc)
2831-
2832- _check(doc, ['type'], basestring,
2833+ # Common schema:
2834+ _check(doc, [], dict)
2835+ _check(doc, ['_id'], None,
2836+ _intrinsic_id,
2837+ )
2838+ _check(doc, ['ver'], int,
2839+ (_equals, 0),
2840+ )
2841+ _check(doc, ['type'], str,
2842 (_equals, 'dmedia/file'),
2843 )
2844-
2845- try:
2846- _check(doc, ['bytes'], int,
2847- (_at_least, 1),
2848- )
2849- except TypeError:
2850- _check(doc, ['bytes'], long,
2851- (_at_least, 1),
2852- )
2853-
2854- _check(doc, ['ext'], (type(None), basestring),
2855- (_matches, EXT_PAT),
2856- )
2857-
2858- _check(doc, ['origin'], basestring,
2859+ _check(doc, ['time'], (int, float),
2860+ (_at_least, 0),
2861+ )
2862+
2863+ # dmedia/file specific:
2864+ _check(doc, ['_attachments', 'leaf_hashes'], dict,
2865+ _nonempty,
2866+ )
2867+ _check(doc, ['_attachments', 'leaf_hashes', 'content_type'], str,
2868+ (_equals, 'application/octet-stream'),
2869+ )
2870+ _check(doc, ['bytes'], int,
2871+ (_at_least, 1),
2872+ )
2873+ _check(doc, ['origin'], str,
2874 _lowercase,
2875- (_is_in, 'user', 'download', 'paid', 'proxy', 'cache', 'render'),
2876+ (_is_in, 'user', 'paid', 'download', 'proxy', 'render', 'cache'),
2877 )
2878-
2879 _check(doc, ['stored'], dict,
2880 _nonempty,
2881 )
2882@@ -894,13 +774,13 @@
2883 _check(doc, ['stored', store, 'copies'], int,
2884 (_at_least, 0),
2885 )
2886- _check(doc, ['stored', store, 'time'], (int, float),
2887+ _check(doc, ['stored', store, 'mtime'], (int, float),
2888 (_at_least, 0),
2889 )
2890 _check_if_exists(doc, ['stored', store, 'verified'], (int, float),
2891 (_at_least, 0),
2892 )
2893- _check_if_exists(doc, ['stored', store, 'status'], basestring,
2894+ _check_if_exists(doc, ['stored', store, 'status'], str,
2895 (_is_in, 'partial', 'corrupt'),
2896 )
2897 _check_if_exists(doc, ['stored', store, 'corrupted'], (int, float),
2898@@ -911,22 +791,26 @@
2899
2900
2901 def check_file_optional(doc):
2902+ # 'ext' like 'mov'
2903+ _check_if_exists(doc, ['ext'], str,
2904+ (_matches, EXT_PAT),
2905+ )
2906
2907 # 'content_type' like 'video/quicktime'
2908- _check_if_exists(doc, ['content_type'], basestring)
2909+ _check_if_exists(doc, ['content_type'], str)
2910
2911 # 'content_encoding' like 'gzip'
2912- _check_if_exists(doc, ['content_encoding'], basestring,
2913+ _check_if_exists(doc, ['content_encoding'], str,
2914 (_is_in, 'gzip', 'deflate'),
2915 )
2916
2917 # 'media' like 'video'
2918- _check_if_exists(doc, ['media'], basestring,
2919+ _check_if_exists(doc, ['media'], str,
2920 (_is_in, 'video', 'audio', 'image'),
2921 )
2922
2923- # 'mtime' like 1234567890
2924- _check_if_exists(doc, ['mtime'], (int, float),
2925+ # 'ctime' like 1234567890
2926+ _check_if_exists(doc, ['ctime'], (int, float),
2927 (_at_least, 0),
2928 )
2929
2930@@ -936,11 +820,11 @@
2931 )
2932
2933 # name like 'MVI_5899.MOV'
2934- _check_if_exists(doc, ['name'], basestring)
2935+ _check_if_exists(doc, ['name'], str)
2936
2937 # dir like 'DCIM/100EOS5D2'
2938 # FIXME: Should save this as a list so path is portable
2939- _check_if_exists(doc, ['dir'], basestring)
2940+ _check_if_exists(doc, ['dir'], str)
2941
2942 # 'meta' like {'iso': 800}
2943 _check_if_exists(doc, ['meta'], dict)
2944@@ -956,52 +840,39 @@
2945 """
2946 Verify that *doc* is a valid "dmedia/store" document.
2947
2948- To be a valid 'dmedia/store' record, *doc* must:
2949-
2950- 1. conform with `check_dmedia()`
2951-
2952- 2. have 'plugin' that equal to 'filestore', 'removable_filestore',
2953- 'ubuntuone', or 's3'
2954-
2955- 3. have 'copies' that is an ``int`` >= 1
2956-
2957 For example, a conforming value:
2958
2959 >>> doc = {
2960 ... '_id': 'NZXXMYLDOV2F6ZTUO5PWM5DX',
2961 ... 'ver': 0,
2962- ... 'type': 'dmedia/file',
2963- ... 'time': 1234567890,
2964- ... 'plugin': 'filestore',
2965- ... 'copies': 2,
2966- ... }
2967- ...
2968- >>> check_store(doc)
2969-
2970-
2971- And an invalid value:
2972-
2973- >>> doc = {
2974- ... '_id': 'NZXXMYLDOV2F6ZTUO5PWM5DX',
2975- ... 'ver': 0,
2976- ... 'type': 'dmedia/file',
2977- ... 'time': 1234567890,
2978- ... 'dispatch': 'filestore',
2979- ... 'copies': 2,
2980- ... }
2981- ...
2982- >>> check_store(doc)
2983- Traceback (most recent call last):
2984- ...
2985- ValueError: doc['plugin'] does not exist
2986+ ... 'type': 'dmedia/store',
2987+ ... 'time': 1234567890,
2988+ ... 'plugin': 'filestore.local',
2989+ ... 'copies': 1,
2990+ ... }
2991+ ...
2992+ >>> check_store(doc)
2993
2994 """
2995- check_dmedia(doc)
2996-
2997- _check(doc, ['plugin'], basestring,
2998- (_is_in, 'filestore', 'removable_filestore', 'ubuntuone', 's3'),
2999- )
3000-
3001+ # Common schema:
3002+ _check(doc, [], dict)
3003+ _check(doc, ['_id'], None,
3004+ _random_id,
3005+ )
3006+ _check(doc, ['ver'], int,
3007+ (_equals, 0),
3008+ )
3009+ _check(doc, ['type'], str,
3010+ (_equals, 'dmedia/store'),
3011+ )
3012+ _check(doc, ['time'], (int, float),
3013+ (_at_least, 0),
3014+ )
3015+
3016+ # Specific to dmedia/store
3017+ _check(doc, ['plugin'], str,
3018+ (_is_in, 'filestore.local', 'filestore.removable', 'ubuntuone', 's3'),
3019+ )
3020 _check(doc, ['copies'], int,
3021 (_at_least, 0),
3022 )
3023@@ -1020,7 +891,7 @@
3024
3025 3. have 'uuid', 'fs', 'drive_id' as ``str`` strings.
3026
3027- 4. have 'label', 'partition_label' as ``unicode`` strings.
3028+ 4. have 'label', 'partition_label' as ``str`` strings.
3029
3030 5. have 'size' as an ``int`` or ``long``.
3031
3032@@ -1036,8 +907,8 @@
3033 ... 'size': 1073741824,
3034 ... 'uuid': '45e8f250-b56a-11e0-aff2-0800200c9a66',
3035 ... 'fs': 'ext4',
3036- ... 'label': u'Data',
3037- ... 'partition_label': u'',
3038+ ... 'label': 'Data',
3039+ ... 'partition_label': '',
3040 ... 'drive_id': 'XBBXAIVUK4LPXJMAKCT4TEM2RDGK7HNG'
3041 ... }
3042 ...
3043@@ -1053,8 +924,8 @@
3044 ... 'time': 1234567890,
3045 ... 'size': 1073741824,
3046 ... 'uuid': '45e8f250-b56a-11e0-aff2-0800200c9a66',
3047- ... 'label': u'Data',
3048- ... 'partition_label': u'',
3049+ ... 'label': 'Data',
3050+ ... 'partition_label': '',
3051 ... 'drive_id': 'XBBXAIVUK4LPXJMAKCT4TEM2RDGK7HNG'
3052 ... }
3053 ...
3054@@ -1069,14 +940,14 @@
3055 _check_types(
3056 doc,
3057 (['uuid'], str),
3058- (['size'], int, long),
3059- (['label'], unicode),
3060- (['partition_label'], unicode),
3061+ (['size'], int),
3062+ (['label'], str),
3063+ (['partition_label'], str),
3064 (['fs'], str),
3065 (['drive_id'], str)
3066 )
3067
3068- _base32(doc['drive_id'], _label('drive_id'))
3069+ _any_id(doc['drive_id'], _label('drive_id'))
3070
3071
3072 def check_drive(doc):
3073@@ -1092,7 +963,7 @@
3074
3075 3. have 'serial', 'wwn', 'revision' as ``str`` strings.
3076
3077- 4. have 'vendor', 'model' as ``unicode`` strings.
3078+ 4. have 'vendor', 'model' as ``str`` strings.
3079
3080 For example, a conforming value:
3081
3082@@ -1104,8 +975,8 @@
3083 ... 'serial': 'A0000001B900',
3084 ... 'wwn': '50014ee0016eb572',
3085 ... 'revision': '1.95',
3086- ... 'vendor': u'Canon',
3087- ... 'model': u'EOS 7D'
3088+ ... 'vendor': 'Canon',
3089+ ... 'model': 'EOS 7D'
3090 ... }
3091 ...
3092 >>> check_drive(doc)
3093@@ -1121,7 +992,7 @@
3094 ... 'serial': 'A0000001B900',
3095 ... 'wwn': '50014ee0016eb572',
3096 ... 'revision': '1.95',
3097- ... 'vendor': u'Canon'
3098+ ... 'vendor': 'Canon'
3099 ... }
3100 ...
3101 >>> check_drive(doc)
3102@@ -1136,8 +1007,8 @@
3103 doc,
3104 (['serial'], str),
3105 (['wwn'], str),
3106- (['vendor'], unicode),
3107- (['model'], unicode),
3108+ (['vendor'], str),
3109+ (['model'], str),
3110 (['revision'], str)
3111 )
3112
3113@@ -1145,36 +1016,22 @@
3114 #######################################################
3115 # Functions for creating specific types of dmedia docs:
3116
3117-def create_file(_id, file_size, leaf_hashes, stored, ext=None, origin='user'):
3118+def create_file(_id, file_size, leaf_hashes, stored, origin='user'):
3119 """
3120 Create a minimal 'dmedia/file' document.
3121-
3122- :param _id: the content hash, eg ``'JK47OD6N5JYFGEIFB53LX7XPUSYCWDUM'``
3123- :param file_size: an ``int``, the file size in bytes, eg ``20202333``
3124- :param leaf_hashes: a ``bytes`` instance containing the concatenated content
3125- hashes of the leaves
3126- :param stored: a ``dict`` containing locations this file is stored
3127- ``'Y4J3WQCMKV5GHATOCZZBHF4Y'``
3128- :param ext: the file extension, eg ``'mov'``; default is ``None``
3129- :param origin: the file's origin (for durability/reclamation purposes);
3130- default is ``'user'``
3131 """
3132- ts = time.time()
3133- for value in stored.values():
3134- value['time'] = ts
3135 return {
3136 '_id': _id,
3137 '_attachments': {
3138- 'leaves': {
3139- 'data': b64encode(leaf_hashes),
3140+ 'leaf_hashes': {
3141+ 'data': b64encode(leaf_hashes).decode('utf-8'),
3142 'content_type': 'application/octet-stream',
3143 }
3144 },
3145 'ver': 0,
3146 'type': 'dmedia/file',
3147- 'time': ts,
3148+ 'time': time.time(),
3149 'bytes': file_size,
3150- 'ext': ext,
3151 'origin': origin,
3152 'stored': stored,
3153 }
3154@@ -1190,34 +1047,39 @@
3155 'type': 'dmedia/machine',
3156 'time': time.time(),
3157 'hostname': socket.gethostname(),
3158- 'distribution': list(platform.linux_distribution()),
3159 }
3160
3161
3162 def create_store(parentdir, machine_id, copies=1):
3163 """
3164- Create a 'dmedia/store' document.
3165+ Create a 'dmedia/store' doc for a FileStore on a non-removable drive.
3166 """
3167- # FIXME: We're going to have have the drive and partition information passed
3168- # to schema.py "from the outside" as to abstract whether the info comes from
3169- # udisks or the equivalent on other platforms.
3170- #try:
3171- # makedirs(parentdir)
3172- #except:
3173- # pass
3174- #p = Device(path=parentdir)
3175- #uuid = str(p['IdUuid'])
3176 return {
3177 '_id': random_id(),
3178 'ver': 0,
3179 'type': 'dmedia/store',
3180 'time': time.time(),
3181- 'plugin': 'filestore',
3182- 'copies': copies,
3183- 'path': parentdir,
3184+ 'plugin': 'filestore.local',
3185+ 'parentdir': parentdir,
3186 'machine_id': machine_id,
3187- #'partition_id': b32encode(sha1(uuid).digest())
3188- }
3189+ 'copies': copies,
3190+ }
3191+
3192+
3193+def create_removable_store(copies=1, **kw):
3194+ """
3195+ Create a 'dmedia/store' document.
3196+ """
3197+ doc = {
3198+ '_id': random_id(),
3199+ 'ver': 0,
3200+ 'type': 'dmedia/store',
3201+ 'time': time.time(),
3202+ 'plugin': 'filestore.removable',
3203+ 'copies': copies,
3204+ }
3205+ doc.update(kw)
3206+ return doc
3207
3208
3209 def create_s3_store(bucket, copies=2, use_ext=True):
3210@@ -1249,41 +1111,36 @@
3211 'imports': [],
3212 'errors': [],
3213 'stats': {
3214- 'considered': {'count': 0, 'bytes': 0},
3215- 'imported': {'count': 0, 'bytes': 0},
3216- 'skipped': {'count': 0, 'bytes': 0},
3217+ 'total': {'count': 0, 'bytes': 0},
3218+ 'new': {'count': 0, 'bytes': 0},
3219+ 'duplicate': {'count': 0, 'bytes': 0},
3220 'empty': {'count': 0, 'bytes': 0},
3221- 'error': {'count': 0, 'bytes': 0},
3222- }
3223+ },
3224 }
3225
3226
3227-def create_import(base, partition_id, batch_id=None, machine_id=None):
3228+def create_import(basedir, machine_id, **kw):
3229 """
3230 Create initial 'dmedia/import' accounting document.
3231 """
3232- return {
3233+ doc = {
3234 '_id': random_id(),
3235 'ver': 0,
3236 'type': 'dmedia/import',
3237 'time': time.time(),
3238- 'batch_id': batch_id,
3239+ 'basedir': basedir,
3240 'machine_id': machine_id,
3241- 'partition_id': partition_id,
3242- 'base': base,
3243- 'log': {
3244- 'imported': [],
3245- 'skipped': [],
3246- 'empty': [],
3247- 'error': [],
3248- },
3249+ 'files': {},
3250+ 'import_order': [],
3251 'stats': {
3252- 'imported': {'count': 0, 'bytes': 0},
3253- 'skipped': {'count': 0, 'bytes': 0},
3254+ 'total': {'count': 0, 'bytes': 0},
3255+ 'new': {'count': 0, 'bytes': 0},
3256+ 'duplicate': {'count': 0, 'bytes': 0},
3257 'empty': {'count': 0, 'bytes': 0},
3258- 'error': {'count': 0, 'bytes': 0},
3259- }
3260+ },
3261 }
3262+ doc.update(kw)
3263+ return doc
3264
3265
3266 def create_partition(base):
3267@@ -1300,8 +1157,8 @@
3268 'time': time.time(),
3269 'uuid': uuid,
3270 'size': int(p['DeviceSize']),
3271- 'label': unicode(p['IdLabel']),
3272- 'partition_label': unicode(p['PartitionLabel']),
3273+ 'label': str(p['IdLabel']),
3274+ 'partition_label': str(p['PartitionLabel']),
3275 'fs': str(p['IdType']),
3276 'drive_id': _drive_id(d)
3277 }
3278@@ -1320,7 +1177,7 @@
3279 'time': time.time(),
3280 'serial': str(d['DriveSerial']),
3281 'wwn': str(d['DriveWwn']),
3282- 'vendor': unicode(d['DriveVendor']),
3283- 'model': unicode(d['DriveModel']),
3284+ 'vendor': str(d['DriveVendor']),
3285+ 'model': str(d['DriveModel']),
3286 'revision': str(d['DriveRevision'])
3287 }
3288
3289=== added file 'dmedia/tests/base.py'
3290--- dmedia/tests/base.py 1970-01-01 00:00:00 +0000
3291+++ dmedia/tests/base.py 2011-09-22 11:43:29 +0000
3292@@ -0,0 +1,164 @@
3293+# Authors:
3294+# Jason Gerard DeRose <jderose@novacut.com>
3295+#
3296+# dmedia: distributed media library
3297+# Copyright (C) 2011 Jason Gerard DeRose <jderose@novacut.com>
3298+#
3299+# This file is part of `dmedia`.
3300+#
3301+# `dmedia` is free software: you can redistribute it and/or modify it under the
3302+# terms of the GNU Affero General Public License as published by the Free
3303+# Software Foundation, either version 3 of the License, or (at your option) any
3304+# later version.
3305+#
3306+# `dmedia` is distributed in the hope that it will be useful, but WITHOUT ANY
3307+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
3308+# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
3309+# details.
3310+#
3311+# You should have received a copy of the GNU Affero General Public License along
3312+# with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
3313+
3314+"""
3315+Usefull TestCase subclasses.
3316+"""
3317+
3318+from unittest import TestCase
3319+from os import path
3320+from base64 import b64decode
3321+import os
3322+from os import path
3323+import tempfile
3324+import shutil
3325+from random import SystemRandom
3326+
3327+from filestore import File, Leaf, ContentHash, Batch, Hasher, LEAF_SIZE
3328+from microfiber import random_id
3329+
3330+
3331+datadir = path.join(path.dirname(path.abspath(__file__)), 'data')
3332+random = SystemRandom()
3333+
3334+
3335+class SampleFilesTestCase(TestCase):
3336+ """
3337+ Base clase for tests that use the files in dmedia/tests/data.
3338+
3339+ If the MVI_5751.MOV or MVI_5751.THM file isn't present, self.skipTest() is
3340+ called. This will allow us to stop shipping the 20MB video file in the
3341+ dmedia release tarballs.
3342+ """
3343+
3344+ mov = path.join(datadir, 'MVI_5751.MOV')
3345+ thm = path.join(datadir, 'MVI_5751.THM')
3346+ mov_ch = ContentHash(
3347+ 'SM3GS4DUDVXOEU2DTTTWU5HKNRK777IWNSI5UQ4ZWNQGRXAN',
3348+ 20202333,
3349+ b64decode(b''.join([
3350+ b'Ps9ZlZ5RALOGrqUbXJYJDFJaLClkGKkv4gYu2cWn',
3351+ b'dse+QPUQFn9Q6FBkhhX0hjDGHOyMnFtGdAgRY1Gc',
3352+ b'XzjjVS002vjsMkVKb4/+E7qmeGfHsBFFbYV127ux'
3353+ ]))
3354+ )
3355+ thm_ch = ContentHash(
3356+ 'MXPCFNUNPDAWHQWC5QNTPP2U5OF2J267QQVALXX6B5TRJKJB',
3357+ 27328,
3358+ b64decode(b'RwtCvXTjDrah3O23qNkobCGNF6hq7HYIB4TRx2Dh'),
3359+ )
3360+
3361+ def setUp(self):
3362+ for filename in (self.mov, self.thm):
3363+ if not path.isfile(filename):
3364+ self.skipTest('Missing file {!r}'.format(filename))
3365+
3366+
3367+def random_leaves(file_size):
3368+ index = 0
3369+ for full in range(file_size // LEAF_SIZE):
3370+ data = os.urandom(16) * (LEAF_SIZE // 16)
3371+ yield Leaf(index, data)
3372+ index += 1
3373+ partial = file_size % LEAF_SIZE
3374+ if partial:
3375+ data = os.urandom(1) * partial
3376+ yield Leaf(index, data)
3377+
3378+
3379+def random_file(tmpdir, max_size):
3380+ filename = path.join(tmpdir, random_id())
3381+ file_size = random.randint(1, max_size)
3382+ dst_fp = open(filename, 'wb')
3383+ h = Hasher()
3384+ for leaf in random_leaves(file_size):
3385+ h.hash_leaf(leaf)
3386+ dst_fp.write(leaf.data)
3387+ dst_fp.close()
3388+ st = os.stat(filename)
3389+ file = File(filename, st.st_size, st.st_mtime)
3390+ assert file.size == file_size
3391+ return (file, h.content_hash())
3392+
3393+
3394+def random_empty(tmpdir):
3395+ filename = path.join(tmpdir, random_id())
3396+ open(filename, 'wb').close()
3397+ st = os.stat(filename)
3398+ file = File(filename, st.st_size, st.st_mtime)
3399+ assert file.size == 0
3400+ return (file, None)
3401+
3402+
3403+class TempDir(object):
3404+ def __init__(self):
3405+ self.dir = tempfile.mkdtemp(prefix='unittest.')
3406+
3407+ def __del__(self):
3408+ self.rmtree()
3409+
3410+ def rmtree(self):
3411+ if self.dir is not None:
3412+ shutil.rmtree(self.dir)
3413+ self.dir = None
3414+
3415+ def join(self, *parts):
3416+ return path.join(self.dir, *parts)
3417+
3418+ def makedirs(self, *parts):
3419+ d = self.join(*parts)
3420+ if not path.exists(d):
3421+ os.makedirs(d)
3422+ assert path.isdir(d), d
3423+ return d
3424+
3425+ def touch(self, *parts):
3426+ self.makedirs(*parts[:-1])
3427+ f = self.join(*parts)
3428+ open(f, 'wb').close()
3429+ return f
3430+
3431+ def write(self, data, *parts):
3432+ self.makedirs(*parts[:-1])
3433+ f = self.join(*parts)
3434+ open(f, 'wb').write(data)
3435+ return f
3436+
3437+ def copy(self, src, *parts):
3438+ self.makedirs(*parts[:-1])
3439+ dst = self.join(*parts)
3440+ shutil.copy(src, dst)
3441+ return dst
3442+
3443+ def random_batch(self, count, empties=0, max_size=LEAF_SIZE*4):
3444+ result = list(self.random_file(max_size) for i in range(count))
3445+ result.extend(self.random_empty() for i in range(empties))
3446+ result.sort(key=lambda tup: tup[0].name)
3447+ files = tuple(file for (file, ch) in result)
3448+ batch = Batch(files, sum(f.size for f in files), len(files))
3449+ return (batch, result)
3450+
3451+ def random_file(self, max_size=LEAF_SIZE*4):
3452+ return random_file(self.dir, max_size)
3453+
3454+ def random_empty(self):
3455+ return random_empty(self.dir)
3456+
3457
3458=== modified file 'dmedia/tests/couch.py'
3459--- dmedia/tests/couch.py 2011-09-15 11:41:48 +0000
3460+++ dmedia/tests/couch.py 2011-09-22 11:43:29 +0000
3461@@ -30,7 +30,7 @@
3462 from subprocess import Popen
3463 import time
3464 import socket
3465-from hashlib import sha1
3466+from hashlib import sha1, md5
3467 from base64 import b32encode
3468 import shutil
3469 from copy import deepcopy
3470@@ -38,8 +38,6 @@
3471 import microfiber
3472 from microfiber import random_id
3473
3474-from .helpers import TempHome
3475-
3476
3477 SOCKET_OPTIONS = '[{recbuf, 262144}, {sndbuf, 262144}, {nodelay, true}]'
3478
3479@@ -93,7 +91,7 @@
3480
3481
3482 def random_key():
3483- return b32encode(os.urandom(10))
3484+ return b32encode(os.urandom(10)).decode('utf-8')
3485
3486
3487 def random_oauth():
3488@@ -122,11 +120,12 @@
3489
3490
3491 def random_salt():
3492- return os.urandom(16).encode('hex')
3493+ return md5(os.urandom(16)).hexdigest()
3494
3495
3496 def couch_hashed(password, salt):
3497- hexdigest = sha1(password + salt).hexdigest()
3498+ data = (password + salt).encode('utf-8')
3499+ hexdigest = sha1(data).hexdigest()
3500 return '-hashed-{},{}'.format(hexdigest, salt)
3501
3502
3503@@ -240,13 +239,11 @@
3504 def setUp(self):
3505 self.tmpcouch = TempCouch()
3506 self.env = self.tmpcouch.bootstrap()
3507- self.home = TempHome()
3508 self.machine_id = random_id()
3509 self.env['machine_id'] = self.machine_id
3510- self.env['filestore'] = {'_id': random_id(), 'path': self.home.path}
3511
3512 def tearDown(self):
3513 self.tmpcouch.kill()
3514 self.tmpcouch = None
3515- self.home = None
3516 self.env = None
3517+ self.machine_id = None
3518
3519=== modified file 'dmedia/tests/helpers.py'
3520--- dmedia/tests/helpers.py 2011-04-06 20:56:54 +0000
3521+++ dmedia/tests/helpers.py 2011-09-22 11:43:29 +0000
3522@@ -36,19 +36,19 @@
3523 mov_hash = 'TGX33XXWU3EVHEEY5J7NBOJGKBFXLEBK'
3524 mov_size = 20202333
3525 mov_leaves = [
3526- b32decode('IXJTSUCYYFECGSG6JIB2R77CAJVJK4W3'),
3527- b32decode('MA3IAHUOKXR4TRG7CWAPOO7U4WCV5WJ4'),
3528- b32decode('FHF7KDMAGNYOVNYSYT6ZYWQLUOCTUADI'),
3529+ b32decode(b'IXJTSUCYYFECGSG6JIB2R77CAJVJK4W3'),
3530+ b32decode(b'MA3IAHUOKXR4TRG7CWAPOO7U4WCV5WJ4'),
3531+ b32decode(b'FHF7KDMAGNYOVNYSYT6ZYWQLUOCTUADI'),
3532 ]
3533 mov_att = {
3534- 'data': b64encode(''.join(mov_leaves)),
3535+ 'data': b64encode(b''.join(mov_leaves)),
3536 'content_type': 'application/octet-stream',
3537 }
3538
3539 mov_qid = 'GJ4AQP3BK3DMTXYOLKDK6CW4QIJJGVMN'
3540
3541 thm_hash = 'GKZMOPVZILR43MZCXLVYP7T62XGBT7BQ'
3542-thm_leaves = [b32decode('F6ATTKI6YVWVRBQQESAZ4DSUXQ4G457A')]
3543+thm_leaves = [b32decode(b'F6ATTKI6YVWVRBQQESAZ4DSUXQ4G457A')]
3544 thm_qid = 'EYCDXXCNDB6OIIX5DN74J7KEXLNCQD5M'
3545
3546
3547@@ -64,28 +64,25 @@
3548 return (src1, src2, dup1)
3549
3550
3551-class ExceptionNotRaised(StandardError):
3552+class ExceptionNotRaised(Exception):
3553 """
3554 Raised when an expected exception is not raised.
3555 """
3556
3557 def __init__(self, expected):
3558 self.expected = expected
3559- StandardError.__init__(self, 'expected %s' % expected.__name__)
3560+ Exception.__init__(self, 'expected %s' % expected.__name__)
3561
3562
3563 def raises(exception, callback, *args, **kw):
3564 """
3565 Test that ``exception`` is raised when ``callback`` is called.
3566 """
3567- raised = False
3568 try:
3569 callback(*args, **kw)
3570- except exception, e:
3571- raised = True
3572- if not raised:
3573- raise ExceptionNotRaised(exception)
3574- return e
3575+ except exception as e:
3576+ return e
3577+ raise ExceptionNotRaised(exception)
3578
3579
3580 class TempDir(object):
3581
3582=== added file 'dmedia/tests/test_client.py'
3583--- dmedia/tests/test_client.py 1970-01-01 00:00:00 +0000
3584+++ dmedia/tests/test_client.py 2011-09-22 11:43:29 +0000
3585@@ -0,0 +1,259 @@
3586+# dmedia: dmedia hashing protocol and file layout
3587+# Copyright (C) 2011 Novacut Inc
3588+#
3589+# This file is part of `dmedia`.
3590+#
3591+# `dmedia` is free software: you can redistribute it and/or modify it under
3592+# the terms of the GNU Affero General Public License as published by the Free
3593+# Software Foundation, either version 3 of the License, or (at your option) any
3594+# later version.
3595+#
3596+# `dmedia` is distributed in the hope that it will be useful, but WITHOUT ANY
3597+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
3598+# A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
3599+# details.
3600+#
3601+# You should have received a copy of the GNU Affero General Public License along
3602+# with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
3603+#
3604+# Authors:
3605+# Jason Gerard DeRose <jderose@novacut.com>
3606+
3607+"""
3608+Unit tests for `dmedia.client`.
3609+"""
3610+
3611+from unittest import TestCase
3612+import os
3613+from http.client import HTTPConnection, HTTPSConnection
3614+
3615+from microfiber import random_id
3616+from filestore import ContentHash, TYPE_ERROR, DIGEST_BYTES
3617+
3618+from dmedia import client
3619+
3620+
3621+class FakeResponse:
3622+ def __init__(self, status, reason):
3623+ self.status = status
3624+ self.reason = reason
3625+ self._data = os.urandom(16)
3626+
3627+ def read(self):
3628+ return self._data
3629+
3630+
3631+class TestErrors(TestCase):
3632+ def test_errors(self):
3633+ self.assertEqual(
3634+ client.errors,
3635+ {
3636+ 400: client.BadRequest,
3637+ 401: client.Unauthorized,
3638+ 403: client.Forbidden,
3639+ 404: client.NotFound,
3640+ 405: client.MethodNotAllowed,
3641+ 406: client.NotAcceptable,
3642+ 409: client.Conflict,
3643+ 412: client.PreconditionFailed,
3644+ 415: client.BadContentType,
3645+ 416: client.BadRangeRequest,
3646+ 417: client.ExpectationFailed,
3647+ }
3648+ )
3649+ method = 'MOST'
3650+ path = '/restful?and=awesome'
3651+ for (status, klass) in client.errors.items():
3652+ reason = random_id()
3653+ r = FakeResponse(status, reason)
3654+ inst = klass(r, method, path)
3655+ self.assertIs(inst.response, r)
3656+ self.assertEqual(inst.method, method)
3657+ self.assertEqual(inst.path, path)
3658+ self.assertEqual(inst.data, r._data)
3659+ self.assertEqual(
3660+ str(inst),
3661+ '{} {}: {} {}'.format(status, reason, method, path)
3662+ )
3663+
3664+
3665+class TestFunctions(TestCase):
3666+ def test_http_conn(self):
3667+ f = client.http_conn
3668+
3669+ # Test with bad scheme
3670+ with self.assertRaises(ValueError) as cm:
3671+ (conn, t) = f('ftp://foo.s3.amazonaws.com/')
3672+ self.assertEqual(
3673+ str(cm.exception),
3674+ "url scheme must be http or https: 'ftp://foo.s3.amazonaws.com/'"
3675+ )
3676+
3677+ # Test with bad url
3678+ with self.assertRaises(ValueError) as cm:
3679+ (inst, t) = f('http:foo.s3.amazonaws.com/')
3680+ self.assertEqual(
3681+ str(cm.exception),
3682+ "bad url: 'http:foo.s3.amazonaws.com/'"
3683+ )
3684+
3685+ # Test with HTTP
3686+ (conn, t) = f('http://foo.s3.amazonaws.com/')
3687+ self.assertIsInstance(conn, HTTPConnection)
3688+ self.assertNotIsInstance(conn, HTTPSConnection)
3689+ self.assertEqual(t, ('http', 'foo.s3.amazonaws.com', '/', '', '', ''))
3690+
3691+ # Test with HTTPS
3692+ (conn, t) = f('https://foo.s3.amazonaws.com/')
3693+ self.assertIsInstance(conn, HTTPSConnection)
3694+ self.assertEqual(t, ('https', 'foo.s3.amazonaws.com', '/', '', '', ''))
3695+
3696+ def test_bytes_range(self):
3697+ f = client.bytes_range
3698+ self.assertEqual(f(0, 500), 'bytes=0-499')
3699+ self.assertEqual(f(500, 1000), 'bytes=500-999')
3700+ self.assertEqual(f(-500), 'bytes=-500')
3701+ self.assertEqual(f(9500), 'bytes=9500-')
3702+
3703+ def test_check_slice(self):
3704+ ch = ContentHash('foo', None, (1, 2, 3))
3705+
3706+ # Test all valid slices
3707+ client.check_slice(ch, 0, None)
3708+ client.check_slice(ch, 1, None)
3709+ client.check_slice(ch, 2, None)
3710+ client.check_slice(ch, 0, 1)
3711+ client.check_slice(ch, 0, 2)
3712+ client.check_slice(ch, 1, 2)
3713+ client.check_slice(ch, 0, 3)
3714+ client.check_slice(ch, 1, 3)
3715+ client.check_slice(ch, 2, 3)
3716+
3717+ # ch type
3718+ with self.assertRaises(TypeError) as cm:
3719+ bad = ('foo', None, (1, 2, 3))
3720+ client.check_slice(bad, 1, None)
3721+ self.assertEqual(
3722+ str(cm.exception),
3723+ TYPE_ERROR.format('ch', ContentHash, tuple, bad)
3724+ )
3725+
3726+ # ch.leaf_hashes type
3727+ with self.assertRaises(TypeError) as cm:
3728+ bad = ContentHash('foo', None, os.urandom(DIGEST_BYTES))
3729+ client.check_slice(bad, 1, None)
3730+ self.assertEqual(
3731+ str(cm.exception),
3732+ 'ch.leaf_hashes not unpacked for ch.id=foo'
3733+ )
3734+
3735+ # empty ch.leaf_hashes
3736+ with self.assertRaises(ValueError) as cm:
3737+ bad = ContentHash('foo', None, tuple())
3738+ client.check_slice(bad, 1, None)
3739+ self.assertEqual(
3740+ str(cm.exception),
3741+ 'got empty ch.leaf_hashes for ch.id=foo'
3742+ )
3743+
3744+ # start type
3745+ with self.assertRaises(TypeError) as cm:
3746+ client.check_slice(ch, 0.0, None)
3747+ self.assertEqual(
3748+ str(cm.exception),
3749+ TYPE_ERROR.format('start', int, float, 0.0)
3750+ )
3751+
3752+ # stop type
3753+ with self.assertRaises(TypeError) as cm:
3754+ client.check_slice(ch, 0, 1.0)
3755+ self.assertEqual(
3756+ str(cm.exception),
3757+ TYPE_ERROR.format('stop', int, float, 1.0)
3758+ )
3759+
3760+ # start value
3761+ with self.assertRaises(ValueError) as cm:
3762+ client.check_slice(ch, -1, None)
3763+ self.assertEqual(
3764+ str(cm.exception),
3765+ 'Need 0 <= start < 3; got start=-1'
3766+ )
3767+ with self.assertRaises(ValueError) as cm:
3768+ client.check_slice(ch, 3, None)
3769+ self.assertEqual(
3770+ str(cm.exception),
3771+ 'Need 0 <= start < 3; got start=3'
3772+ )
3773+
3774+ # stop value
3775+ with self.assertRaises(ValueError) as cm:
3776+ client.check_slice(ch, 0, 0)
3777+ self.assertEqual(
3778+ str(cm.exception),
3779+ 'Need 1 <= stop <= 3; got stop=0'
3780+ )
3781+ with self.assertRaises(ValueError) as cm:
3782+ client.check_slice(ch, 0, 4)
3783+ self.assertEqual(
3784+ str(cm.exception),
3785+ 'Need 1 <= stop <= 3; got stop=4'
3786+ )
3787+
3788+ # start < stop
3789+ with self.assertRaises(ValueError) as cm:
3790+ client.check_slice(ch, 2, 1)
3791+ self.assertEqual(
3792+ str(cm.exception),
3793+ 'Need start < stop; got start=2, stop=1'
3794+ )
3795+ with self.assertRaises(ValueError) as cm:
3796+ client.check_slice(ch, 1, 1)
3797+ self.assertEqual(
3798+ str(cm.exception),
3799+ 'Need start < stop; got start=1, stop=1'
3800+ )
3801+
3802+
3803+class TestHTTPClient(TestCase):
3804+ def test_init(self):
3805+ bad = 'sftp://localhost:5984/'
3806+ with self.assertRaises(ValueError) as cm:
3807+ inst = client.HTTPClient(bad)
3808+ self.assertEqual(
3809+ str(cm.exception),
3810+ 'url scheme must be http or https: {!r}'.format(bad)
3811+ )
3812+ bad = 'http:localhost:5984/foo/bar'
3813+ with self.assertRaises(ValueError) as cm:
3814+ inst = client.HTTPClient(bad)
3815+ self.assertEqual(
3816+ str(cm.exception),
3817+ 'bad url: {!r}'.format(bad)
3818+ )
3819+
3820+ inst = client.HTTPClient('https://localhost:5984/couch?foo=bar/')
3821+ self.assertEqual(inst.url, 'https://localhost:5984/couch/')
3822+ self.assertEqual(inst.basepath, '/couch/')
3823+ self.assertIsInstance(inst.conn, HTTPSConnection)
3824+
3825+ inst = client.HTTPClient('http://localhost:5984?/')
3826+ self.assertEqual(inst.url, 'http://localhost:5984/')
3827+ self.assertEqual(inst.basepath, '/')
3828+ self.assertIsInstance(inst.conn, HTTPConnection)
3829+
3830+ inst = client.HTTPClient('http://localhost:5001/')
3831+ self.assertEqual(inst.url, 'http://localhost:5001/')
3832+ self.assertIsInstance(inst.conn, HTTPConnection)
3833+
3834+ inst = client.HTTPClient('http://localhost:5002')
3835+ self.assertEqual(inst.url, 'http://localhost:5002/')
3836+ self.assertIsInstance(inst.conn, HTTPConnection)
3837+
3838+ inst = client.HTTPClient('https://localhost:5003/')
3839+ self.assertEqual(inst.url, 'https://localhost:5003/')
3840+ self.assertIsInstance(inst.conn, HTTPSConnection)
3841+
3842+ inst = client.HTTPClient('https://localhost:5004')
3843+ self.assertEqual(inst.url, 'https://localhost:5004/')
3844+ self.assertIsInstance(inst.conn, HTTPSConnection)
3845
3846=== modified file 'dmedia/tests/test_core.py'
3847--- dmedia/tests/test_core.py 2011-09-16 05:03:51 +0000
3848+++ dmedia/tests/test_core.py 2011-09-22 11:43:29 +0000
3849@@ -30,18 +30,27 @@
3850 from os import path
3851
3852 import microfiber
3853+from filestore import FileStore, DIGEST_BYTES
3854
3855 from dmedia.schema import random_id, check_store
3856-from dmedia.filestore import FileStore
3857 from dmedia import core
3858
3859-from .helpers import TempDir, mov_hash, sample_mov
3860+from .helpers import TempHome
3861 from .couch import CouchCase
3862+from .base import TempDir
3863
3864
3865 class TestCore(CouchCase):
3866 klass = core.Core
3867
3868+ def setUp(self):
3869+ super().setUp()
3870+ self.home = TempHome()
3871+
3872+ def tearDown(self):
3873+ super().tearDown()
3874+ self.home = None
3875+
3876 def test_init(self):
3877 inst = self.klass(self.env)
3878 self.assertIs(inst.env, self.env)
3879@@ -84,7 +93,6 @@
3880 'type',
3881 'time',
3882 'hostname',
3883- 'distribution',
3884 ])
3885 )
3886 self.assertEqual(machine, inst.db.get(local['machine']['_id']))
3887@@ -142,16 +150,16 @@
3888 'time',
3889 'plugin',
3890 'copies',
3891- 'path',
3892+ 'parentdir',
3893 'machine_id',
3894 #'partition_id',
3895 ])
3896 )
3897 self.assertEqual(lstore['ver'], 0)
3898 self.assertEqual(lstore['type'], 'dmedia/store')
3899- self.assertEqual(lstore['plugin'], 'filestore')
3900+ self.assertEqual(lstore['plugin'], 'filestore.local')
3901 self.assertEqual(lstore['copies'], 1)
3902- self.assertEqual(lstore['path'], self.home.path)
3903+ self.assertEqual(lstore['parentdir'], self.home.path)
3904 self.assertEqual(lstore['machine_id'], inst.machine_id)
3905
3906 store = inst.db.get(_id)
3907@@ -198,12 +206,12 @@
3908 self.assertEqual(set(inst._filestores), set([okay]))
3909 fs = inst._filestores[okay]
3910 self.assertIsInstance(fs, FileStore)
3911- self.assertEqual(fs.parent, okay)
3912+ self.assertEqual(fs.parentdir, okay)
3913
3914 # Test the doc
3915 check_store(store)
3916 self.assertEqual(inst.db.get(store['_id']), store)
3917- self.assertEqual(store['path'], okay)
3918+ self.assertEqual(store['parentdir'], okay)
3919 self.assertTrue(store.pop('_rev').startswith('1-'))
3920 self.assertEqual(list(inst.local['filestores']), [okay])
3921 self.assertEqual(inst.local['filestores'][okay], store)
3922@@ -217,26 +225,25 @@
3923 self.assertEqual(inst.db.get('_local/dmedia')['_rev'], '0-1')
3924
3925 def test_get_file(self):
3926+ src = TempDir()
3927+ (file, ch) = src.random_file()
3928+
3929 inst = self.klass(self.env)
3930- doc = {
3931- '_id': mov_hash,
3932- 'ext': 'mov',
3933- }
3934- inst.db.save(doc)
3935- self.assertIsNone(inst.get_file(mov_hash))
3936+ self.assertIsNone(inst.get_file(ch.id))
3937
3938 tmp1 = TempDir()
3939 tmp2 = TempDir()
3940- fs1 = FileStore(tmp1.path)
3941- fs2 = FileStore(tmp2.path)
3942- inst._filestores[tmp1.path] = fs1
3943- inst._filestores[tmp2.path] = fs2
3944- self.assertIsNone(inst.get_file(mov_hash))
3945+ fs1 = FileStore(tmp1.dir)
3946+ fs2 = FileStore(tmp2.dir)
3947+ inst._filestores[tmp1.dir] = fs1
3948+ inst._filestores[tmp2.dir] = fs2
3949+ self.assertIsNone(inst.get_file(ch.id))
3950
3951- src_fp = open(sample_mov, 'rb')
3952+ src_fp = open(file.name, 'rb')
3953 fs1.import_file(src_fp)
3954- self.assertIsNone(inst.get_file(mov_hash))
3955+ self.assertEqual(inst.get_file(ch.id), fs1.path(ch.id))
3956+ fs1.remove(ch.id)
3957
3958- src_fp = open(sample_mov, 'rb')
3959- fs2.import_file(src_fp, 'mov')
3960- self.assertEqual(inst.get_file(mov_hash), fs2.path(mov_hash, 'mov'))
3961+ src_fp = open(file.name, 'rb')
3962+ fs2.import_file(src_fp)
3963+ self.assertEqual(inst.get_file(ch.id), fs2.path(ch.id))
3964
3965=== modified file 'dmedia/tests/test_extractor.py'
3966--- dmedia/tests/test_extractor.py 2011-08-29 02:00:26 +0000
3967+++ dmedia/tests/test_extractor.py 2011-09-22 11:43:29 +0000
3968@@ -23,231 +23,231 @@
3969 Unit tests for `dmedia.extractor` module.
3970 """
3971
3972-from unittest import TestCase
3973 import base64
3974 from os import path
3975-import Image
3976-from .helpers import sample_mov, sample_thm, TempDir
3977+
3978+from .base import TempDir, SampleFilesTestCase
3979+
3980 from dmedia import extractor
3981
3982 # Known EXIF data as returned be exiftool:
3983 sample_thm_exif = {
3984- u'AddOriginalDecisionData': u'Off',
3985- u'AEBAutoCancel': u'On',
3986- u'AEBBracketValue': 0,
3987- u'AEBSequence': u'0,-,+',
3988- u'AFAssistBeam': u'Emits',
3989- u'AFMicroAdjActive': u'No',
3990- u'AFMicroadjustment': u'Disable; 0; 0; 0; 0',
3991- u'AFMicroAdjValue': 0,
3992- u'AFOnAELockButtonSwitch': u'Disable',
3993- u'AFPointAreaExpansion': u'Disable',
3994- u'AFPointSelectionMethod': u'Normal',
3995- u'Aperture': 11.0,
3996- u'ApertureValue': 11.300000000000001,
3997- u'Artist': u'',
3998- u'AssignFuncButton': u'LCD brightness',
3999- u'AutoExposureBracketing': u'Off',
4000- u'AutoISO': 100,
4001- u'AutoLightingOptimizer': u'Disable',
4002- u'BaseISO': 100,
4003- u'BlackMaskBottomBorder': 0,
4004- u'BlackMaskLeftBorder': 0,
4005- u'BlackMaskRightBorder': 0,
4006- u'BlackMaskTopBorder': 0,
4007- u'BracketMode': u'Off',
4008- u'BracketShotNumber': 0,
4009- u'BracketValue': 0,
4010- u'BulbDuration': 0,
4011- u'CameraType': u'EOS High-end',
4012- u'CanonExposureMode': u'Manual',
4013- u'CanonFirmwareVersion': u'Firmware Version 2.0.7',
4014- u'CanonFlashMode': u'Off',
4015- u'CanonImageSize': u'1920x1080 Movie',
4016- u'CanonImageType': u'MVI:Canon EOS 5D Mark II',
4017- u'CanonModelID': u'EOS 5D Mark II',
4018- u'CircleOfConfusion': u'0.031 mm',
4019- u'ColorComponents': 3,
4020- u'ColorSpace': u'sRGB',
4021- u'ColorTemperature': 3600,
4022- u'ColorTone': u'Normal',
4023- u'ComponentsConfiguration': u'Y, Cb, Cr, -',
4024- u'ContinuousDrive': u'Movie',
4025- u'Contrast': -4,
4026- u'ControlMode': u'Camera Local Control',
4027- u'Copyright': u'',
4028- u'CreateDate': u'2010:10:19 20:43:14',
4029- u'CustomRendered': u'Normal',
4030- u'DateTimeOriginal': u'2010:10:19 20:43:14',
4031- u'DialDirectionTvAv': u'Normal',
4032- u'DigitalGain': 0,
4033- u'DigitalZoom': u'None',
4034- #u'Directory': u'dmedia/tests/data',
4035- u'DriveMode': u'Continuous Shooting',
4036- u'EasyMode': u'Manual',
4037- u'EncodingProcess': u'Baseline DCT, Huffman coding',
4038- #u'ExifByteOrder': u'Little-endian (Intel, II)',
4039- u'ExifImageHeight': 120,
4040- u'ExifImageWidth': 160,
4041- #u'ExifToolVersion': 8.1500000000000004,
4042- u'ExifVersion': u'0221',
4043- u'ExposureCompensation': 0,
4044- u'ExposureLevelIncrements': u'1/3 Stop',
4045- u'ExposureMode': u'Auto',
4046- u'ExposureProgram': u'Manual',
4047- u'ExposureTime': u'1/100',
4048- #u'FileModifyDate': u'2010:10:19 20:43:18-06:00',
4049- #u'FileName': u'MVI_5751.THM',
4050- #u'FilePermissions': u'rw-r--r--',
4051- #u'FileSize': u'27 kB',
4052- #u'FileType': u'JPEG',
4053- u'FlashActivity': 0,
4054- u'FlashBits': u'(none)',
4055- u'FlashExposureComp': 0, u'SequenceNumber': 0,
4056- u'FlashExposureLock': u'Off',
4057- u'FlashGuideNumber': 0,
4058- u'FlashpixVersion': u'0100',
4059- u'FlashSyncSpeedAv': u'Auto',
4060- u'Flash': u'Off, Did not fire',
4061- u'FNumber': 11.0,
4062- u'FocalLength35efl': u'138.0 mm (35 mm equivalent: 134.7 mm)',
4063- u'FocalLength': u'138.0 mm',
4064- u'FocalPlaneResolutionUnit': u'inches',
4065- u'FocalPlaneXResolution': 109.6641535,
4066- u'FocalPlaneYResolution': 125.26096029999999,
4067- u'FocalUnits': u'1/mm',
4068- u'FocusingScreen': u'Eg-D',
4069- u'FocusMode': u'Manual Focus (3)',
4070- u'FocusRange': u'Not Known',
4071- u'FOV': u'15.2 deg',
4072- u'GPSVersionID': u'2.2.0.0',
4073- u'HighISONoiseReduction': u'Standard',
4074- u'HighlightTonePriority': u'Disable',
4075- u'HyperfocalDistance': u'56.23 m',
4076- u'ImageHeight': 120,
4077- u'ImageSize': u'160x120',
4078- u'ImageWidth': 160,
4079- u'InternalSerialNumber': u'',
4080- u'InteropIndex': u'THM - DCF thumbnail file',
4081- u'InteropVersion': u'0100',
4082- u'ISO': 100,
4083- u'ISOExpansion': u'Off',
4084- u'ISOSpeedIncrements': u'1/3 Stop',
4085- u'Lens35efl': u'70.0 - 200.0 mm (35 mm equivalent: 68.3 - 195.2 mm)',
4086- u'LensAFStopButton': u'AF stop',
4087- u'LensDriveNoAF': u'Focus search on',
4088- u'LensID': u'Canon EF 70-200mm f/4L IS',
4089- u'LensModel': u'EF70-200mm f/4L IS USM',
4090- u'LensType': u'Canon EF 70-200mm f/4L IS',
4091- u'Lens': u'70.0 - 200.0 mm',
4092- u'LightValue': 13.6,
4093- u'LiveViewShooting': u'On',
4094- u'LongExposureNoiseReduction2': u'Off',
4095- u'LongExposureNoiseReduction': u'Off',
4096- u'LongFocal': u'200 mm',
4097- u'MacroMode': u'Normal',
4098- u'Make': u'Canon',
4099- u'ManualFlashOutput': u'n/a',
4100- u'MaxAperture': 4,
4101- u'MeasuredEV': 12.5,
4102- u'MeasuredEV2': 13,
4103- u'MeteringMode': u'Center-weighted average',
4104- #u'MIMEType': u'image/jpeg',
4105- u'MinAperture': 32,
4106- u'MirrorLockup': u'Disable',
4107- u'Model': u'Canon EOS 5D Mark II',
4108- u'ModifyDate': u'2010:10:19 20:43:14',
4109- u'NDFilter': u'n/a',
4110- u'OpticalZoomCode': u'n/a',
4111- u'Orientation': u'Horizontal (normal)',
4112- u'OwnerName': u'',
4113- u'PictureStyle': u'User Def. 1',
4114- u'Quality': u'Unknown (-1)',
4115- u'RawJpgSize': u'Large',
4116- u'RecordMode': u'Video',
4117- u'RelatedImageHeight': 1080,
4118- u'RelatedImageWidth': 1920,
4119- u'ResolutionUnit': u'inches',
4120- u'SafetyShift': u'Disable',
4121- u'Saturation': u'Normal',
4122- u'ScaleFactor35efl': 1.0,
4123- u'SceneCaptureType': u'Standard',
4124- u'SelfTimer': u'Off',
4125- u'SensorBlueLevel': 0,
4126- u'SensorBottomBorder': 3799,
4127- u'SensorHeight': 3804,
4128- u'SensorLeftBorder': 168,
4129- u'SensorRedLevel': 0,
4130- u'SensorRightBorder': 5783,
4131- u'SensorTopBorder': 56,
4132- u'SensorWidth': 5792,
4133- u'SerialNumberFormat': u'Format 2',
4134- u'SerialNumber': u'0820500998',
4135- u'SetButtonWhenShooting': u'Normal (disabled)',
4136- u'Sharpness': 3,
4137- u'SharpnessFrequency': u'n/a',
4138- u'ShootingMode': u'Manual',
4139- u'ShortFocal': u'70 mm',
4140- u'ShutterButtonAFOnButton': u'Metering + AF start',
4141- u'ShutterSpeed': u'1/100',
4142- u'ShutterSpeedValue': u'1/99',
4143- u'SlowShutter': u'None',
4144- #u'SourceFile': u'dmedia/tests/data/MVI_5751.THM',
4145- u'SubSecCreateDate': u'2010:10:19 20:43:14.68',
4146- u'SubSecDateTimeOriginal': u'2010:10:19 20:43:14.68',
4147- u'SubSecModifyDate': u'2010:10:19 20:43:14.68',
4148- u'SubSecTime': 68,
4149- u'SubSecTimeDigitized': 68,
4150- u'SubSecTimeOriginal': 68,
4151- u'SuperimposedDisplay': u'On',
4152- u'TargetAperture': 11,
4153- u'TargetExposureTime': u'1/102',
4154- u'ThumbnailImageValidArea': u'0 159 15 104',
4155- u'ToneCurve': u'Standard',
4156- u'UserComment': u'',
4157- u'VRDOffset': 0,
4158- #u'Warning': u'Invalid CanonAFInfo2 data', Not present under Oneiric
4159- u'WBBracketMode': u'Off',
4160- u'WBBracketValueAB': 0,
4161- u'WBBracketValueGM': 0,
4162- u'WBShiftAB': 0,
4163- u'WBShiftGM': 0,
4164- u'WhiteBalanceBlue': 0,
4165- u'WhiteBalanceRed': 0,
4166- u'WhiteBalance': u'Daylight',
4167- u'XResolution': 72,
4168- u'YCbCrPositioning': u'Co-sited',
4169- u'YCbCrSubSampling': u'YCbCr4:2:2 (2 1)',
4170- u'YResolution': 72,
4171- u'ZoomSourceWidth': 0,
4172- u'ZoomTargetWidth': 0,
4173- u'BitsPerSample': 8,
4174+ 'AddOriginalDecisionData': 'Off',
4175+ 'AEBAutoCancel': 'On',
4176+ 'AEBBracketValue': 0,
4177+ 'AEBSequence': '0,-,+',
4178+ 'AFAssistBeam': 'Emits',
4179+ 'AFMicroAdjActive': 'No',
4180+ 'AFMicroadjustment': 'Disable; 0; 0; 0; 0',
4181+ 'AFMicroAdjValue': 0,
4182+ 'AFOnAELockButtonSwitch': 'Disable',
4183+ 'AFPointAreaExpansion': 'Disable',
4184+ 'AFPointSelectionMethod': 'Normal',
4185+ 'Aperture': 11.0,
4186+ 'ApertureValue': 11.300000000000001,
4187+ 'Artist': '',
4188+ 'AssignFuncButton': 'LCD brightness',
4189+ 'AutoExposureBracketing': 'Off',
4190+ 'AutoISO': 100,
4191+ 'AutoLightingOptimizer': 'Disable',
4192+ 'BaseISO': 100,
4193+ 'BlackMaskBottomBorder': 0,
4194+ 'BlackMaskLeftBorder': 0,
4195+ 'BlackMaskRightBorder': 0,
4196+ 'BlackMaskTopBorder': 0,
4197+ 'BracketMode': 'Off',
4198+ 'BracketShotNumber': 0,
4199+ 'BracketValue': 0,
4200+ 'BulbDuration': 0,
4201+ 'CameraType': 'EOS High-end',
4202+ 'CanonExposureMode': 'Manual',
4203+ 'CanonFirmwareVersion': 'Firmware Version 2.0.7',
4204+ 'CanonFlashMode': 'Off',
4205+ 'CanonImageSize': '1920x1080 Movie',
4206+ 'CanonImageType': 'MVI:Canon EOS 5D Mark II',
4207+ 'CanonModelID': 'EOS 5D Mark II',
4208+ 'CircleOfConfusion': '0.031 mm',
4209+ 'ColorComponents': 3,
4210+ 'ColorSpace': 'sRGB',
4211+ 'ColorTemperature': 3600,
4212+ 'ColorTone': 'Normal',
4213+ 'ComponentsConfiguration': 'Y, Cb, Cr, -',
4214+ 'ContinuousDrive': 'Movie',
4215+ 'Contrast': -4,
4216+ 'ControlMode': 'Camera Local Control',
4217+ 'Copyright': '',
4218+ 'CreateDate': '2010:10:19 20:43:14',
4219+ 'CustomRendered': 'Normal',
4220+ 'DateTimeOriginal': '2010:10:19 20:43:14',
4221+ 'DialDirectionTvAv': 'Normal',
4222+ 'DigitalGain': 0,
4223+ 'DigitalZoom': 'None',
4224+ #'Directory': 'dmedia/tests/data',
4225+ 'DriveMode': 'Continuous Shooting',
4226+ 'EasyMode': 'Manual',
4227+ 'EncodingProcess': 'Baseline DCT, Huffman coding',
4228+ #'ExifByteOrder': 'Little-endian (Intel, II)',
4229+ 'ExifImageHeight': 120,
4230+ 'ExifImageWidth': 160,
4231+ #'ExifToolVersion': 8.1500000000000004,
4232+ 'ExifVersion': '0221',
4233+ 'ExposureCompensation': 0,
4234+ 'ExposureLevelIncrements': '1/3 Stop',
4235+ 'ExposureMode': 'Auto',
4236+ 'ExposureProgram': 'Manual',
4237+ 'ExposureTime': '1/100',
4238+ #'FileModifyDate': '2010:10:19 20:43:18-06:00',
4239+ #'FileName': 'MVI_5751.THM',
4240+ #'FilePermissions': 'rw-r--r--',
4241+ #'FileSize': '27 kB',
4242+ #'FileType': 'JPEG',
4243+ 'FlashActivity': 0,
4244+ 'FlashBits': '(none)',
4245+ 'FlashExposureComp': 0, 'SequenceNumber': 0,
4246+ 'FlashExposureLock': 'Off',
4247+ 'FlashGuideNumber': 0,
4248+ 'FlashpixVersion': '0100',
4249+ 'FlashSyncSpeedAv': 'Auto',
4250+ 'Flash': 'Off, Did not fire',
4251+ 'FNumber': 11.0,
4252+ 'FocalLength35efl': '138.0 mm (35 mm equivalent: 134.7 mm)',
4253+ 'FocalLength': '138.0 mm',
4254+ 'FocalPlaneResolutionUnit': 'inches',
4255+ 'FocalPlaneXResolution': 109.6641535,
4256+ 'FocalPlaneYResolution': 125.26096029999999,
4257+ 'FocalUnits': '1/mm',
4258+ 'FocusingScreen': 'Eg-D',
4259+ 'FocusMode': 'Manual Focus (3)',
4260+ 'FocusRange': 'Not Known',
4261+ 'FOV': '15.2 deg',
4262+ 'GPSVersionID': '2.2.0.0',
4263+ 'HighISONoiseReduction': 'Standard',
4264+ 'HighlightTonePriority': 'Disable',
4265+ 'HyperfocalDistance': '56.23 m',
4266+ 'ImageHeight': 120,
4267+ 'ImageSize': '160x120',
4268+ 'ImageWidth': 160,
4269+ 'InternalSerialNumber': '',
4270+ 'InteropIndex': 'THM - DCF thumbnail file',
4271+ 'InteropVersion': '0100',
4272+ 'ISO': 100,
4273+ 'ISOExpansion': 'Off',
4274+ 'ISOSpeedIncrements': '1/3 Stop',
4275+ 'Lens35efl': '70.0 - 200.0 mm (35 mm equivalent: 68.3 - 195.2 mm)',
4276+ 'LensAFStopButton': 'AF stop',
4277+ 'LensDriveNoAF': 'Focus search on',
4278+ 'LensID': 'Canon EF 70-200mm f/4L IS',
4279+ 'LensModel': 'EF70-200mm f/4L IS USM',
4280+ 'LensType': 'Canon EF 70-200mm f/4L IS',
4281+ 'Lens': '70.0 - 200.0 mm',
4282+ 'LightValue': 13.6,
4283+ 'LiveViewShooting': 'On',
4284+ 'LongExposureNoiseReduction2': 'Off',
4285+ 'LongExposureNoiseReduction': 'Off',
4286+ 'LongFocal': '200 mm',
4287+ 'MacroMode': 'Normal',
4288+ 'Make': 'Canon',
4289+ 'ManualFlashOutput': 'n/a',
4290+ 'MaxAperture': 4,
4291+ 'MeasuredEV': 12.5,
4292+ 'MeasuredEV2': 13,
4293+ 'MeteringMode': 'Center-weighted average',
4294+ #'MIMEType': 'image/jpeg',
4295+ 'MinAperture': 32,
4296+ 'MirrorLockup': 'Disable',
4297+ 'Model': 'Canon EOS 5D Mark II',
4298+ 'ModifyDate': '2010:10:19 20:43:14',
4299+ 'NDFilter': 'n/a',
4300+ 'OpticalZoomCode': 'n/a',
4301+ 'Orientation': 'Horizontal (normal)',
4302+ 'OwnerName': '',
4303+ 'PictureStyle': 'User Def. 1',
4304+ 'Quality': 'Unknown (-1)',
4305+ 'RawJpgSize': 'Large',
4306+ 'RecordMode': 'Video',
4307+ 'RelatedImageHeight': 1080,
4308+ 'RelatedImageWidth': 1920,
4309+ 'ResolutionUnit': 'inches',
4310+ 'SafetyShift': 'Disable',
4311+ 'Saturation': 'Normal',
4312+ 'ScaleFactor35efl': 1.0,
4313+ 'SceneCaptureType': 'Standard',
4314+ 'SelfTimer': 'Off',
4315+ 'SensorBlueLevel': 0,
4316+ 'SensorBottomBorder': 3799,
4317+ 'SensorHeight': 3804,
4318+ 'SensorLeftBorder': 168,
4319+ 'SensorRedLevel': 0,
4320+ 'SensorRightBorder': 5783,
4321+ 'SensorTopBorder': 56,
4322+ 'SensorWidth': 5792,
4323+ 'SerialNumberFormat': 'Format 2',
4324+ 'SerialNumber': '0820500998',
4325+ 'SetButtonWhenShooting': 'Normal (disabled)',
4326+ 'Sharpness': 3,
4327+ 'SharpnessFrequency': 'n/a',
4328+ 'ShootingMode': 'Manual',
4329+ 'ShortFocal': '70 mm',
4330+ 'ShutterButtonAFOnButton': 'Metering + AF start',
4331+ 'ShutterSpeed': '1/100',
4332+ 'ShutterSpeedValue': '1/99',
4333+ 'SlowShutter': 'None',
4334+ #'SourceFile': 'dmedia/tests/data/MVI_5751.THM',
4335+ 'SubSecCreateDate': '2010:10:19 20:43:14.68',
4336+ 'SubSecDateTimeOriginal': '2010:10:19 20:43:14.68',
4337+ 'SubSecModifyDate': '2010:10:19 20:43:14.68',
4338+ 'SubSecTime': 68,
4339+ 'SubSecTimeDigitized': 68,
4340+ 'SubSecTimeOriginal': 68,
4341+ 'SuperimposedDisplay': 'On',
4342+ 'TargetAperture': 11,
4343+ 'TargetExposureTime': '1/102',
4344+ 'ThumbnailImageValidArea': '0 159 15 104',
4345+ 'ToneCurve': 'Standard',
4346+ 'UserComment': '',
4347+ 'VRDOffset': 0,
4348+ #'Warning': 'Invalid CanonAFInfo2 data', Not present under Oneiric
4349+ 'WBBracketMode': 'Off',
4350+ 'WBBracketValueAB': 0,
4351+ 'WBBracketValueGM': 0,
4352+ 'WBShiftAB': 0,
4353+ 'WBShiftGM': 0,
4354+ 'WhiteBalanceBlue': 0,
4355+ 'WhiteBalanceRed': 0,
4356+ 'WhiteBalance': 'Daylight',
4357+ 'XResolution': 72,
4358+ 'YCbCrPositioning': 'Co-sited',
4359+ 'YCbCrSubSampling': 'YCbCr4:2:2 (2 1)',
4360+ 'YResolution': 72,
4361+ 'ZoomSourceWidth': 0,
4362+ 'ZoomTargetWidth': 0,
4363+ 'BitsPerSample': 8,
4364 }
4365
4366 # These values are new running on Oneiric
4367 sample_thm_exif2 = {
4368- u'CropLeftMargin': 24,
4369- u'CropRightMargin': 24,
4370- u'CropTopMargin': 16,
4371- u'CropBottomMargin': 16,
4372-
4373- u'CroppedImageWidth': 2784,
4374- u'CroppedImageHeight': 1856,
4375-
4376- u'VideoCodec': u'avc1',
4377-
4378- u'AudioBitrate': u'1.54 Mbps',
4379- u'CustomPictureStyleFileName': u'superflat01',
4380- u'Duration': u'3.00 s',
4381- u'FrameRate': 29.97,
4382-
4383- u'AudioChannels': 2,
4384- u'AudioSampleRate': 48000,
4385- u'CameraTemperature': u'30 C',
4386-
4387- u'AspectRatio': u'3:2',
4388-
4389- u'FrameCount': 107,
4390+ 'CropLeftMargin': 24,
4391+ 'CropRightMargin': 24,
4392+ 'CropTopMargin': 16,
4393+ 'CropBottomMargin': 16,
4394+
4395+ 'CroppedImageWidth': 2784,
4396+ 'CroppedImageHeight': 1856,
4397+
4398+ 'VideoCodec': 'avc1',
4399+
4400+ 'AudioBitrate': '1.54 Mbps',
4401+ 'CustomPictureStyleFileName': 'superflat01',
4402+ 'Duration': '3.00 s',
4403+ 'FrameRate': 29.97,
4404+
4405+ 'AudioChannels': 2,
4406+ 'AudioSampleRate': 48000,
4407+ 'CameraTemperature': '30 C',
4408+
4409+ 'AspectRatio': '3:2',
4410+
4411+ 'FrameCount': 107,
4412 }
4413
4414 sample_thm_exif.update(sample_thm_exif2)
4415@@ -268,21 +268,21 @@
4416 }
4417
4418
4419-class test_functions(TestCase):
4420+class TestFunctions(SampleFilesTestCase):
4421
4422 def test_file_2_base64(self):
4423 f = extractor.file_2_base64
4424 tmp = TempDir()
4425- src = tmp.write('Hello naughty nurse!', 'sample.txt')
4426+ src = tmp.write(b'Hello naughty nurse!', 'sample.txt')
4427 self.assertEqual(
4428 base64.b64decode(f(src)),
4429- 'Hello naughty nurse!'
4430+ b'Hello naughty nurse!'
4431 )
4432
4433
4434 def test_extract_exif(self):
4435 f = extractor.extract_exif
4436- exif = f(sample_thm)
4437+ exif = f(self.thm)
4438 self.assertEqual(set(sample_thm_exif), set(exif))
4439 for key in sample_thm_exif:
4440 v1 = sample_thm_exif[key]
4441@@ -292,20 +292,16 @@
4442
4443 # Test that error is returned for invalid file:
4444 tmp = TempDir()
4445- data = 'Foo Bar\n' * 1000
4446+ data = b'Foo Bar\n' * 1000
4447 jpg = tmp.write(data, 'sample.jpg')
4448 self.assertEqual(
4449 f(jpg),
4450- {u'Error': u'File format error'}
4451+ {'Error': 'File format error'}
4452 )
4453
4454 # Test with non-existent file:
4455 nope = tmp.join('nope.jpg')
4456- self.assertEqual(
4457- f(nope),
4458- {u'Error': u'ValueError: No JSON object could be decoded'}
4459- )
4460-
4461+ self.assertEqual(f(nope), {})
4462
4463 def test_parse_subsec_datetime(self):
4464 f = extractor.parse_subsec_datetime
4465@@ -365,11 +361,11 @@
4466 tmp = TempDir()
4467
4468 # Test with sample_mov from 5D Mark II:
4469- info = f(sample_mov)
4470+ info = f(self.mov)
4471 self.assertEqual(sample_mov_info, info)
4472
4473 # Test invalid file:
4474- invalid = tmp.write('Wont work!', 'invalid.mov')
4475+ invalid = tmp.write(b'Wont work!', 'invalid.mov')
4476 self.assertEqual(
4477 f(invalid),
4478 {
4479@@ -394,18 +390,14 @@
4480 tmp = TempDir()
4481
4482 # Test with sample_mov from 5D Mark II:
4483- d = f(sample_mov)
4484+ d = f(self.mov)
4485 self.assertTrue(isinstance(d, dict))
4486 self.assertEqual(sorted(d), ['content_type', 'data'])
4487 self.assertEqual(d['content_type'], 'image/jpeg')
4488 data = base64.b64decode(d['data'])
4489- jpg = tmp.write(data, 'thumbnail.jpg')
4490- img = Image.open(jpg)
4491- self.assertEqual(img.size, (192, 108))
4492- self.assertEqual(img.format, 'JPEG')
4493
4494 # Test invalid file:
4495- invalid = tmp.write('Wont work!', 'invalid.mov')
4496+ invalid = tmp.write(b'Wont work!', 'invalid.mov')
4497 self.assertEqual(f(invalid), None)
4498
4499 # Test with non-existent file:
4500@@ -418,7 +410,7 @@
4501 tmp = TempDir()
4502
4503 doc = dict(ext='mov')
4504- f(sample_mov, doc)
4505+ f(self.mov, doc)
4506
4507 # Check canon.thm attachment
4508 att = doc.pop('_attachments')
4509@@ -427,7 +419,7 @@
4510 self.assertEqual(att['canon.thm']['content_type'], 'image/jpeg')
4511 self.assertEqual(
4512 base64.b64decode(att['canon.thm']['data']),
4513- open(sample_thm, 'r').read()
4514+ open(self.thm, 'rb').read()
4515 )
4516
4517 # Check thumbnail
4518@@ -436,10 +428,6 @@
4519 self.assertEqual(sorted(thm), ['content_type', 'data'])
4520 self.assertEqual(thm['content_type'], 'image/jpeg')
4521 data = base64.b64decode(thm['data'])
4522- jpg = tmp.write(data, 'thumbnail.jpg')
4523- img = Image.open(jpg)
4524- self.assertEqual(img.size, (192, 108))
4525- self.assertEqual(img.format, 'JPEG')
4526
4527 self.assertEqual(
4528 doc,
4529@@ -456,32 +444,32 @@
4530 fps=30,
4531 channels='Stereo',
4532 iso=100,
4533- shutter=u'1/100',
4534+ shutter='1/100',
4535 aperture=11.0,
4536- lens=u'Canon EF 70-200mm f/4L IS',
4537- camera=u'Canon EOS 5D Mark II',
4538- camera_serial=u'0820500998',
4539- focal_length=u'138.0 mm',
4540+ lens='Canon EF 70-200mm f/4L IS',
4541+ camera='Canon EOS 5D Mark II',
4542+ camera_serial='0820500998',
4543+ focal_length='138.0 mm',
4544 ),
4545 )
4546 )
4547
4548 def test_merge_exif(self):
4549 f = extractor.merge_exif
4550- self.assertTrue(sample_thm.endswith('.THM'))
4551+ self.assertTrue(self.thm.endswith('.THM'))
4552 attachments = {}
4553 self.assertEqual(
4554- dict(f(sample_thm, attachments)),
4555+ dict(f(self.thm, attachments)),
4556 dict(
4557 width=160,
4558 height=120,
4559 iso=100,
4560- shutter=u'1/100',
4561+ shutter='1/100',
4562 aperture=11.0,
4563- lens=u'Canon EF 70-200mm f/4L IS',
4564- camera=u'Canon EOS 5D Mark II',
4565- camera_serial=u'0820500998',
4566- focal_length=u'138.0 mm',
4567+ lens='Canon EF 70-200mm f/4L IS',
4568+ camera='Canon EOS 5D Mark II',
4569+ camera_serial='0820500998',
4570+ focal_length='138.0 mm',
4571 mtime=1287520994 + 68 / 100.0,
4572 ),
4573 )
4574@@ -493,7 +481,7 @@
4575 tmp = TempDir()
4576
4577 att = {}
4578- merged = dict(f(sample_mov, att))
4579+ merged = dict(f(self.mov, att))
4580
4581 # Check canon.thm attachment
4582 self.assertEqual(set(att), set(['thumbnail', 'canon.thm']))
4583@@ -501,7 +489,7 @@
4584 self.assertEqual(att['canon.thm']['content_type'], 'image/jpeg')
4585 self.assertEqual(
4586 base64.b64decode(att['canon.thm']['data']),
4587- open(sample_thm, 'r').read()
4588+ open(self.thm, 'rb').read()
4589 )
4590
4591 # Check thumbnail
4592@@ -510,10 +498,6 @@
4593 self.assertEqual(sorted(thm), ['content_type', 'data'])
4594 self.assertEqual(thm['content_type'], 'image/jpeg')
4595 data = base64.b64decode(thm['data'])
4596- jpg = tmp.write(data, 'thumbnail.jpg')
4597- img = Image.open(jpg)
4598- self.assertEqual(img.size, (192, 108))
4599- self.assertEqual(img.format, 'JPEG')
4600
4601 self.assertEqual(
4602 merged,
4603@@ -527,19 +511,19 @@
4604 fps=30,
4605 channels='Stereo',
4606 iso=100,
4607- shutter=u'1/100',
4608+ shutter='1/100',
4609 aperture=11.0,
4610- lens=u'Canon EF 70-200mm f/4L IS',
4611- camera=u'Canon EOS 5D Mark II',
4612- camera_serial=u'0820500998',
4613- focal_length=u'138.0 mm',
4614+ lens='Canon EF 70-200mm f/4L IS',
4615+ camera='Canon EOS 5D Mark II',
4616+ camera_serial='0820500998',
4617+ focal_length='138.0 mm',
4618 mtime=1287520994 + 68 / 100.0,
4619 )
4620 )
4621
4622 # Test invalid file:
4623- invalid_mov = tmp.write('Wont work!', 'invalid.mov')
4624- invalid_thm = tmp.write('Wont work either!', 'invalid.thm')
4625+ invalid_mov = tmp.write(b'Wont work!', 'invalid.mov')
4626+ invalid_thm = tmp.write(b'Wont work either!', 'invalid.thm')
4627 att = {}
4628 merged = dict(f(invalid_mov, att))
4629 self.assertEqual(merged, {})
4630
4631=== modified file 'dmedia/tests/test_filestore.py'
4632--- dmedia/tests/test_filestore.py 2011-06-15 04:46:55 +0000
4633+++ dmedia/tests/test_filestore.py 2011-09-22 11:43:29 +0000
4634@@ -21,1323 +21,35 @@
4635 # with `dmedia`. If not, see <http://www.gnu.org/licenses/>.
4636
4637 """
4638-Unit tests for `dmedia.filestore` module.
4639+Unit tests for external filestore, dmedia style.
4640 """
4641
4642-import os
4643-from os import path
4644-import stat
4645-from hashlib import sha1
4646-from base64 import b32encode, b32decode
4647-import shutil
4648-import json
4649-from unittest import TestCase
4650-from .helpers import TempDir, TempHome, raises
4651-from .helpers import sample_mov, sample_thm
4652-from .helpers import mov_hash, mov_leaves, mov_qid
4653-from .helpers import thm_hash, thm_leaves, thm_qid
4654-from dmedia.errors import AmbiguousPath, FileStoreTraversal
4655-from dmedia.errors import DuplicateFile, IntegrityError
4656-from dmedia.filestore import HashList
4657-from dmedia import filestore, constants
4658-from dmedia.constants import TYPE_ERROR, EXT_PAT, LEAF_SIZE
4659-
4660-
4661-class test_functions(TestCase):
4662- def test_safe_path(self):
4663- f = filestore.safe_path
4664-
4665- # Test with relative path:
4666- e = raises(AmbiguousPath, f, 'foo/bar')
4667- self.assertEqual(e.pathname, 'foo/bar')
4668- self.assertEqual(e.abspath, path.abspath('foo/bar'))
4669-
4670- # Test with path traversal:
4671- e = raises(AmbiguousPath, f, '/foo/bar/../../root')
4672- self.assertEqual(e.pathname, '/foo/bar/../../root')
4673- self.assertEqual(e.abspath, '/root')
4674-
4675- # Test with normalized absolute path:
4676- self.assertEqual(f('/home/jderose/.dmedia'), '/home/jderose/.dmedia')
4677-
4678- def test_safe_open(self):
4679- f = filestore.safe_open
4680- tmp = TempDir()
4681- filename = tmp.touch('example.mov')
4682-
4683- # Test that AmbiguousPath is raised:
4684- e = raises(AmbiguousPath, f, 'foo/bar', 'rb')
4685- self.assertEqual(e.pathname, 'foo/bar')
4686- self.assertEqual(e.abspath, path.abspath('foo/bar'))
4687-
4688- e = raises(AmbiguousPath, f, '/foo/bar/../../root', 'rb')
4689- self.assertEqual(e.pathname, '/foo/bar/../../root')
4690- self.assertEqual(e.abspath, '/root')
4691-
4692- # Test with absolute normalized path:
4693- fp = f(filename, 'rb')
4694- self.assertTrue(isinstance(fp, file))
4695- self.assertEqual(fp.name, filename)
4696- self.assertEqual(fp.mode, 'rb')
4697-
4698- def test_safe_ext(self):
4699- f = filestore.safe_ext
4700-
4701- # Test with wrong type
4702- e = raises(TypeError, f, 42)
4703- self.assertEqual(
4704- str(e),
4705- TYPE_ERROR % ('ext', basestring, int, 42)
4706- )
4707-
4708- # Test with invalid case:
4709- bad = 'ogV'
4710- e = raises(ValueError, f, bad)
4711- self.assertEqual(
4712- str(e),
4713- 'ext %r does not match pattern %r' % (bad, EXT_PAT)
4714- )
4715-
4716- # Test with invalid charaters:
4717- bad = '$home'
4718- e = raises(ValueError, f, bad)
4719- self.assertEqual(
4720- str(e),
4721- 'ext %r does not match pattern %r' % (bad, EXT_PAT)
4722- )
4723-
4724- # Test with path traversal:
4725- bad = '/../../../.ssh/id_pub'
4726- e = raises(ValueError, f, bad)
4727- self.assertEqual(
4728- str(e),
4729- 'ext %r does not match pattern %r' % (bad, EXT_PAT)
4730- )
4731-
4732- # Test with a good ext:
4733- good = 'wav'
4734- assert f(good) is good
4735- good = 'cr2'
4736- assert f(good) is good
4737- good = 'tar.gz'
4738- assert f(good) is good
4739-
4740- def test_safe_b32(self):
4741- f = filestore.safe_b32
4742-
4743- # Test with wrong type
4744- e = raises(TypeError, f, 42)
4745- self.assertEqual(
4746- str(e),
4747- TYPE_ERROR % ('b32', basestring, int, 42)
4748- )
4749-
4750- # Test with invalid base32 encoding:
4751- bad = 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7N'
4752- e = raises(ValueError, f, bad)
4753- self.assertEqual(
4754- str(e),
4755- 'b32: cannot b32decode %r: Incorrect padding' % bad
4756- )
4757-
4758- # Test with wrong length:
4759- bad = 'NWBNVXVK5DQGIOW7MYR4K3KA'
4760- e = raises(ValueError, f, bad)
4761- self.assertEqual(
4762- str(e),
4763- 'len(b32) must be 32; got 24: %r' % bad
4764- )
4765-
4766- # Test with a good chash:
4767- good = 'NWBNVXVK5DQGIOW7MYR4K3KA5K22W7NW'
4768- assert f(good) is good
4769-
4770- def test_tophash(self):
4771- f = filestore.tophash
4772- h = f(31415)
4773- self.assertEqual(
4774- h.digest(),
4775- sha1(b'dmedia/tophash 31415').digest()
4776- )
4777- l = ''.join(mov_leaves)
4778- h.update(l)
4779- self.assertEqual(
4780- h.digest(),
4781- sha1(b'dmedia/tophash 31415' + l).digest()
4782- )
4783-
4784- def test_leafhash(self):
4785- f = filestore.leafhash
4786- l = ''.join(mov_leaves)
4787-
4788- h = f(1079991, 0)
4789- self.assertEqual(
4790- h.digest(),
4791- sha1(b'dmedia/leafhash 1079991 0').digest()
4792- )
4793- h.update(l)
4794- self.assertEqual(
4795- h.digest(),
4796- sha1(b'dmedia/leafhash 1079991 0' + l).digest()
4797- )
4798-
4799- h = f(1079991, 1)
4800- self.assertEqual(
4801- h.digest(),
4802- sha1(b'dmedia/leafhash 1079991 1').digest()
4803- )
4804- h.update(l)
4805- self.assertEqual(
4806- h.digest(),
4807- sha1(b'dmedia/leafhash 1079991 1' + l).digest()
4808- )
4809-
4810- def test_pack_leaves(self):
4811- f = filestore.pack_leaves
4812-
4813- a = 'a' * 20
4814- b = 'b' * 20
4815- c = 'c' * 20
4816- d = 'd' * 20
4817- self.assertEqual(f([a, b, c]), a + b + c)
4818- self.assertEqual(f([a, b, c, d]), a + b + c + d)
4819-
4820- e = raises(ValueError, f, [a, b, c], digest_bytes=25)
4821- self.assertEqual(
4822- str(e),
4823- 'digest_bytes=25, but len(leaves[0]) is 20'
4824- )
4825- e = raises(ValueError, f, [a, 'b' * 15, c])
4826- self.assertEqual(
4827- str(e),
4828- 'digest_bytes=20, but len(leaves[1]) is 15'
4829- )
4830-
4831- def test_unpack_leaves(self):
4832- f = filestore.unpack_leaves
4833-
4834- a = 'a' * 20
4835- b = 'b' * 20
4836- c = 'c' * 20
4837- d = 'd' * 20
4838- data = a + b + c + d
4839- self.assertEqual(f(data), [a, b, c, d])
4840-
4841- a = 'a' * 32
4842- b = 'b' * 32
4843- c = 'c' * 32
4844- d = 'd' * 32
4845- e = 'e' * 32
4846- data = a + b + c + d + e
4847- self.assertEqual(f(data, digest_bytes=32), [a, b, c, d, e])
4848-
4849- e = raises(ValueError, f, 'a' * 201)
4850- self.assertEqual(
4851- str(e),
4852- 'len(data)=201, not multiple of digest_bytes=20'
4853- )
4854- e = raises(ValueError, f, 'a' * 200, digest_bytes=16)
4855- self.assertEqual(
4856- str(e),
4857- 'len(data)=200, not multiple of digest_bytes=16'
4858- )
4859-
4860- def test_quick_id(self):
4861- f = filestore.quick_id
4862-
4863- # Test with fp of wrong type
4864- e = raises(TypeError, f, 'hello')
4865- self.assertEqual(
4866- str(e),
4867- TYPE_ERROR % ('fp', file, str, 'hello')
4868- )
4869-
4870- # Test with fp opened in wrong mode
4871- fp = open(sample_mov, 'r')
4872- e = raises(ValueError, f, fp)
4873- self.assertEqual(
4874- str(e),
4875- "fp: must be opened in mode 'rb'; got 'r'"
4876- )
4877-
4878- # Test with some known files/values:
4879- fp = open(sample_mov, 'rb')
4880- self.assertEqual(f(fp), 'GJ4AQP3BK3DMTXYOLKDK6CW4QIJJGVMN')
4881- self.assertFalse(fp.closed) # Should not close file
4882-
4883- fp = open(sample_thm, 'rb')
4884- self.assertEqual(f(fp), 'EYCDXXCNDB6OIIX5DN74J7KEXLNCQD5M')
4885- self.assertFalse(fp.closed) # Should not close file
4886-
4887- # Make user seek(0) is being called:
4888- fp = open(sample_mov, 'rb')
4889- fp.seek(1024)
4890- self.assertEqual(f(fp), 'GJ4AQP3BK3DMTXYOLKDK6CW4QIJJGVMN')
4891- self.assertFalse(fp.closed) # Should not close file
4892-
4893- def test_fallocate(self):
4894- f = filestore.fallocate
4895- tmp = TempDir()
4896- filename = tmp.join('example.mov')
4897-
4898- # Test when size is wrong type:
4899- e = raises(TypeError, f, '2311', filename)
4900- self.assertEqual(
4901- str(e),
4902- TYPE_ERROR % ('size', (int, long), str, '2311')
4903- )
4904-
4905- # Test when size <= 0
4906- e = raises(ValueError, f, 0, filename)
4907- self.assertEqual(str(e), 'size must be >0; got 0')
4908- e = raises(ValueError, f, -2311, filename)
4909- self.assertEqual(str(e), 'size must be >0; got -2311')
4910-
4911- # Test with relative path:
4912- e = raises(AmbiguousPath, f, 2311, 'foo/bar')
4913- self.assertEqual(e.pathname, 'foo/bar')
4914- self.assertEqual(e.abspath, path.abspath('foo/bar'))
4915-
4916- # Test with path traversal:
4917- e = raises(AmbiguousPath, f, 2311, '/foo/bar/../../root')
4918- self.assertEqual(e.pathname, '/foo/bar/../../root')
4919- self.assertEqual(e.abspath, '/root')
4920-
4921- # Test with correct args:
4922- self.assertFalse(path.exists(filename))
4923- ret = f(2311, filename)
4924- self.assertTrue(ret in [None, True, False])
4925-
4926- if ret is None:
4927- self.assertFalse(path.exists(filename))
4928-
4929- if ret is True:
4930- self.assertTrue(path.exists(filename))
4931- self.assertEqual(path.getsize(filename), 2311)
4932-
4933- if ret is False:
4934- self.assertTrue(path.exists(filename))
4935- self.assertEqual(path.getsize(filename), 0)
4936-
4937-
4938-class test_HashList(TestCase):
4939- klass = filestore.HashList
4940-
4941- def test_init(self):
4942- tmp = TempDir()
4943- src_fp = open(sample_mov, 'rb')
4944- dst_fp = open(tmp.join('test.mov'), 'wb')
4945-
4946- # Test with src_fp of wrong type
4947- e = raises(TypeError, self.klass, 'hello', dst_fp)
4948- self.assertEqual(
4949- str(e),
4950- TYPE_ERROR % ('src_fp', file, str, 'hello')
4951- )
4952-
4953- # Test with src_fp opened in wrong mode
4954- e = raises(ValueError, self.klass, open(sample_mov, 'r'), dst_fp)
4955- self.assertEqual(
4956- str(e),
4957- "src_fp: mode must be 'rb'; got 'r'"
4958- )
4959-
4960- # Test with dst_fp of wrong type
4961- e = raises(TypeError, self.klass, src_fp, 17)
4962- self.assertEqual(
4963- str(e),
4964- TYPE_ERROR % ('dst_fp', file, int, 17)
4965- )
4966-
4967- # Test with dst_fp opened in wrong mode
4968- e = raises(ValueError, self.klass, src_fp,
4969- open(tmp.join('wrong.mov'), 'w')
4970- )
4971- self.assertEqual(
4972- str(e),
4973- "dst_fp: mode must be 'wb' or 'r+b'; got 'w'"
4974- )
4975-
4976- # Test with correct values
4977- inst = self.klass(src_fp)
4978- self.assertTrue(inst.src_fp is src_fp)
4979- self.assertEqual(inst.file_size, os.fstat(src_fp.fileno()).st_size)
4980- self.assertEqual(inst.leaves, [])
4981- self.assertTrue(inst.dst_fp is None)
4982- self.assertEqual(inst.leaf_size, constants.LEAF_SIZE)
4983-
4984- inst = self.klass(src_fp, dst_fp)
4985- self.assertTrue(inst.src_fp is src_fp)
4986- self.assertTrue(inst.dst_fp is dst_fp)
4987- self.assertEqual(inst.leaf_size, constants.LEAF_SIZE)
4988-
4989- inst = self.klass(src_fp, dst_fp, 2 * constants.LEAF_SIZE)
4990- self.assertTrue(inst.src_fp is src_fp)
4991- self.assertTrue(inst.dst_fp is dst_fp)
4992- self.assertEqual(inst.leaf_size, 2 * constants.LEAF_SIZE)
4993-
4994- def test_update(self):
4995- tmp = TempDir()
4996-
4997- class Example(self.klass):
4998- def __init__(self, dst_fp=None):
4999- self.dst_fp = dst_fp
5000-
The diff has been truncated for viewing.

Subscribers

People subscribed via source and target branches