Merge lp:~stub/launchpad/large-swift-files into lp:launchpad

Proposed by Stuart Bishop
Status: Merged
Merged at revision: 16896
Proposed branch: lp:~stub/launchpad/large-swift-files
Merge into: lp:launchpad
Prerequisite: lp:~stub/launchpad/swift-librarian-config
Diff against target: 255 lines (+128/-27)
3 files modified
lib/lp/services/librarianserver/librariangc.py (+6/-1)
lib/lp/services/librarianserver/swift.py (+67/-21)
lib/lp/services/librarianserver/tests/test_swift.py (+55/-5)
To merge this branch: bzr merge lp:~stub/launchpad/large-swift-files
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+198222@code.launchpad.net

Commit message

Swift large file support for the Librarian

Description of the change

Swift requires some gymnastics from the client to store objects larger than 5GB.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/services/librarianserver/librariangc.py'
2--- lib/lp/services/librarianserver/librariangc.py 2013-11-07 14:28:46 +0000
3+++ lib/lp/services/librarianserver/librariangc.py 2014-01-07 10:13:54 +0000
4@@ -769,7 +769,12 @@
5
6 for container, obj in swift_files(max_lfc_id):
7 name = obj['name']
8- content_id = int(name)
9+
10+ # We may have a segment of a large file.
11+ if '/' in name:
12+ content_id = int(name.split('/', 1)[0])
13+ else:
14+ content_id = int(name)
15
16 while (next_wanted_content_id is not None
17 and content_id > next_wanted_content_id):
18
19=== modified file 'lib/lp/services/librarianserver/swift.py'
20--- lib/lp/services/librarianserver/swift.py 2013-12-12 08:17:53 +0000
21+++ lib/lp/services/librarianserver/swift.py 2014-01-07 10:13:54 +0000
22@@ -15,6 +15,7 @@
23 import re
24 import sys
25 import time
26+import urllib
27
28 from swiftclient import client as swiftclient
29
30@@ -24,6 +25,7 @@
31
32
33 SWIFT_CONTAINER_PREFIX = 'librarian_'
34+MAX_SWIFT_OBJECT_SIZE = 5 * 1024 ** 3 # 5GB Swift limit.
35
36 ONE_DAY = 24 * 60 * 60
37
38@@ -122,7 +124,7 @@
39 log.debug(
40 "{0} already exists in Swift({1}, {2})".format(
41 lfc, container, obj_name))
42- if ('x-object-manifest' not in headers and
43+ if ('X-Object-Manifest' not in headers and
44 int(headers['content-length'])
45 != os.path.getsize(fs_path)):
46 raise AssertionError(
47@@ -130,29 +132,68 @@
48 except swiftclient.ClientException as x:
49 if x.http_status != 404:
50 raise
51- log.info(
52- 'Putting {0} into Swift ({1}, {2})'.format(
53- lfc, container, obj_name))
54- md5_stream = HashStream(open(fs_path, 'rb'))
55- db_md5_hash = ISlaveStore(LibraryFileContent).get(
56- LibraryFileContent, lfc).md5
57- swift_md5_hash = swift_connection.put_object(
58- container, obj_name, md5_stream, os.path.getsize(fs_path))
59- disk_md5_hash = md5_stream.hash.hexdigest()
60- if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
61- log.error(
62- "LibraryFileContent({0}) corrupt. "
63- "disk md5={1}, db md5={2}, swift md5={3}".format(
64- lfc, disk_md5_hash, db_md5_hash, swift_md5_hash))
65- try:
66- swift_connection.delete_object(container, obj_name)
67- except Exception as x:
68- log.error('Failed to delete corrupt file from Swift')
69- raise AssertionError('md5 mismatch')
70+ log.info('Putting {0} into Swift ({1}, {2})'.format(
71+ lfc, container, obj_name))
72+ _put(log, swift_connection, lfc, container, obj_name, fs_path)
73 if remove:
74 os.unlink(fs_path)
75
76
77+def _put(log, swift_connection, lfc_id, container, obj_name, fs_path):
78+ fs_size = os.path.getsize(fs_path)
79+ fs_file = HashStream(open(fs_path, 'rb'))
80+ db_md5_hash = ISlaveStore(LibraryFileContent).get(
81+ LibraryFileContent, lfc_id).md5
82+
83+ if fs_size <= MAX_SWIFT_OBJECT_SIZE:
84+ swift_md5_hash = swift_connection.put_object(
85+ container, obj_name, fs_file, fs_size)
86+ disk_md5_hash = fs_file.hash.hexdigest()
87+ if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
88+ log.error(
89+ "LibraryFileContent({0}) corrupt. "
90+ "disk md5={1}, db md5={2}, swift md5={3}".format(
91+ lfc_id, disk_md5_hash, db_md5_hash, swift_md5_hash))
92+ try:
93+ swift_connection.delete_object(container, obj_name)
94+ except Exception:
95+ log.exception('Failed to delete corrupt file from Swift')
96+ raise AssertionError('md5 mismatch')
97+ else:
98+ # Large file upload. Create the segments first, then the
99+ # manifest. This order prevents partial downloads, and lets us
100+ # detect interrupted uploads and clean up.
101+ segment = 0
102+ while fs_file.tell() < fs_size:
103+ assert segment <= 9999, 'Insane number of segments'
104+ seg_name = '%s/%04d' % (obj_name, segment)
105+ seg_size = min(fs_size - fs_file.tell(), MAX_SWIFT_OBJECT_SIZE)
106+ md5_stream = HashStream(fs_file)
107+ swift_md5_hash = swift_connection.put_object(
108+ container, seg_name, md5_stream, seg_size)
109+ segment_md5_hash = md5_stream.hash.hexdigest()
110+ assert swift_md5_hash == segment_md5_hash, (
111+ "LibraryFileContent({0}) segment {1} upload corrupted".format(
112+ lfc_id, segment))
113+ segment = segment + 1
114+
115+ disk_md5_hash = fs_file.hash.hexdigest()
116+ if disk_md5_hash != db_md5_hash:
117+ # We don't have to delete the uploaded segments, as Librarian
118+ # Garbage Collection handles this for us.
119+ log.error(
120+ "Large LibraryFileContent({0}) corrupt. "
121+ "disk md5={1}, db_md5={2}".format(
122+ lfc_id, disk_md5_hash, db_md5_hash))
123+ raise AssertionError('md5 mismatch')
124+
125+ manifest = '{0}/{1}/'.format(
126+ urllib.quote(container), urllib.quote(obj_name))
127+ manifest_headers = {'X-Object-Manifest': manifest}
128+ swift_connection.put_object(
129+ container, obj_name, '', 0, headers=manifest_headers)
130+
131+
132 def swift_location(lfc_id):
133 '''Return the (container, obj_name) used to store a file.
134
135@@ -165,7 +206,9 @@
136 # storage, as objects will no longer be found in the expected
137 # container. This value and the container prefix are deliberatly
138 # hard coded to avoid cockups with values specified in config files.
139- max_objects_per_container = 1000000
140+ # While the suggested number is 'under a million', the rare large files
141+ # will take up multiple slots so we choose a more conservative number.
142+ max_objects_per_container = 500000
143
144 container_num = lfc_id // max_objects_per_container
145
146@@ -250,6 +293,9 @@
147 self.hash.update(chunk)
148 return chunk
149
150+ def tell(self):
151+ return self._stream.tell()
152+
153
154 class ConnectionPool:
155 MAX_POOL_SIZE = 10
156
157=== modified file 'lib/lp/services/librarianserver/tests/test_swift.py'
158--- lib/lp/services/librarianserver/tests/test_swift.py 2013-11-12 15:03:38 +0000
159+++ lib/lp/services/librarianserver/tests/test_swift.py 2014-01-07 10:13:54 +0000
160@@ -10,6 +10,7 @@
161 import time
162
163 from mock import patch
164+from swiftclient import client as swiftclient
165 import transaction
166
167 from lp.services.database import write_transaction
168@@ -147,8 +148,11 @@
169 data = self.librarian_client.getFileByAlias(lfa_id).read()
170 self.assertEqual(content, data)
171
172- def test_large_binary_files_from_disk(self):
173- # Generate a large blob, including null bytes for kicks.
174+ def test_largish_binary_files_from_disk(self):
175+ # Generate a largish blob, including null bytes for kicks.
176+ # A largish file is large enough that the HTTP upload needs
177+ # to be done in multiple chunks, but small enough that it is
178+ # stored in Swift as a single object.
179 size = 512 * 1024 # 512KB
180 expected_content = ''.join(chr(i % 256) for i in range(0, size))
181 lfa_id = self.add_file('hello_bigboy.xls', expected_content)
182@@ -157,9 +161,12 @@
183 lfa = self.librarian_client.getFileByAlias(lfa_id)
184 self.assertEqual(expected_content, lfa.read())
185
186- def test_large_binary_files_from_swift(self):
187+ def test_largish_binary_files_from_swift(self):
188 # Generate large blob, multiple of the chunk size.
189 # Including null bytes for kicks.
190+ # A largish file is large enough that the HTTP upload needs
191+ # to be done in multiple chunks, but small enough that it is
192+ # stored in Swift as a single object.
193 size = LibrarianStorage.CHUNK_SIZE * 50
194 self.assert_(size > 1024 * 1024)
195 expected_content = ''.join(chr(i % 256) for i in range(0, size))
196@@ -176,10 +183,12 @@
197 lfa = self.librarian_client.getFileByAlias(lfa_id)
198 self.assertEqual(expected_content, lfa.read())
199
200-
201- def test_large_binary_files_from_swift_offset(self):
202+ def test_largish_binary_files_from_swift_offset(self):
203 # Generate large blob, but NOT a multiple of the chunk size.
204 # Including null bytes for kicks.
205+ # A largish file is large enough that the HTTP upload needs
206+ # to be done in multiple chunks, but small enough that it is
207+ # stored in Swift as a single object.
208 size = LibrarianStorage.CHUNK_SIZE * 50 + 1
209 self.assert_(size > 1024 * 1024)
210 expected_content = ''.join(chr(i % 256) for i in range(0, size))
211@@ -195,3 +204,44 @@
212 lfa = self.librarian_client.getFileByAlias(lfa_id)
213 self.failIf(os.path.exists(swift.filesystem_path(lfc.id)))
214 self.assertEqual(expected_content, lfa.read())
215+
216+ def test_large_file_to_swift(self):
217+ # Generate a blob large enough that Swift requires us to store
218+ # it as multiple objects plus a manifest.
219+ size = LibrarianStorage.CHUNK_SIZE * 50
220+ self.assert_(size > 1024 * 1024)
221+ expected_content = ''.join(chr(i % 256) for i in range(0, size))
222+ lfa_id = self.add_file('hello_bigboy.xls', expected_content)
223+ lfa = IStore(LibraryFileAlias).get(LibraryFileAlias, lfa_id)
224+ lfc = lfa.content
225+
226+ # We don't really want to upload a file >5GB to our mock Swift,
227+ # so change the constant instead. Set it so we need 3 segments.
228+ def _reset_max(val):
229+ swift.MAX_SWIFT_OBJECT_SIZE = val
230+ self.addCleanup(_reset_max, swift.MAX_SWIFT_OBJECT_SIZE)
231+ swift.MAX_SWIFT_OBJECT_SIZE = int(size / 2) - 1
232+
233+ # Shove the file requiring multiple segments into Swift.
234+ swift.to_swift(BufferLogger(), remove=False)
235+
236+ # As our mock Swift does not support multi-segment files,
237+ # instead we examine it directly in Swift as best we can.
238+ swift_client = self.swift_fixture.connect()
239+
240+ # The manifest exists. Unfortunately, we can't test that the
241+ # magic manifest header is set correctly.
242+ container, name = swift.swift_location(lfc.id)
243+ headers, obj = swift_client.get_object(container, name)
244+ self.assertEqual(obj, '')
245+
246+ # The segments we expect are all in their expected locations.
247+ _, obj1 = swift_client.get_object(container, '{0}/0000'.format(name))
248+ _, obj2 = swift_client.get_object(container, '{0}/0001'.format(name))
249+ _, obj3 = swift_client.get_object(container, '{0}/0002'.format(name))
250+ self.assertRaises(
251+ swiftclient.ClientException, swift_client.get_object,
252+ container, '{0}/0003'.format(name))
253+
254+ # Our object round tripped
255+ self.assertEqual(obj1 + obj2 + obj3, expected_content)