Merge lp:~jderose/filestore/fadvise into lp:filestore

Proposed by Jason Gerard DeRose
Status: Merged
Merged at revision: 268
Proposed branch: lp:~jderose/filestore/fadvise
Merge into: lp:filestore
Diff against target: 208 lines (+68/-24)
2 files modified
benchmark.py (+25/-22)
filestore.py (+43/-2)
To merge this branch: bzr merge lp:~jderose/filestore/fadvise
Reviewer Review Type Date Requested Status
James Raymond Approve
Review via email: mp+132004@code.launchpad.net

Description of the change

In the future we should experiment with POSIX_FADV_WILLNEED, but for now, this simple change provides an impressive performance improvement (almost 10%).

Changes:

 * When needed, monkey patch `os` module with dummy `os.posix_fadvise()`

 * reader() now calls on.posix_fadvise() with os.POSIX_FADV_SEQUENTIAL before starting read

 * batch_reader() now calls on.posix_fadvise() with os.POSIX_FADV_SEQUENTIAL before starting each read

 * batch_import_iter() now uses a queue size of 8 instead of 16 (meaning now at most 64 MiB will be in-flight in the queue, instead of 128 MiB)

 * reader() and batch_reader() now log.exception() when an exception is caught to make debugging easier

 * benchmark.py script now uses batch_import_iter(), other misc changes

To post a comment you must log in.
lp:~jderose/filestore/fadvise updated
275. By Jason Gerard DeRose

Wow, with os.POSIX_FADV_WILLNEED, now I'm up to almost an 18% improvement, 31.2 MB/s

276. By Jason Gerard DeRose

Changed reader() to use os.POSIX_FADV_WILLNEED also

Revision history for this message
Jason Gerard DeRose (jderose) wrote :

Okay, I decided to undergo said experiment right now... and holy crap, fast.

Using os.POSIX_FADV_WILLNEED, I'm up to almost an 18% performance improvement.

31.2 MB/s over USB2, up from 26.5 MB/s.

There are a few things that need further tuning on the Dmedia side of things, but the filestore read performance is now pretty stellar (under Python3.3 anyway).

Revision history for this message
James Raymond (jamesmr) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'benchmark.py'
2--- benchmark.py 2011-09-07 03:46:52 +0000
3+++ benchmark.py 2012-10-30 02:40:25 +0000
4@@ -23,6 +23,10 @@
5
6 """
7 Benchmarking utilities for in-tree tests (not installed).
8+
9+Clear page cache like this:
10+
11+ sync ; echo 3 | sudo tee /proc/sys/vm/drop_caches
12 """
13
14 import sys
15@@ -30,30 +34,32 @@
16 import time
17 import math
18
19-
20 import filestore
21
22
23-UNITS_BASE10 = (
24+BYTES10 = (
25 'bytes',
26 'kB',
27 'MB',
28 'GB',
29 'TB',
30 'PB',
31+ 'EB',
32+ 'ZB',
33+ 'YB',
34 )
35
36-def units_base10(size):
37+def bytes10(size):
38 """
39 Return *size* bytes to 3 significant digits in SI base-10 units.
40
41 For example:
42
43- >>> units_base10(1000)
44+ >>> bytes10(1000)
45 '1 kB'
46- >>> units_base10(29481537)
47+ >>> bytes10(29481537)
48 '29.5 MB'
49- >>> units_base10(392012353)
50+ >>> bytes10(392012353)
51 '392 MB'
52
53 For additional details, see:
54@@ -63,17 +69,15 @@
55 if size is None:
56 return None
57 if size < 0:
58- raise ValueError('size must be greater than zero; got %r' % size)
59- if size >= 10 ** 18:
60- raise ValueError('size must be smaller than 10**18; got %r' % size)
61+ raise ValueError('size must be >= 0; got {!r}'.format(size))
62 if size == 0:
63 return '0 bytes'
64 if size == 1:
65 return '1 byte'
66- i = int(math.floor(math.log(size, 1000)))
67- s = (size / (1000.0 ** i) if i > 0 else size)
68+ i = min(int(math.floor(math.log(size, 1000))), len(BYTES10) - 1)
69+ s = (size / (1000 ** i) if i > 0 else size)
70 return (
71- '%.*g %s' % (3, s, UNITS_BASE10[i])
72+ '{:.3g} {}'.format(s, BYTES10[i])
73 )
74
75
76@@ -88,7 +92,7 @@
77
78 def rate(self, total_size):
79 per_sec = total_size / self.elapsed
80- return '{} per second'.format(units_base10(per_sec))
81+ return '{} per second'.format(bytes10(per_sec))
82
83
84 if __name__ == '__main__':
85@@ -100,18 +104,17 @@
86 raise SystemExit(
87 'ERROR: source is not a directory: {!r}'.format(d)
88 )
89+ if len(args) > 1:
90+ dst = filestore.FileStore(path.abspath(args[1]))
91+ else:
92+ dst = filestore.TempFileStore()
93
94 print('Benchmarking {!r}'.format(d))
95 batch = filestore.scandir(d)
96 print(' {} files'.format(batch.count))
97- print(' total size: {}'.format(units_base10(batch.size)))
98-
99+ print(' total size: {}'.format(bytes10(batch.size)))
100 t = Timer()
101- for f in batch.files:
102- src_fp = open(f.name, 'rb')
103- while True:
104- data = src_fp.read(filestore.LEAF_SIZE)
105- if not data:
106- break
107- print('Finished in {}'.format(t.done()))
108+ for (file, ch) in filestore.batch_import_iter(batch, dst):
109+ pass
110+ print('Finished in {:.3f}'.format(t.done()))
111 print('Read at {}'.format(t.rate(batch.size)))
112
113=== modified file 'filestore.py'
114--- filestore.py 2012-10-19 20:01:09 +0000
115+++ filestore.py 2012-10-30 02:40:25 +0000
116@@ -140,6 +140,16 @@
117 https://launchpad.net/dmedia
118 """
119
120+# Monkey patch python3.2 with dummy os.posix_fadvise():
121+if not hasattr(os, 'posix_fadvise'):
122+ def _posix_fadvise(fd, offset, length, advice):
123+ pass
124+ os.posix_fadvise = _posix_fadvise
125+ os.POSIX_FADV_SEQUENTIAL = 2
126+ os.POSIX_FADV_WILLNEED = 3
127+ os.POSIX_FADV_DONTNEED = 4
128+
129+
130
131 ##############################
132 # The Dmedia Hashing Protocol:
133@@ -679,15 +689,31 @@
134 TYPE_ERROR.format('src_fp', IO_READABLE, type(src_fp), src_fp)
135 )
136 src_fp.seek(0)
137+ fd = src_fp.fileno()
138+ st = os.fstat(fd)
139+ if st.st_size > 0:
140+ os.posix_fadvise(fd, 0, st.st_size, os.POSIX_FADV_SEQUENTIAL)
141 index = 0
142 while True:
143 data = src_fp.read(LEAF_SIZE)
144 if not data:
145 queue.put(None)
146 break
147+ if len(data) == LEAF_SIZE:
148+ os.posix_fadvise(fd,
149+ (index + 1) * LEAF_SIZE,
150+ LEAF_SIZE,
151+ os.POSIX_FADV_WILLNEED
152+ )
153+ os.posix_fadvise(fd,
154+ index * LEAF_SIZE,
155+ len(data),
156+ os.POSIX_FADV_DONTNEED
157+ )
158 queue.put(Leaf(index, data))
159 index += 1
160 except Exception as e:
161+ log.exception('Error reading %r', src_fp.name)
162 queue.put(e)
163
164
165@@ -740,18 +766,33 @@
166 try:
167 for file in batch.files:
168 queue.put(file)
169- log.info('Opening file %r', file.name)
170+ log.info('Opening %r', file)
171 src_fp = open(file.name, 'rb')
172+ fd = src_fp.fileno()
173+ if file.size > 0:
174+ os.posix_fadvise(fd, 0, file.size, os.POSIX_FADV_SEQUENTIAL)
175 index = 0
176 while True:
177 data = src_fp.read(LEAF_SIZE)
178 if not data:
179 queue.put(EndFile)
180 break
181+ if len(data) == LEAF_SIZE:
182+ os.posix_fadvise(fd,
183+ (index + 1) * LEAF_SIZE,
184+ LEAF_SIZE,
185+ os.POSIX_FADV_WILLNEED
186+ )
187+ os.posix_fadvise(fd,
188+ index * LEAF_SIZE,
189+ len(data),
190+ os.POSIX_FADV_DONTNEED
191+ )
192 queue.put(Leaf(index, data))
193 index += 1
194 queue.put(EndBatch)
195 except Exception as e:
196+ log.exception('batch.count=%r, batch.size=%r', batch.count, batch.size)
197 queue.put(e)
198
199
200@@ -802,7 +843,7 @@
201 if callback is not None:
202 count = 0
203 size = 0
204- q = SmartQueue(16)
205+ q = SmartQueue(8)
206 thread = _start_thread(batch_reader, batch, q)
207 while True:
208 file = q.get()

Subscribers

People subscribed via source and target branches