Merge lp:~verterok/ubuntuone-client/file_shelf-is-fast into lp:ubuntuone-client

Proposed by Guillermo Gonzalez
Status: Merged
Approved by: dobey
Approved revision: 91
Merged at revision: not available
Proposed branch: lp:~verterok/ubuntuone-client/file_shelf-is-fast
Merge into: lp:ubuntuone-client
Diff against target: None lines
To merge this branch: bzr merge lp:~verterok/ubuntuone-client/file_shelf-is-fast
Reviewer Review Type Date Requested Status
dobey (community) Approve
Lucio Torre (community) Approve
Review via email: mp+8963@code.launchpad.net

Commit message

[r=lucio.torre, r=dobey] minimize the times fileshelf touch the disk to do a key lookup. Also adds a new LRUCache for the Fileshelf

To post a comment you must log in.
Revision history for this message
Guillermo Gonzalez (verterok) wrote :

This branch minimize the times fileshelf touch the disk to do a key lookup. Also includes a branch new LRUCache for the Filshelf this also should reduce the time we are hitting disk, as e.g: sync is issuing a lot of consecutive request to the file shelf.

Revision history for this message
Lucio Torre (lucio.torre) wrote :

        try:
            self.shelf['missingkey']
        except KeyError:
            self.assertEquals(self.shelf.cache_misses, 1)

This part of test_hit_miss_properties depends on the state of the cache somewhere else? dont you need to add an 'else' block that raises an error?

anyhow, big +1. i like that we now show more stats on log marks.

review: Approve
91. By Guillermo Gonzalez

fix lint warning

Revision history for this message
dobey (dobey) :
review: Approve
Revision history for this message
Guillermo Gonzalez (verterok) wrote :

> try:
> self.shelf['missingkey']
> except KeyError:
> self.assertEquals(self.shelf.cache_misses, 1)
>
> This part of test_hit_miss_properties depends on the state of the cache
> somewhere else? dont you need to add an 'else' block that raises an error?

good point!
the cache is clean, as the shelf was initialized before starting the test, but I'll add an else clause just to make the test more complete.

>
> anyhow, big +1. i like that we now show more stats on log marks.

Thanks!

92. By Guillermo Gonzalez

 add else clause to cache.misses test

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'tests/syncdaemon/test_fileshelf.py'
--- tests/syncdaemon/test_fileshelf.py 2009-07-02 20:22:51 +0000
+++ tests/syncdaemon/test_fileshelf.py 2009-07-17 20:31:26 +0000
@@ -15,57 +15,59 @@
15# You should have received a copy of the GNU General Public License along15# You should have received a copy of the GNU General Public License along
16# with this program. If not, see <http://www.gnu.org/licenses/>.16# with this program. If not, see <http://www.gnu.org/licenses/>.
1717
18""" Test disk cache persistent shelf """18""" Test file based persistent shelf """
1919
20import os20import os
21import hashlib21import hashlib
22import unittest22import unittest
23import shutil23import shutil
2424
25from ubuntuone.syncdaemon.file_shelf import FileShelf25from ubuntuone.syncdaemon.file_shelf import (
2626 FileShelf,
27TESTS_DIR = os.path.join(os.getcwd(), "tmp")27 CachedFileShelf,
2828 LRUCache,
29class TestFileShelf(unittest.TestCase):29 CacheInconsistencyError,
30)
31from contrib.testing import testcase
32
33
34class TestFileShelf(testcase.BaseTwistedTestCase):
30 """ Test the FileShelf """35 """ Test the FileShelf """
36 fileshelf_class = FileShelf
3137
32 def setUp(self):38 def setUp(self):
33 """ Sets up a test. """39 """ Sets up a test. """
34 try:40 testcase.BaseTwistedTestCase.setUp(self)
35 os.mkdir(TESTS_DIR)41 self.path = self.mktemp('shelf')
36 except OSError:42 self.shelf = self.fileshelf_class(self.path)
37 # already there, remove it to clean and create again
38 shutil.rmtree(TESTS_DIR)
39 os.mkdir(TESTS_DIR)
40 self.path = os.path.join(TESTS_DIR, 'shelf')
41 self.shelf = FileShelf(self.path)
4243
43 def tearDown(self):44 def tearDown(self):
44 '''Clean up the tests.'''45 '''Clean up the tests.'''
45 shutil.rmtree(TESTS_DIR)46 shutil.rmtree(self.path)
47 testcase.BaseTwistedTestCase.tearDown(self)
4648
47 def test_bad_depth(self):49 def test_bad_depth(self):
48 """ test that the shelf reject invalid depth at creation time """50 """ test that the shelf reject invalid depth at creation time """
49 self.assertRaises(ValueError, FileShelf, TESTS_DIR, depth=-1)51 self.assertRaises(ValueError, self.fileshelf_class, self.path, depth=-1)
5052
51 def test_bad_path(self):53 def test_bad_path(self):
52 """ test that the shelf removes the previous shelve file and create a54 """ test that the shelf removes the previous shelve file and create a
53 directory for the new file based shelf at creation time.55 directory for the new file based shelf at creation time.
54 """56 """
55 path = os.path.join(TESTS_DIR, 'shelf_file')57 path = os.path.join(self.path, 'shelf_file')
56 open(path, 'w').close()58 open(path, 'w').close()
57 FileShelf(path)59 self.fileshelf_class(path)
58 self.assertTrue(os.path.isdir(path))60 self.assertTrue(os.path.isdir(path))
5961
60 def test_different_depth_sizes(self):62 def test_different_depth_sizes(self):
61 """ test the basic operations (delitem, getitem, setitem) with63 """ test the basic operations (delitem, getitem, setitem) with
62 depths between 0 and len(hashlib.sha1().hexdigest())64 depths between 0 and len(hashlib.sha1().hexdigest())
63 """65 """
64 base_path = os.path.join(TESTS_DIR, 'shelf_depth-')66 base_path = os.path.join(self.path, 'shelf_depth-')
65 sha1 = hashlib.sha1()67 sha1 = hashlib.sha1()
66 for idx in xrange(0, len(sha1.hexdigest())):68 for idx in xrange(0, len(sha1.hexdigest())):
67 path = base_path + str(idx)69 path = base_path + str(idx)
68 shelf = FileShelf(path, depth=idx)70 shelf = self.fileshelf_class(path, depth=idx)
69 key = sha1.hexdigest()71 key = sha1.hexdigest()
70 # test __setitem__72 # test __setitem__
71 shelf[key] = 'foo'73 shelf[key] = 'foo'
@@ -85,8 +87,8 @@
8587
86 def test_contains(self):88 def test_contains(self):
87 """ test that it behaves with the 'in' """89 """ test that it behaves with the 'in' """
88 path = os.path.join(TESTS_DIR, 'shelf_depth')90 path = os.path.join(self.path, 'shelf_depth')
89 shelf = FileShelf(path)91 shelf = self.fileshelf_class(path)
90 shelf["foo"] = "bar"92 shelf["foo"] = "bar"
91 self.assertTrue("foo" in shelf)93 self.assertTrue("foo" in shelf)
92 self.assertFalse("baz" in shelf)94 self.assertFalse("baz" in shelf)
@@ -95,8 +97,8 @@
9597
96 def test_pop(self):98 def test_pop(self):
97 """ test that it behaves with the .pop() """99 """ test that it behaves with the .pop() """
98 path = os.path.join(TESTS_DIR, 'shelf_depth')100 path = os.path.join(self.path, 'shelf_depth')
99 shelf = FileShelf(path)101 shelf = self.fileshelf_class(path)
100 shelf["foo"] = "bar"102 shelf["foo"] = "bar"
101 self.assertEqual("bar", shelf.pop("foo"))103 self.assertEqual("bar", shelf.pop("foo"))
102 self.assertFalse("foo" in shelf)104 self.assertFalse("foo" in shelf)
@@ -106,8 +108,8 @@
106108
107 def test_get(self):109 def test_get(self):
108 """ test that it behaves with the .get(key, default) """110 """ test that it behaves with the .get(key, default) """
109 path = os.path.join(TESTS_DIR, 'shelf_get')111 path = os.path.join(self.path, 'shelf_get')
110 shelf = FileShelf(path)112 shelf = self.fileshelf_class(path)
111 shelf["foo"] = "bar"113 shelf["foo"] = "bar"
112 self.assertEquals('bar', shelf.get('foo'))114 self.assertEquals('bar', shelf.get('foo'))
113 self.assertEquals('bar', shelf.get('foo', None))115 self.assertEquals('bar', shelf.get('foo', None))
@@ -116,8 +118,8 @@
116118
117 def test_items(self):119 def test_items(self):
118 """ test that it behaves with the .items() """120 """ test that it behaves with the .items() """
119 path = os.path.join(TESTS_DIR, 'shelf_get')121 path = os.path.join(self.path, 'shelf_get')
120 shelf = FileShelf(path)122 shelf = self.fileshelf_class(path)
121 shelf["foo"] = "bar"123 shelf["foo"] = "bar"
122 # k, v are temp variables, pylint: disable-msg=W0631124 # k, v are temp variables, pylint: disable-msg=W0631
123 self.assertEquals([('foo', 'bar')],125 self.assertEquals([('foo', 'bar')],
@@ -127,6 +129,131 @@
127 [(k, v) for k, v in shelf.items()])129 [(k, v) for k, v in shelf.items()])
128130
129131
132class CachedFileShelfTests(TestFileShelf):
133 """TestFileShelf tests but using CachedFileShelf"""
134 fileshelf_class = CachedFileShelf
135
136 def test_hit_miss_properties(self):
137 """test the cache hits/misses properties"""
138 # yes, the statement has some effect, pylint: disable-msg=W0104
139 try:
140 self.shelf['missingkey']
141 except KeyError:
142 self.assertEquals(self.shelf.cache_misses, 1)
143 self.shelf['realkey'] = 'realvalue'
144 self.shelf['realkey']
145 self.shelf['realkey']
146 self.assertEquals(self.shelf.cache_hits, 1)
147
148
149class LRUCacheTests(unittest.TestCase):
150 '''Test the LRUCache class'''
151 # I'm a unittest of a class, I need to access protected members
152 # pylint: disable-msg=W0212
153
154 def test_setitem(self):
155 """test __delitem__ method"""
156 cache = LRUCache(100, 4)
157 # set some data in the cache
158 values = [('key'+str(i), i) for i in range(100)]
159 for i, j in values:
160 cache[i] = j
161 self.assertEquals(len(cache._queue), len(values))
162 self.assertEquals(len(cache._cache), len(values))
163
164 def test_getitem(self):
165 """test __delitem__ method"""
166 cache = LRUCache(100, 4)
167 # set some data in the cache
168 values = [('key'+str(i), i) for i in range(100)]
169 for i, j in values:
170 cache[i] = j
171 self.assertEquals(len(cache._queue), len(values))
172 self.assertEquals(len(cache._cache), len(values))
173 # compare all the items with the values
174 for i, j in values:
175 self.assertEquals(cache[i], j)
176
177 def test_delitem(self):
178 """test __delitem__ method"""
179 cache = LRUCache(100, 4)
180 values = [('key'+str(i), i) for i in range(100)]
181 for i, j in values:
182 cache[i] = j
183 self.assertEquals(len(cache._queue), len(values))
184 self.assertEquals(len(cache._cache), len(values))
185 for item in cache._cache.copy():
186 del cache[item]
187 self.assertEquals(len(cache._queue), 0)
188 self.assertEquals(len(cache._cache), 0)
189
190 def test_update(self):
191 """test the update method, chacking the refcount and queue."""
192 cache = LRUCache(100, 4)
193 cache.update('key1')
194 self.assertEquals(len(cache._queue), 1)
195 self.assertEquals(len(cache._refcount), 1)
196 self.assertEquals(cache._refcount['key1'], 1)
197 cache.update('key1')
198 self.assertEquals(len(cache._queue), 2)
199 self.assertEquals(len(cache._refcount), 1)
200 self.assertEquals(cache._refcount['key1'], 2)
201
202 def test_purge(self):
203 """Test the queue compact and cache purge"""
204 cache = LRUCache(100, 4)
205 values = [('key'+str(i), j) for i in range(50) for j in range(8)]
206 for i, j in values:
207 cache[i] = j
208 # we hit the limit
209 self.assertEquals(len(cache._queue), 400)
210 self.assertEquals(len(cache._cache), 50)
211 # insert key1 item to compact the queue
212 cache[values[0][0]] = values[0][1]
213 self.assertEquals(len(cache._queue), 50)
214 self.assertEquals(len(cache._cache), 50)
215
216 # now with a key not present in the cache
217 cache = LRUCache(100, 4)
218 for i, j in values:
219 cache[i] = j
220 # we hit the limit
221 self.assertEquals(len(cache._queue), 400)
222 self.assertEquals(len(cache._cache), 50)
223 # insert key1 item to compact the queue
224 cache['non-present-key'] = 'some value'
225 self.assertEquals(len(cache._queue), 51)
226 self.assertEquals(len(cache._cache), 51)
227
228 def test_statistics(self):
229 """Tests if the cache correclty keeps track of misses and hits."""
230 cache = LRUCache(100, 4)
231 # set some data in the cache
232 values = [('key'+str(i), i) for i in range(10)]
233 for i, j in values:
234 cache[i] = j
235 self.assertEquals(len(cache._queue), len(values))
236 self.assertEquals(len(cache._cache), len(values))
237 # compare 4 items with the values
238 for i, j in values[:4]:
239 self.assertEquals(cache[i], j)
240 # check the hits value
241 self.assertEquals(cache.hits, 4)
242 # try to get items not present in the cache
243 for i, j in values[5:10]:
244 self.assertRaises(KeyError, cache.__getitem__, i*10)
245 self.assertEquals(cache.misses, 5)
246
247 def test_inconsistency(self):
248 """Test that the consistency checking works as expected"""
249 cache = LRUCache(2, 1)
250 cache['foo'] = 'bar'
251 # add it again to the _queue to force a inconsistency
252 cache._queue.append('foo')
253 self.assertRaises(CacheInconsistencyError,
254 cache.__setitem__, 'bar', 'foo')
255
256
130def test_suite():257def test_suite():
131 # pylint: disable-msg=C0111258 # pylint: disable-msg=C0111
132 return unittest.TestLoader().loadTestsFromName(__name__)259 return unittest.TestLoader().loadTestsFromName(__name__)
133260
=== modified file 'ubuntuone/syncdaemon/file_shelf.py'
--- ubuntuone/syncdaemon/file_shelf.py 2009-07-15 07:55:08 +0000
+++ ubuntuone/syncdaemon/file_shelf.py 2009-07-17 20:37:43 +0000
@@ -22,6 +22,10 @@
22from UserDict import DictMixin22from UserDict import DictMixin
23import cPickle23import cPickle
24import os24import os
25import stat
26import errno
27from collections import deque
28
2529
26class FileShelf(object, DictMixin):30class FileShelf(object, DictMixin):
27 """ File based shelf.31 """ File based shelf.
@@ -46,10 +50,19 @@
46 """ check if the path isn't a file and in case it don't exists,50 """ check if the path isn't a file and in case it don't exists,
47 creates it51 creates it
48 """52 """
49 if os.path.exists(path) and os.path.isfile(path):53 try:
50 os.unlink(path)54 stat_result = os.stat(path)
51 if not os.path.exists(path):55 # is a regular file?
52 os.makedirs(path)56 if stat.S_ISREG(stat_result.st_mode):
57 os.unlink(path)
58 os.makedirs(path)
59 # else, the dir is already there
60 except OSError, e:
61 if e.errno == errno.ENOENT:
62 # the file or dir don't exist
63 os.makedirs(path)
64 else:
65 raise
5366
54 def key_file(self, key):67 def key_file(self, key):
55 """ get the real key used by the storage from a key """68 """ get the real key used by the storage from a key """
@@ -62,9 +75,7 @@
62 raise ValueError("The key (%r) needs to be longer!" % key)75 raise ValueError("The key (%r) needs to be longer!" % key)
6376
64 letters = [key[i] for i in xrange(0, self._depth)]77 letters = [key[i] for i in xrange(0, self._depth)]
65 path = os.path.join(self._path, *letters)78 return os.path.join(os.path.join(self._path, *letters), key)
66 self._check_and_create_dirs(path)
67 return os.path.join(path, key)
6879
69 def has_key(self, key):80 def has_key(self, key):
70 """ True if the the shelf has the key """81 """ True if the the shelf has the key """
@@ -105,7 +116,9 @@
105116
106 def __setitem__(self, key, value):117 def __setitem__(self, key, value):
107 """ setitem backed by the file storage """118 """ setitem backed by the file storage """
108 with open(self.key_file(key), "wb") as fh:119 path = self.key_file(key)
120 self._check_and_create_dirs(os.path.dirname(path))
121 with open(path, "wb") as fh:
109 cPickle.dump(value, fh, protocol=2)122 cPickle.dump(value, fh, protocol=2)
110 fh.flush()123 fh.flush()
111 os.fsync(fh.fileno())124 os.fsync(fh.fileno())
@@ -124,3 +137,143 @@
124 counter += 1137 counter += 1
125 return counter138 return counter
126139
140
141class CachedFileShelf(FileShelf):
142 """A extension of FileShelf that uses a cache of 1500 items"""
143
144 def __init__(self, *args, **kwargs):
145 """Create the instance"""
146 super(CachedFileShelf, self).__init__(*args, **kwargs)
147 # XXX: the size of the cache and the compact threshold needs to be
148 # tweaked once we get more statistics from real usage
149 self._cache = LRUCache(1500, 4)
150
151 @property
152 def cache_misses(self):
153 """proterty to access the internal cache misses"""
154 return self._cache.misses
155
156 @property
157 def cache_hits(self):
158 """proterty to access the internal cache hits"""
159 return self._cache.hits
160
161 def __getitem__(self, key):
162 """ getitem backed by the file storage """
163 try:
164 return self._cache[key]
165 except KeyError:
166 value = super(CachedFileShelf, self).__getitem__(key)
167 # add it to the cache
168 self._cache[key] = value
169 return value
170
171 def __setitem__(self, key, value):
172 """ setitem backed by the file storage """
173 super(CachedFileShelf, self).__setitem__(key, value)
174 if key in self._cache:
175 self._cache[key] = value
176
177 def __delitem__(self, key):
178 """ delitem backed by the file storage """
179 super(CachedFileShelf, self).__delitem__(key)
180 if key in self._cache:
181 del self._cache[key]
182
183
184class LRUCache(object):
185 """A least-recently-used|updated cache with maximum size.
186
187 The object(s) added to the cache must be hashable.
188 Cache performance statistics stored in self.hits and self.misses.
189
190 Based on recipe #252524 by Raymond Hettinger
191 """
192 def __init__(self, maxsize, compact_threshold=4):
193 """Create the instance.
194 @param maxsize:
195 @param compact_threshold:
196 """
197 self._maxsize = maxsize
198 self._compact_threshold = compact_threshold
199 self._cache = {} # mapping of args to results
200 self._queue = deque() # order that keys have been accessed
201 self._refcount = {} # number of times each key is in the access queue
202 self.hits = 0
203 self.misses = 0
204
205 def __getitem__(self, key):
206 """return the item from the cache or raise KeyError."""
207 try:
208 result = self._cache[key]
209 self.hits += 1
210 except KeyError:
211 # get the value and increase misses
212 self.misses += 1
213 raise
214 else:
215 self.update(key)
216 self.purge()
217 return result
218
219 def __setitem__(self, key, value):
220 """set the key, value in the cache"""
221 self._cache[key] = value
222 self.update(key)
223 self.purge()
224
225 def __delitem__(self, key):
226 """removes the key, value from the cache"""
227 del self._cache[key]
228 self._refcount[key] -= 1
229 self._queue.remove(key)
230 self.purge()
231
232 def __contains__(self, key):
233 """returns True if key is in the cache"""
234 return key in self._cache
235
236 def update(self, key):
237 """Update the least recently used|updated and refcount"""
238 self._queue.append(key)
239 self._refcount[key] = self._refcount.get(key, 0) + 1
240
241 def purge(self):
242 """Purge least recently accessed cache contents and periodically
243 compact the queue by duplicate keys.
244 """
245 while len(self._cache) > self._maxsize:
246 k = self._queue.popleft()
247 self._refcount[k] -= 1
248 if not self._refcount[k]:
249 if k in self._cache:
250 del self._cache[k]
251 if k in self._refcount:
252 del self._refcount[k]
253
254 # Periodically compact the queue by duplicate keys
255 queue_len = len(self._queue)
256 if queue_len > self._maxsize * self._compact_threshold:
257 for _ in xrange(queue_len):
258 k = self._queue.popleft()
259 if self._refcount[k] == 1:
260 self._queue.append(k)
261 else:
262 self._refcount[k] -= 1
263 if not (len(self._queue) == len(self._cache) \
264 == len(self._refcount) \
265 == sum(self._refcount.itervalues())):
266 # create a custom exception for this error
267 raise CacheInconsistencyError(len(self._queue),
268 len(self._cache),
269 len(self._refcount),
270 sum(self._refcount.itervalues()))
271
272
273class CacheInconsistencyError(Exception):
274 """Exception representing a inconsistency in the cache"""
275
276 def __str__(self):
277 return "Inconsistency in the cache: queue: %d cache: %d refcount: %d" \
278 " sum(refcount.values): %d" % self.args
279
127280
=== modified file 'ubuntuone/syncdaemon/filesystem_manager.py'
--- ubuntuone/syncdaemon/filesystem_manager.py 2009-07-14 13:37:22 +0000
+++ ubuntuone/syncdaemon/filesystem_manager.py 2009-07-16 22:52:12 +0000
@@ -187,7 +187,7 @@
187 raise TypeError("data_dir should be a string instead of %s" % \187 raise TypeError("data_dir should be a string instead of %s" % \
188 type(data_dir))188 type(data_dir))
189 fsmdir = os.path.join(data_dir, 'fsm')189 fsmdir = os.path.join(data_dir, 'fsm')
190 self.fs = file_shelf.FileShelf(fsmdir)190 self.fs = file_shelf.CachedFileShelf(fsmdir)
191 self.shares = {}191 self.shares = {}
192 self.vm = vm192 self.vm = vm
193193
194194
=== modified file 'ubuntuone/syncdaemon/main.py'
--- ubuntuone/syncdaemon/main.py 2009-07-14 05:49:49 +0000
+++ ubuntuone/syncdaemon/main.py 2009-07-17 20:31:26 +0000
@@ -94,10 +94,12 @@
94 def log_mark(self):94 def log_mark(self):
95 """ log a "mark" that includes the current AQ state and queue size"""95 """ log a "mark" that includes the current AQ state and queue size"""
96 self.logger.info("%s %s (state: %s; queues: metadata: %d; content: %d;"96 self.logger.info("%s %s (state: %s; queues: metadata: %d; content: %d;"
97 " hash: %d) %s" % ('-'*4, 'MARK', self.state.name,97 " hash: %d, fsm-cache: hit=%d miss=%d) %s" % ('-'*4, 'MARK', self.state.name,
98 len(self.action_q.meta_queue),98 len(self.action_q.meta_queue),
99 len(self.action_q.content_queue),99 len(self.action_q.content_queue),
100 len(self.hash_q), '-'*4))100 len(self.hash_q),
101 self.fs.fs.cache_hits,
102 self.fs.fs.cache_misses, '-'*4))
101103
102 def wait_for_nirvana(self, last_event_interval=0.5):104 def wait_for_nirvana(self, last_event_interval=0.5):
103 """Get a deferred that will fire when there are no more105 """Get a deferred that will fire when there are no more

Subscribers

People subscribed via source and target branches