Merge lp:~mwhudson/loggerhead/persist-_rev_info into lp:~mwhudson/loggerhead/less-stupid-whole-history-redundancy

Proposed by Michael Hudson-Doyle
Status: Merged
Merge reported by: Michael Hudson-Doyle
Merged at revision: not available
Proposed branch: lp:~mwhudson/loggerhead/persist-_rev_info
Merge into: lp:~mwhudson/loggerhead/less-stupid-whole-history-redundancy
Diff against target: None lines
To merge this branch: bzr merge lp:~mwhudson/loggerhead/persist-_rev_info
Reviewer Review Type Date Requested Status
Paul Hummer (community) Approve
Review via email: mp+5854@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Paul Hummer (rockstar) wrote :

Thanks for documenting the whole history data structures.

review: Approve
Revision history for this message
Matt Nordhoff (mnordhoff) wrote :

It's not a big deal, but could somebody make this merge request... go
away? This branch wasn't merged into
less-stupid-whole-history-redundancy, but it *was* merged into
lp:loggerhead, so for all intents and purposes it's been dealt with. But
this request is still open, so it's still listed on the merge request
summary page.
--

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'loggerhead/apps/branch.py'
2--- loggerhead/apps/branch.py 2009-04-07 18:29:10 +0000
3+++ loggerhead/apps/branch.py 2009-04-24 10:23:44 +0000
4@@ -38,27 +38,31 @@
5 self.branch_link = branch_link # Currently only used in Launchpad
6 self.log = logging.getLogger('loggerhead.%s' % friendly_name)
7 if graph_cache is None:
8- graph_cache = bzrlib.lru_cache.LRUCache()
9+ graph_cache = bzrlib.lru_cache.LRUCache(10)
10 self.graph_cache = graph_cache
11 self.is_root = is_root
12 self.served_url = served_url
13 self.use_cdn = use_cdn
14
15 def get_history(self):
16- _history = History(self.branch, self.graph_cache)
17+ file_cache = None
18+ revinfo_disk_cache = None
19 cache_path = self._config.get('cachepath', None)
20 if cache_path is not None:
21 # Only import the cache if we're going to use it.
22 # This makes sqlite optional
23 try:
24- from loggerhead.changecache import FileChangeCache
25+ from loggerhead.changecache import (
26+ FileChangeCache, RevInfoDiskCache)
27 except ImportError:
28 self.log.debug("Couldn't load python-sqlite,"
29 " continuing without using a cache")
30 else:
31- _history.use_file_cache(
32- FileChangeCache(_history, cache_path))
33- return _history
34+ file_cache = FileChangeCache(cache_path)
35+ revinfo_disk_cache = RevInfoDiskCache(cache_path)
36+ return History(
37+ self.branch, self.graph_cache, file_cache=file_cache,
38+ revinfo_disk_cache=revinfo_disk_cache, cache_key=self.friendly_name)
39
40 def url(self, *args, **kw):
41 if isinstance(args[0], list):
42
43=== modified file 'loggerhead/apps/filesystem.py'
44--- loggerhead/apps/filesystem.py 2009-04-23 23:39:18 +0000
45+++ loggerhead/apps/filesystem.py 2009-04-24 10:23:44 +0000
46@@ -71,7 +71,7 @@
47 class BranchesFromFileSystemRoot(object):
48
49 def __init__(self, folder, config):
50- self.graph_cache = lru_cache.LRUCache()
51+ self.graph_cache = lru_cache.LRUCache(10)
52 self.folder = folder
53 self._config = config
54
55@@ -94,7 +94,7 @@
56 class UserBranchesFromFileSystemRoot(object):
57
58 def __init__(self, folder, config):
59- self.graph_cache = lru_cache.LRUCache()
60+ self.graph_cache = lru_cache.LRUCache(10)
61 self.folder = folder
62 self._config = config
63 self.trunk_dir = config.get_option('trunk_dir')
64
65=== modified file 'loggerhead/changecache.py'
66--- loggerhead/changecache.py 2009-03-19 21:01:11 +0000
67+++ loggerhead/changecache.py 2009-04-24 10:30:15 +0000
68@@ -27,8 +27,10 @@
69 """
70
71 import cPickle
72+import marshal
73 import os
74 import tempfile
75+import zlib
76
77 try:
78 from sqlite3 import dbapi2
79@@ -38,16 +40,25 @@
80 # We take an optimistic approach to concurrency here: we might do work twice
81 # in the case of races, but not crash or corrupt data.
82
83+def safe_init_db(filename, init_sql):
84+ # To avoid races around creating the database, we create the db in
85+ # a temporary file and rename it into the ultimate location.
86+ fd, temp_path = tempfile.mkstemp(dir=os.path.dirname(filename))
87+ con = dbapi2.connect(temp_path)
88+ cur = con.cursor()
89+ cur.execute(init_sql)
90+ con.commit()
91+ con.close()
92+ os.rename(temp_path, filename)
93+
94 class FakeShelf(object):
95
96 def __init__(self, filename):
97 create_table = not os.path.exists(filename)
98 if create_table:
99- # To avoid races around creating the database, we create the db in
100- # a temporary file and rename it into the ultimate location.
101- fd, path = tempfile.mkstemp(dir=os.path.dirname(filename))
102- self._create_table(path)
103- os.rename(path, filename)
104+ safe_init_db(
105+ filename, "create table RevisionData "
106+ "(revid binary primary key, data binary)")
107 self.connection = dbapi2.connect(filename)
108 self.cursor = self.connection.cursor()
109
110@@ -89,8 +100,7 @@
111
112 class FileChangeCache(object):
113
114- def __init__(self, history, cache_path):
115- self.history = history
116+ def __init__(self, cache_path):
117
118 if not os.path.exists(cache_path):
119 os.mkdir(cache_path)
120@@ -104,3 +114,44 @@
121 changes = self.history.get_file_changes_uncached(entry)
122 cache.add(entry.revid, changes)
123 return changes
124+
125+
126+class RevInfoDiskCache(object):
127+ """Like `RevInfoMemoryCache` but backed in a sqlite DB."""
128+
129+ def __init__(self, cache_path):
130+ if not os.path.exists(cache_path):
131+ os.mkdir(cache_path)
132+ filename = os.path.join(cache_path, 'revinfo.sql')
133+ create_table = not os.path.exists(filename)
134+ if create_table:
135+ safe_init_db(
136+ filename, "create table Data "
137+ "(key binary primary key, revid binary, data binary)")
138+ self.connection = dbapi2.connect(filename)
139+ self.cursor = self.connection.cursor()
140+
141+ def get(self, key, revid):
142+ self.cursor.execute(
143+ "select revid, data from data where key = ?", (dbapi2.Binary(key),))
144+ row = self.cursor.fetchone()
145+ if row is None:
146+ return None
147+ elif str(row[0]) != revid:
148+ return None
149+ else:
150+ return marshal.loads(zlib.decompress(row[1]))
151+
152+ def set(self, key, revid, data):
153+ try:
154+ self.cursor.execute(
155+ 'delete from data where key = ?', (dbapi2.Binary(key), ))
156+ blob = zlib.compress(marshal.dumps(data))
157+ self.cursor.execute(
158+ "insert into data (key, revid, data) values (?, ?, ?)",
159+ map(dbapi2.Binary, [key, revid, blob]))
160+ self.connection.commit()
161+ except dbapi2.IntegrityError:
162+ # If another thread or process attempted to set the same key, we
163+ # don't care too much -- it's only a cache after all!
164+ pass
165
166=== modified file 'loggerhead/history.py'
167--- loggerhead/history.py 2009-04-24 03:44:30 +0000
168+++ loggerhead/history.py 2009-04-24 10:30:15 +0000
169@@ -31,6 +31,7 @@
170 import bisect
171 import datetime
172 import logging
173+import marshal
174 import re
175 import textwrap
176 import threading
177@@ -47,6 +48,7 @@
178 import bzrlib.delta
179 import bzrlib.diff
180 import bzrlib.errors
181+import bzrlib.lru_cache
182 import bzrlib.progress
183 import bzrlib.revision
184 import bzrlib.textfile
185@@ -177,6 +179,43 @@
186 file_id=file_id))
187
188
189+class RevInfoMemoryCache(object):
190+ """A store that validates values against the revids they were stored with.
191+
192+ We use a unique key for each branch.
193+
194+ The reason for not just using the revid as the key is so that when a new
195+ value is provided for a branch, we replace the old value used for the
196+ branch.
197+
198+ There is another implementation of the same interface in
199+ loggerhead.changecache.RevInfoDiskCache.
200+ """
201+
202+ def __init__(self, cache):
203+ self._cache = cache
204+
205+ def get(self, key, revid):
206+ """Return the data associated with `key`, subject to a revid check.
207+
208+ If a value was stored under `key`, with the same revid, return it.
209+ Otherwise return None.
210+ """
211+ cached = self._cache.get(key)
212+ if cached is None:
213+ return None
214+ stored_revid, data = cached
215+ if revid == stored_revid:
216+ return data
217+ else:
218+ return None
219+
220+ def set(self, key, revid, data):
221+ """Store `data` under `key`, to be checked against `revid` on get().
222+ """
223+ self._cache[key] = (revid, data)
224+
225+
226 class History (object):
227 """Decorate a branch to provide information for rendering.
228
229@@ -185,13 +224,76 @@
230 around it, serve the request, throw the History object away, unlock the
231 branch and throw it away.
232
233- :ivar _file_change_cache: xx
234+ :ivar _file_change_cache: An object that caches information about the
235+ files that changed between two revisions.
236+ :ivar _rev_info: A list of information about revisions. This is by far
237+ the most cryptic data structure in loggerhead. At the top level, it
238+ is a list of 3-tuples [(merge-info, where-merged, parents)].
239+ `merge-info` is (seq, revid, merge_depth, revno_str, end_of_merge) --
240+ like a merged sorted list, but the revno is stringified.
241+ `where-merged` is a tuple of revisions that have this revision as a
242+ non-lefthand parent. Finally, `parents` is just the usual list of
243+ parents of this revision.
244+ :ivar _rev_indices: A dictionary mapping each revision id to the index of
245+ the information about it in _rev_info.
246+ :ivar _full_history: A list of all revision ids in the ancestry of the
247+ branch, in merge-sorted order. This is a bit silly, and shouldn't
248+ really be stored on the instance...
249+ :ivar _revno_revid: A dictionary mapping stringified revnos to revision
250+ ids.
251 """
252
253- def __init__(self, branch, whole_history_data_cache):
254+ def _load_whole_history_data(self, caches, cache_key):
255+ """Set the attributes relating to the whole history of the branch.
256+
257+ :param caches: a list of caches with interfaces like
258+ `RevInfoMemoryCache` and be ordered from fastest to slowest.
259+ :param cache_key: the key to use with the caches.
260+ """
261+ self._rev_indices = None
262+ self._rev_info = None
263+
264+ missed_caches = []
265+ def update_missed_caches():
266+ for cache in missed_caches:
267+ cache.set(cache_key, self.last_revid, self._rev_info)
268+ for cache in caches:
269+ data = cache.get(cache_key, self.last_revid)
270+ if data is not None:
271+ self._rev_info = data
272+ update_missed_caches()
273+ break
274+ else:
275+ missed_caches.append(cache)
276+ else:
277+ whole_history_data = compute_whole_history_data(self._branch)
278+ self._rev_info, self._rev_indices = whole_history_data
279+ update_missed_caches()
280+
281+ if self._rev_indices is not None:
282+ self._full_history = []
283+ self._revno_revid = {}
284+ for ((_, revid, _, revno_str, _), _, _) in self._rev_info:
285+ self._revno_revid[revno_str] = revid
286+ self._full_history.append(revid)
287+ else:
288+ self._full_history = []
289+ self._revno_revid = {}
290+ self._rev_indices = {}
291+ for ((seq, revid, _, revno_str, _), _, _) in self._rev_info:
292+ self._rev_indices[revid] = seq
293+ self._revno_revid[revno_str] = revid
294+ self._full_history.append(revid)
295+
296+ def __init__(self, branch, whole_history_data_cache, file_cache=None,
297+ revinfo_disk_cache=None, cache_key=None):
298 assert branch.is_locked(), (
299 "Can only construct a History object with a read-locked branch.")
300- self._file_change_cache = None
301+ if file_cache is not None:
302+ self._file_change_cache = file_cache
303+ file_cache.history = self
304+ else:
305+ self._file_change_cache = None
306 self._branch = branch
307 self._inventory_cache = {}
308 self._branch_nick = self._branch.get_config().get_nickname()
309@@ -199,28 +301,10 @@
310
311 self.last_revid = branch.last_revision()
312
313- cached_whole_history_data = whole_history_data_cache.get(self.last_revid)
314- if cached_whole_history_data is None:
315- whole_history_data = compute_whole_history_data(branch)
316- (self._rev_info, self._rev_indices) = whole_history_data
317- whole_history_data_cache[self.last_revid] = whole_history_data[0]
318- self._full_history = []
319- self._revno_revid = {}
320- for ((_, revid, _, revno_str, _), _, _) in self._rev_info:
321- self._revno_revid[revno_str] = revid
322- self._full_history.append(revid)
323- else:
324- self._rev_info = cached_whole_history_data
325- self._full_history = []
326- self._revno_revid = {}
327- self._rev_indices = {}
328- for ((seq, revid, _, revno_str, _), _, _) in self._rev_info:
329- self._rev_indices[revid] = seq
330- self._revno_revid[revno_str] = revid
331- self._full_history.append(revid)
332-
333- def use_file_cache(self, cache):
334- self._file_change_cache = cache
335+ caches = [RevInfoMemoryCache(whole_history_data_cache)]
336+ if revinfo_disk_cache:
337+ caches.append(revinfo_disk_cache)
338+ self._load_whole_history_data(caches, cache_key)
339
340 @property
341 def has_revisions(self):
342
343=== modified file 'loggerhead/wholehistory.py'
344--- loggerhead/wholehistory.py 2009-04-24 03:12:28 +0000
345+++ loggerhead/wholehistory.py 2009-04-24 10:30:15 +0000
346@@ -37,6 +37,10 @@
347
348
349 def compute_whole_history_data(branch):
350+ """Compute _rev_info and _rev_indices for a branch.
351+
352+ See History.__doc__ for what these data structures mean.
353+ """
354 z = time.time()
355
356 last_revid = branch.last_revision()

Subscribers

People subscribed via source and target branches

to all changes: