Merge ~wgrant/launchpad:archive-file-history-refactor into launchpad:master

Proposed by William Grant
Status: Rejected
Rejected by: William Grant
Proposed branch: ~wgrant/launchpad:archive-file-history-refactor
Merge into: launchpad:master
Prerequisite: ~cjwatson/launchpad:archive-file-history
Diff against target: 269 lines (+98/-98)
2 files modified
lib/lp/archivepublisher/publishing.py (+95/-98)
lib/lp/archivepublisher/tests/test_publisher.py (+3/-0)
Reviewer Review Type Date Requested Status
Launchpad code reviewers Pending
Review via email: mp+390811@code.launchpad.net
To post a comment you must log in.

Unmerged commits

1bdfbaa... by William Grant

Experimental _updateByHash refactoring.

afbab5f... by William Grant

Factor _getCurrentFiles out of _updateByHash.

178ada8... by Colin Watson

Turn ArchiveFile into a history table

This adds date_created and date_superseded columns. Adjust the
publisher to match.

LP: #1765933

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/archivepublisher/publishing.py b/lib/lp/archivepublisher/publishing.py
2index 4735b89..56ef070 100644
3--- a/lib/lp/archivepublisher/publishing.py
4+++ b/lib/lp/archivepublisher/publishing.py
5@@ -75,6 +75,7 @@ from lp.registry.model.distroseries import DistroSeries
6 from lp.services.compat import lzma
7 from lp.services.database.constants import UTC_NOW
8 from lp.services.database.interfaces import IStore
9+from lp.services.database.sqlbase import get_transaction_timestamp
10 from lp.services.features import getFeatureFlag
11 from lp.services.helpers import filenameToContentType
12 from lp.services.librarian.client import LibrarianClient
13@@ -95,6 +96,7 @@ from lp.soyuz.interfaces.publishing import (
14 active_publishing_status,
15 IPublishingSet,
16 )
17+from lp.soyuz.model.archivefile import ArchiveFile
18 from lp.soyuz.model.distroarchseries import DistroArchSeries
19 from lp.soyuz.model.publishing import (
20 BinaryPackagePublishingHistory,
21@@ -1040,15 +1042,13 @@ class Publisher(object):
22 return self.distro.displayname
23 return "LP-PPA-%s" % get_ppa_reference(self.archive)
24
25- def _updateByHash(self, suite, release_file_name, extra_files):
26- """Update by-hash files for a suite.
27+ def _getCurrentFiles(self, suite, release_file_name, extra_files):
28+ # Gather information on entries in the current Release file.
29+ release_path = os.path.join(
30+ self._config.distsroot, suite, release_file_name)
31+ with open(release_path) as release_file:
32+ release_data = Release(release_file)
33
34- This takes Release file data which references a set of on-disk
35- files, injects any newly-modified files from that set into the
36- librarian and the ArchiveFile table, and updates the on-disk by-hash
37- directories to be in sync with ArchiveFile. Any on-disk by-hash
38- entries that ceased to be current sufficiently long ago are removed.
39- """
40 extra_data = {}
41 for filename, real_filename in extra_files.items():
42 hashes = self._readIndexFileHashes(
43@@ -1059,22 +1059,9 @@ class Publisher(object):
44 extra_data.setdefault(archive_hash.apt_name, []).append(
45 hashes[archive_hash.deb822_name])
46
47- release_path = os.path.join(
48- self._config.distsroot, suite, release_file_name)
49- with open(release_path) as release_file:
50- release_data = Release(release_file)
51- archive_file_set = getUtility(IArchiveFileSet)
52- by_hashes = ByHashes(self._config.distsroot, self.log)
53 suite_dir = os.path.relpath(
54 os.path.join(self._config.distsroot, suite),
55 self._config.distsroot)
56- container = "release:%s" % suite
57-
58- def strip_dists(path):
59- assert path.startswith("dists/")
60- return path[len("dists/"):]
61-
62- # Gather information on entries in the current Release file.
63 current_files = {}
64 for current_entry in (
65 release_data["SHA256"] + extra_data.get("SHA256", [])):
66@@ -1083,102 +1070,112 @@ class Publisher(object):
67 real_path = os.path.join(suite_dir, real_name)
68 current_files[path] = (
69 int(current_entry["size"]), current_entry["sha256"], real_path)
70+ return current_files
71+
72+ def _updateByHash(self, suite, release_file_name, extra_files):
73+ """Update by-hash files for a suite.
74
75- # Gather information on entries currently in the database. Ensure
76- # that we know about all the relevant by-hash directory trees before
77- # doing any removals so that we can prune them properly later, and
78- # work out which condemned files should be reprieved due to the
79- # paths in question having their previous content again.
80- reprieved_files = defaultdict(list)
81- uncondemned_files = set()
82+ This takes Release file data which references a set of on-disk
83+ files, injects any newly-modified files from that set into the
84+ librarian and the ArchiveFile table, and updates the on-disk by-hash
85+ directories to be in sync with ArchiveFile. Any on-disk by-hash
86+ entries that ceased to be current sufficiently long ago are removed.
87+ """
88+ archive_file_set = getUtility(IArchiveFileSet)
89+ container = "release:%s" % suite
90+
91+ by_hashes = ByHashes(self._config.distsroot, self.log)
92+ existing_lfa_cache = {}
93+ existing_live_files = {}
94+ reapable_archivefiles = set()
95+
96+ def strip_dists(path):
97+ assert path.startswith("dists/")
98+ return path[len("dists/"):]
99+
100+ # Record all files from the database.
101+ db_now = get_transaction_timestamp(IStore(ArchiveFile))
102 for db_file in archive_file_set.getByArchive(
103 self.archive, container=container, eager_load=True):
104+ file_key = (
105+ strip_dists(db_file.path), db_file.library_file.content.sha256)
106+ existing_lfa_cache[file_key] = db_file.library_file
107+ # Ensure any subdirectories are registered early on, in case we're
108+ # about to delete the only file and need to know to prune it.
109 by_hashes.registerChild(os.path.dirname(strip_dists(db_file.path)))
110- file_key = (db_file.path, db_file.library_file.content.sha256)
111- if db_file.scheduled_deletion_date is None:
112- uncondemned_files.add(file_key)
113- else:
114- stripped_path = strip_dists(db_file.path)
115- if stripped_path in current_files:
116- current_sha256 = current_files[stripped_path][1]
117- if db_file.library_file.content.sha256 == current_sha256:
118- reprieved_files[file_key].append(db_file)
119-
120- # We may already have uncondemned entries with the same path and
121- # content as condemned entries that we were about to reprieve; if
122- # so, there's no need to reprieve them.
123- for file_key in uncondemned_files:
124- reprieved_files.pop(file_key, None)
125-
126- # Make sure nothing in the current Release file is condemned.
127- if reprieved_files:
128- reprieved_files_flat = set(
129- chain.from_iterable(reprieved_files.values()))
130- archive_file_set.unscheduleDeletion(reprieved_files_flat)
131- for db_file in reprieved_files_flat:
132- self.log.debug(
133- "by-hash: Unscheduled %s for %s in %s for deletion" % (
134- db_file.library_file.content.sha256, db_file.path,
135- db_file.container))
136
137- # Remove any condemned files from the database whose stay of
138- # execution has elapsed. We ensure that we know about all the
139- # relevant by-hash directory trees before doing any removals so that
140- # we can prune them properly later.
141- for container, path, sha256 in archive_file_set.reap(
142- self.archive, container=container):
143- if (path, sha256) not in uncondemned_files:
144- self.log.debug(
145- "by-hash: Deleted %s for %s in %s" %
146- (sha256, path, container))
147-
148- # Ensure that all files recorded in the database are in by-hash.
149- db_files = archive_file_set.getByArchive(
150- self.archive, container=container, eager_load=True)
151- for db_file in db_files:
152- by_hashes.add(strip_dists(db_file.path), db_file.library_file)
153-
154- # Condemn any database records that do not correspond to current
155- # index files.
156- condemned_files = set()
157- for db_file in db_files:
158- if db_file.scheduled_deletion_date is None:
159- stripped_path = strip_dists(db_file.path)
160- if stripped_path in current_files:
161- current_sha256 = current_files[stripped_path][1]
162- else:
163- current_sha256 = None
164- if db_file.library_file.content.sha256 != current_sha256:
165- condemned_files.add(db_file)
166- if condemned_files:
167- archive_file_set.scheduleDeletion(
168- condemned_files, timedelta(days=BY_HASH_STAY_OF_EXECUTION))
169- for db_file in condemned_files:
170- self.log.debug(
171- "by-hash: Scheduled %s for %s in %s for deletion" % (
172- db_file.library_file.content.sha256, db_file.path,
173- db_file.container))
174+ if db_file.date_superseded is None:
175+ # XXX: Should be a DB constraint.
176+ assert file_key not in existing_live_files
177+ existing_live_files[file_key] = db_file
178+ if (db_file.scheduled_deletion_date is not None
179+ and db_file.scheduled_deletion_date < db_now):
180+ # File has expired. Mark it for reaping.
181+ reapable_archivefiles.add(db_file)
182+ else:
183+ # File should still be on disk.
184+ by_hashes.add(strip_dists(db_file.path), db_file.library_file)
185
186 # Ensure that all the current index files are in by-hash and have
187 # corresponding database entries.
188 # XXX cjwatson 2016-03-15: This should possibly use bulk creation,
189 # although we can only avoid about a third of the queries since the
190 # librarian client has no bulk upload methods.
191+ new_live_files = set()
192+ current_files = self._getCurrentFiles(
193+ suite, release_file_name, extra_files)
194 for path, (size, sha256, real_path) in current_files.items():
195+ file_key = (path, sha256)
196+ new_live_files.add(file_key)
197 full_path = os.path.join(self._config.distsroot, real_path)
198- if (os.path.exists(full_path) and
199- not by_hashes.known(path, "SHA256", sha256)):
200- with open(full_path, "rb") as fileobj:
201- db_file = archive_file_set.newFromFile(
202+ if not os.path.exists(full_path):
203+ # XXX: Should warn?
204+ continue
205+ # Ensure there's a current ArchiveFile row.
206+ if file_key not in existing_live_files:
207+ existing_lfa = existing_lfa_cache.get(file_key)
208+ if existing_lfa:
209+ # XXX: Untested?
210+ db_file = archive_file_set.new(
211 self.archive, container, os.path.join("dists", path),
212- fileobj, size, filenameToContentType(path))
213+ existing_lfa)
214+ else:
215+ with open(full_path, "rb") as fileobj:
216+ db_file = archive_file_set.newFromFile(
217+ self.archive, container,
218+ os.path.join("dists", path), fileobj, size,
219+ filenameToContentType(path))
220+ # And ensure the by-hash links exist on disk.
221+ # XXX: Not sure known is pulling its weight, just avoiding a stat.
222+ if not by_hashes.known(path, "SHA256", sha256):
223 by_hashes.add(
224 path, db_file.library_file, copy_from_path=real_path)
225
226- # Finally, remove any files from disk that aren't recorded in the
227- # database and aren't active.
228+ # Schedule the deletion of any ArchiveFiles which are current in the DB
229+ # but weren't current in the archive this round.
230+ # XXX: This order would violate a unique constraint.
231+ condemned_files = [
232+ af for key, af in existing_live_files.items()
233+ if key not in new_live_files]
234+ if condemned_files:
235+ archive_file_set.scheduleDeletion(
236+ condemned_files, timedelta(days=BY_HASH_STAY_OF_EXECUTION))
237+ for db_file in condemned_files:
238+ self.log.debug(
239+ "by-hash: Scheduled %s for %s in %s for deletion" % (
240+ db_file.library_file.content.sha256, db_file.path,
241+ db_file.container))
242+
243+ # Remove any files from disk that aren't recorded in the database.
244 by_hashes.prune()
245
246+ # And remove expired DB rows now that we've pruned them and their dirs
247+ # from disk.
248+ # XXX: Probably should have a model method. reap doesn't quite work,
249+ # since I'd prefer to know it was the same set we just considered.
250+ for db_file in reapable_archivefiles:
251+ IStore(ArchiveFile).remove(db_file)
252+
253 def _writeReleaseFile(self, suite, release_data):
254 """Write a Release file to the archive (as Release.new).
255
256diff --git a/lib/lp/archivepublisher/tests/test_publisher.py b/lib/lp/archivepublisher/tests/test_publisher.py
257index 5308e12..c1b6daf 100644
258--- a/lib/lp/archivepublisher/tests/test_publisher.py
259+++ b/lib/lp/archivepublisher/tests/test_publisher.py
260@@ -2626,6 +2626,9 @@ class TestUpdateByHash(TestPublisherBase):
261 mocked_datetime.utcnow = lambda: self.times[-1].replace(tzinfo=None)
262 self.useFixture(MonkeyPatch(
263 'lp.soyuz.model.archivefile._now', lambda: self.times[-1]))
264+ self.useFixture(MonkeyPatch(
265+ 'lp.archivepublisher.publishing.get_transaction_timestamp',
266+ lambda _: self.times[-1]))
267
268 def advanceTime(self, delta=None, absolute=None):
269 if delta is not None:

Subscribers

People subscribed via source and target branches

to status/vote changes: