Merge lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad

Proposed by Colin Watson
Status: Merged
Merged at revision: 17975
Proposed branch: lp:~cjwatson/launchpad/archive-index-by-hash
Merge into: lp:launchpad
Prerequisite: lp:~cjwatson/launchpad/ds-publish-by-hash
Diff against target: 1502 lines (+1079/-53)
10 files modified
lib/lp/archivepublisher/model/ftparchive.py (+6/-2)
lib/lp/archivepublisher/publishing.py (+282/-19)
lib/lp/archivepublisher/tests/test_publisher.py (+599/-1)
lib/lp/registry/model/distribution.py (+14/-2)
lib/lp/services/helpers.py (+31/-12)
lib/lp/services/librarian/interfaces/__init__.py (+1/-1)
lib/lp/services/librarian/model.py (+4/-2)
lib/lp/soyuz/interfaces/archivefile.py (+25/-1)
lib/lp/soyuz/model/archivefile.py (+63/-11)
lib/lp/soyuz/tests/test_archivefile.py (+54/-2)
To merge this branch: bzr merge lp:~cjwatson/launchpad/archive-index-by-hash
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+289379@code.launchpad.net

Commit message

Add files indexed by Release to the librarian and to ArchiveFile. Publish them in by-hash directories, keeping old versions for a day.

Description of the change

Add files indexed by Release to the librarian and to ArchiveFile. Publish them in by-hash directories, keeping old versions for a day.

DistroSeries.publish_by_hash is useful so that we only do this for series with a version of apt that can make use of it, but it also serves as a circuit breaker in case something goes wrong.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) :
review: Needs Fixing (code)
Revision history for this message
Colin Watson (cjwatson) :
Revision history for this message
William Grant (wgrant) :
Revision history for this message
William Grant (wgrant) :
review: Needs Fixing (code)
Revision history for this message
William Grant (wgrant) :
Revision history for this message
Colin Watson (cjwatson) wrote :

Should be worth another look now.

Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/archivepublisher/model/ftparchive.py'
2--- lib/lp/archivepublisher/model/ftparchive.py 2016-02-09 15:51:19 +0000
3+++ lib/lp/archivepublisher/model/ftparchive.py 2016-04-02 00:45:52 +0000
4@@ -54,10 +54,14 @@
5 """Ensure that the path exists and is an empty directory."""
6 if os.path.isdir(path):
7 for name in os.listdir(path):
8+ if name == "by-hash":
9+ # Ignore existing by-hash directories; they will be cleaned
10+ # up to match the rest of the directory tree later.
11+ continue
12 child_path = os.path.join(path, name)
13 # Directories containing index files should never have
14- # subdirectories. Guard against expensive mistakes by not
15- # recursing here.
16+ # subdirectories other than by-hash. Guard against expensive
17+ # mistakes by not recursing here.
18 os.unlink(child_path)
19 else:
20 os.makedirs(path, 0o755)
21
22=== modified file 'lib/lp/archivepublisher/publishing.py'
23--- lib/lp/archivepublisher/publishing.py 2016-03-30 09:17:31 +0000
24+++ lib/lp/archivepublisher/publishing.py 2016-04-02 00:45:52 +0000
25@@ -12,7 +12,11 @@
26 __metaclass__ = type
27
28 import bz2
29-from datetime import datetime
30+from collections import defaultdict
31+from datetime import (
32+ datetime,
33+ timedelta,
34+ )
35 import errno
36 import gzip
37 import hashlib
38@@ -31,6 +35,11 @@
39 )
40 from storm.expr import Desc
41 from zope.component import getUtility
42+from zope.interface import (
43+ Attribute,
44+ implementer,
45+ Interface,
46+ )
47
48 from lp.app.interfaces.launchpad import ILaunchpadCelebrities
49 from lp.archivepublisher import HARDCODED_COMPONENT_ORDER
50@@ -64,8 +73,12 @@
51 from lp.services.database.constants import UTC_NOW
52 from lp.services.database.interfaces import IStore
53 from lp.services.features import getFeatureFlag
54+from lp.services.helpers import filenameToContentType
55 from lp.services.librarian.client import LibrarianClient
56-from lp.services.osutils import open_for_writing
57+from lp.services.osutils import (
58+ ensure_directory_exists,
59+ open_for_writing,
60+ )
61 from lp.services.utils import file_exists
62 from lp.soyuz.enums import (
63 ArchivePurpose,
64@@ -74,6 +87,7 @@
65 PackagePublishingStatus,
66 )
67 from lp.soyuz.interfaces.archive import NoSuchPPA
68+from lp.soyuz.interfaces.archivefile import IArchiveFileSet
69 from lp.soyuz.interfaces.publishing import (
70 active_publishing_status,
71 IPublishingSet,
72@@ -96,6 +110,10 @@
73 }
74
75
76+# Number of days before unreferenced files are removed from by-hash.
77+BY_HASH_STAY_OF_EXECUTION = 1
78+
79+
80 def reorder_components(components):
81 """Return a list of the components provided.
82
83@@ -232,6 +250,152 @@
84 return max(len(str(item['size'])) for item in self[key])
85
86
87+class IArchiveHash(Interface):
88+ """Represents a hash algorithm used for index files."""
89+
90+ hash_factory = Attribute("A hashlib class suitable for this algorithm.")
91+ deb822_name = Attribute(
92+ "Algorithm name expected by debian.deb822.Release.")
93+ apt_name = Attribute(
94+ "Algorithm name used by apt in Release files and by-hash "
95+ "subdirectories.")
96+ lfc_name = Attribute(
97+ "LibraryFileContent attribute name corresponding to this algorithm.")
98+
99+
100+@implementer(IArchiveHash)
101+class MD5ArchiveHash:
102+ hash_factory = hashlib.md5
103+ deb822_name = "md5sum"
104+ apt_name = "MD5Sum"
105+ lfc_name = "md5"
106+
107+
108+@implementer(IArchiveHash)
109+class SHA1ArchiveHash:
110+ hash_factory = hashlib.sha1
111+ deb822_name = "sha1"
112+ apt_name = "SHA1"
113+ lfc_name = "sha1"
114+
115+
116+@implementer(IArchiveHash)
117+class SHA256ArchiveHash:
118+ hash_factory = hashlib.sha256
119+ deb822_name = "sha256"
120+ apt_name = "SHA256"
121+ lfc_name = "sha256"
122+
123+
124+archive_hashes = [
125+ MD5ArchiveHash(),
126+ SHA1ArchiveHash(),
127+ SHA256ArchiveHash(),
128+ ]
129+
130+
131+class ByHash:
132+ """Represents a single by-hash directory tree."""
133+
134+ def __init__(self, root, key, log):
135+ self.root = root
136+ self.path = os.path.join(root, key, "by-hash")
137+ self.log = log
138+ self.known_digests = defaultdict(lambda: defaultdict(set))
139+
140+ def add(self, name, lfa, copy_from_path=None):
141+ """Ensure that by-hash entries for a single file exist.
142+
143+ :param name: The name of the file under this directory tree.
144+ :param lfa: The `ILibraryFileAlias` to add.
145+ :param copy_from_path: If not None, copy file content from here
146+ rather than fetching it from the librarian. This can be used
147+ for newly-added files to avoid needing to commit the transaction
148+ before calling this method.
149+ """
150+ for archive_hash in archive_hashes:
151+ digest = getattr(lfa.content, archive_hash.lfc_name)
152+ digest_path = os.path.join(
153+ self.path, archive_hash.apt_name, digest)
154+ self.known_digests[archive_hash.apt_name][digest].add(name)
155+ if not os.path.exists(digest_path):
156+ self.log.debug(
157+ "by-hash: Creating %s for %s" % (digest_path, name))
158+ ensure_directory_exists(os.path.dirname(digest_path))
159+ if copy_from_path is not None:
160+ os.link(
161+ os.path.join(self.root, copy_from_path), digest_path)
162+ else:
163+ with open(digest_path, "wb") as outfile:
164+ lfa.open()
165+ try:
166+ shutil.copyfileobj(lfa, outfile, 4 * 1024 * 1024)
167+ finally:
168+ lfa.close()
169+
170+ def known(self, name, hashname, digest):
171+ """Do we know about a file with this name and digest?"""
172+ names = self.known_digests[hashname].get(digest)
173+ return names is not None and name in names
174+
175+ def prune(self):
176+ """Remove all by-hash entries that we have not been told to add.
177+
178+ This also removes the by-hash directory itself if no entries remain.
179+ """
180+ prune_directory = True
181+ for archive_hash in archive_hashes:
182+ hash_path = os.path.join(self.path, archive_hash.apt_name)
183+ if os.path.exists(hash_path):
184+ prune_hash_directory = True
185+ for digest in list(os.listdir(hash_path)):
186+ if digest not in self.known_digests[archive_hash.apt_name]:
187+ digest_path = os.path.join(hash_path, digest)
188+ self.log.debug(
189+ "by-hash: Deleting unreferenced %s" % digest_path)
190+ os.unlink(digest_path)
191+ else:
192+ prune_hash_directory = False
193+ if prune_hash_directory:
194+ os.rmdir(hash_path)
195+ else:
196+ prune_directory = False
197+ if prune_directory and os.path.exists(self.path):
198+ os.rmdir(self.path)
199+
200+
201+class ByHashes:
202+ """Represents all by-hash directory trees in an archive."""
203+
204+ def __init__(self, root, log):
205+ self.root = root
206+ self.log = log
207+ self.children = {}
208+
209+ def registerChild(self, dirpath):
210+ """Register a single by-hash directory.
211+
212+ Only directories that have been registered here will be pruned by
213+ the `prune` method.
214+ """
215+ if dirpath not in self.children:
216+ self.children[dirpath] = ByHash(self.root, dirpath, self.log)
217+ return self.children[dirpath]
218+
219+ def add(self, path, lfa, copy_from_path=None):
220+ dirpath, name = os.path.split(path)
221+ self.registerChild(dirpath).add(
222+ name, lfa, copy_from_path=copy_from_path)
223+
224+ def known(self, path, hashname, digest):
225+ dirpath, name = os.path.split(path)
226+ return self.registerChild(dirpath).known(name, hashname, digest)
227+
228+ def prune(self):
229+ for child in self.children.values():
230+ child.prune()
231+
232+
233 class Publisher(object):
234 """Publisher is the class used to provide the facility to publish
235 files in the pool of a Distribution. The publisher objects will be
236@@ -567,10 +731,20 @@
237 Otherwise we include only pockets flagged as true in dirty_pockets.
238 """
239 self.log.debug("* Step D: Generating Release files.")
240+
241+ archive_file_suites = set()
242+ for container in getUtility(IArchiveFileSet).getContainersToReap(
243+ self.archive, container_prefix=u"release:"):
244+ distroseries, pocket = self.distro.getDistroSeriesAndPocket(
245+ container[len(u"release:"):])
246+ archive_file_suites.add((distroseries, pocket))
247+ self.release_files_needed.update(archive_file_suites)
248+
249 for distroseries in self.distro:
250 for pocket in self.archive.getPockets():
251 if not is_careful:
252- if not self.isDirty(distroseries, pocket):
253+ if (not self.isDirty(distroseries, pocket) and
254+ (distroseries, pocket) not in archive_file_suites):
255 self.log.debug("Skipping release files for %s/%s" %
256 (distroseries.name, pocket.name))
257 continue
258@@ -811,6 +985,95 @@
259 return self.distro.displayname
260 return "LP-PPA-%s" % get_ppa_reference(self.archive)
261
262+ def _updateByHash(self, suite, release_data):
263+ """Update by-hash files for a suite.
264+
265+ This takes Release file data which references a set of on-disk
266+ files, injects any newly-modified files from that set into the
267+ librarian and the ArchiveFile table, and updates the on-disk by-hash
268+ directories to be in sync with ArchiveFile. Any on-disk by-hash
269+ entries that ceased to be current sufficiently long ago are removed.
270+ """
271+ archive_file_set = getUtility(IArchiveFileSet)
272+ by_hashes = ByHashes(self._config.archiveroot, self.log)
273+ suite_dir = os.path.relpath(
274+ os.path.join(self._config.distsroot, suite),
275+ self._config.archiveroot)
276+ container = "release:%s" % suite
277+
278+ # Gather information on entries in the current Release file, and
279+ # make sure nothing there is condemned.
280+ current_files = {}
281+ current_sha256_checksums = set()
282+ for current_entry in release_data["SHA256"]:
283+ path = os.path.join(suite_dir, current_entry["name"])
284+ current_files[path] = (
285+ current_entry["size"], current_entry["sha256"])
286+ current_sha256_checksums.add(current_entry["sha256"])
287+ for container, path, sha256 in archive_file_set.unscheduleDeletion(
288+ self.archive, container=container,
289+ sha256_checksums=current_sha256_checksums):
290+ self.log.debug(
291+ "by-hash: Unscheduled %s for %s in %s for deletion" % (
292+ sha256, path, container))
293+
294+ # Remove any condemned files from the database whose stay of
295+ # execution has elapsed. We ensure that we know about all the
296+ # relevant by-hash directory trees before doing any removals so that
297+ # we can prune them properly later.
298+ for db_file in archive_file_set.getByArchive(
299+ self.archive, container=container):
300+ by_hashes.registerChild(os.path.dirname(db_file.path))
301+ for container, path, sha256 in archive_file_set.reap(
302+ self.archive, container=container):
303+ self.log.debug(
304+ "by-hash: Deleted %s for %s in %s" % (sha256, path, container))
305+
306+ # Ensure that all files recorded in the database are in by-hash.
307+ db_files = archive_file_set.getByArchive(
308+ self.archive, container=container, eager_load=True)
309+ for db_file in db_files:
310+ by_hashes.add(db_file.path, db_file.library_file)
311+
312+ # Condemn any database records that do not correspond to current
313+ # index files.
314+ condemned_files = set()
315+ for db_file in db_files:
316+ if db_file.scheduled_deletion_date is None:
317+ path = db_file.path
318+ if path in current_files:
319+ current_sha256 = current_files[path][1]
320+ else:
321+ current_sha256 = None
322+ if db_file.library_file.content.sha256 != current_sha256:
323+ condemned_files.add(db_file)
324+ if condemned_files:
325+ for container, path, sha256 in archive_file_set.scheduleDeletion(
326+ condemned_files,
327+ timedelta(days=BY_HASH_STAY_OF_EXECUTION)):
328+ self.log.debug(
329+ "by-hash: Scheduled %s for %s in %s for deletion" % (
330+ sha256, path, container))
331+
332+ # Ensure that all the current index files are in by-hash and have
333+ # corresponding database entries.
334+ # XXX cjwatson 2016-03-15: This should possibly use bulk creation,
335+ # although we can only avoid about a third of the queries since the
336+ # librarian client has no bulk upload methods.
337+ for path, (size, sha256) in current_files.items():
338+ full_path = os.path.join(self._config.archiveroot, path)
339+ if (os.path.exists(full_path) and
340+ not by_hashes.known(path, "SHA256", sha256)):
341+ with open(full_path, "rb") as fileobj:
342+ db_file = archive_file_set.newFromFile(
343+ self.archive, container, path, fileobj,
344+ size, filenameToContentType(path))
345+ by_hashes.add(path, db_file.library_file, copy_from_path=path)
346+
347+ # Finally, remove any files from disk that aren't recorded in the
348+ # database and aren't active.
349+ by_hashes.prune()
350+
351 def _writeReleaseFile(self, suite, release_data):
352 """Write a Release file to the archive.
353
354@@ -919,9 +1182,14 @@
355 hashes = self._readIndexFileHashes(suite, filename)
356 if hashes is None:
357 continue
358- release_file.setdefault("MD5Sum", []).append(hashes["md5sum"])
359- release_file.setdefault("SHA1", []).append(hashes["sha1"])
360- release_file.setdefault("SHA256", []).append(hashes["sha256"])
361+ for archive_hash in archive_hashes:
362+ release_file.setdefault(archive_hash.apt_name, []).append(
363+ hashes[archive_hash.deb822_name])
364+
365+ if distroseries.publish_by_hash:
366+ self._updateByHash(suite, release_file)
367+ if distroseries.advertise_by_hash:
368+ release_file["Acquire-By-Hash"] = "yes"
369
370 self._writeReleaseFile(suite, release_file)
371 core_files.add("Release")
372@@ -1041,16 +1309,14 @@
373 # Schedule this for inclusion in the Release file.
374 all_series_files.add(os.path.join(component, "i18n", "Index"))
375
376- def _readIndexFileHashes(self, distroseries_name, file_name,
377- subpath=None):
378+ def _readIndexFileHashes(self, suite, file_name, subpath=None):
379 """Read an index file and return its hashes.
380
381- :param distroseries_name: Distro series name
382+ :param suite: Suite name.
383 :param file_name: Filename relative to the parent container directory.
384- :param subpath: Optional subpath within the distroseries root.
385- Generated indexes will not include this path. If omitted,
386- filenames are assumed to be relative to the distroseries
387- root.
388+ :param subpath: Optional subpath within the suite root. Generated
389+ indexes will not include this path. If omitted, filenames are
390+ assumed to be relative to the suite root.
391 :return: A dictionary mapping hash field names to dictionaries of
392 their components as defined by debian.deb822.Release (e.g.
393 {"md5sum": {"md5sum": ..., "size": ..., "name": ...}}), or None
394@@ -1058,8 +1324,7 @@
395 """
396 open_func = open
397 full_name = os.path.join(
398- self._config.distsroot, distroseries_name, subpath or '.',
399- file_name)
400+ self._config.distsroot, suite, subpath or '.', file_name)
401 if not os.path.exists(full_name):
402 if os.path.exists(full_name + '.gz'):
403 open_func = gzip.open
404@@ -1075,10 +1340,8 @@
405 return None
406
407 hashes = {
408- "md5sum": hashlib.md5(),
409- "sha1": hashlib.sha1(),
410- "sha256": hashlib.sha256(),
411- }
412+ archive_hash.deb822_name: archive_hash.hash_factory()
413+ for archive_hash in archive_hashes}
414 size = 0
415 with open_func(full_name) as in_file:
416 for chunk in iter(lambda: in_file.read(256 * 1024), ""):
417
418=== modified file 'lib/lp/archivepublisher/tests/test_publisher.py'
419--- lib/lp/archivepublisher/tests/test_publisher.py 2016-03-30 09:17:31 +0000
420+++ lib/lp/archivepublisher/tests/test_publisher.py 2016-04-02 00:45:52 +0000
421@@ -7,9 +7,14 @@
422
423 import bz2
424 import crypt
425+from datetime import (
426+ datetime,
427+ timedelta,
428+ )
429 from functools import partial
430 import gzip
431 import hashlib
432+from operator import attrgetter
433 import os
434 import shutil
435 import stat
436@@ -22,9 +27,20 @@
437 import lzma
438 except ImportError:
439 from backports import lzma
440+import pytz
441 from testtools.matchers import (
442 ContainsAll,
443+ DirContains,
444+ Equals,
445+ FileContains,
446+ Is,
447 LessThan,
448+ Matcher,
449+ MatchesListwise,
450+ MatchesSetwise,
451+ MatchesStructure,
452+ Not,
453+ PathExists,
454 )
455 import transaction
456 from zope.component import getUtility
457@@ -36,6 +52,8 @@
458 IArchiveSigningKey,
459 )
460 from lp.archivepublisher.publishing import (
461+ ByHash,
462+ ByHashes,
463 getPublisher,
464 I18nIndex,
465 Publisher,
466@@ -51,6 +69,7 @@
467 from lp.registry.interfaces.series import SeriesStatus
468 from lp.services.config import config
469 from lp.services.database.constants import UTC_NOW
470+from lp.services.database.sqlbase import flush_database_caches
471 from lp.services.features import getFeatureFlag
472 from lp.services.features.testing import FeatureFixture
473 from lp.services.gpg.interfaces import IGPGHandler
474@@ -69,12 +88,16 @@
475 PackageUploadStatus,
476 )
477 from lp.soyuz.interfaces.archive import IArchiveSet
478+from lp.soyuz.interfaces.archivefile import IArchiveFileSet
479 from lp.soyuz.tests.test_publishing import TestNativePublishingBase
480 from lp.testing import TestCaseWithFactory
481 from lp.testing.fakemethod import FakeMethod
482 from lp.testing.gpgkeys import gpgkeysdir
483 from lp.testing.keyserver import KeyServerTac
484-from lp.testing.layers import ZopelessDatabaseLayer
485+from lp.testing.layers import (
486+ LaunchpadZopelessLayer,
487+ ZopelessDatabaseLayer,
488+ )
489
490
491 RELEASE = PackagePublishingPocket.RELEASE
492@@ -424,6 +447,226 @@
493 'i386', publications[0].distroarchseries.architecturetag)
494
495
496+class ByHashHasContents(Matcher):
497+ """Matches if a by-hash directory has exactly the specified contents."""
498+
499+ def __init__(self, contents):
500+ self.contents = contents
501+
502+ def match(self, by_hash_path):
503+ mismatch = DirContains(["MD5Sum", "SHA1", "SHA256"]).match(
504+ by_hash_path)
505+ if mismatch is not None:
506+ return mismatch
507+ for hashname, hashattr in (
508+ ("MD5Sum", "md5"), ("SHA1", "sha1"), ("SHA256", "sha256")):
509+ digests = {
510+ getattr(hashlib, hashattr)(content).hexdigest(): content
511+ for content in self.contents}
512+ path = os.path.join(by_hash_path, hashname)
513+ mismatch = DirContains(digests.keys()).match(path)
514+ if mismatch is not None:
515+ return mismatch
516+ for digest, content in digests.items():
517+ mismatch = FileContains(content).match(
518+ os.path.join(path, digest))
519+ if mismatch is not None:
520+ return mismatch
521+
522+
523+class ByHashesHaveContents(Matcher):
524+ """Matches if only these by-hash directories exist with proper contents."""
525+
526+ def __init__(self, path_contents):
527+ self.path_contents = path_contents
528+
529+ def match(self, root):
530+ children = set()
531+ for dirpath, dirnames, _ in os.walk(root):
532+ if "by-hash" in dirnames:
533+ children.add(os.path.relpath(dirpath, root))
534+ mismatch = MatchesSetwise(
535+ *(Equals(path) for path in self.path_contents)).match(children)
536+ if mismatch is not None:
537+ return mismatch
538+ for path, contents in self.path_contents.items():
539+ by_hash_path = os.path.join(root, path, "by-hash")
540+ mismatch = ByHashHasContents(contents).match(by_hash_path)
541+ if mismatch is not None:
542+ return mismatch
543+
544+
545+class TestByHash(TestCaseWithFactory):
546+ """Unit tests for details of handling a single by-hash directory tree."""
547+
548+ layer = LaunchpadZopelessLayer
549+
550+ def test_add(self):
551+ root = self.makeTemporaryDirectory()
552+ contents = ["abc\n", "def\n"]
553+ lfas = [
554+ self.factory.makeLibraryFileAlias(content=content)
555+ for content in contents]
556+ transaction.commit()
557+ by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
558+ for lfa in lfas:
559+ by_hash.add("Sources", lfa)
560+ by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
561+ self.assertThat(by_hash_path, ByHashHasContents(contents))
562+
563+ def test_add_copy_from_path(self):
564+ root = self.makeTemporaryDirectory()
565+ content = "abc\n"
566+ sources_path = "dists/foo/main/source/Sources"
567+ with open_for_writing(
568+ os.path.join(root, sources_path), "w") as sources:
569+ sources.write(content)
570+ lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
571+ by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
572+ by_hash.add("Sources", lfa, copy_from_path=sources_path)
573+ by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
574+ self.assertThat(by_hash_path, ByHashHasContents([content]))
575+
576+ def test_add_existing(self):
577+ root = self.makeTemporaryDirectory()
578+ content = "abc\n"
579+ lfa = self.factory.makeLibraryFileAlias(content=content)
580+ by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
581+ for hashname, hashattr in (
582+ ("MD5Sum", "md5"), ("SHA1", "sha1"), ("SHA256", "sha256")):
583+ digest = getattr(hashlib, hashattr)(content).hexdigest()
584+ with open_for_writing(
585+ os.path.join(by_hash_path, hashname, digest), "w") as f:
586+ f.write(content)
587+ by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
588+ self.assertThat(by_hash_path, ByHashHasContents([content]))
589+ by_hash.add("Sources", lfa)
590+ self.assertThat(by_hash_path, ByHashHasContents([content]))
591+
592+ def test_known(self):
593+ root = self.makeTemporaryDirectory()
594+ content = "abc\n"
595+ with open_for_writing(os.path.join(root, "abc"), "w") as f:
596+ f.write(content)
597+ lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
598+ by_hash = ByHash(root, "", DevNullLogger())
599+ md5 = hashlib.md5(content).hexdigest()
600+ sha1 = hashlib.sha1(content).hexdigest()
601+ sha256 = hashlib.sha256(content).hexdigest()
602+ self.assertFalse(by_hash.known("abc", "MD5Sum", md5))
603+ self.assertFalse(by_hash.known("abc", "SHA1", sha1))
604+ self.assertFalse(by_hash.known("abc", "SHA256", sha256))
605+ by_hash.add("abc", lfa, copy_from_path="abc")
606+ self.assertTrue(by_hash.known("abc", "MD5Sum", md5))
607+ self.assertTrue(by_hash.known("abc", "SHA1", sha1))
608+ self.assertTrue(by_hash.known("abc", "SHA256", sha256))
609+ self.assertFalse(by_hash.known("def", "SHA256", sha256))
610+ by_hash.add("def", lfa, copy_from_path="abc")
611+ self.assertTrue(by_hash.known("def", "SHA256", sha256))
612+
613+ def test_prune(self):
614+ root = self.makeTemporaryDirectory()
615+ content = "abc\n"
616+ sources_path = "dists/foo/main/source/Sources"
617+ with open_for_writing(os.path.join(root, sources_path), "w") as f:
618+ f.write(content)
619+ lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
620+ by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
621+ by_hash.add("Sources", lfa, copy_from_path=sources_path)
622+ by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
623+ with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
624+ pass
625+ self.assertThat(by_hash_path, Not(ByHashHasContents([content])))
626+ by_hash.prune()
627+ self.assertThat(by_hash_path, ByHashHasContents([content]))
628+
629+ def test_prune_empty(self):
630+ root = self.makeTemporaryDirectory()
631+ by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
632+ by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
633+ with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
634+ pass
635+ self.assertThat(by_hash_path, PathExists())
636+ by_hash.prune()
637+ self.assertThat(by_hash_path, Not(PathExists()))
638+
639+
640+class TestByHashes(TestCaseWithFactory):
641+ """Unit tests for details of handling a set of by-hash directory trees."""
642+
643+ layer = LaunchpadZopelessLayer
644+
645+ def test_add(self):
646+ root = self.makeTemporaryDirectory()
647+ self.assertThat(root, ByHashesHaveContents({}))
648+ path_contents = {
649+ "dists/foo/main/source": {"Sources": "abc\n"},
650+ "dists/foo/main/binary-amd64": {
651+ "Packages.gz": "def\n", "Packages.xz": "ghi\n"},
652+ }
653+ by_hashes = ByHashes(root, DevNullLogger())
654+ for dirpath, contents in path_contents.items():
655+ for name, content in contents.items():
656+ path = os.path.join(dirpath, name)
657+ with open_for_writing(os.path.join(root, path), "w") as f:
658+ f.write(content)
659+ lfa = self.factory.makeLibraryFileAlias(
660+ content=content, db_only=True)
661+ by_hashes.add(path, lfa, copy_from_path=path)
662+ self.assertThat(root, ByHashesHaveContents({
663+ path: contents.values()
664+ for path, contents in path_contents.items()}))
665+
666+ def test_known(self):
667+ root = self.makeTemporaryDirectory()
668+ content = "abc\n"
669+ sources_path = "dists/foo/main/source/Sources"
670+ with open_for_writing(os.path.join(root, sources_path), "w") as f:
671+ f.write(content)
672+ lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
673+ by_hashes = ByHashes(root, DevNullLogger())
674+ md5 = hashlib.md5(content).hexdigest()
675+ sha1 = hashlib.sha1(content).hexdigest()
676+ sha256 = hashlib.sha256(content).hexdigest()
677+ self.assertFalse(by_hashes.known(sources_path, "MD5Sum", md5))
678+ self.assertFalse(by_hashes.known(sources_path, "SHA1", sha1))
679+ self.assertFalse(by_hashes.known(sources_path, "SHA256", sha256))
680+ by_hashes.add(sources_path, lfa, copy_from_path=sources_path)
681+ self.assertTrue(by_hashes.known(sources_path, "MD5Sum", md5))
682+ self.assertTrue(by_hashes.known(sources_path, "SHA1", sha1))
683+ self.assertTrue(by_hashes.known(sources_path, "SHA256", sha256))
684+
685+ def test_prune(self):
686+ root = self.makeTemporaryDirectory()
687+ path_contents = {
688+ "dists/foo/main/source": {"Sources": "abc\n"},
689+ "dists/foo/main/binary-amd64": {
690+ "Packages.gz": "def\n", "Packages.xz": "ghi\n"},
691+ }
692+ by_hashes = ByHashes(root, DevNullLogger())
693+ for dirpath, contents in path_contents.items():
694+ for name, content in contents.items():
695+ path = os.path.join(dirpath, name)
696+ with open_for_writing(os.path.join(root, path), "w") as f:
697+ f.write(content)
698+ lfa = self.factory.makeLibraryFileAlias(
699+ content=content, db_only=True)
700+ by_hashes.add(path, lfa, copy_from_path=path)
701+ strays = [
702+ "dists/foo/main/source/by-hash/MD5Sum/0",
703+ "dists/foo/main/binary-amd64/by-hash/MD5Sum/0",
704+ ]
705+ for stray in strays:
706+ with open_for_writing(os.path.join(root, stray), "w"):
707+ pass
708+ matcher = ByHashesHaveContents({
709+ path: contents.values()
710+ for path, contents in path_contents.items()})
711+ self.assertThat(root, Not(matcher))
712+ by_hashes.prune()
713+ self.assertThat(root, matcher)
714+
715+
716 class TestPublisher(TestPublisherBase):
717 """Testing `Publisher` behaviour."""
718
719@@ -1018,6 +1261,22 @@
720 self.assertEqual(
721 1 + old_num_pending_archives, new_num_pending_archives)
722
723+ def testPendingArchiveWithReapableFiles(self):
724+ # getPendingPublicationPPAs returns archives that have reapable
725+ # ArchiveFiles.
726+ ubuntu = getUtility(IDistributionSet)['ubuntu']
727+ archive = self.factory.makeArchive()
728+ self.assertNotIn(archive, ubuntu.getPendingPublicationPPAs())
729+ archive_file = self.factory.makeArchiveFile(archive=archive)
730+ self.assertNotIn(archive, ubuntu.getPendingPublicationPPAs())
731+ now = datetime.now(pytz.UTC)
732+ removeSecurityProxy(archive_file).scheduled_deletion_date = (
733+ now + timedelta(hours=12))
734+ self.assertNotIn(archive, ubuntu.getPendingPublicationPPAs())
735+ removeSecurityProxy(archive_file).scheduled_deletion_date = (
736+ now - timedelta(hours=12))
737+ self.assertIn(archive, ubuntu.getPendingPublicationPPAs())
738+
739 def _checkCompressedFiles(self, archive_publisher, base_file_path,
740 suffixes):
741 """Assert that the various compressed versions of a file are equal.
742@@ -1930,6 +2189,345 @@
743 'Release')
744 self.assertTrue(file_exists(source_release))
745
746+ def testUpdateByHashDisabled(self):
747+ # The publisher does not create by-hash directories if it is
748+ # disabled in the series configuration.
749+ self.assertFalse(self.breezy_autotest.publish_by_hash)
750+ self.assertFalse(self.breezy_autotest.advertise_by_hash)
751+ publisher = Publisher(
752+ self.logger, self.config, self.disk_pool,
753+ self.ubuntutest.main_archive)
754+
755+ self.getPubSource(filecontent='Source: foo\n')
756+
757+ publisher.A_publish(False)
758+ publisher.C_doFTPArchive(False)
759+ publisher.D_writeReleaseFiles(False)
760+
761+ suite_path = partial(
762+ os.path.join, self.config.distsroot, 'breezy-autotest')
763+ self.assertThat(
764+ suite_path('main', 'source', 'by-hash'), Not(PathExists()))
765+ release = self.parseRelease(suite_path('Release'))
766+ self.assertNotIn('Acquire-By-Hash', release)
767+
768+ def testUpdateByHashUnadvertised(self):
769+ # If the series configuration sets publish_by_hash but not
770+ # advertise_by_hash, then by-hash directories are created but not
771+ # advertised in Release. This is useful for testing.
772+ self.breezy_autotest.publish_by_hash = True
773+ self.assertFalse(self.breezy_autotest.advertise_by_hash)
774+ publisher = Publisher(
775+ self.logger, self.config, self.disk_pool,
776+ self.ubuntutest.main_archive)
777+
778+ self.getPubSource(filecontent='Source: foo\n')
779+
780+ publisher.A_publish(False)
781+ publisher.C_doFTPArchive(False)
782+ publisher.D_writeReleaseFiles(False)
783+
784+ suite_path = partial(
785+ os.path.join, self.config.distsroot, 'breezy-autotest')
786+ self.assertThat(suite_path('main', 'source', 'by-hash'), PathExists())
787+ release = self.parseRelease(suite_path('Release'))
788+ self.assertNotIn('Acquire-By-Hash', release)
789+
790+ def testUpdateByHashInitial(self):
791+ # An initial publisher run populates by-hash directories and leaves
792+ # no archive files scheduled for deletion.
793+ self.breezy_autotest.publish_by_hash = True
794+ self.breezy_autotest.advertise_by_hash = True
795+ publisher = Publisher(
796+ self.logger, self.config, self.disk_pool,
797+ self.ubuntutest.main_archive)
798+
799+ self.getPubSource(filecontent='Source: foo\n')
800+
801+ publisher.A_publish(False)
802+ publisher.C_doFTPArchive(False)
803+ publisher.D_writeReleaseFiles(False)
804+ flush_database_caches()
805+
806+ suite_path = partial(
807+ os.path.join, self.config.distsroot, 'breezy-autotest')
808+ contents = set()
809+ for name in ('Release', 'Sources.gz', 'Sources.bz2'):
810+ with open(suite_path('main', 'source', name), 'rb') as f:
811+ contents.add(f.read())
812+
813+ self.assertThat(
814+ suite_path('main', 'source', 'by-hash'),
815+ ByHashHasContents(contents))
816+
817+ archive_files = getUtility(IArchiveFileSet).getByArchive(
818+ self.ubuntutest.main_archive)
819+ self.assertNotEqual([], archive_files)
820+ self.assertEqual([], [
821+ archive_file for archive_file in archive_files
822+ if archive_file.scheduled_deletion_date is not None])
823+
824+ def testUpdateByHashSubsequent(self):
825+ # A subsequent publisher run updates by-hash directories where
826+ # necessary, and marks inactive index files for later deletion.
827+ self.breezy_autotest.publish_by_hash = True
828+ self.breezy_autotest.advertise_by_hash = True
829+ publisher = Publisher(
830+ self.logger, self.config, self.disk_pool,
831+ self.ubuntutest.main_archive)
832+
833+ self.getPubSource(filecontent='Source: foo\n')
834+
835+ publisher.A_publish(False)
836+ publisher.C_doFTPArchive(False)
837+ publisher.D_writeReleaseFiles(False)
838+
839+ suite_path = partial(
840+ os.path.join, self.config.distsroot, 'breezy-autotest')
841+ main_contents = set()
842+ universe_contents = set()
843+ for name in ('Release', 'Sources.gz', 'Sources.bz2'):
844+ with open(suite_path('main', 'source', name), 'rb') as f:
845+ main_contents.add(f.read())
846+ with open(suite_path('universe', 'source', name), 'rb') as f:
847+ universe_contents.add(f.read())
848+
849+ self.getPubSource(sourcename='baz', filecontent='Source: baz\n')
850+
851+ publisher.A_publish(False)
852+ publisher.C_doFTPArchive(False)
853+ publisher.D_writeReleaseFiles(False)
854+ flush_database_caches()
855+
856+ for name in ('Release', 'Sources.gz', 'Sources.bz2'):
857+ with open(suite_path('main', 'source', name), 'rb') as f:
858+ main_contents.add(f.read())
859+
860+ self.assertThat(
861+ suite_path('main', 'source', 'by-hash'),
862+ ByHashHasContents(main_contents))
863+ self.assertThat(
864+ suite_path('universe', 'source', 'by-hash'),
865+ ByHashHasContents(universe_contents))
866+
867+ archive_files = getUtility(IArchiveFileSet).getByArchive(
868+ self.ubuntutest.main_archive)
869+ self.assertContentEqual(
870+ ['dists/breezy-autotest/main/source/Sources.bz2',
871+ 'dists/breezy-autotest/main/source/Sources.gz'],
872+ [archive_file.path for archive_file in archive_files
873+ if archive_file.scheduled_deletion_date is not None])
874+
875+ def testUpdateByHashIdenticalFiles(self):
876+ # Multiple identical files in the same directory receive multiple
877+ # ArchiveFile rows, even though they share a by-hash entry.
878+ self.breezy_autotest.publish_by_hash = True
879+ publisher = Publisher(
880+ self.logger, self.config, self.disk_pool,
881+ self.ubuntutest.main_archive)
882+ suite_path = partial(
883+ os.path.join, self.config.distsroot, 'breezy-autotest')
884+ get_contents_files = lambda: [
885+ archive_file
886+ for archive_file in getUtility(IArchiveFileSet).getByArchive(
887+ self.ubuntutest.main_archive)
888+ if archive_file.path.startswith('dists/breezy-autotest/Contents-')]
889+
890+ # Create the first file.
891+ with open_for_writing(suite_path('Contents-i386'), 'w') as f:
892+ f.write('A Contents file\n')
893+ publisher.markPocketDirty(
894+ self.breezy_autotest, PackagePublishingPocket.RELEASE)
895+ publisher.A_publish(False)
896+ publisher.C_doFTPArchive(False)
897+ publisher.D_writeReleaseFiles(False)
898+ flush_database_caches()
899+ matchers = [
900+ MatchesStructure(
901+ path=Equals('dists/breezy-autotest/Contents-i386'),
902+ scheduled_deletion_date=Is(None))]
903+ self.assertThat(get_contents_files(), MatchesSetwise(*matchers))
904+ self.assertThat(
905+ suite_path('by-hash'), ByHashHasContents(['A Contents file\n']))
906+
907+ # Add a second identical file.
908+ with open_for_writing(suite_path('Contents-hppa'), 'w') as f:
909+ f.write('A Contents file\n')
910+ publisher.D_writeReleaseFiles(False)
911+ flush_database_caches()
912+ matchers.append(
913+ MatchesStructure(
914+ path=Equals('dists/breezy-autotest/Contents-hppa'),
915+ scheduled_deletion_date=Is(None)))
916+ self.assertThat(get_contents_files(), MatchesSetwise(*matchers))
917+ self.assertThat(
918+ suite_path('by-hash'), ByHashHasContents(['A Contents file\n']))
919+
920+ # Delete the first file, but allow it its stay of execution.
921+ os.unlink(suite_path('Contents-i386'))
922+ publisher.D_writeReleaseFiles(False)
923+ flush_database_caches()
924+ matchers[0] = matchers[0].update(scheduled_deletion_date=Not(Is(None)))
925+ self.assertThat(get_contents_files(), MatchesSetwise(*matchers))
926+ self.assertThat(
927+ suite_path('by-hash'), ByHashHasContents(['A Contents file\n']))
928+
929+ # Arrange for the first file to be pruned, and delete the second
930+ # file.
931+ now = datetime.now(pytz.UTC)
932+ i386_file = getUtility(IArchiveFileSet).getByArchive(
933+ self.ubuntutest.main_archive,
934+ path=u'dists/breezy-autotest/Contents-i386').one()
935+ removeSecurityProxy(i386_file).scheduled_deletion_date = (
936+ now - timedelta(hours=1))
937+ os.unlink(suite_path('Contents-hppa'))
938+ publisher.D_writeReleaseFiles(False)
939+ flush_database_caches()
940+ matchers = [matchers[1].update(scheduled_deletion_date=Not(Is(None)))]
941+ self.assertThat(get_contents_files(), MatchesSetwise(*matchers))
942+ self.assertThat(
943+ suite_path('by-hash'), ByHashHasContents(['A Contents file\n']))
944+
945+ # Arrange for the second file to be pruned.
946+ hppa_file = getUtility(IArchiveFileSet).getByArchive(
947+ self.ubuntutest.main_archive,
948+ path=u'dists/breezy-autotest/Contents-hppa').one()
949+ removeSecurityProxy(hppa_file).scheduled_deletion_date = (
950+ now - timedelta(hours=1))
951+ publisher.D_writeReleaseFiles(False)
952+ flush_database_caches()
953+ self.assertContentEqual([], get_contents_files())
954+ self.assertThat(suite_path('by-hash'), Not(PathExists()))
955+
956+ def testUpdateByHashReprieve(self):
957+ # If a newly-modified index file is identical to a
958+ # previously-condemned one, then it is reprieved and not pruned.
959+ self.breezy_autotest.publish_by_hash = True
960+ # Enable uncompressed index files to avoid relying on stable output
961+ # from compressors in this test.
962+ self.breezy_autotest.index_compressors = [
963+ IndexCompressionType.UNCOMPRESSED]
964+ publisher = Publisher(
965+ self.logger, self.config, self.disk_pool,
966+ self.ubuntutest.main_archive)
967+
968+ # Publish empty index files.
969+ publisher.markPocketDirty(
970+ self.breezy_autotest, PackagePublishingPocket.RELEASE)
971+ publisher.A_publish(False)
972+ publisher.C_doFTPArchive(False)
973+ publisher.D_writeReleaseFiles(False)
974+ suite_path = partial(
975+ os.path.join, self.config.distsroot, 'breezy-autotest')
976+ main_contents = set()
977+ for name in ('Release', 'Sources'):
978+ with open(suite_path('main', 'source', name), 'rb') as f:
979+ main_contents.add(f.read())
980+
981+ # Add a source package so that Sources is non-empty.
982+ pub_source = self.getPubSource(filecontent='Source: foo\n')
983+ publisher.A_publish(False)
984+ publisher.C_doFTPArchive(False)
985+ publisher.D_writeReleaseFiles(False)
986+ transaction.commit()
987+ with open(suite_path('main', 'source', 'Sources'), 'rb') as f:
988+ main_contents.add(f.read())
989+ self.assertEqual(3, len(main_contents))
990+ self.assertThat(
991+ suite_path('main', 'source', 'by-hash'),
992+ ByHashHasContents(main_contents))
993+
994+ # Make the empty Sources file ready to prune.
995+ old_archive_files = []
996+ for archive_file in getUtility(IArchiveFileSet).getByArchive(
997+ self.ubuntutest.main_archive):
998+ if ('main/source' in archive_file.path and
999+ archive_file.scheduled_deletion_date is not None):
1000+ old_archive_files.append(archive_file)
1001+ self.assertEqual(1, len(old_archive_files))
1002+ removeSecurityProxy(old_archive_files[0]).scheduled_deletion_date = (
1003+ datetime.now(pytz.UTC) - timedelta(hours=1))
1004+
1005+ # Delete the source package so that Sources is empty again. The
1006+ # empty file is reprieved and the non-empty one is condemned.
1007+ pub_source.requestDeletion(self.ubuntutest.owner)
1008+ publisher.A_publish(False)
1009+ publisher.C_doFTPArchive(False)
1010+ publisher.D_writeReleaseFiles(False)
1011+ transaction.commit()
1012+ self.assertThat(
1013+ suite_path('main', 'source', 'by-hash'),
1014+ ByHashHasContents(main_contents))
1015+ archive_files = getUtility(IArchiveFileSet).getByArchive(
1016+ self.ubuntutest.main_archive,
1017+ path=u'dists/breezy-autotest/main/source/Sources')
1018+ self.assertThat(
1019+ sorted(archive_files, key=attrgetter('id')),
1020+ MatchesListwise([
1021+ MatchesStructure(scheduled_deletion_date=Is(None)),
1022+ MatchesStructure(scheduled_deletion_date=Not(Is(None))),
1023+ ]))
1024+
1025+ def testUpdateByHashPrune(self):
1026+ # The publisher prunes files from by-hash that were condemned more
1027+ # than a day ago.
1028+ self.breezy_autotest.publish_by_hash = True
1029+ self.breezy_autotest.advertise_by_hash = True
1030+ publisher = Publisher(
1031+ self.logger, self.config, self.disk_pool,
1032+ self.ubuntutest.main_archive)
1033+
1034+ suite_path = partial(
1035+ os.path.join, self.config.distsroot, 'breezy-autotest')
1036+ main_contents = set()
1037+ for sourcename in ('foo', 'bar'):
1038+ self.getPubSource(
1039+ sourcename=sourcename, filecontent='Source: %s\n' % sourcename)
1040+ publisher.A_publish(False)
1041+ publisher.C_doFTPArchive(False)
1042+ publisher.D_writeReleaseFiles(False)
1043+ for name in ('Release', 'Sources.gz', 'Sources.bz2'):
1044+ with open(suite_path('main', 'source', name), 'rb') as f:
1045+ main_contents.add(f.read())
1046+ transaction.commit()
1047+ # Undo any previous determination that breezy-autotest is dirty, so
1048+ # that we can use that to check that future runs don't force index
1049+ # regeneration.
1050+ publisher.dirty_pockets = set()
1051+
1052+ self.assertThat(
1053+ suite_path('main', 'source', 'by-hash'),
1054+ ByHashHasContents(main_contents))
1055+ old_archive_files = []
1056+ for archive_file in getUtility(IArchiveFileSet).getByArchive(
1057+ self.ubuntutest.main_archive):
1058+ if ('main/source' in archive_file.path and
1059+ archive_file.scheduled_deletion_date is not None):
1060+ old_archive_files.append(archive_file)
1061+ self.assertEqual(2, len(old_archive_files))
1062+
1063+ now = datetime.now(pytz.UTC)
1064+ removeSecurityProxy(old_archive_files[0]).scheduled_deletion_date = (
1065+ now + timedelta(hours=12))
1066+ removeSecurityProxy(old_archive_files[1]).scheduled_deletion_date = (
1067+ now - timedelta(hours=12))
1068+ old_archive_files[1].library_file.open()
1069+ try:
1070+ main_contents.remove(old_archive_files[1].library_file.read())
1071+ finally:
1072+ old_archive_files[1].library_file.close()
1073+ self.assertThat(
1074+ suite_path('main', 'source', 'by-hash'),
1075+ Not(ByHashHasContents(main_contents)))
1076+
1077+ publisher.A2_markPocketsWithDeletionsDirty()
1078+ publisher.C_doFTPArchive(False)
1079+ publisher.D_writeReleaseFiles(False)
1080+ self.assertEqual(set(), publisher.dirty_pockets)
1081+ self.assertThat(
1082+ suite_path('main', 'source', 'by-hash'),
1083+ ByHashHasContents(main_contents))
1084+
1085 def testCreateSeriesAliasesNoAlias(self):
1086 """createSeriesAliases has nothing to do by default."""
1087 publisher = Publisher(
1088
1089=== modified file 'lib/lp/registry/model/distribution.py'
1090--- lib/lp/registry/model/distribution.py 2015-10-13 13:22:08 +0000
1091+++ lib/lp/registry/model/distribution.py 2016-04-02 00:45:52 +0000
1092@@ -1,4 +1,4 @@
1093-# Copyright 2009-2015 Canonical Ltd. This software is licensed under the
1094+# Copyright 2009-2016 Canonical Ltd. This software is licensed under the
1095 # GNU Affero General Public License version 3 (see the file LICENSE).
1096
1097 """Database classes for implementing distribution items."""
1098@@ -1283,10 +1283,22 @@
1099 bin_query, clauseTables=['BinaryPackagePublishingHistory'],
1100 orderBy=['archive.id'], distinct=True)
1101
1102+ reapable_af_query = """
1103+ Archive.purpose = %s AND
1104+ Archive.distribution = %s AND
1105+ ArchiveFile.archive = archive.id AND
1106+ ArchiveFile.scheduled_deletion_date < %s
1107+ """ % sqlvalues(ArchivePurpose.PPA, self, UTC_NOW)
1108+
1109+ reapable_af_archives = Archive.select(
1110+ reapable_af_query, clauseTables=['ArchiveFile'],
1111+ orderBy=['archive.id'], distinct=True)
1112+
1113 deleting_archives = Archive.selectBy(
1114 status=ArchiveStatus.DELETING).orderBy(['archive.id'])
1115
1116- return src_archives.union(bin_archives).union(deleting_archives)
1117+ return src_archives.union(bin_archives).union(
1118+ reapable_af_archives).union(deleting_archives)
1119
1120 def getArchiveByComponent(self, component_name):
1121 """See `IDistribution`."""
1122
1123=== modified file 'lib/lp/services/helpers.py'
1124--- lib/lp/services/helpers.py 2014-05-07 15:28:50 +0000
1125+++ lib/lp/services/helpers.py 2016-04-02 00:45:52 +0000
1126@@ -10,6 +10,7 @@
1127
1128 __metaclass__ = type
1129
1130+from collections import OrderedDict
1131 from difflib import unified_diff
1132 import re
1133 from StringIO import StringIO
1134@@ -224,19 +225,37 @@
1135
1136 >>> filenameToContentType('test.tgz')
1137 'application/octet-stream'
1138+
1139+ Build logs
1140+ >>> filenameToContentType('buildlog.txt.gz')
1141+ 'text/plain'
1142+
1143+ Various compressed files
1144+
1145+ >>> filenameToContentType('Packages.gz')
1146+ 'application/x-gzip'
1147+ >>> filenameToContentType('Packages.bz2')
1148+ 'application/x-bzip2'
1149+ >>> filenameToContentType('Packages.xz')
1150+ 'application/x-xz'
1151 """
1152- ftmap = {".dsc": "text/plain",
1153- ".changes": "text/plain",
1154- ".deb": "application/x-debian-package",
1155- ".udeb": "application/x-debian-package",
1156- ".txt": "text/plain",
1157- # For the build master logs
1158- ".txt.gz": "text/plain",
1159- # For live filesystem builds
1160- ".manifest": "text/plain",
1161- ".manifest-remove": "text/plain",
1162- ".size": "text/plain",
1163- }
1164+ ftmap = OrderedDict([
1165+ (".dsc", "text/plain"),
1166+ (".changes", "text/plain"),
1167+ (".deb", "application/x-debian-package"),
1168+ (".udeb", "application/x-debian-package"),
1169+ (".txt", "text/plain"),
1170+ # For the build master logs
1171+ (".txt.gz", "text/plain"),
1172+ # For live filesystem builds
1173+ (".manifest", "text/plain"),
1174+ (".manifest-remove", "text/plain"),
1175+ (".size", "text/plain"),
1176+ # Compressed files
1177+ (".gz", "application/x-gzip"),
1178+ (".bz2", "application/x-bzip2"),
1179+ (".xz", "application/x-xz"),
1180+ ])
1181 for ending in ftmap:
1182 if fname.endswith(ending):
1183 return ftmap[ending]
1184
1185=== modified file 'lib/lp/services/librarian/interfaces/__init__.py'
1186--- lib/lp/services/librarian/interfaces/__init__.py 2016-03-14 16:28:19 +0000
1187+++ lib/lp/services/librarian/interfaces/__init__.py 2016-04-02 00:45:52 +0000
1188@@ -155,7 +155,7 @@
1189 class ILibraryFileAliasSet(Interface):
1190
1191 def create(name, size, file, contentType, expires=None, debugID=None,
1192- restricted=False):
1193+ restricted=False, allow_zero_length=False):
1194 """Create a file in the Librarian, returning the new alias.
1195
1196 An expiry time of None means the file will never expire until it
1197
1198=== modified file 'lib/lp/services/librarian/model.py'
1199--- lib/lp/services/librarian/model.py 2016-03-14 16:28:19 +0000
1200+++ lib/lp/services/librarian/model.py 2016-04-02 00:45:52 +0000
1201@@ -244,7 +244,7 @@
1202 """Create and find LibraryFileAliases."""
1203
1204 def create(self, name, size, file, contentType, expires=None,
1205- debugID=None, restricted=False):
1206+ debugID=None, restricted=False, allow_zero_length=False):
1207 """See `ILibraryFileAliasSet`"""
1208 if restricted:
1209 client = getUtility(IRestrictedLibrarianClient)
1210@@ -252,7 +252,9 @@
1211 client = getUtility(ILibrarianClient)
1212 if '/' in name:
1213 raise InvalidFilename("Filename cannot contain slashes.")
1214- fid = client.addFile(name, size, file, contentType, expires, debugID)
1215+ fid = client.addFile(
1216+ name, size, file, contentType, expires=expires, debugID=debugID,
1217+ allow_zero_length=allow_zero_length)
1218 lfa = IMasterStore(LibraryFileAlias).find(
1219 LibraryFileAlias, LibraryFileAlias.id == fid).one()
1220 assert lfa is not None, "client.addFile didn't!"
1221
1222=== modified file 'lib/lp/soyuz/interfaces/archivefile.py'
1223--- lib/lp/soyuz/interfaces/archivefile.py 2016-03-18 15:09:37 +0000
1224+++ lib/lp/soyuz/interfaces/archivefile.py 2016-04-02 00:45:52 +0000
1225@@ -79,13 +79,15 @@
1226 :param content_type: The MIME type of the file.
1227 """
1228
1229- def getByArchive(archive, container=None, eager_load=False):
1230+ def getByArchive(archive, container=None, path=None, eager_load=False):
1231 """Get files in an archive.
1232
1233 :param archive: Return files in this `IArchive`.
1234 :param container: Return only files with this container.
1235+ :param path: Return only files with this path.
1236 :param eager_load: If True, preload related `LibraryFileAlias` and
1237 `LibraryFileContent` rows.
1238+ :return: An iterable of matched files.
1239 """
1240
1241 def scheduleDeletion(archive_files, stay_of_execution):
1242@@ -94,6 +96,25 @@
1243 :param archive_files: The `IArchiveFile`s to schedule for deletion.
1244 :param stay_of_execution: A `timedelta`; schedule files for deletion
1245 this amount of time in the future.
1246+ :return: An iterable of (container, path, sha256) for files that
1247+ were scheduled for deletion.
1248+ """
1249+
1250+ def unscheduleDeletion(archive, container=None, sha256_checksums=set()):
1251+ """Unschedule these archive files for deletion.
1252+
1253+ This is useful in the case when the new content of a file is
1254+ identical to a version that was previously condemned. This method's
1255+ signature does not match that of `scheduleDeletion`; this is more
1256+ convenient because in such cases we normally do not yet have
1257+ `ArchiveFile` rows in hand.
1258+
1259+ :param archive: Operate on files in this `IArchive`.
1260+ :param container: Operate only on files with this container.
1261+ :param sha256_checksums: Operate only on files with any of these
1262+ checksums.
1263+ :return: An iterable of (container, path, sha256) for files that
1264+ were unscheduled for deletion.
1265 """
1266
1267 def getContainersToReap(archive, container_prefix=None):
1268@@ -102,6 +123,7 @@
1269 :param archive: Return containers in this `IArchive`.
1270 :param container_prefix: Return only containers that start with this
1271 prefix.
1272+ :return: An iterable of matched container names.
1273 """
1274
1275 def reap(archive, container=None):
1276@@ -109,4 +131,6 @@
1277
1278 :param archive: Delete files from this `IArchive`.
1279 :param container: Delete only files with this container.
1280+ :return: An iterable of (container, path, sha256) for files that
1281+ were deleted.
1282 """
1283
1284=== modified file 'lib/lp/soyuz/model/archivefile.py'
1285--- lib/lp/soyuz/model/archivefile.py 2016-03-18 15:09:37 +0000
1286+++ lib/lp/soyuz/model/archivefile.py 2016-04-02 00:45:52 +0000
1287@@ -14,7 +14,9 @@
1288 import os.path
1289
1290 import pytz
1291+from storm.databases.postgres import Returning
1292 from storm.locals import (
1293+ And,
1294 DateTime,
1295 Int,
1296 Reference,
1297@@ -31,6 +33,7 @@
1298 IMasterStore,
1299 IStore,
1300 )
1301+from lp.services.database.stormexpr import BulkUpdate
1302 from lp.services.librarian.interfaces import ILibraryFileAliasSet
1303 from lp.services.librarian.model import (
1304 LibraryFileAlias,
1305@@ -89,17 +92,19 @@
1306 content_type):
1307 library_file = getUtility(ILibraryFileAliasSet).create(
1308 os.path.basename(path), size, fileobj, content_type,
1309- restricted=archive.private)
1310+ restricted=archive.private, allow_zero_length=True)
1311 return cls.new(archive, container, path, library_file)
1312
1313 @staticmethod
1314- def getByArchive(archive, container=None, eager_load=False):
1315+ def getByArchive(archive, container=None, path=None, eager_load=False):
1316 """See `IArchiveFileSet`."""
1317 clauses = [ArchiveFile.archive == archive]
1318 # XXX cjwatson 2016-03-15: We'll need some more sophisticated way to
1319 # match containers once we're using them for custom uploads.
1320 if container is not None:
1321 clauses.append(ArchiveFile.container == container)
1322+ if path is not None:
1323+ clauses.append(ArchiveFile.path == path)
1324 archive_files = IStore(ArchiveFile).find(ArchiveFile, *clauses)
1325
1326 def eager_load(rows):
1327@@ -114,11 +119,43 @@
1328 @staticmethod
1329 def scheduleDeletion(archive_files, stay_of_execution):
1330 """See `IArchiveFileSet`."""
1331- archive_file_ids = set(
1332- archive_file.id for archive_file in archive_files)
1333- rows = IMasterStore(ArchiveFile).find(
1334- ArchiveFile, ArchiveFile.id.is_in(archive_file_ids))
1335- rows.set(scheduled_deletion_date=UTC_NOW + stay_of_execution)
1336+ clauses = [
1337+ ArchiveFile.id.is_in(
1338+ set(archive_file.id for archive_file in archive_files)),
1339+ ArchiveFile.library_file == LibraryFileAlias.id,
1340+ LibraryFileAlias.content == LibraryFileContent.id,
1341+ ]
1342+ new_date = UTC_NOW + stay_of_execution
1343+ return_columns = [
1344+ ArchiveFile.container, ArchiveFile.path, LibraryFileContent.sha256]
1345+ return list(IMasterStore(ArchiveFile).execute(Returning(
1346+ BulkUpdate(
1347+ {ArchiveFile.scheduled_deletion_date: new_date},
1348+ table=ArchiveFile,
1349+ values=[LibraryFileAlias, LibraryFileContent],
1350+ where=And(*clauses)),
1351+ columns=return_columns)))
1352+
1353+ @staticmethod
1354+ def unscheduleDeletion(archive, container=None, sha256_checksums=set()):
1355+ """See `IArchiveFileSet`."""
1356+ clauses = [
1357+ ArchiveFile.archive == archive,
1358+ ArchiveFile.library_file == LibraryFileAlias.id,
1359+ LibraryFileAlias.content == LibraryFileContent.id,
1360+ LibraryFileContent.sha256.is_in(sha256_checksums),
1361+ ]
1362+ if container is not None:
1363+ clauses.append(ArchiveFile.container == container)
1364+ return_columns = [
1365+ ArchiveFile.container, ArchiveFile.path, LibraryFileContent.sha256]
1366+ return list(IMasterStore(ArchiveFile).execute(Returning(
1367+ BulkUpdate(
1368+ {ArchiveFile.scheduled_deletion_date: None},
1369+ table=ArchiveFile,
1370+ values=[LibraryFileAlias, LibraryFileContent],
1371+ where=And(*clauses)),
1372+ columns=return_columns)))
1373
1374 @staticmethod
1375 def getContainersToReap(archive, container_prefix=None):
1376@@ -134,10 +171,25 @@
1377 @staticmethod
1378 def reap(archive, container=None):
1379 """See `IArchiveFileSet`."""
1380+ # XXX cjwatson 2016-03-30 bug=322972: Requires manual SQL due to
1381+ # lack of support for DELETE FROM ... USING ... in Storm.
1382 clauses = [
1383- ArchiveFile.archive == archive,
1384- ArchiveFile.scheduled_deletion_date < UTC_NOW,
1385+ "ArchiveFile.archive = ?",
1386+ "ArchiveFile.scheduled_deletion_date < "
1387+ "CURRENT_TIMESTAMP AT TIME ZONE 'UTC'",
1388+ "ArchiveFile.library_file = LibraryFileAlias.id",
1389+ "LibraryFileAlias.content = LibraryFileContent.id",
1390 ]
1391+ values = [archive.id]
1392 if container is not None:
1393- clauses.append(ArchiveFile.container == container)
1394- IMasterStore(ArchiveFile).find(ArchiveFile, *clauses).remove()
1395+ clauses.append("ArchiveFile.container = ?")
1396+ values.append(container)
1397+ return list(IMasterStore(ArchiveFile).execute("""
1398+ DELETE FROM ArchiveFile
1399+ USING LibraryFileAlias, LibraryFileContent
1400+ WHERE """ + " AND ".join(clauses) + """
1401+ RETURNING
1402+ ArchiveFile.container,
1403+ ArchiveFile.path,
1404+ LibraryFileContent.sha256
1405+ """, values))
1406
1407=== modified file 'lib/lp/soyuz/tests/test_archivefile.py'
1408--- lib/lp/soyuz/tests/test_archivefile.py 2016-03-18 15:09:37 +0000
1409+++ lib/lp/soyuz/tests/test_archivefile.py 2016-04-02 00:45:52 +0000
1410@@ -19,6 +19,7 @@
1411 from zope.component import getUtility
1412 from zope.security.proxy import removeSecurityProxy
1413
1414+from lp.services.database.sqlbase import flush_database_caches
1415 from lp.services.osutils import open_for_writing
1416 from lp.soyuz.interfaces.archivefile import IArchiveFileSet
1417 from lp.testing import TestCaseWithFactory
1418@@ -75,17 +76,35 @@
1419 self.assertContentEqual(
1420 [], archive_file_set.getByArchive(archives[0], container="bar"))
1421 self.assertContentEqual(
1422+ [archive_files[1]],
1423+ archive_file_set.getByArchive(
1424+ archives[0], path=archive_files[1].path))
1425+ self.assertContentEqual(
1426+ [], archive_file_set.getByArchive(archives[0], path="other"))
1427+ self.assertContentEqual(
1428 archive_files[2:], archive_file_set.getByArchive(archives[1]))
1429 self.assertContentEqual(
1430 [archive_files[3]],
1431 archive_file_set.getByArchive(archives[1], container="foo"))
1432 self.assertContentEqual(
1433 [], archive_file_set.getByArchive(archives[1], container="bar"))
1434+ self.assertContentEqual(
1435+ [archive_files[3]],
1436+ archive_file_set.getByArchive(
1437+ archives[1], path=archive_files[3].path))
1438+ self.assertContentEqual(
1439+ [], archive_file_set.getByArchive(archives[1], path="other"))
1440
1441 def test_scheduleDeletion(self):
1442 archive_files = [self.factory.makeArchiveFile() for _ in range(3)]
1443- getUtility(IArchiveFileSet).scheduleDeletion(
1444+ expected_rows = [
1445+ (archive_file.container, archive_file.path,
1446+ archive_file.library_file.content.sha256)
1447+ for archive_file in archive_files[:2]]
1448+ rows = getUtility(IArchiveFileSet).scheduleDeletion(
1449 archive_files[:2], timedelta(days=1))
1450+ self.assertContentEqual(expected_rows, rows)
1451+ flush_database_caches()
1452 tomorrow = datetime.now(pytz.UTC) + timedelta(days=1)
1453 # Allow a bit of timing slack for slow tests.
1454 self.assertThat(
1455@@ -96,6 +115,34 @@
1456 LessThan(timedelta(minutes=5)))
1457 self.assertIsNone(archive_files[2].scheduled_deletion_date)
1458
1459+ def test_unscheduleDeletion(self):
1460+ archives = [self.factory.makeArchive() for _ in range(2)]
1461+ lfas = [
1462+ self.factory.makeLibraryFileAlias(db_only=True) for _ in range(3)]
1463+ archive_files = []
1464+ for archive in archives:
1465+ for container in ("foo", "bar"):
1466+ archive_files.extend([
1467+ self.factory.makeArchiveFile(
1468+ archive=archive, container=container, library_file=lfa)
1469+ for lfa in lfas])
1470+ now = datetime.now(pytz.UTC)
1471+ for archive_file in archive_files:
1472+ removeSecurityProxy(archive_file).scheduled_deletion_date = now
1473+ expected_rows = [
1474+ ("foo", archive_files[0].path, lfas[0].content.sha256),
1475+ ("foo", archive_files[1].path, lfas[1].content.sha256),
1476+ ]
1477+ rows = getUtility(IArchiveFileSet).unscheduleDeletion(
1478+ archive=archives[0], container="foo",
1479+ sha256_checksums=[lfas[0].content.sha256, lfas[1].content.sha256])
1480+ self.assertContentEqual(expected_rows, rows)
1481+ flush_database_caches()
1482+ self.assertContentEqual(
1483+ [archive_files[0], archive_files[1]],
1484+ [archive_file for archive_file in archive_files
1485+ if archive_file.scheduled_deletion_date is None])
1486+
1487 def test_getContainersToReap(self):
1488 archive = self.factory.makeArchive()
1489 archive_files = []
1490@@ -149,6 +196,11 @@
1491 removeSecurityProxy(archive_files[4]).scheduled_deletion_date = (
1492 now - timedelta(days=1))
1493 archive_file_set = getUtility(IArchiveFileSet)
1494- archive_file_set.reap(archive, container="foo")
1495+ expected_rows = [
1496+ ("foo", archive_files[0].path,
1497+ archive_files[0].library_file.content.sha256),
1498+ ]
1499+ rows = archive_file_set.reap(archive, container="foo")
1500+ self.assertContentEqual(expected_rows, rows)
1501 self.assertContentEqual(
1502 archive_files[1:4], archive_file_set.getByArchive(archive))