Great work Celso - you make complex things easy to understand :) r=me > = Summary = > > This branch fixes https://bugs.edge.launchpad.net/bugs/430552. > > The is a issue currently blocking `process-death-row` (p-d-r) in > production and increasing its backlog of removal-candidates, since > it's not able to remove files from pool/ > > After an investigation we discovered that p-d-r is failing due > because it trying to remove files that do not exist anymore. > > The missing files (symlinks, in fact) could be gone by the following > reasons: > > * Someone butter-fingered the ubuntu archive disk trying to fix > something else (only remotely possible, there are many broken > files) > > * The last effective run of p-d-r was aborted during the DB update > (quite possible, disk operations and updates are not atomic) > Thanks for the detailed explanation - I'd been wondering about this issue while seeing you and Julian chatting about it. It's not possible that this could have happened while converting the privacy of ppas? I guess you would already know that - just a thought. > > == Proposed fix == > > The fix for this problem is to deal with 'missing symlinks' as we deal > with 'missing files'. > > It will be an *attempt* to remove what was already removed, somehow, > so should carry on and update the DB records as if we have > successfully removed the files from disk. Great. > > == Tests == > > ./bin/test -vv -t TestDeathRow -t deathrow.txt > > == Demo and Q/A == > > The best! Code is cowboyed in production and dealing with 200k records > backlog (ETA 5h). ouch. It can't even be tested on df? Can't we mess with df and delete some symlinks that p-d-r expects? > > = Launchpad lint = > > Checking for conflicts. and issues in doctests and templates. > Running jslint, xmllint, pyflakes, and pylint. > Using normal rules. > > Linting changed files: > lib/lp/archivepublisher/deathrow.py > lib/lp/soyuz/interfaces/publishing.py > lib/lp/archivepublisher/diskpool.py > lib/lp/archivepublisher/tests/test_deathrow.py > === modified file 'lib/lp/archivepublisher/deathrow.py' > --- lib/lp/archivepublisher/deathrow.py 2009-06-24 23:28:16 +0000 > +++ lib/lp/archivepublisher/deathrow.py 2009-09-24 12:54:03 +0000 > @@ -23,7 +23,8 @@ > from lp.soyuz.interfaces.publishing import ( > ISecureBinaryPackagePublishingHistory, > ISecureSourcePackagePublishingHistory) > -from canonical.launchpad.interfaces import NotInPool > +from lp.soyuz.interfaces.publishing import ( > + MissingSymlinkInPool, NotInPool) > > > def getDeathRow(archive, log, pool_root_override): > @@ -292,12 +293,16 @@ > try: > bytes += self._removeFile( > component_name, source_name, file_name) > - except NotInPool: > + except NotInPool, info: > # It's safe for us to let this slide because it means that > # the file is already gone. > - self.logger.debug( > - "File for removing %s %s/%s is not in pool, skipping" % > - (component_name, source_name, file_name)) > + self.logger.debug(str(info)) > + except MissingSymlinkInPool, info: > + # This one is a little more worrying, becayse an expected > + # symlink has vanished from the pool/ (could be a code > + # mistake) but there is nothing we can do about it at this > + # point. > + self.logger.warn(str(info)) I'll miss the clarity of your code contributions cprov :/. > > self.logger.info("Total bytes freed: %s" % bytes) > > > === modified file 'lib/lp/archivepublisher/diskpool.py' > --- lib/lp/archivepublisher/diskpool.py 2009-06-24 23:28:16 +0000 > +++ lib/lp/archivepublisher/diskpool.py 2009-09-24 12:54:03 +0000 > @@ -8,8 +8,8 @@ > > from lp.archivepublisher import HARDCODED_COMPONENT_ORDER > from canonical.cachedproperty import cachedproperty > -from canonical.launchpad.interfaces import ( > - NotInPool, PoolFileOverwriteError) > +from lp.soyuz.interfaces.publishing import ( > + MissingSymlinkInPool, NotInPool, PoolFileOverwriteError) > from canonical.librarian.utils import copy_and_close, sha1_from_path > > > @@ -229,7 +229,10 @@ > 3) Remove the main file and there are symlinks left. > """ > if not self.file_component: > - raise NotInPool() > + raise NotInPool( > + "File for removing %s %s/%s is not in pool, skipping." % > + (component, self.source, self.filename)) > + > > # Okay, it's there, if it's a symlink then we need to remove > # it simply. > @@ -242,7 +245,10 @@ > assert os.path.islink(link_path) > return self._reallyRemove(component) > > - assert component == self.file_component > + if component != self.file_component: > + raise MissingSymlinkInPool( > + "Symlink for %s/%s in %s is missing, skipping." % > + (self.source, self.filename, component)) > > # It's not a symlink, this means we need to check whether we > # have symlinks or not. > > === added file 'lib/lp/archivepublisher/tests/test_deathrow.py' > --- lib/lp/archivepublisher/tests/test_deathrow.py 1970-01-01 00:00:00 +0000 > +++ lib/lp/archivepublisher/tests/test_deathrow.py 2009-09-24 12:54:03 +0000 > @@ -0,0 +1,155 @@ > +# Copyright 2009 Canonical Ltd. This software is licensed under the > +# GNU Affero General Public License version 3 (see the file LICENSE). > + > +"""Tests for deathrow class.""" > + > +__metaclass__ = type > + > + > +import os > +import shutil > +import tempfile > +import unittest > + > +from zope.component import getUtility > + > +from canonical.testing import LaunchpadZopelessLayer > +from canonical.launchpad.scripts.logger import BufferLogger > + > +from lp.archivepublisher.deathrow import DeathRow > +from lp.archivepublisher.diskpool import DiskPool > +from lp.registry.interfaces.distribution import IDistributionSet > +from lp.soyuz.interfaces.component import IComponentSet > +from lp.soyuz.model.publishing import SourcePackagePublishingHistory > +from lp.soyuz.tests.test_publishing import SoyuzTestPublisher > +from lp.testing import TestCase > + > + > +class TestDeathRow(TestCase): > + > + layer = LaunchpadZopelessLayer > + > + def getTestPublisher(self, distroseries): > + """Return an `SoyuzTestPublisher`instance.""" > + stp = SoyuzTestPublisher() > + stp.addFakeChroots(distroseries) > + stp.setUpDefaultDistroSeries(distroseries) > + return stp > + > + def getDeathRow(self, archive): > + """Return an `DeathRow` for the given archive. > + > + Created the temporary 'pool' and 'temp' directories and register > + a 'cleanup' to purge them after the test runs. > + """ > + pool_path = tempfile.mkdtemp('-pool') > + temp_path = tempfile.mkdtemp('-pool-tmp') > + def clean_pool(pool_path, temp_path): > + shutil.rmtree(pool_path) > + shutil.rmtree(temp_path) > + self.addCleanup(clean_pool, pool_path, temp_path) Nice - I didn't know about that. > + > + logger = BufferLogger() > + diskpool = DiskPool(pool_path, temp_path, logger) > + return DeathRow(archive, diskpool, logger) > + > + def getDiskPoolPath(self, pub_file, diskpool): > + """Return the absolute path to a published file in the disk pool/.""" > + return diskpool.pathFor( > + pub_file.componentname.encode('utf-8'), > + pub_file.sourcepackagename.encode('utf8'), > + pub_file.libraryfilealiasfilename.encode('utf-8') > + ) > + > + def assertIsFile(self, path): > + """Assert the path exists and is a regular file.""" > + self.assertTrue( > + os.path.exists(path), > + "File %s does not exist" % os.path.basename(path)) > + self.assertFalse( > + os.path.islink(path), > + "File %s is a symbolic link" % os.path.basename(path)) > + > + def assertIsLink(self, path): > + """Assert the path exists and is a symbolic link.""" > + self.assertTrue( > + os.path.exists(path), > + "File %s does not exist" % os.path.basename(path)) > + self.assertTrue( > + os.path.islink(path), > + "File %s is a not symbolic link" % os.path.basename(path)) > + > + def assertDoesNotExist(self, path): > + """Assert the path does not exit.""" > + self.assertFalse( > + os.path.exists(path), > + "File %s exists" % os.path.basename(path)) > + > + def test_MissingSymLinkInPool(self): > + # When a publication is promoted from 'universe' to 'main' and > + # the symbolic links expected in 'universe' are not present, > + # a `MissingSymlinkInPool` error is generated and immediately > + # ignored by the `DeathRow` processor. Even in this adverse > + # circumstances the database record is updated (removal candidates) > + # are materialized to match the disk status. And again :) (that is, I'll miss these excellent clear descriptions of what's happening!) > + > + # Setup an `SoyuzTestPublisher` and a `DeathRow` instance. > + ubuntu = getUtility(IDistributionSet).getByName('ubuntu') > + hoary = ubuntu.getSeries('hoary') > + stp = self.getTestPublisher(hoary) > + deathrow = self.getDeathRow(hoary.main_archive) > + > + # Create a source publication with a since file (DSC) in > + # 'universe' and promote it to 'main'. > + source_universe = stp.getPubSource(component='universe') > + secure_record = source_universe.changeOverride( > + new_component=getUtility(IComponentSet)['main']) > + source_main = SourcePackagePublishingHistory.get( > + secure_record.id) > + test_publications = (source_universe, source_main) > + > + # Commit for exposing the just-created librarian files. > + self.layer.commit() > + > + # Publish the testing publication on disk, the file for the > + # 'universe' component will be a symbolic link to the one > + # in 'main'. > + for pub in test_publications: > + pub.publish(deathrow.diskpool, deathrow.logger) > + [main_dsc_path] = [ > + self.getDiskPoolPath(pub_file, deathrow.diskpool) > + for pub_file in source_main.files] > + [universe_dsc_path] = [ > + self.getDiskPoolPath(pub_file, deathrow.diskpool) > + for pub_file in source_universe.files] > + self.assertIsFile(main_dsc_path) > + self.assertIsLink(universe_dsc_path) *sigh* and again. > + > + # Remove the symbolic link to emulate MissingSymlinkInPool scenario. > + os.remove(universe_dsc_path) > + > + # Remove the testing publications. > + for pub in test_publications: > + pub.requestObsolescence() > + > + # Commit for exposing the just-created removal candidates. > + self.layer.commit() > + > + # Due to the MissingSymlinkInPool scenario, it takes 2 iteration to > + # remove both references to the shared file in pool/. > + deathrow.reap() > + deathrow.reap() > + > + for pub in test_publications: > + self.assertTrue( > + pub.dateremoved is not None, > + '%s (%s) is not marked as removed.' > + % (pub.displayname, pub.component.name)) > + > + self.assertDoesNotExist(main_dsc_path) > + self.assertDoesNotExist(universe_dsc_path) > + > + > +def test_suite(): > + return unittest.TestLoader().loadTestsFromName(__name__) > + > > === modified file 'lib/lp/soyuz/interfaces/publishing.py' > --- lib/lp/soyuz/interfaces/publishing.py 2009-09-15 09:24:15 +0000 > +++ lib/lp/soyuz/interfaces/publishing.py 2009-09-24 12:54:03 +0000 > @@ -18,6 +18,7 @@ > 'ISecureSourcePackagePublishingHistory', > 'ISourcePackageFilePublishing', > 'ISourcePackagePublishingHistory', > + 'MissingSymlinkInPool', > 'NotInPool', > 'PackagePublishingPriority', > 'PackagePublishingStatus', > @@ -60,6 +61,16 @@ > requires manual intervention in the archive. > """ > > +class MissingSymlinkInPool(Exception): > + """Raised when there is a missing symlink in pool. > + > + This condition is ignored, similarly to what we do for `NotInPool`, > + since the pool entry requested to be removed is not there anymore. > + > + The corresponding record is marked as removed and the process > + continues. > + """ > + > > class PackagePublishingStatus(DBEnumeratedType): > """Package Publishing Status Beautiful. Thanks Celso. -- Michael