Merge lp:~wgrant/launchpad/ppa-reset-2.0-basics into lp:launchpad

Proposed by William Grant
Status: Merged
Merged at revision: 17050
Proposed branch: lp:~wgrant/launchpad/ppa-reset-2.0-basics
Merge into: lp:launchpad
Prerequisite: lp:~wgrant/launchpad/ppa-reset-2.0-model
Diff against target: 985 lines (+361/-297)
6 files modified
lib/lp/buildmaster/interactor.py (+56/-54)
lib/lp/buildmaster/manager.py (+76/-54)
lib/lp/buildmaster/tests/mock_slaves.py (+6/-1)
lib/lp/buildmaster/tests/test_interactor.py (+96/-100)
lib/lp/buildmaster/tests/test_manager.py (+118/-72)
lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py (+9/-16)
To merge this branch: bzr merge lp:~wgrant/launchpad/ppa-reset-2.0-basics
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+223397@code.launchpad.net

Commit message

Reset virtual builders a cycle before any dispatch attempt, and persist the clean status in the DB. Preparation for the new Scalingstack reset protocol.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) wrote :

Looks good to me; just one minor suggestion.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/interactor.py'
2--- lib/lp/buildmaster/interactor.py 2014-05-10 18:40:36 +0000
3+++ lib/lp/buildmaster/interactor.py 2014-06-18 08:49:42 +0000
4@@ -21,6 +21,7 @@
5 removeSecurityProxy,
6 )
7
8+from lp.buildmaster.enums import BuilderCleanStatus
9 from lp.buildmaster.interfaces.builder import (
10 BuildDaemonError,
11 CannotFetchFile,
12@@ -216,7 +217,7 @@
13 BuilderVitals = namedtuple(
14 'BuilderVitals',
15 ('name', 'url', 'virtualized', 'vm_host', 'builderok', 'manual',
16- 'build_queue', 'version'))
17+ 'build_queue', 'version', 'clean_status'))
18
19 _BQ_UNSPECIFIED = object()
20
21@@ -226,7 +227,8 @@
22 build_queue = builder.currentjob
23 return BuilderVitals(
24 builder.name, builder.url, builder.virtualized, builder.vm_host,
25- builder.builderok, builder.manual, build_queue, builder.version)
26+ builder.builderok, builder.manual, build_queue, builder.version,
27+ builder.clean_status)
28
29
30 class BuilderInteractor(object):
31@@ -249,44 +251,6 @@
32 return behaviour
33
34 @classmethod
35- @defer.inlineCallbacks
36- def rescueIfLost(cls, vitals, slave, slave_status, expected_cookie,
37- logger=None):
38- """Reset the slave if its job information doesn't match the DB.
39-
40- This checks the build ID reported in the slave status against
41- the given cookie. If it isn't building what we think it should
42- be, the current build will be aborted and the slave cleaned in
43- preparation for a new task.
44-
45- :return: A Deferred that fires when the dialog with the slave is
46- finished. Its return value is True if the slave is lost,
47- False otherwise.
48- """
49- # Determine the slave's current build cookie.
50- status = slave_status['builder_status']
51- slave_cookie = slave_status.get('build_id')
52-
53- if slave_cookie == expected_cookie:
54- # The master and slave agree about the current job. Continue.
55- defer.returnValue(False)
56- else:
57- # The master and slave disagree. The master is our master,
58- # so try to rescue the slave.
59- # An IDLE slave doesn't need rescuing (SlaveScanner.scan
60- # will rescue the DB side instead), and we just have to wait
61- # out an ABORTING one.
62- if status == 'BuilderStatus.WAITING':
63- yield slave.clean()
64- elif status == 'BuilderStatus.BUILDING':
65- yield slave.abort()
66- if logger:
67- logger.info(
68- "Builder slave '%s' rescued from %r (expected %r)." %
69- (vitals.name, slave_cookie, expected_cookie))
70- defer.returnValue(True)
71-
72- @classmethod
73 def resumeSlaveHost(cls, vitals, slave):
74 """Resume the slave host to a known good condition.
75
76@@ -323,6 +287,49 @@
77
78 @classmethod
79 @defer.inlineCallbacks
80+ def cleanSlave(cls, vitals, slave):
81+ """Prepare a slave for a new build.
82+
83+ :return: A Deferred that fires when this stage of the resume
84+ operations finishes. If the value is True, the slave is now clean.
85+ If it's False, the clean is still in progress and this must be
86+ called again later.
87+ """
88+ if vitals.virtualized:
89+ # If we are building a virtual build, resume the virtual
90+ # machine. Before we try and contact the resumed slave,
91+ # we're going to send it a message. This is to ensure
92+ # it's accepting packets from the outside world, because
93+ # testing has shown that the first packet will randomly
94+ # fail for no apparent reason. This could be a quirk of
95+ # the Xen guest, we're not sure. We also don't care
96+ # about the result from this message, just that it's
97+ # sent, hence the "addBoth". See bug 586359.
98+ yield cls.resumeSlaveHost(vitals, slave)
99+ yield slave.echo("ping")
100+ defer.returnValue(True)
101+ else:
102+ slave_status = yield slave.status()
103+ status = slave_status.get('builder_status', None)
104+ if status == 'BuilderStatus.IDLE':
105+ # This is as clean as we can get it.
106+ defer.returnValue(True)
107+ elif status == 'BuilderStatus.BUILDING':
108+ # Asynchronously abort() the slave and wait until WAITING.
109+ yield slave.abort()
110+ defer.returnValue(False)
111+ elif status == 'BuilderStatus.ABORTING':
112+ # Wait it out until WAITING.
113+ defer.returnValue(False)
114+ elif status == 'BuilderStatus.WAITING':
115+ # Just a synchronous clean() call and we'll be idle.
116+ yield slave.clean()
117+ defer.returnValue(True)
118+ raise BuildDaemonError(
119+ "Invalid status during clean: %r" % status)
120+
121+ @classmethod
122+ @defer.inlineCallbacks
123 def _startBuild(cls, build_queue_item, vitals, builder, slave, behaviour,
124 logger):
125 """Start a build on this builder.
126@@ -342,17 +349,12 @@
127 raise BuildDaemonError(
128 "Attempted to start a build on a known-bad builder.")
129
130- # If we are building a virtual build, resume the virtual
131- # machine. Before we try and contact the resumed slave, we're
132- # going to send it a message. This is to ensure it's accepting
133- # packets from the outside world, because testing has shown that
134- # the first packet will randomly fail for no apparent reason.
135- # This could be a quirk of the Xen guest, we're not sure. We
136- # also don't care about the result from this message, just that
137- # it's sent, hence the "addBoth". See bug 586359.
138- if builder.virtualized:
139- yield cls.resumeSlaveHost(vitals, slave)
140- yield slave.echo("ping")
141+ if builder.clean_status != BuilderCleanStatus.CLEAN:
142+ raise BuildDaemonError(
143+ "Attempted to start build on a dirty slave.")
144+
145+ builder.setCleanStatus(BuilderCleanStatus.DIRTY)
146+ transaction.commit()
147
148 yield behaviour.dispatchBuildToSlave(build_queue_item.id, logger)
149
150@@ -448,9 +450,9 @@
151 :return: A Deferred that fires when the slave dialog is finished.
152 """
153 # IDLE is deliberately not handled here, because it should be
154- # impossible to get past rescueIfLost unless the slave matches
155- # the DB, and this method isn't called unless the DB says
156- # there's a job.
157+ # impossible to get past the cookie check unless the slave
158+ # matches the DB, and this method isn't called unless the DB
159+ # says there's a job.
160 builder_status = slave_status['builder_status']
161 if builder_status == 'BuilderStatus.BUILDING':
162 # Build still building, collect the logtail.
163
164=== modified file 'lib/lp/buildmaster/manager.py'
165--- lib/lp/buildmaster/manager.py 2014-02-05 23:22:30 +0000
166+++ lib/lp/buildmaster/manager.py 2014-06-18 08:49:42 +0000
167@@ -26,6 +26,7 @@
168 from zope.component import getUtility
169
170 from lp.buildmaster.enums import (
171+ BuilderCleanStatus,
172 BuildQueueStatus,
173 BuildStatus,
174 )
175@@ -413,70 +414,91 @@
176 vitals = self.builder_factory.getVitals(self.builder_name)
177 interactor = self.interactor_factory()
178 slave = self.slave_factory(vitals)
179- if vitals.builderok:
180- self.logger.debug("Scanning %s." % self.builder_name)
181- slave_status = yield slave.status()
182- else:
183- slave_status = None
184-
185- # Confirm that the DB and slave sides are in a valid, mutually
186- # agreeable state.
187- lost_reason = None
188- if not vitals.builderok:
189- lost_reason = '%s is disabled' % vitals.name
190- else:
191- self.updateVersion(vitals, slave_status)
192- cancelled = yield self.checkCancellation(vitals, slave, interactor)
193- if cancelled:
194- return
195- assert slave_status is not None
196- lost = yield interactor.rescueIfLost(
197- vitals, slave, slave_status, self.getExpectedCookie(vitals),
198- self.logger)
199- if lost:
200- lost_reason = '%s is lost' % vitals.name
201-
202- # The slave is lost or the builder is disabled. We can't
203- # continue to update the job status or dispatch a new job, so
204- # just rescue the assigned job, if any, so it can be dispatched
205- # to another slave.
206- if lost_reason is not None:
207- if vitals.build_queue is not None:
208+
209+ if vitals.build_queue is not None:
210+ if vitals.clean_status != BuilderCleanStatus.DIRTY:
211+ # This is probably a grave bug with security implications,
212+ # as a slave that has a job must be cleaned afterwards.
213+ raise BuildDaemonError("Non-dirty builder allegedly building.")
214+
215+ lost_reason = None
216+ if not vitals.builderok:
217+ lost_reason = '%s is disabled' % vitals.name
218+ else:
219+ slave_status = yield slave.status()
220+ # Ensure that the slave has the job that we think it
221+ # should.
222+ slave_cookie = slave_status.get('build_id')
223+ expected_cookie = self.getExpectedCookie(vitals)
224+ if slave_cookie != expected_cookie:
225+ lost_reason = (
226+ '%s is lost (expected %r, got %r)' % (
227+ vitals.name, expected_cookie, slave_cookie))
228+
229+ if lost_reason is not None:
230+ # The slave is either confused or disabled, so reset and
231+ # requeue the job. The next scan cycle will clean up the
232+ # slave if appropriate.
233 self.logger.warn(
234 "%s. Resetting BuildQueue %d.", lost_reason,
235 vitals.build_queue.id)
236 vitals.build_queue.reset()
237 transaction.commit()
238- return
239-
240- # We've confirmed that the slave state matches the DB. Continue
241- # with updating the job status, or dispatching a new job if the
242- # builder is idle.
243- if vitals.build_queue is not None:
244- # Scan the slave and get the logtail, or collect the build
245- # if it's ready. Yes, "updateBuild" is a bad name.
246+ return
247+
248+ cancelled = yield self.checkCancellation(
249+ vitals, slave, interactor)
250+ if cancelled:
251+ return
252+
253+ # The slave and DB agree on the builder's state. Scan the
254+ # slave and get the logtail, or collect the build if it's
255+ # ready. Yes, "updateBuild" is a bad name.
256 assert slave_status is not None
257 yield interactor.updateBuild(
258 vitals, slave, slave_status, self.builder_factory,
259 self.behaviour_factory)
260- elif vitals.manual:
261- # If the builder is in manual mode, don't dispatch anything.
262- self.logger.debug(
263- '%s is in manual mode, not dispatching.' % vitals.name)
264 else:
265- # See if there is a job we can dispatch to the builder slave.
266- builder = self.builder_factory[self.builder_name]
267- # Try to dispatch the job. If it fails, don't attempt to
268- # just retry the scan; we need to reset the job so the
269- # dispatch will be reattempted.
270- d = interactor.findAndStartJob(vitals, builder, slave)
271- d.addErrback(functools.partial(self._scanFailed, False))
272- yield d
273- if builder.currentjob is not None:
274- # After a successful dispatch we can reset the
275- # failure_count.
276- builder.resetFailureCount()
277- transaction.commit()
278+ if not vitals.builderok:
279+ return
280+ # We think the builder is idle. If it's clean, dispatch. If
281+ # it's dirty, clean.
282+ if vitals.clean_status == BuilderCleanStatus.CLEAN:
283+ slave_status = yield slave.status()
284+ if slave_status.get('builder_status') != 'BuilderStatus.IDLE':
285+ raise BuildDaemonError(
286+ 'Allegedly clean slave not idle (%r instead)'
287+ % slave_status.get('builder_status'))
288+ self.updateVersion(vitals, slave_status)
289+ if vitals.manual:
290+ # If the builder is in manual mode, don't dispatch
291+ # anything.
292+ self.logger.debug(
293+ '%s is in manual mode, not dispatching.', vitals.name)
294+ return
295+ # Try to find and dispatch a job. If it fails, don't
296+ # attempt to just retry the scan; we need to reset
297+ # the job so the dispatch will be reattempted.
298+ builder = self.builder_factory[self.builder_name]
299+ d = interactor.findAndStartJob(vitals, builder, slave)
300+ d.addErrback(functools.partial(self._scanFailed, False))
301+ yield d
302+ if builder.currentjob is not None:
303+ # After a successful dispatch we can reset the
304+ # failure_count.
305+ builder.resetFailureCount()
306+ transaction.commit()
307+ else:
308+ # Ask the BuilderInteractor to clean the slave. It might
309+ # be immediately cleaned on return, in which case we go
310+ # straight back to CLEAN, or we might have to spin
311+ # through another few cycles.
312+ done = yield interactor.cleanSlave(vitals, slave)
313+ if done:
314+ builder = self.builder_factory[self.builder_name]
315+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
316+ self.logger.debug('%s has been cleaned.', vitals.name)
317+ transaction.commit()
318
319
320 class NewBuildersScanner:
321
322=== modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
323--- lib/lp/buildmaster/tests/mock_slaves.py 2014-06-06 10:32:55 +0000
324+++ lib/lp/buildmaster/tests/mock_slaves.py 2014-06-18 08:49:42 +0000
325@@ -30,6 +30,7 @@
326 from twisted.internet import defer
327 from twisted.web import xmlrpc
328
329+from lp.buildmaster.enums import BuilderCleanStatus
330 from lp.buildmaster.interactor import BuilderSlave
331 from lp.buildmaster.interfaces.builder import CannotFetchFile
332 from lp.services.config import config
333@@ -48,7 +49,7 @@
334
335 def __init__(self, name='mock-builder', builderok=True, manual=False,
336 virtualized=True, vm_host=None, url='http://fake:0000',
337- version=None):
338+ version=None, clean_status=BuilderCleanStatus.DIRTY):
339 self.currentjob = None
340 self.builderok = builderok
341 self.manual = manual
342@@ -58,6 +59,10 @@
343 self.vm_host = vm_host
344 self.failnotes = None
345 self.version = version
346+ self.clean_status = clean_status
347+
348+ def setCleanStatus(self, clean_status):
349+ self.clean_status = clean_status
350
351 def failBuilder(self, reason):
352 self.builderok = False
353
354=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
355--- lib/lp/buildmaster/tests/test_interactor.py 2014-06-06 11:15:29 +0000
356+++ lib/lp/buildmaster/tests/test_interactor.py 2014-06-18 08:49:42 +0000
357@@ -16,19 +16,24 @@
358 SynchronousDeferredRunTest,
359 )
360 from testtools.matchers import ContainsAll
361+from testtools.testcase import ExpectedException
362 from twisted.internet import defer
363 from twisted.internet.task import Clock
364 from twisted.python.failure import Failure
365 from twisted.web.client import getPage
366 from zope.security.proxy import removeSecurityProxy
367
368-from lp.buildmaster.enums import BuildStatus
369+from lp.buildmaster.enums import (
370+ BuilderCleanStatus,
371+ BuildStatus,
372+ )
373 from lp.buildmaster.interactor import (
374 BuilderInteractor,
375 BuilderSlave,
376 extract_vitals_from_db,
377 )
378 from lp.buildmaster.interfaces.builder import (
379+ BuildDaemonError,
380 CannotFetchFile,
381 CannotResumeHost,
382 )
383@@ -150,94 +155,75 @@
384 slave = BuilderInteractor.makeSlaveFromVitals(vitals)
385 self.assertEqual(5, slave.timeout)
386
387- @defer.inlineCallbacks
388- def test_rescueIfLost_aborts_lost_and_broken_slave(self):
389- # A slave that's 'lost' should be aborted; when the slave is
390- # broken then abort() should also throw a fault.
391+
392+class TestBuilderInteractorCleanSlave(TestCase):
393+
394+ run_tests_with = AsynchronousDeferredRunTest
395+
396+ @defer.inlineCallbacks
397+ def assertCleanCalls(self, builder, slave, calls, done):
398+ actually_done = yield BuilderInteractor.cleanSlave(
399+ extract_vitals_from_db(builder), slave)
400+ self.assertEqual(done, actually_done)
401+ self.assertEqual(calls, slave.method_log)
402+
403+ @defer.inlineCallbacks
404+ def test_virtual(self):
405+ # We don't care what the status of a virtual builder is; we just
406+ # reset its VM and check that it's back.
407+ yield self.assertCleanCalls(
408+ MockBuilder(
409+ virtualized=True, vm_host='lol',
410+ clean_status=BuilderCleanStatus.DIRTY),
411+ OkSlave(), ['resume', 'echo'], True)
412+
413+ @defer.inlineCallbacks
414+ def test_nonvirtual_idle(self):
415+ # An IDLE non-virtual slave is already as clean as we can get it.
416+ yield self.assertCleanCalls(
417+ MockBuilder(
418+ virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
419+ OkSlave(), ['status'], True)
420+
421+ @defer.inlineCallbacks
422+ def test_nonvirtual_building(self):
423+ # A BUILDING non-virtual slave needs to be aborted. It'll go
424+ # through ABORTING and eventually be picked up from WAITING.
425+ yield self.assertCleanCalls(
426+ MockBuilder(
427+ virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
428+ BuildingSlave(), ['status', 'abort'], False)
429+
430+ @defer.inlineCallbacks
431+ def test_nonvirtual_aborting(self):
432+ # An ABORTING non-virtual slave must be waited out. It should
433+ # hit WAITING eventually.
434+ yield self.assertCleanCalls(
435+ MockBuilder(
436+ virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
437+ AbortingSlave(), ['status'], False)
438+
439+ @defer.inlineCallbacks
440+ def test_nonvirtual_waiting(self):
441+ # A WAITING non-virtual slave just needs clean() called.
442+ yield self.assertCleanCalls(
443+ MockBuilder(
444+ virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
445+ WaitingSlave(), ['status', 'clean'], True)
446+
447+ @defer.inlineCallbacks
448+ def test_nonvirtual_broken(self):
449+ # A broken non-virtual builder is probably unrecoverable, so the
450+ # method just crashes.
451+ vitals = extract_vitals_from_db(MockBuilder(
452+ virtualized=False, clean_status=BuilderCleanStatus.DIRTY))
453 slave = LostBuildingBrokenSlave()
454- slave_status = yield slave.status()
455 try:
456- yield BuilderInteractor.rescueIfLost(
457- extract_vitals_from_db(MockBuilder()), slave, slave_status,
458- 'trivial')
459+ yield BuilderInteractor.cleanSlave(vitals, slave)
460 except xmlrpclib.Fault:
461- self.assertIn('abort', slave.call_log)
462+ self.assertEqual(['status', 'abort'], slave.call_log)
463 else:
464- self.fail("xmlrpclib.Fault not raised")
465-
466- @defer.inlineCallbacks
467- def test_recover_idle_slave(self):
468- # An idle slave is not rescued, even if it's not meant to be
469- # idle. SlaveScanner.scan() will clean up the DB side, because
470- # we still report that it's lost.
471- slave = OkSlave()
472- slave_status = yield slave.status()
473- lost = yield BuilderInteractor.rescueIfLost(
474- extract_vitals_from_db(MockBuilder()), slave, slave_status,
475- 'trivial')
476- self.assertTrue(lost)
477- self.assertEqual(['status'], slave.call_log)
478-
479- @defer.inlineCallbacks
480- def test_recover_ok_slave(self):
481- # An idle slave that's meant to be idle is not rescued.
482- slave = OkSlave()
483- slave_status = yield slave.status()
484- lost = yield BuilderInteractor.rescueIfLost(
485- extract_vitals_from_db(MockBuilder()), slave, slave_status, None)
486- self.assertFalse(lost)
487- self.assertEqual(['status'], slave.call_log)
488-
489- @defer.inlineCallbacks
490- def test_recover_waiting_slave_with_good_id(self):
491- # rescueIfLost does not attempt to abort or clean a builder that is
492- # WAITING.
493- waiting_slave = WaitingSlave(build_id='trivial')
494- slave_status = yield waiting_slave.status()
495- lost = yield BuilderInteractor.rescueIfLost(
496- extract_vitals_from_db(MockBuilder()), waiting_slave, slave_status,
497- 'trivial')
498- self.assertFalse(lost)
499- self.assertEqual(['status'], waiting_slave.call_log)
500-
501- @defer.inlineCallbacks
502- def test_recover_waiting_slave_with_bad_id(self):
503- # If a slave is WAITING with a build for us to get, and the build
504- # cookie cannot be verified, which means we don't recognize the build,
505- # then rescueBuilderIfLost should attempt to abort it, so that the
506- # builder is reset for a new build, and the corrupt build is
507- # discarded.
508- waiting_slave = WaitingSlave(build_id='non-trivial')
509- slave_status = yield waiting_slave.status()
510- lost = yield BuilderInteractor.rescueIfLost(
511- extract_vitals_from_db(MockBuilder()), waiting_slave, slave_status,
512- 'trivial')
513- self.assertTrue(lost)
514- self.assertEqual(['status', 'clean'], waiting_slave.call_log)
515-
516- @defer.inlineCallbacks
517- def test_recover_building_slave_with_good_id(self):
518- # rescueIfLost does not attempt to abort or clean a builder that is
519- # BUILDING.
520- building_slave = BuildingSlave(build_id='trivial')
521- slave_status = yield building_slave.status()
522- lost = yield BuilderInteractor.rescueIfLost(
523- extract_vitals_from_db(MockBuilder()), building_slave,
524- slave_status, 'trivial')
525- self.assertFalse(lost)
526- self.assertEqual(['status'], building_slave.call_log)
527-
528- @defer.inlineCallbacks
529- def test_recover_building_slave_with_bad_id(self):
530- # If a slave is BUILDING with a build id we don't recognize, then we
531- # abort the build, thus stopping it in its tracks.
532- building_slave = BuildingSlave(build_id='non-trivial')
533- slave_status = yield building_slave.status()
534- lost = yield BuilderInteractor.rescueIfLost(
535- extract_vitals_from_db(MockBuilder()), building_slave,
536- slave_status, 'trivial')
537- self.assertTrue(lost)
538- self.assertEqual(['status', 'abort'], building_slave.call_log)
539+ self.fail("abort() should crash.")
540
541
542 class TestBuilderSlaveStatus(TestCase):
543@@ -321,6 +307,7 @@
544 processor = self.factory.makeProcessor(name="i386")
545 builder = self.factory.makeBuilder(
546 processors=[processor], virtualized=True, vm_host="bladh")
547+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
548 self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
549 distroseries = self.factory.makeDistroSeries()
550 das = self.factory.makeDistroArchSeries(
551@@ -377,21 +364,30 @@
552
553 return d.addCallback(check_build_started)
554
555- def test_virtual_job_dispatch_pings_before_building(self):
556- # We need to send a ping to the builder to work around a bug
557- # where sometimes the first network packet sent is dropped.
558- builder, build = self._setupBinaryBuildAndBuilder()
559- candidate = build.queueBuild()
560- removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
561- result=candidate)
562- vitals = extract_vitals_from_db(builder)
563- slave = OkSlave()
564- d = BuilderInteractor.findAndStartJob(vitals, builder, slave)
565-
566- def check_build_started(candidate):
567- self.assertIn(('echo', 'ping'), slave.call_log)
568-
569- return d.addCallback(check_build_started)
570+ @defer.inlineCallbacks
571+ def test_findAndStartJob_requires_clean_slave(self):
572+ # findAndStartJob ensures that its slave starts CLEAN.
573+ builder, build = self._setupBinaryBuildAndBuilder()
574+ builder.setCleanStatus(BuilderCleanStatus.DIRTY)
575+ candidate = build.queueBuild()
576+ removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
577+ result=candidate)
578+ vitals = extract_vitals_from_db(builder)
579+ with ExpectedException(
580+ BuildDaemonError,
581+ "Attempted to start build on a dirty slave."):
582+ yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
583+
584+ @defer.inlineCallbacks
585+ def test_findAndStartJob_dirties_slave(self):
586+ # findAndStartJob marks its builder DIRTY before dispatching.
587+ builder, build = self._setupBinaryBuildAndBuilder()
588+ candidate = build.queueBuild()
589+ removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
590+ result=candidate)
591+ vitals = extract_vitals_from_db(builder)
592+ yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
593+ self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
594
595
596 class TestSlave(TestCase):
597
598=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
599--- lib/lp/buildmaster/tests/test_manager.py 2014-06-06 10:32:55 +0000
600+++ lib/lp/buildmaster/tests/test_manager.py 2014-06-18 08:49:42 +0000
601@@ -13,6 +13,7 @@
602 AsynchronousDeferredRunTest,
603 )
604 from testtools.matchers import Equals
605+from testtools.testcase import ExpectedException
606 import transaction
607 from twisted.internet import (
608 defer,
609@@ -25,6 +26,7 @@
610 from zope.security.proxy import removeSecurityProxy
611
612 from lp.buildmaster.enums import (
613+ BuilderCleanStatus,
614 BuildQueueStatus,
615 BuildStatus,
616 )
617@@ -33,7 +35,10 @@
618 BuilderSlave,
619 extract_vitals_from_db,
620 )
621-from lp.buildmaster.interfaces.builder import IBuilderSet
622+from lp.buildmaster.interfaces.builder import (
623+ BuildDaemonError,
624+ IBuilderSet,
625+ )
626 from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
627 IBuildFarmJobBehaviour,
628 )
629@@ -113,6 +118,7 @@
630 job = builder.currentjob
631 if job is not None:
632 job.reset()
633+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
634
635 transaction.commit()
636
637@@ -155,6 +161,7 @@
638 # Set this to 1 here so that _checkDispatch can make sure it's
639 # reset to 0 after a successful dispatch.
640 builder.failure_count = 1
641+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
642
643 # Run 'scan' and check its result.
644 switch_dbuser(config.builddmaster.dbuser)
645@@ -258,6 +265,7 @@
646 def test_scan_with_nothing_to_dispatch(self):
647 factory = LaunchpadObjectFactory()
648 builder = factory.makeBuilder()
649+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
650 self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
651 transaction.commit()
652 scanner = self._getScanner(builder_name=builder.name)
653@@ -422,9 +430,9 @@
654 self.assertFalse(builder.builderok)
655
656 @defer.inlineCallbacks
657- def test_fail_to_resume_slave_resets_job(self):
658- # If an attempt to resume and dispatch a slave fails, it should
659- # reset the job via job.reset()
660+ def test_fail_to_resume_leaves_it_dirty(self):
661+ # If an attempt to resume a slave fails, its failure count is
662+ # incremented and it is left DIRTY.
663
664 # Make a slave with a failing resume() method.
665 slave = OkSlave()
666@@ -435,26 +443,21 @@
667 builder = removeSecurityProxy(
668 getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME])
669 self._resetBuilder(builder)
670+ builder.setCleanStatus(BuilderCleanStatus.DIRTY)
671+ builder.virtualized = True
672 self.assertEqual(0, builder.failure_count)
673 self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
674 builder.vm_host = "fake_vm_host"
675-
676- scanner = self._getScanner()
677-
678- # Get the next job that will be dispatched.
679- job = removeSecurityProxy(builder._findBuildCandidate())
680- job.virtualized = True
681- builder.virtualized = True
682 transaction.commit()
683- yield scanner.singleCycle()
684-
685- # The failure_count will have been incremented on the builder, we
686- # can check that to see that a dispatch attempt did indeed occur.
687+
688+ # A spin of the scanner will see the DIRTY builder and reset it.
689+ # Our patched reset will fail.
690+ yield self._getScanner().singleCycle()
691+
692+ # The failure_count will have been incremented on the builder,
693+ # and it will be left DIRTY.
694 self.assertEqual(1, builder.failure_count)
695- # There should also be no builder set on the job.
696- self.assertIsNone(job.builder)
697- build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
698- self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
699+ self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
700
701 @defer.inlineCallbacks
702 def test_update_slave_version(self):
703@@ -569,15 +572,25 @@
704 builder.name, BuilderFactory(), BufferLogger(),
705 slave_factory=get_slave, clock=clock)
706
707- # The slave is idle and there's a build candidate, so the first
708- # scan will reset the builder and dispatch the build.
709+ # The slave is idle and dirty, so the first scan will clean it
710+ # with a reset.
711+ self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
712+ yield scanner.scan()
713+ self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
714+ self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
715+ self.assertIs(None, builder.currentjob)
716+
717+ # The slave is idle and clean, and there's a build candidate, so
718+ # the next scan will dispatch the build.
719+ get_slave.result = OkSlave()
720 yield scanner.scan()
721 self.assertEqual(
722- ['status', 'resume', 'echo', 'ensurepresent', 'build'],
723+ ['status', 'ensurepresent', 'build'],
724 get_slave.result.method_log)
725 self.assertEqual(bq, builder.currentjob)
726 self.assertEqual(BuildQueueStatus.RUNNING, bq.status)
727 self.assertEqual(BuildStatus.BUILDING, build.status)
728+ self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
729
730 # build() has been called, so switch in a BUILDING slave.
731 # Scans will now just do a status() each, as the logtail is
732@@ -596,7 +609,8 @@
733 # When the build finishes, the scanner will notice, call
734 # handleStatus(), and then clean the builder. Our fake
735 # _handleStatus_OK doesn't do anything special, but there'd
736- # usually be file retrievals in the middle.
737+ # usually be file retrievals in the middle. The builder remains
738+ # dirty afterward.
739 get_slave.result = WaitingSlave(
740 build_id=IBuildFarmJobBehaviour(build).getBuildCookie())
741 yield scanner.scan()
742@@ -604,13 +618,15 @@
743 self.assertIs(None, builder.currentjob)
744 self.assertEqual(BuildStatus.UPLOADING, build.status)
745 self.assertEqual(builder, build.builder)
746+ self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
747
748- # We're clean, so let's flip back to an idle slave and
749- # confirm that a scan does nothing special.
750+ # We're idle and dirty, so let's flip back to an idle slave and
751+ # confirm that the slave gets cleaned.
752 get_slave.result = OkSlave()
753 yield scanner.scan()
754- self.assertEqual(['status'], get_slave.result.method_log)
755+ self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
756 self.assertIs(None, builder.currentjob)
757+ self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
758
759
760 class TestPrefetchedBuilderFactory(TestCaseWithFactory):
761@@ -740,74 +756,104 @@
762
763 run_tests_with = AsynchronousDeferredRunTest
764
765+ def getScanner(self, builder_factory=None, interactor=None, slave=None,
766+ behaviour=None):
767+ if builder_factory is None:
768+ builder_factory = MockBuilderFactory(
769+ MockBuilder(virtualized=False), None)
770+ if interactor is None:
771+ interactor = BuilderInteractor()
772+ interactor.updateBuild = FakeMethod()
773+ if slave is None:
774+ slave = OkSlave()
775+ if behaviour is None:
776+ behaviour = TrivialBehaviour()
777+ return SlaveScanner(
778+ 'mock', builder_factory, BufferLogger(),
779+ interactor_factory=FakeMethod(interactor),
780+ slave_factory=FakeMethod(slave),
781+ behaviour_factory=FakeMethod(behaviour))
782+
783 @defer.inlineCallbacks
784 def test_scan_with_job(self):
785 # SlaveScanner.scan calls updateBuild() when a job is building.
786 slave = BuildingSlave('trivial')
787 bq = FakeBuildQueue()
788-
789- # Instrument updateBuild.
790- interactor = BuilderInteractor()
791- interactor.updateBuild = FakeMethod()
792-
793- scanner = SlaveScanner(
794- 'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
795- interactor_factory=FakeMethod(interactor),
796- slave_factory=FakeMethod(slave),
797- behaviour_factory=FakeMethod(TrivialBehaviour()))
798+ scanner = self.getScanner(
799+ builder_factory=MockBuilderFactory(MockBuilder(), bq),
800+ slave=slave)
801
802 yield scanner.scan()
803 self.assertEqual(['status'], slave.call_log)
804- self.assertEqual(1, interactor.updateBuild.call_count)
805+ self.assertEqual(
806+ 1, scanner.interactor_factory.result.updateBuild.call_count)
807 self.assertEqual(0, bq.reset.call_count)
808
809 @defer.inlineCallbacks
810- def test_scan_aborts_lost_slave_with_job(self):
811- # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
812- # slaves that don't have the expected job.
813+ def test_scan_recovers_lost_slave_with_job(self):
814+ # SlaveScanner.scan identifies slaves that aren't building what
815+ # they should be, resets the jobs, and then aborts the slaves.
816 slave = BuildingSlave('nontrivial')
817 bq = FakeBuildQueue()
818-
819- # Instrument updateBuild.
820- interactor = BuilderInteractor()
821- interactor.updateBuild = FakeMethod()
822-
823- scanner = SlaveScanner(
824- 'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
825- interactor_factory=FakeMethod(interactor),
826- slave_factory=FakeMethod(slave),
827- behaviour_factory=FakeMethod(TrivialBehaviour()))
828+ builder = MockBuilder(virtualized=False)
829+ scanner = self.getScanner(
830+ builder_factory=MockBuilderFactory(builder, bq),
831+ slave=slave)
832
833 # A single scan will call status(), notice that the slave is lost,
834- # abort() the slave, then reset() the job without calling
835- # updateBuild().
836+ # and reset() the job without calling updateBuild().
837 yield scanner.scan()
838- self.assertEqual(['status', 'abort'], slave.call_log)
839- self.assertEqual(0, interactor.updateBuild.call_count)
840+ self.assertEqual(['status'], slave.call_log)
841+ self.assertEqual(
842+ 0, scanner.interactor_factory.result.updateBuild.call_count)
843 self.assertEqual(1, bq.reset.call_count)
844+ # The reset would normally have unset build_queue.
845+ scanner.builder_factory.updateTestData(builder, None)
846+
847+ # The next scan will see a dirty idle builder with a BUILDING
848+ # slave, and abort() it.
849+ yield scanner.scan()
850+ self.assertEqual(['status', 'status', 'abort'], slave.call_log)
851
852 @defer.inlineCallbacks
853- def test_scan_aborts_lost_slave_when_idle(self):
854- # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
855- # slaves that aren't meant to have a job.
856+ def test_scan_recovers_lost_slave_when_idle(self):
857+ # SlaveScanner.scan identifies slaves that are building when
858+ # they shouldn't be and aborts them.
859 slave = BuildingSlave()
860-
861- # Instrument updateBuild.
862- interactor = BuilderInteractor()
863- interactor.updateBuild = FakeMethod()
864-
865- scanner = SlaveScanner(
866- 'mock', MockBuilderFactory(MockBuilder(), None), BufferLogger(),
867- interactor_factory=FakeMethod(interactor),
868- slave_factory=FakeMethod(slave),
869- behaviour_factory=FakeMethod(None))
870-
871- # A single scan will call status(), notice that the slave is lost,
872- # abort() the slave, then reset() the job without calling
873- # updateBuild().
874+ scanner = self.getScanner(slave=slave)
875 yield scanner.scan()
876 self.assertEqual(['status', 'abort'], slave.call_log)
877- self.assertEqual(0, interactor.updateBuild.call_count)
878+
879+ @defer.inlineCallbacks
880+ def test_scan_building_but_not_dirty_builder_explodes(self):
881+ # Builders with a build assigned must be dirty for safety
882+ # reasons. If we run into one that's clean, we blow up.
883+ slave = BuildingSlave()
884+ builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
885+ bq = FakeBuildQueue()
886+ scanner = self.getScanner(
887+ slave=slave, builder_factory=MockBuilderFactory(builder, bq))
888+
889+ with ExpectedException(
890+ BuildDaemonError, "Non-dirty builder allegedly building."):
891+ yield scanner.scan()
892+ self.assertEqual([], slave.call_log)
893+
894+ @defer.inlineCallbacks
895+ def test_scan_clean_but_not_idle_slave_explodes(self):
896+ # Clean builders by definition have slaves that are idle. If
897+ # an ostensibly clean slave isn't idle, blow up.
898+ slave = BuildingSlave()
899+ builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
900+ scanner = self.getScanner(
901+ slave=slave, builder_factory=MockBuilderFactory(builder, None))
902+
903+ with ExpectedException(
904+ BuildDaemonError,
905+ "Allegedly clean slave not idle "
906+ "\('BuilderStatus.BUILDING' instead\)"):
907+ yield scanner.scan()
908+ self.assertEqual(['status'], slave.call_log)
909
910 def test_getExpectedCookie_caches(self):
911 bf = MockBuilderFactory(MockBuilder(), FakeBuildQueue())
912
913=== modified file 'lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py'
914--- lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py 2014-06-06 10:32:55 +0000
915+++ lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py 2014-06-18 08:49:42 +0000
916@@ -19,6 +19,7 @@
917 from zope.security.proxy import removeSecurityProxy
918
919 from lp.buildmaster.enums import (
920+ BuilderCleanStatus,
921 BuildQueueStatus,
922 BuildStatus,
923 )
924@@ -131,10 +132,7 @@
925 build_log = [
926 ('build', cookie, 'binarypackage', chroot.content.sha1,
927 filemap_names, extra_args)]
928- if builder.virtualized:
929- result = [('echo', 'ping')] + upload_logs + build_log
930- else:
931- result = upload_logs + build_log
932+ result = upload_logs + build_log
933 return result
934
935 def test_non_virtual_ppa_dispatch(self):
936@@ -146,6 +144,7 @@
937 archive = self.factory.makeArchive(virtualized=False)
938 slave = OkSlave()
939 builder = self.factory.makeBuilder(virtualized=False)
940+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
941 vitals = extract_vitals_from_db(builder)
942 build = self.factory.makeBinaryPackageBuild(
943 builder=builder, archive=archive)
944@@ -164,12 +163,11 @@
945 return d
946
947 def test_virtual_ppa_dispatch(self):
948- # Make sure the builder slave gets reset before a build is
949- # dispatched to it.
950 archive = self.factory.makeArchive(virtualized=True)
951 slave = OkSlave()
952 builder = self.factory.makeBuilder(
953 virtualized=True, vm_host="foohost")
954+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
955 vitals = extract_vitals_from_db(builder)
956 build = self.factory.makeBinaryPackageBuild(
957 builder=builder, archive=archive)
958@@ -182,22 +180,17 @@
959 d = interactor._startBuild(
960 bq, vitals, builder, slave,
961 interactor.getBuildBehaviour(bq, builder, slave), BufferLogger())
962-
963- def check_build(ignored):
964- # We expect the first call to the slave to be a resume call,
965- # followed by the rest of the usual calls we expect.
966- expected_resume_call = slave.call_log.pop(0)
967- self.assertEqual('resume', expected_resume_call)
968- self.assertExpectedInteraction(
969- ignored, slave.call_log, builder, build, lf, archive,
970- ArchivePurpose.PPA)
971- return d.addCallback(check_build)
972+ d.addCallback(
973+ self.assertExpectedInteraction, slave.call_log, builder, build,
974+ lf, archive, ArchivePurpose.PPA)
975+ return d
976
977 def test_partner_dispatch_no_publishing_history(self):
978 archive = self.factory.makeArchive(
979 virtualized=False, purpose=ArchivePurpose.PARTNER)
980 slave = OkSlave()
981 builder = self.factory.makeBuilder(virtualized=False)
982+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
983 vitals = extract_vitals_from_db(builder)
984 build = self.factory.makeBinaryPackageBuild(
985 builder=builder, archive=archive)