Merge lp:~cjwatson/launchpad/buildmaster-cancel-properly into lp:launchpad

Proposed by Colin Watson
Status: Merged
Approved by: Colin Watson
Approved revision: no longer in the source branch.
Merged at revision: 16748
Proposed branch: lp:~cjwatson/launchpad/buildmaster-cancel-properly
Merge into: lp:launchpad
Prerequisite: lp:~cjwatson/launchpad/buildstatus-aborted
Diff against target: 424 lines (+144/-98)
6 files modified
lib/lp/buildmaster/interactor.py (+0/-10)
lib/lp/buildmaster/manager.py (+50/-17)
lib/lp/buildmaster/tests/mock_slaves.py (+5/-1)
lib/lp/buildmaster/tests/test_builder.py (+9/-13)
lib/lp/buildmaster/tests/test_manager.py (+79/-54)
lib/lp/soyuz/model/binarypackagebuild.py (+1/-3)
To merge this branch: bzr merge lp:~cjwatson/launchpad/buildmaster-cancel-properly
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+177580@code.launchpad.net

Commit message

Cancel builds by sending an "abort" command to the build slave, with a three-minute timeout after which we give up and rescue the builder. This should now work even on non-virtualised builders.

Description of the change

This is the second half of the master side of build cancellation, following on from the first half of the master side in https://code.launchpad.net/~cjwatson/launchpad/buildstatus-aborted/+merge/176990, and the slave side in https://code.launchpad.net/~cjwatson/launchpad-buildd/fix-abort/+merge/177003 (both those branches must be landed first). With a working slave side we mainly just need to remove previous restrictions and arrange for a timeout in the event that the builder entirely fails to abort.

The bug that had previously led test_abort to be disabled should be fixed by using scan-for-processes on the slave rather than killing a specific process.

We should be able to get rid of most of BuilderStatus.ABORTED once this is all deployed and working.

For QA, we should test on dogfood with both virtualised and non-virtualised builders, using builds that (a) hang "normally" but are reasonably killable and (b) attempt to spawn processes faster than scan-for-processes can kill them. It might also be a good idea to attempt some kind of fake slave that can be told to enter the ABORTED state without the master asking it to do so.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) wrote :

59 + self.logger.info("Build '%s' failed to cancel" % build.title)

I'd mention the builder here to make logs searchable.

80 + builder.requestAbort()
81 + self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
82 + return defer.succeed(False)

requestAbort returns a Deferred that you might want to handle.

Additionally, what will happen if I restart buildd-manager while the builder is ABORTING? It'll attempt to call requestAbort again, which will probably fail, but it'll be ignored because you don't handle the Deferred.

review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/interactor.py'
2--- lib/lp/buildmaster/interactor.py 2013-08-29 04:58:10 +0000
3+++ lib/lp/buildmaster/interactor.py 2013-08-29 12:10:56 +0000
4@@ -364,16 +364,6 @@
5 # the job.
6 if (status == 'BuilderStatus.ABORTED'
7 and self.builder.currentjob is None):
8- if not self.builder.virtualized:
9- # We can't reset non-virtual builders reliably as the
10- # abort() function doesn't kill the actual build job,
11- # only the sbuild process! All we can do here is fail
12- # the builder with a message indicating the problem and
13- # wait for an admin to reboot it.
14- self.builder.failBuilder(
15- "Non-virtual builder in ABORTED state, requires admin "
16- "to restart")
17- return "dummy status"
18 if logger is not None:
19 logger.info(
20 "Builder '%s' being cleaned up from ABORTED" %
21
22=== modified file 'lib/lp/buildmaster/manager.py'
23--- lib/lp/buildmaster/manager.py 2013-08-29 04:28:50 +0000
24+++ lib/lp/buildmaster/manager.py 2013-08-29 12:10:56 +0000
25@@ -1,4 +1,4 @@
26-# Copyright 2009-2012 Canonical Ltd. This software is licensed under the
27+# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
28 # GNU Affero General Public License version 3 (see the file LICENSE).
29
30 """Soyuz buildd slave manager logic."""
31@@ -114,13 +114,25 @@
32 # algorithm for polling.
33 SCAN_INTERVAL = 15
34
35- def __init__(self, builder_name, logger):
36+ # The time before deciding that a cancelling builder has failed, in
37+ # seconds. This should normally be a multiple of SCAN_INTERVAL, and
38+ # greater than abort_timeout in launchpad-buildd's slave BuildManager.
39+ CANCEL_TIMEOUT = 180
40+
41+ def __init__(self, builder_name, logger, clock=None):
42 self.builder_name = builder_name
43 self.logger = logger
44+ # Use the clock if provided, so that tests can advance it. Use the
45+ # reactor by default.
46+ if clock is None:
47+ clock = reactor
48+ self._clock = clock
49+ self.date_cancel = None
50
51 def startCycle(self):
52 """Scan the builder and dispatch to it or deal with failures."""
53 self.loop = LoopingCall(self.singleCycle)
54+ self.loop.clock = self._clock
55 self.stopping_deferred = self.loop.start(self.SCAN_INTERVAL)
56 return self.stopping_deferred
57
58@@ -172,32 +184,53 @@
59 exc_info=True)
60 transaction.abort()
61
62+ @defer.inlineCallbacks
63 def checkCancellation(self, builder):
64 """See if there is a pending cancellation request.
65
66 If the current build is in status CANCELLING then terminate it
67 immediately.
68
69- :return: A deferred whose value is True if we cancelled the build.
70+ :return: A deferred whose value is True if we recovered the builder
71+ by resuming a slave host, so that there is no need to update its
72+ status.
73 """
74- if not builder.virtualized:
75- return defer.succeed(False)
76 buildqueue = self.builder.getBuildQueue()
77 if not buildqueue:
78- return defer.succeed(False)
79+ self.date_cancel = None
80+ defer.returnValue(False)
81 build = buildqueue.specific_job.build
82 if build.status != BuildStatus.CANCELLING:
83- return defer.succeed(False)
84-
85- def resume_done(ignored):
86- return defer.succeed(True)
87-
88- self.logger.info("Cancelling build '%s'" % build.title)
89- buildqueue.cancel()
90- transaction.commit()
91- d = self.interactor.resumeSlaveHost()
92- d.addCallback(resume_done)
93- return d
94+ self.date_cancel = None
95+ defer.returnValue(False)
96+
97+ try:
98+ if self.date_cancel is None:
99+ self.logger.info("Cancelling build '%s'" % build.title)
100+ yield self.interactor.requestAbort()
101+ self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
102+ defer.returnValue(False)
103+ else:
104+ # The BuildFarmJob will normally set the build's status to
105+ # something other than CANCELLING once the builder responds to
106+ # the cancel request. This timeout is in case it doesn't.
107+ if self._clock.seconds() < self.date_cancel:
108+ self.logger.info(
109+ "Waiting for build '%s' to cancel" % build.title)
110+ defer.returnValue(False)
111+ else:
112+ raise BuildSlaveFailure(
113+ "Build '%s' cancellation timed out" % build.title)
114+ except Exception as e:
115+ self.logger.info(
116+ "Build '%s' on %s failed to cancel" %
117+ (build.title, self.builder.name))
118+ self.date_cancel = None
119+ buildqueue.cancel()
120+ transaction.commit()
121+ value = yield self.interactor.resetOrFail(self.logger, e)
122+ # value is not None if we resumed a slave host.
123+ defer.returnValue(value is not None)
124
125 def scan(self):
126 """Probe the builder and update/dispatch/collect as appropriate.
127
128=== modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
129--- lib/lp/buildmaster/tests/mock_slaves.py 2013-08-29 04:27:12 +0000
130+++ lib/lp/buildmaster/tests/mock_slaves.py 2013-08-29 12:10:56 +0000
131@@ -1,4 +1,4 @@
132-# Copyright 2009-2011 Canonical Ltd. This software is licensed under the
133+# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
134 # GNU Affero General Public License version 3 (see the file LICENSE).
135
136 """Mock Build objects for tests soyuz buildd-system."""
137@@ -222,6 +222,10 @@
138 self.call_log.append('abort')
139 return defer.fail(xmlrpclib.Fault(8002, "Could not abort"))
140
141+ def resume(self):
142+ self.call_log.append('resume')
143+ return defer.succeed(("", "", 0))
144+
145
146 class BrokenSlave:
147 """A mock slave that reports that it is broken."""
148
149=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
150--- lib/lp/buildmaster/tests/test_builder.py 2013-08-29 04:12:56 +0000
151+++ lib/lp/buildmaster/tests/test_builder.py 2013-08-29 12:10:56 +0000
152@@ -1,4 +1,4 @@
153-# Copyright 2009-2010 Canonical Ltd. This software is licensed under the
154+# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
155 # GNU Affero General Public License version 3 (see the file LICENSE).
156
157 """Test Builder features."""
158@@ -359,18 +359,17 @@
159 return d.addCallback(check_slave_calls)
160
161 def test_recovery_of_aborted_nonvirtual_slave(self):
162- # Nonvirtual slaves in the ABORTED state cannot be reliably
163- # cleaned since the sbuild process doesn't properly kill the
164- # build job. We test that the builder is marked failed.
165+ # If a nonvirtual slave is in the ABORTED state,
166+ # rescueBuilderIfLost should clean it if we don't think it's
167+ # currently building anything.
168 aborted_slave = AbortedSlave()
169- builder = MockBuilder(virtualized=False, builderok=True)
170+ builder = MockBuilder(virtualized=False, builderok=False)
171 d = BuilderInteractor(builder, aborted_slave).rescueIfLost()
172
173- def check_failed(ignored):
174- self.assertFalse(builder.builderok)
175- self.assertNotIn('clean', aborted_slave.call_log)
176+ def check_slave_calls(ignored):
177+ self.assertIn('clean', aborted_slave.call_log)
178
179- return d.addCallback(check_failed)
180+ return d.addCallback(check_slave_calls)
181
182 def test_recover_ok_slave(self):
183 # An idle slave is not rescued.
184@@ -885,10 +884,7 @@
185 super(TestSlave, self).setUp()
186 self.slave_helper = self.useFixture(SlaveTestHelpers())
187
188- # XXX 2010-10-06 Julian bug=655559
189- # This is failing on buildbot but not locally; it's trying to abort
190- # before the build has started.
191- def disabled_test_abort(self):
192+ def test_abort(self):
193 slave = self.slave_helper.getClientSlave()
194 # We need to be in a BUILDING state before we can abort.
195 d = self.slave_helper.triggerGoodBuild(slave)
196
197=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
198--- lib/lp/buildmaster/tests/test_manager.py 2013-08-27 09:17:07 +0000
199+++ lib/lp/buildmaster/tests/test_manager.py 2013-08-29 12:10:56 +0000
200@@ -1,4 +1,4 @@
201-# Copyright 2009-2010 Canonical Ltd. This software is licensed under the
202+# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
203 # GNU Affero General Public License version 3 (see the file LICENSE).
204
205 """Tests for the renovated slave scanner aka BuilddManager."""
206@@ -42,6 +42,7 @@
207 from lp.buildmaster.tests.mock_slaves import (
208 BrokenSlave,
209 BuildingSlave,
210+ LostBuildingBrokenSlave,
211 make_publisher,
212 OkSlave,
213 )
214@@ -112,14 +113,14 @@
215 self.assertEqual(build.status, BuildStatus.BUILDING)
216 self.assertEqual(job.logtail, logtail)
217
218- def _getScanner(self, builder_name=None):
219+ def _getScanner(self, builder_name=None, clock=None):
220 """Instantiate a SlaveScanner object.
221
222 Replace its default logging handler by a testing version.
223 """
224 if builder_name is None:
225 builder_name = BOB_THE_BUILDER_NAME
226- scanner = SlaveScanner(builder_name, BufferLogger())
227+ scanner = SlaveScanner(builder_name, BufferLogger(), clock=clock)
228 scanner.logger.name = 'slave-scanner'
229
230 return scanner
231@@ -395,19 +396,14 @@
232
233 return d.addCallback(check)
234
235+ @defer.inlineCallbacks
236 def test_cancelling_a_build(self):
237 # When scanning an in-progress build, if its state is CANCELLING
238- # then the build should be stopped and moved to the CANCELLED state.
239-
240- # Set up a building slave with a fake resume method so we can see
241- # if it got called later.
242- slave = BuildingSlave(build_id="8-1")
243- call_counter = FakeMethod()
244-
245- def fake_resume():
246- call_counter()
247- return defer.succeed((None, None, 0))
248- slave.resume = fake_resume
249+ # then the build should be aborted, and eventually stopped and moved
250+ # to the CANCELLED state if it does not abort by itself.
251+
252+ # Set up a mock building slave.
253+ slave = BuildingSlave()
254
255 # Set the sample data builder building with the slave from above.
256 builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
257@@ -420,6 +416,7 @@
258 transaction.commit()
259 login(ANONYMOUS)
260 buildqueue = builder.currentjob
261+ slave.build_id = buildqueue.specific_job.generateSlaveBuildCookie()
262 self.assertBuildingJob(buildqueue, builder)
263
264 # Now set the build to CANCELLING.
265@@ -428,18 +425,28 @@
266
267 # Run 'scan' and check its results.
268 switch_dbuser(config.builddmaster.dbuser)
269- scanner = self._getScanner()
270- d = scanner.scan()
271-
272- # The build state should be cancelled and we should have also
273- # called the resume() method on the slave that resets the virtual
274- # machine.
275- def check_cancelled(ignore, builder, buildqueue):
276- self.assertEqual(1, call_counter.call_count)
277- self.assertEqual(BuildStatus.CANCELLED, build.status)
278-
279- d.addCallback(check_cancelled, builder, buildqueue)
280- return d
281+ clock = task.Clock()
282+ scanner = self._getScanner(clock=clock)
283+ yield scanner.scan()
284+
285+ # An abort request should be sent.
286+ self.assertEqual(1, slave.call_log.count("abort"))
287+ self.assertEqual(BuildStatus.CANCELLING, build.status)
288+
289+ # Advance time a little. Nothing much should happen.
290+ clock.advance(1)
291+ yield scanner.scan()
292+ self.assertEqual(1, slave.call_log.count("abort"))
293+ self.assertEqual(BuildStatus.CANCELLING, build.status)
294+
295+ # Advance past the timeout. The build state should be cancelled and
296+ # we should have also called the resume() method on the slave that
297+ # resets the virtual machine.
298+ clock.advance(SlaveScanner.CANCEL_TIMEOUT)
299+ yield scanner.scan()
300+ self.assertEqual(1, slave.call_log.count("abort"))
301+ self.assertEqual(1, slave.call_log.count("resume"))
302+ self.assertEqual(BuildStatus.CANCELLED, build.status)
303
304
305 class TestCancellationChecking(TestCaseWithFactory):
306@@ -453,15 +460,18 @@
307 builder_name = BOB_THE_BUILDER_NAME
308 self.builder = getUtility(IBuilderSet)[builder_name]
309 self.builder.virtualized = True
310- self.scanner = SlaveScanner(builder_name, BufferLogger())
311- self.scanner.builder = self.builder
312- self.scanner.interactor = BuilderInteractor(self.builder)
313- self.scanner.logger.name = 'slave-scanner'
314+
315+ def _getScanner(self, clock=None):
316+ scanner = SlaveScanner(None, BufferLogger(), clock=clock)
317+ scanner.builder = self.builder
318+ scanner.interactor = BuilderInteractor(self.builder)
319+ scanner.logger.name = 'slave-scanner'
320+ return scanner
321
322 def test_ignores_nonvirtual(self):
323 # If the builder is nonvirtual make sure we return False.
324 self.builder.virtualized = False
325- d = self.scanner.checkCancellation(self.builder)
326+ d = self._getScanner().checkCancellation(self.builder)
327 return d.addCallback(self.assertFalse)
328
329 def test_ignores_no_buildqueue(self):
330@@ -469,37 +479,52 @@
331 # make sure we return False.
332 buildqueue = self.builder.currentjob
333 buildqueue.reset()
334- d = self.scanner.checkCancellation(self.builder)
335+ d = self._getScanner().checkCancellation(self.builder)
336 return d.addCallback(self.assertFalse)
337
338 def test_ignores_build_not_cancelling(self):
339 # If the active build is not in a CANCELLING state, ignore it.
340- d = self.scanner.checkCancellation(self.builder)
341+ d = self._getScanner().checkCancellation(self.builder)
342 return d.addCallback(self.assertFalse)
343
344+ @defer.inlineCallbacks
345 def test_cancelling_build_is_cancelled(self):
346- # If a build is CANCELLING, make sure True is returned and the
347- # slave was resumed.
348- call_counter = FakeMethod()
349-
350- def fake_resume():
351- call_counter()
352- return defer.succeed((None, None, 0))
353+ # If a build is CANCELLING and the cancel timeout expires, make sure
354+ # True is returned and the slave was resumed.
355 slave = OkSlave()
356- slave.resume = fake_resume
357- self.builder.vm_host = "fake_vm_host"
358- self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
359- buildqueue = self.builder.currentjob
360- build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
361- build.updateStatus(BuildStatus.CANCELLING)
362-
363- def check(result):
364- self.assertEqual(1, call_counter.call_count)
365- self.assertTrue(result)
366- self.assertEqual(BuildStatus.CANCELLED, build.status)
367-
368- d = self.scanner.checkCancellation(self.builder)
369- return d.addCallback(check)
370+ self.builder.vm_host = "fake_vm_host"
371+ self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
372+ buildqueue = self.builder.currentjob
373+ build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
374+ build.updateStatus(BuildStatus.CANCELLING)
375+ clock = task.Clock()
376+ scanner = self._getScanner(clock=clock)
377+
378+ result = yield scanner.checkCancellation(self.builder)
379+ self.assertNotIn("resume", slave.call_log)
380+ self.assertFalse(result)
381+ self.assertEqual(BuildStatus.CANCELLING, build.status)
382+
383+ clock.advance(SlaveScanner.CANCEL_TIMEOUT)
384+ result = yield scanner.checkCancellation(self.builder)
385+ self.assertEqual(1, slave.call_log.count("resume"))
386+ self.assertTrue(result)
387+ self.assertEqual(BuildStatus.CANCELLED, build.status)
388+
389+ @defer.inlineCallbacks
390+ def test_lost_build_is_cancelled(self):
391+ # If the builder reports a fault while attempting to abort it, then
392+ # immediately resume the slave as if the cancel timeout had expired.
393+ slave = LostBuildingBrokenSlave()
394+ self.builder.vm_host = "fake_vm_host"
395+ self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
396+ buildqueue = self.builder.currentjob
397+ build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
398+ build.updateStatus(BuildStatus.CANCELLING)
399+ result = yield self._getScanner().checkCancellation(self.builder)
400+ self.assertEqual(1, slave.call_log.count("resume"))
401+ self.assertTrue(result)
402+ self.assertEqual(BuildStatus.CANCELLED, build.status)
403
404
405 class TestBuilddManager(TestCase):
406
407=== modified file 'lib/lp/soyuz/model/binarypackagebuild.py'
408--- lib/lp/soyuz/model/binarypackagebuild.py 2013-08-28 04:40:32 +0000
409+++ lib/lp/soyuz/model/binarypackagebuild.py 2013-08-29 12:10:56 +0000
410@@ -1,4 +1,4 @@
411-# Copyright 2009-2012 Canonical Ltd. This software is licensed under the
412+# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
413 # GNU Affero General Public License version 3 (see the file LICENSE).
414
415 __metaclass__ = type
416@@ -381,8 +381,6 @@
417 """See `IBuild`."""
418 if not self.buildqueue_record:
419 return False
420- if self.buildqueue_record.virtualized is False:
421- return False
422
423 cancellable_statuses = [
424 BuildStatus.BUILDING,