Merge lp:~cjwatson/launchpad/buildstatus-aborted into lp:launchpad

Proposed by Colin Watson
Status: Merged
Approved by: William Grant
Approved revision: no longer in the source branch.
Merged at revision: 16727
Proposed branch: lp:~cjwatson/launchpad/buildstatus-aborted
Merge into: lp:launchpad
Diff against target: 325 lines (+124/-66)
8 files modified
lib/lp/buildmaster/interfaces/builder.py (+15/-6)
lib/lp/buildmaster/manager.py (+5/-20)
lib/lp/buildmaster/model/builder.py (+27/-9)
lib/lp/buildmaster/model/buildfarmjobbehavior.py (+26/-1)
lib/lp/buildmaster/tests/test_builder.py (+8/-2)
lib/lp/buildmaster/tests/test_buildfarmjobbehavior.py (+29/-3)
lib/lp/buildmaster/tests/test_manager.py (+0/-25)
lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py (+14/-0)
To merge this branch: bzr merge lp:~cjwatson/launchpad/buildstatus-aborted
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+176990@code.launchpad.net

Commit message

Add support for ABORTED build slave status, in preparation for using it on slaves.

Description of the change

As discussed at the releng sprint, this branch adds support for an "ABORTED" build slave status, allowing the combination of BuilderStatus.WAITING/BuildStatus.ABORTED on the slave; this makes more sense for cancelling builds than the only other current possibility of BuilderStatus.ABORTED.

After some feedback from William Grant, I also refactored how builder failures are handled, and thereby arranged for an unsolicited ABORTED slave status (i.e. not in response to CANCELLING) to cause the manager to attempt to recover the slave.

I'm not sure we can QA this until the new launchpad-buildd is ready that uses this, at which point we can QA it all at once. However, we should at least check that slave recovery still works in other situations.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) wrote :

This is too happy to kill non-virtual builders. handleFailure (which is called on eg. a connection failure due to a network glitch) will immediately disable a non-virtual builder, whereas this was previously only done after a guilty judgement by assessFailureCounts. It'll probably also end up double-resetting virtual builders, though that's by no means a critical issue.

review: Needs Fixing (code)
Revision history for this message
Colin Watson (cjwatson) wrote :

How's this? I think I misunderstood your earlier comment about consolidating things into handleFailure.

Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/interfaces/builder.py'
2--- lib/lp/buildmaster/interfaces/builder.py 2013-04-22 06:26:34 +0000
3+++ lib/lp/buildmaster/interfaces/builder.py 2013-08-06 15:33:57 +0000
4@@ -309,18 +309,27 @@
5 explaining what went wrong.
6 """
7
8- def handleTimeout(logger, error_message):
9- """Handle buildd slave communication timeout situations.
10+ def handleFailure(logger):
11+ """Handle buildd slave failures.
12+
13+ Increment builder and (if possible) job failure counts.
14+ """
15+
16+ def resetOrFail(logger, exception):
17+ """Handle "confirmed" build slave failures.
18+
19+ Call this when there have been multiple failures that are not just
20+ the fault of failing jobs, or when the builder has entered an
21+ ABORTED state without having been asked to do so.
22
23 In case of a virtualized/PPA buildd slave an attempt will be made
24- to reset it first (using `resumeSlaveHost`). Only if that fails
25- will it be (marked as) failed (using `failBuilder`).
26+ to reset it (using `resumeSlaveHost`).
27
28 Conversely, a non-virtualized buildd slave will be (marked as)
29- failed straightaway.
30+ failed straightaway (using `failBuilder`).
31
32 :param logger: The logger object to be used for logging.
33- :param error_message: The error message to be used for logging.
34+ :param exception: An exception to be used for logging.
35 :return: A Deferred that fires after the virtual slave was resumed
36 or immediately if it's a non-virtual slave.
37 """
38
39=== modified file 'lib/lp/buildmaster/manager.py'
40--- lib/lp/buildmaster/manager.py 2013-01-22 00:33:13 +0000
41+++ lib/lp/buildmaster/manager.py 2013-08-06 15:33:57 +0000
42@@ -161,31 +161,16 @@
43 self.builder_name, failure.getErrorMessage(),
44 failure.getTraceback()))
45
46- # Decide if we need to terminate the job or fail the
47- # builder.
48+ # Decide if we need to terminate the job or fail the builder.
49+ builder = get_builder(self.builder_name)
50 try:
51- builder = get_builder(self.builder_name)
52- builder.gotFailure()
53- if builder.currentjob is not None:
54- build_farm_job = builder.getCurrentBuildFarmJob()
55- build_farm_job.gotFailure()
56- self.logger.info(
57- "builder %s failure count: %s, "
58- "job '%s' failure count: %s" % (
59- self.builder_name,
60- builder.failure_count,
61- build_farm_job.title,
62- build_farm_job.failure_count))
63- else:
64- self.logger.info(
65- "Builder %s failed a probe, count: %s" % (
66- self.builder_name, builder.failure_count))
67+ builder.handleFailure(self.logger)
68 assessFailureCounts(builder, failure.getErrorMessage())
69 transaction.commit()
70- except:
71+ except Exception:
72 # Catastrophic code failure! Not much we can do.
73 self.logger.error(
74- "Miserable failure when trying to examine failure counts:\n",
75+ "Miserable failure when trying to handle failure:\n",
76 exc_info=True)
77 transaction.abort()
78
79
80=== modified file 'lib/lp/buildmaster/model/builder.py'
81--- lib/lp/buildmaster/model/builder.py 2013-06-20 05:50:00 +0000
82+++ lib/lp/buildmaster/model/builder.py 2013-08-06 15:33:57 +0000
83@@ -803,15 +803,33 @@
84 d = defer.maybeDeferred(self.startBuild, candidate, logger)
85 return d
86
87- def handleTimeout(self, logger, error_message):
88- """See IBuilder."""
89+ def handleFailure(self, logger):
90+ """See IBuilder."""
91+ self.gotFailure()
92+ if self.currentjob is not None:
93+ build_farm_job = self.getCurrentBuildFarmJob()
94+ build_farm_job.gotFailure()
95+ logger.info(
96+ "Builder %s failure count: %s, job '%s' failure count: %s" % (
97+ self.name, self.failure_count,
98+ build_farm_job.title, build_farm_job.failure_count))
99+ else:
100+ logger.info(
101+ "Builder %s failure count: %s" % (
102+ self.name, self.failure_count))
103+
104+ def resetOrFail(self, logger, exception):
105+ """See IBuilder."""
106+ error_message = str(exception)
107 if self.virtualized:
108- # Virtualized/PPA builder: attempt a reset.
109- logger.warn(
110- "Resetting builder: %s -- %s" % (self.url, error_message),
111- exc_info=True)
112- d = self.resumeSlaveHost()
113- return d
114+ # Virtualized/PPA builder: attempt a reset, unless the failure
115+ # was itself a failure to reset. (In that case, the slave
116+ # scanner will try again until we reach the failure threshold.)
117+ if not isinstance(exception, CannotResumeHost):
118+ logger.warn(
119+ "Resetting builder: %s -- %s" % (self.url, error_message),
120+ exc_info=True)
121+ return self.resumeSlaveHost()
122 else:
123 # XXX: This should really let the failure bubble up to the
124 # scan() method that does the failure counting.
125@@ -819,7 +837,7 @@
126 logger.warn(
127 "Disabling builder: %s -- %s" % (self.url, error_message))
128 self.failBuilder(error_message)
129- return defer.succeed(None)
130+ return defer.succeed(None)
131
132 def findAndStartJob(self):
133 """See IBuilder."""
134
135=== modified file 'lib/lp/buildmaster/model/buildfarmjobbehavior.py'
136--- lib/lp/buildmaster/model/buildfarmjobbehavior.py 2013-01-23 06:41:24 +0000
137+++ lib/lp/buildmaster/model/buildfarmjobbehavior.py 2013-08-06 15:33:57 +0000
138@@ -219,7 +219,8 @@
139 Buildslave can be WAITING in five situations:
140
141 * Build has failed, no filemap is received (PACKAGEFAIL, DEPFAIL,
142- CHROOTFAIL, BUILDERFAIL)
143+ CHROOTFAIL, BUILDERFAIL,
144+ ABORTED)
145
146 * Build has been built successfully (BuildStatus.OK), in this case
147 we have a 'filemap', so we can retrieve those files and store in
148@@ -415,6 +416,30 @@
149 transaction.commit()
150
151 @defer.inlineCallbacks
152+ def _handleStatus_ABORTED(self, librarian, slave_status, logger,
153+ send_notification):
154+ """Handle aborted builds.
155+
156+ If the build was explicitly cancelled, then mark it as such.
157+ Otherwise, the build has failed in some unexpected way.
158+ """
159+ builder = self.build.buildqueue_record.builder
160+ if self.build.status == BuildStatus.CANCELLING:
161+ self.build.buildqueue_record.cancel()
162+ transaction.commit()
163+ yield builder.cleanSlave()
164+ else:
165+ self.build.buildqueue_record.reset()
166+ try:
167+ builder.handleFailure(logger)
168+ yield builder.resetOrFail(logger, BuildSlaveFailure(
169+ "Builder unexpectedly returned ABORTED"))
170+ except Exception as e:
171+ # We've already done our best to mark the builder as failed.
172+ logger.error("Failed to rescue from ABORTED: %s" % e)
173+ transaction.commit()
174+
175+ @defer.inlineCallbacks
176 def _handleStatus_GIVENBACK(self, librarian, slave_status, logger,
177 send_notification):
178 """Handle automatic retry requested by builder.
179
180=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
181--- lib/lp/buildmaster/tests/test_builder.py 2013-06-20 05:50:00 +0000
182+++ lib/lp/buildmaster/tests/test_builder.py 2013-08-06 15:33:57 +0000
183@@ -191,7 +191,13 @@
184 d = builder.resumeSlaveHost()
185 return assert_fails_with(d, CannotResumeHost)
186
187- def test_handleTimeout_resume_failure(self):
188+ def test_handleFailure_increments_failure_count(self):
189+ builder = self.factory.makeBuilder(virtualized=False)
190+ builder.builderok = True
191+ builder.handleFailure(BufferLogger())
192+ self.assertEqual(1, builder.failure_count)
193+
194+ def test_resetOrFail_resume_failure(self):
195 reset_fail_config = """
196 [builddmaster]
197 vm_resume_command: /bin/false"""
198@@ -199,7 +205,7 @@
199 self.addCleanup(config.pop, 'reset fail')
200 builder = self.factory.makeBuilder(virtualized=True, vm_host="pop")
201 builder.builderok = True
202- d = builder.handleTimeout(BufferLogger(), 'blah')
203+ d = builder.resetOrFail(BufferLogger(), Exception())
204 return assert_fails_with(d, CannotResumeHost)
205
206 def _setupBuilder(self):
207
208=== modified file 'lib/lp/buildmaster/tests/test_buildfarmjobbehavior.py'
209--- lib/lp/buildmaster/tests/test_buildfarmjobbehavior.py 2013-01-30 07:30:10 +0000
210+++ lib/lp/buildmaster/tests/test_buildfarmjobbehavior.py 2013-08-06 15:33:57 +0000
211@@ -214,10 +214,11 @@
212 self.build = self.makeBuild()
213 # For the moment, we require a builder for the build so that
214 # handleStatus_OK can get a reference to the slave.
215- builder = self.factory.makeBuilder()
216- self.build.buildqueue_record.builder = builder
217+ self.builder = self.factory.makeBuilder()
218+ self.build.buildqueue_record.builder = self.builder
219 self.build.buildqueue_record.setDateStarted(UTC_NOW)
220- self.behavior = removeSecurityProxy(builder.current_build_behavior)
221+ self.behavior = removeSecurityProxy(
222+ self.builder.current_build_behavior)
223 self.slave = WaitingSlave('BuildStatus.OK')
224 self.slave.valid_file_hashes.append('test_file_hash')
225 self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(self.slave))
226@@ -321,6 +322,31 @@
227 def test_handleStatus_PACKAGEFAIL_notifies(self):
228 return self._test_handleStatus_notifies("PACKAGEFAIL")
229
230+ def test_handleStatus_ABORTED_cancels_cancelling(self):
231+ self.build.updateStatus(BuildStatus.CANCELLING)
232+
233+ def got_status(ignored):
234+ self.assertEqual(
235+ 0, len(pop_notifications()), "Notifications received")
236+ self.assertEqual(BuildStatus.CANCELLED, self.build.status)
237+
238+ d = self.behavior.handleStatus("ABORTED", None, {})
239+ return d.addCallback(got_status)
240+
241+ def test_handleStatus_ABORTED_recovers_building(self):
242+ self.builder.vm_host = "fake_vm_host"
243+ self.build.updateStatus(BuildStatus.BUILDING)
244+
245+ def got_status(ignored):
246+ self.assertEqual(
247+ 0, len(pop_notifications()), "Notifications received")
248+ self.assertEqual(BuildStatus.NEEDSBUILD, self.build.status)
249+ self.assertEqual(1, self.builder.failure_count)
250+ self.assertIn("resume", self.slave.call_log)
251+
252+ d = self.behavior.handleStatus("ABORTED", None, {})
253+ return d.addCallback(got_status)
254+
255 def test_date_finished_set(self):
256 # The date finished is updated during handleStatus_OK.
257 self.assertEqual(None, self.build.date_finished)
258
259=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
260--- lib/lp/buildmaster/tests/test_manager.py 2013-02-04 06:14:32 +0000
261+++ lib/lp/buildmaster/tests/test_manager.py 2013-08-06 15:33:57 +0000
262@@ -162,31 +162,6 @@
263 self.assertTrue(builder.builderok)
264 self.assertTrue(builder.currentjob is None)
265
266- def testNoDispatchForMissingChroots(self):
267- # When a required chroot is not present the `scan` method
268- # should not return any `RecordingSlaves` to be processed
269- # and the builder used should remain active and IDLE.
270-
271- # Reset sampledata builder.
272- builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
273- self._resetBuilder(builder)
274-
275- # Remove hoary/i386 chroot.
276- login('foo.bar@canonical.com')
277- ubuntu = getUtility(IDistributionSet).getByName('ubuntu')
278- hoary = ubuntu.getSeries('hoary')
279- pocket_chroot = hoary.getDistroArchSeries('i386').getPocketChroot()
280- removeSecurityProxy(pocket_chroot).chroot = None
281- transaction.commit()
282- login(ANONYMOUS)
283-
284- # Run 'scan' and check its result.
285- switch_dbuser(config.builddmaster.dbuser)
286- scanner = self._getScanner()
287- d = defer.maybeDeferred(scanner.singleCycle)
288- d.addCallback(self._checkNoDispatch, builder)
289- return d
290-
291 def _checkJobRescued(self, slave, builder, job):
292 """`SlaveScanner.scan` rescued the job.
293
294
295=== modified file 'lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py'
296--- lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py 2013-01-24 05:50:23 +0000
297+++ lib/lp/soyuz/tests/test_binarypackagebuildbehavior.py 2013-08-06 15:33:57 +0000
298@@ -19,6 +19,7 @@
299 from zope.security.proxy import removeSecurityProxy
300
301 from lp.buildmaster.enums import BuildStatus
302+from lp.buildmaster.interfaces.builder import CannotBuild
303 from lp.buildmaster.model.builder import BuilderSlave
304 from lp.buildmaster.tests.mock_slaves import (
305 AbortedSlave,
306@@ -264,6 +265,19 @@
307 'Attempt to build virtual item on a non-virtual builder.',
308 str(e))
309
310+ def test_verifyBuildRequest_no_chroot(self):
311+ # Don't dispatch a build when the DAS has no chroot.
312+ archive = self.factory.makeArchive(purpose=ArchivePurpose.PRIMARY)
313+ builder = self.factory.makeBuilder()
314+ build = self.factory.makeBinaryPackageBuild(
315+ builder=builder, archive=archive)
316+ candidate = build.queueBuild()
317+ behavior = candidate.required_build_behavior
318+ behavior.setBuilder(builder)
319+ e = self.assertRaises(
320+ CannotBuild, behavior.verifyBuildRequest, BufferLogger())
321+ self.assertIn("Missing CHROOT", str(e))
322+
323 def test_getBuildCookie(self):
324 # A build cookie is made up of the job type and record id.
325 # The uploadprocessor relies on this format.