Merge lp:~cjwatson/launchpad/failure-count-reset into lp:launchpad

Proposed by Colin Watson
Status: Merged
Approved by: Colin Watson
Approved revision: no longer in the source branch.
Merged at revision: 16752
Proposed branch: lp:~cjwatson/launchpad/failure-count-reset
Merge into: lp:launchpad
Diff against target: 319 lines (+114/-60)
3 files modified
lib/lp/buildmaster/manager.py (+24/-7)
lib/lp/buildmaster/model/builder.py (+8/-3)
lib/lp/buildmaster/tests/test_manager.py (+82/-50)
To merge this branch: bzr merge lp:~cjwatson/launchpad/failure-count-reset
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+183320@code.launchpad.net

Commit message

Try to reset virtual builders a couple of times before giving up and failing them.

Description of the change

buildd-manager often determines that a virtual builder has failed without trying to reset it first. As discussed with William, this branch arranges to try to ppa-reset it twice before giving up, going through the usual failure counting in between resets. It will take three times as long to fail a builder completely if it's really unrecoverable, but hopefully many of them can be brought back by way of these automatic ppa-resets.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) wrote :

The resume won't actually occur, because you don't pass back the Deferred returned by resetOrFail. Otherwise this looks fine, as long as the failnotes still have reasonable content.

review: Needs Fixing (code)
Revision history for this message
Colin Watson (cjwatson) wrote :

This should be better now.

Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/manager.py'
2--- lib/lp/buildmaster/manager.py 2013-08-29 12:09:40 +0000
3+++ lib/lp/buildmaster/manager.py 2013-08-31 16:18:34 +0000
4@@ -45,11 +45,17 @@
5 return getUtility(IBuilderSet)[name]
6
7
8-def assessFailureCounts(builder, fail_notes):
9- """View builder/job failure_count and work out which needs to die. """
10+@defer.inlineCallbacks
11+def assessFailureCounts(logger, interactor, exception):
12+ """View builder/job failure_count and work out which needs to die.
13+
14+ :return: A Deferred that fires either immediately or after a virtual
15+ slave has been reset.
16+ """
17 # builder.currentjob hides a complicated query, don't run it twice.
18 # See bug 623281 (Note that currentjob is a cachedproperty).
19
20+ builder = interactor.builder
21 del get_property_cache(builder).currentjob
22 current_job = builder.currentjob
23 if current_job is None:
24@@ -79,9 +85,15 @@
25 # failing jobs because sometimes they get unresponsive due to
26 # human error, flaky networks etc. We expect the builder to get
27 # better, whereas jobs are very unlikely to get better.
28- if builder.failure_count >= Builder.FAILURE_THRESHOLD:
29- # It's also gone over the threshold so let's disable it.
30- builder.failBuilder(fail_notes)
31+ if builder.failure_count >= (
32+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD):
33+ # We've already tried resetting it enough times, so we have
34+ # little choice but to give up.
35+ builder.failBuilder(str(exception))
36+ elif builder.failure_count % Builder.RESET_THRESHOLD == 0:
37+ # The builder is dead, but in the virtual case it might be worth
38+ # resetting it.
39+ yield interactor.resetOrFail(logger, exception)
40 else:
41 # The job is the culprit! Override its status to 'failed'
42 # to make sure it won't get automatically dispatched again,
43@@ -147,11 +159,15 @@
44 d.addErrback(self._scanFailed)
45 return d
46
47+ @defer.inlineCallbacks
48 def _scanFailed(self, failure):
49 """Deal with failures encountered during the scan cycle.
50
51 1. Print the error in the log
52 2. Increment and assess failure counts on the builder and job.
53+
54+ :return: A Deferred that fires either immediately or after a virtual
55+ slave has been reset.
56 """
57 # Make sure that pending database updates are removed as it
58 # could leave the database in an inconsistent state (e.g. The
59@@ -171,11 +187,12 @@
60 self.builder_name, failure.getErrorMessage(),
61 failure.getTraceback()))
62
63- # Decide if we need to terminate the job or fail the builder.
64+ # Decide if we need to terminate the job or reset/fail the builder.
65 builder = get_builder(self.builder_name)
66 try:
67 builder.handleFailure(self.logger)
68- assessFailureCounts(builder, failure.getErrorMessage())
69+ yield assessFailureCounts(
70+ self.logger, BuilderInteractor(builder), failure.value)
71 transaction.commit()
72 except Exception:
73 # Catastrophic code failure! Not much we can do.
74
75=== modified file 'lib/lp/buildmaster/model/builder.py'
76--- lib/lp/buildmaster/model/builder.py 2013-08-28 12:03:57 +0000
77+++ lib/lp/buildmaster/model/builder.py 2013-08-31 16:18:34 +0000
78@@ -84,9 +84,14 @@
79 active = BoolCol(dbName='active', notNull=True, default=True)
80 failure_count = IntCol(dbName='failure_count', default=0, notNull=True)
81
82- # The number of times a builder can consecutively fail before we
83- # give up and mark it builderok=False.
84- FAILURE_THRESHOLD = 5
85+ # The number of times a builder can consecutively fail before we try
86+ # resetting it (if virtual) or marking it builderok=False (if not).
87+ RESET_THRESHOLD = 5
88+
89+ # The number of times a virtual builder can reach its reset threshold
90+ # due to consecutive failures before we give up and mark it
91+ # builderok=False.
92+ RESET_FAILURE_THRESHOLD = 3
93
94 def _getBuilderok(self):
95 return self._builderok
96
97=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
98--- lib/lp/buildmaster/tests/test_manager.py 2013-08-28 12:03:57 +0000
99+++ lib/lp/buildmaster/tests/test_manager.py 2013-08-31 16:18:34 +0000
100@@ -260,6 +260,7 @@
101 d.addCallback(self._checkNoDispatch, builder)
102 return d
103
104+ @defer.inlineCallbacks
105 def test_scan_with_not_ok_builder(self):
106 # Reset sampledata builder.
107 builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
108@@ -267,11 +268,9 @@
109 self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
110 builder.builderok = False
111 scanner = self._getScanner()
112- d = scanner.scan()
113+ yield scanner.scan()
114 # Because the builder is not ok, we can't use _checkNoDispatch.
115- d.addCallback(
116- lambda ignored: self.assertIs(None, builder.currentjob))
117- return d
118+ self.assertIsNone(builder.currentjob)
119
120 def test_scan_of_broken_slave(self):
121 builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
122@@ -283,6 +282,7 @@
123 d = scanner.scan()
124 return assert_fails_with(d, xmlrpclib.Fault)
125
126+ @defer.inlineCallbacks
127 def _assertFailureCounting(self, builder_count, job_count,
128 expected_builder_count, expected_job_count):
129 # If scan() fails with an exception, failure_counts should be
130@@ -306,20 +306,16 @@
131
132 # singleCycle() calls scan() which is our fake one that throws an
133 # exception.
134- d = scanner.singleCycle()
135+ yield scanner.singleCycle()
136
137 # Failure counts should be updated, and the assessment method
138 # should have been called. The actual behaviour is tested below
139 # in TestFailureAssessments.
140- def got_scan(ignored):
141- self.assertEqual(expected_builder_count, builder.failure_count)
142- self.assertEqual(
143- expected_job_count,
144- builder.currentjob.specific_job.build.failure_count)
145- self.assertEqual(
146- 1, manager_module.assessFailureCounts.call_count)
147-
148- return d.addCallback(got_scan)
149+ self.assertEqual(expected_builder_count, builder.failure_count)
150+ self.assertEqual(
151+ expected_job_count,
152+ builder.currentjob.specific_job.build.failure_count)
153+ self.assertEqual(1, manager_module.assessFailureCounts.call_count)
154
155 def test_scan_first_fail(self):
156 # The first failure of a job should result in the failure_count
157@@ -342,23 +338,22 @@
158 builder_count=0, job_count=1, expected_builder_count=1,
159 expected_job_count=2)
160
161+ @defer.inlineCallbacks
162 def test_scanFailed_handles_lack_of_a_job_on_the_builder(self):
163 def failing_scan():
164 return defer.fail(Exception("fake exception"))
165 scanner = self._getScanner()
166 scanner.scan = failing_scan
167 builder = getUtility(IBuilderSet)[scanner.builder_name]
168- builder.failure_count = Builder.FAILURE_THRESHOLD
169+ builder.failure_count = (
170+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
171 builder.currentjob.reset()
172 self.layer.txn.commit()
173
174- d = scanner.singleCycle()
175-
176- def scan_finished(ignored):
177- self.assertFalse(builder.builderok)
178-
179- return d.addCallback(scan_finished)
180-
181+ yield scanner.singleCycle()
182+ self.assertFalse(builder.builderok)
183+
184+ @defer.inlineCallbacks
185 def test_fail_to_resume_slave_resets_job(self):
186 # If an attempt to resume and dispatch a slave fails, it should
187 # reset the job via job.reset()
188@@ -382,19 +377,15 @@
189 job = removeSecurityProxy(builder._findBuildCandidate())
190 job.virtualized = True
191 builder.virtualized = True
192- d = scanner.singleCycle()
193-
194- def check(ignored):
195- # The failure_count will have been incremented on the
196- # builder, we can check that to see that a dispatch attempt
197- # did indeed occur.
198- self.assertEqual(1, builder.failure_count)
199- # There should also be no builder set on the job.
200- self.assertTrue(job.builder is None)
201- build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
202- self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
203-
204- return d.addCallback(check)
205+ yield scanner.singleCycle()
206+
207+ # The failure_count will have been incremented on the builder, we
208+ # can check that to see that a dispatch attempt did indeed occur.
209+ self.assertEqual(1, builder.failure_count)
210+ # There should also be no builder set on the job.
211+ self.assertIsNone(job.builder)
212+ build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
213+ self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
214
215 @defer.inlineCallbacks
216 def test_cancelling_a_build(self):
217@@ -573,47 +564,88 @@
218 self.build = self.factory.makeSourcePackageRecipeBuild()
219 self.buildqueue = self.build.queueBuild()
220 self.buildqueue.markAsBuilding(self.builder)
221-
222+ self.interactor = BuilderInteractor(self.builder)
223+
224+ def _assessFailureCounts(self, fail_notes):
225+ # Helper for assessFailureCounts boilerplate.
226+ return assessFailureCounts(
227+ BufferLogger(), self.interactor, Exception(fail_notes))
228+
229+ @defer.inlineCallbacks
230 def test_equal_failures_reset_job(self):
231 self.builder.gotFailure()
232 self.builder.getCurrentBuildFarmJob().gotFailure()
233
234- assessFailureCounts(self.builder, "failnotes")
235+ yield self._assessFailureCounts("failnotes")
236 self.assertIs(None, self.builder.currentjob)
237 self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
238
239+ @defer.inlineCallbacks
240 def test_job_failing_more_than_builder_fails_job(self):
241 self.builder.getCurrentBuildFarmJob().gotFailure()
242 self.builder.getCurrentBuildFarmJob().gotFailure()
243 self.builder.gotFailure()
244
245- assessFailureCounts(self.builder, "failnotes")
246+ yield self._assessFailureCounts("failnotes")
247 self.assertIs(None, self.builder.currentjob)
248 self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
249 self.assertEqual(0, self.builder.failure_count)
250
251- def test_builder_failing_more_than_job_but_under_fail_threshold(self):
252- self.builder.failure_count = Builder.FAILURE_THRESHOLD - 1
253-
254- assessFailureCounts(self.builder, "failnotes")
255- self.assertIs(None, self.builder.currentjob)
256- self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
257+ @defer.inlineCallbacks
258+ def test_virtual_builder_reset_thresholds(self):
259+ self.builder.virtualized = True
260+ self.patch(self.interactor, "resumeSlaveHost", FakeMethod())
261+
262+ for failure_count in range(
263+ Builder.RESET_THRESHOLD - 1,
264+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD):
265+ self.builder.failure_count = failure_count
266+ yield self._assessFailureCounts("failnotes")
267+ self.assertIs(None, self.builder.currentjob)
268+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
269+ self.assertEqual(
270+ failure_count // Builder.RESET_THRESHOLD,
271+ self.interactor.resumeSlaveHost.call_count)
272+ self.assertTrue(self.builder.builderok)
273+
274+ self.builder.failure_count = (
275+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
276+ yield self._assessFailureCounts("failnotes")
277+ self.assertIs(None, self.builder.currentjob)
278+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
279+ self.assertEqual(
280+ Builder.RESET_FAILURE_THRESHOLD - 1,
281+ self.interactor.resumeSlaveHost.call_count)
282+ self.assertFalse(self.builder.builderok)
283+ self.assertEqual("failnotes", self.builder.failnotes)
284+
285+ @defer.inlineCallbacks
286+ def test_non_virtual_builder_reset_thresholds(self):
287+ self.builder.virtualized = False
288+ self.patch(self.interactor, "resumeSlaveHost", FakeMethod())
289+
290+ self.builder.failure_count = Builder.RESET_THRESHOLD - 1
291+ yield self._assessFailureCounts("failnotes")
292+ self.assertIs(None, self.builder.currentjob)
293+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
294+ self.assertEqual(0, self.interactor.resumeSlaveHost.call_count)
295 self.assertTrue(self.builder.builderok)
296
297- def test_builder_failing_more_than_job_but_over_fail_threshold(self):
298- self.builder.failure_count = Builder.FAILURE_THRESHOLD
299-
300- assessFailureCounts(self.builder, "failnotes")
301+ self.builder.failure_count = Builder.RESET_THRESHOLD
302+ yield self._assessFailureCounts("failnotes")
303 self.assertIs(None, self.builder.currentjob)
304 self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
305+ self.assertEqual(0, self.interactor.resumeSlaveHost.call_count)
306 self.assertFalse(self.builder.builderok)
307 self.assertEqual("failnotes", self.builder.failnotes)
308
309+ @defer.inlineCallbacks
310 def test_builder_failing_with_no_attached_job(self):
311 self.buildqueue.reset()
312- self.builder.failure_count = Builder.FAILURE_THRESHOLD
313+ self.builder.failure_count = (
314+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
315
316- assessFailureCounts(self.builder, "failnotes")
317+ yield self._assessFailureCounts("failnotes")
318 self.assertFalse(self.builder.builderok)
319 self.assertEqual("failnotes", self.builder.failnotes)
320