Merge ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master

Proposed by William Grant
Status: Merged
Approved by: William Grant
Approved revision: 812872093d67ad2597cd96713d2152e3410180c7
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~wgrant/launchpad:buildd-manager-failure-metrics
Merge into: launchpad:master
Prerequisite: ~wgrant/launchpad:buildd-manager-failure-refactor
Diff against target: 293 lines (+124/-37)
2 files modified
lib/lp/buildmaster/manager.py (+31/-24)
lib/lp/buildmaster/tests/test_manager.py (+93/-13)
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+454693@code.launchpad.net

Commit message

Tweak buildd-manager failure handling metrics

Description of the change

They all now start with "builders.failure", labels are more consistent,
and builder failure/reset are counted.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) wrote :

I guess we don't currently have any graphs monitoring the existing metric names, so the slightly gratuitous (although neater) renaming is probably OK.

review: Approve
8128720... by William Grant

Rename base failure count metric to not clash with numbercruncher

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
2index 0c6d335..cf29548 100644
3--- a/lib/lp/buildmaster/manager.py
4+++ b/lib/lp/buildmaster/manager.py
5@@ -312,6 +312,23 @@ class PrefetchedBuilderFactory(BaseBuilderFactory):
6 return self.candidates.pop(vitals)
7
8
9+def get_statsd_labels(builder, build):
10+ labels = {
11+ "builder_name": builder.name,
12+ "region": builder.region,
13+ "virtualized": str(builder.virtualized),
14+ }
15+ if build is not None:
16+ labels.update(
17+ {
18+ "build": True,
19+ "arch": build.processor.name,
20+ "job_type": build.job_type.name,
21+ }
22+ )
23+ return labels
24+
25+
26 def judge_failure(builder_count, job_count, exc, retry=True):
27 """Judge how to recover from a scan failure.
28
29@@ -394,22 +411,15 @@ def recover_failure(logger, vitals, builder, retry, exception):
30 job_action,
31 )
32
33- if job is not None and job_action is not None:
34- statsd_client = getUtility(IStatsdClient)
35- labels = {
36- "build": True,
37- "arch": job.specific_build.processor.name,
38- "region": builder.region,
39- "builder_name": builder.name,
40- "virtualized": str(builder.virtualized),
41- "job_type": job.specific_build.job_type.name,
42- }
43+ statsd_client = getUtility(IStatsdClient)
44+ labels = get_statsd_labels(builder, job.specific_build if job else None)
45
46+ if job is not None and job_action is not None:
47 if cancelling:
48 # We've previously been asked to cancel the job, so just set
49 # it to cancelled rather than retrying or failing.
50 logger.info("Cancelling job %s.", job.build_cookie)
51- statsd_client.incr("builders.job_cancelled", labels=labels)
52+ statsd_client.incr("builders.failure.job_cancelled", labels=labels)
53 job.markAsCancelled()
54 elif job_action == False:
55 # Fail and dequeue the job.
56@@ -430,12 +440,12 @@ def recover_failure(logger, vitals, builder, retry, exception):
57 job.specific_build.updateStatus(
58 BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
59 )
60- statsd_client.incr("builders.job_failed", labels=labels)
61+ statsd_client.incr("builders.failure.job_failed", labels=labels)
62 job.destroySelf()
63 elif job_action == True:
64 # Reset the job so it will be retried elsewhere.
65 logger.info("Requeueing job %s.", job.build_cookie)
66- statsd_client.incr("builders.job_reset", labels=labels)
67+ statsd_client.incr("builders.failure.job_reset", labels=labels)
68 job.reset()
69
70 if job_action == False:
71@@ -447,12 +457,14 @@ def recover_failure(logger, vitals, builder, retry, exception):
72 # We've already tried resetting it enough times, so we have
73 # little choice but to give up.
74 logger.info("Failing builder %s.", builder.name)
75+ statsd_client.incr("builders.failure.builder_failed", labels=labels)
76 builder.failBuilder(str(exception))
77 elif builder_action == True:
78 # Dirty the builder to attempt recovery. In the virtual case,
79 # the dirty idleness will cause a reset, giving us a good chance
80 # of recovery.
81 logger.info("Dirtying builder %s to attempt recovery.", builder.name)
82+ statsd_client.incr("builders.failure.builder_reset", labels=labels)
83 builder.setCleanStatus(BuilderCleanStatus.DIRTY)
84
85
86@@ -583,20 +595,15 @@ class WorkerScanner:
87 vitals = self.builder_factory.getVitals(self.builder_name)
88 builder = self.builder_factory[self.builder_name]
89 try:
90+ labels = get_statsd_labels(builder, builder.current_build)
91+ self.statsd_client.incr(
92+ "builders.failure.scan_failed", labels=labels
93+ )
94+
95 builder.gotFailure()
96- labels = {}
97 if builder.current_build is not None:
98 builder.current_build.gotFailure()
99- labels.update(
100- {
101- "build": True,
102- "arch": builder.current_build.processor.name,
103- "region": builder.region,
104- }
105- )
106- else:
107- labels["build"] = False
108- self.statsd_client.incr("builders.judged_failed", labels=labels)
109+
110 recover_failure(self.logger, vitals, builder, retry, exc)
111 transaction.commit()
112 except Exception:
113diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
114index b87e95c..2d3b6d5 100644
115--- a/lib/lp/buildmaster/tests/test_manager.py
116+++ b/lib/lp/buildmaster/tests/test_manager.py
117@@ -555,13 +555,20 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
118 transaction.commit()
119
120 yield scanner.singleCycle()
121- self.assertEqual(2, self.stats_client.incr.call_count)
122+ self.assertEqual(3, self.stats_client.incr.call_count)
123 self.stats_client.incr.assert_has_calls(
124 [
125 mock.call(
126 "build.reset,arch=386,env=test,job_type=PACKAGEBUILD"
127 ),
128- mock.call("builders.judged_failed,build=False,env=test"),
129+ mock.call(
130+ "builders.failure.scan_failed,builder_name=bob,env=test,"
131+ "region=,virtualized=False"
132+ ),
133+ mock.call(
134+ "builders.failure.builder_failed,builder_name=bob,"
135+ "env=test,region=,virtualized=False"
136+ ),
137 ]
138 )
139
140@@ -1411,9 +1418,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
141 self.stats_client.incr.assert_has_calls(
142 [
143 mock.call(
144- "builders.job_reset,arch={},build=True,builder_name={},"
145- "env=test,job_type=RECIPEBRANCHBUILD,region={},"
146- "virtualized=True".format(
147+ "builders.failure.job_reset,arch={},build=True,"
148+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
149+ "region={},virtualized=True".format(
150 build.processor.name,
151 self.builder.name,
152 self.builder.region,
153@@ -1477,7 +1484,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
154 self.stats_client.incr.assert_has_calls(
155 [
156 mock.call(
157- "builders.job_cancelled,arch={},build=True,"
158+ "builders.failure.job_cancelled,arch={},build=True,"
159 "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
160 "region=builder-name,virtualized=True".format(
161 naked_build.processor.name,
162@@ -1519,9 +1526,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
163 )
164 ),
165 mock.call(
166- "builders.job_failed,arch={},build=True,builder_name={},"
167- "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
168- "virtualized=True".format(
169+ "builders.failure.job_failed,arch={},build=True,"
170+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
171+ "region=builder-name,virtualized=True".format(
172 naked_build.processor.name,
173 self.builder.name,
174 )
175@@ -1561,9 +1568,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
176 )
177 ),
178 mock.call(
179- "builders.job_failed,arch={},build=True,builder_name={},"
180- "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
181- "virtualized=True".format(
182+ "builders.failure.job_failed,arch={},build=True,"
183+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
184+ "region=builder-name,virtualized=True".format(
185 naked_build.processor.name,
186 self.builder.name,
187 )
188@@ -1588,7 +1595,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
189 self.stats_client.incr.assert_has_calls(
190 [
191 mock.call(
192- "builders.job_cancelled,arch={},build=True,"
193+ "builders.failure.job_cancelled,arch={},build=True,"
194 "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
195 "region=builder-name,virtualized=True".format(
196 naked_build.processor.name,
197@@ -1621,6 +1628,36 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
198 self.assertIs(None, self.build.builder)
199 self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
200 self.assertTrue(self.builder.builderok)
201+ self.assertEqual(3, self.stats_client.incr.call_count)
202+ self.stats_client.incr.assert_has_calls(
203+ [
204+ mock.call(
205+ "builders.failure.job_reset,arch={},build=True,"
206+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
207+ "region=builder-name,virtualized=True".format(
208+ self.build.processor.name,
209+ self.builder.name,
210+ )
211+ ),
212+ mock.call(
213+ "build.reset,arch={},builder_name={},env=test,"
214+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
215+ "virtualized=True".format(
216+ self.build.processor.name,
217+ self.builder.name,
218+ )
219+ ),
220+ mock.call(
221+ "builders.failure.builder_reset,arch={},build=True,"
222+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
223+ "region=builder-name,virtualized=True".format(
224+ self.build.processor.name,
225+ self.builder.name,
226+ )
227+ ),
228+ ]
229+ )
230+ self.stats_client.incr.reset_mock()
231
232 # But if the builder continues to cause trouble, it will be
233 # disabled.
234@@ -1634,16 +1671,59 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
235 self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
236 self.assertFalse(self.builder.builderok)
237 self.assertEqual("failnotes", self.builder.failnotes)
238+ self.assertEqual(3, self.stats_client.incr.call_count)
239+ self.stats_client.incr.assert_has_calls(
240+ [
241+ mock.call(
242+ "builders.failure.job_reset,arch={},build=True,"
243+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
244+ "region=builder-name,virtualized=True".format(
245+ self.build.processor.name,
246+ self.builder.name,
247+ )
248+ ),
249+ mock.call(
250+ "build.reset,arch={},builder_name={},env=test,"
251+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
252+ "virtualized=True".format(
253+ self.build.processor.name,
254+ self.builder.name,
255+ )
256+ ),
257+ mock.call(
258+ "builders.failure.builder_failed,arch={},build=True,"
259+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
260+ "region=builder-name,virtualized=True".format(
261+ self.build.processor.name,
262+ self.builder.name,
263+ )
264+ ),
265+ ]
266+ )
267+ self.stats_client.incr.reset_mock()
268
269 def test_builder_failing_with_no_attached_job(self):
270 self.buildqueue.reset()
271 self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
272+ self.stats_client.incr.reset_mock()
273
274 log = self._recover_failure("failnotes")
275 self.assertIn("with no job", log)
276 self.assertIn("Failing builder", log)
277 self.assertFalse(self.builder.builderok)
278 self.assertEqual("failnotes", self.builder.failnotes)
279+ self.assertEqual(1, self.stats_client.incr.call_count)
280+ self.stats_client.incr.assert_has_calls(
281+ [
282+ mock.call(
283+ "builders.failure.builder_failed,builder_name={},"
284+ "env=test,region=builder-name,virtualized=True".format(
285+ self.builder.name,
286+ )
287+ ),
288+ ]
289+ )
290+ self.stats_client.incr.reset_mock()
291
292
293 class TestNewBuilders(TestCase):

Subscribers

People subscribed via source and target branches

to status/vote changes: