Merge ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master

Proposed by Guruprasad
Status: Merged
Approved by: Guruprasad
Approved revision: a768650cc12a092421b4e477a3745bf3fc832eee
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~lgp171188/launchpad:emit-statsd-metrics-build-failures
Merge into: launchpad:master
Diff against target: 237 lines (+126/-1)
2 files modified
lib/lp/buildmaster/manager.py (+12/-0)
lib/lp/buildmaster/tests/test_manager.py (+114/-1)
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+440721@code.launchpad.net

Commit message

Emit statsd metrics when recovering from build job failure

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) :
review: Approve
Revision history for this message
Guruprasad (lgp171188) :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
2index ae46630..67d564a 100644
3--- a/lib/lp/buildmaster/manager.py
4+++ b/lib/lp/buildmaster/manager.py
5@@ -397,10 +397,20 @@ def recover_failure(logger, vitals, builder, retry, exception):
6 )
7
8 if job is not None and job_action is not None:
9+ statsd_client = getUtility(IStatsdClient)
10+ labels = {
11+ "build": True,
12+ "arch": job.specific_build.processor.name,
13+ "builder_name": builder.name,
14+ "virtualized": str(builder.virtualized),
15+ "job_type": job.specific_build.job_type.name,
16+ }
17+
18 if cancelling:
19 # We've previously been asked to cancel the job, so just set
20 # it to cancelled rather than retrying or failing.
21 logger.info("Cancelling job %s.", job.build_cookie)
22+ statsd_client.incr("builders.job_cancelled", labels=labels)
23 job.markAsCancelled()
24 elif job_action == False:
25 # Fail and dequeue the job.
26@@ -421,10 +431,12 @@ def recover_failure(logger, vitals, builder, retry, exception):
27 job.specific_build.updateStatus(
28 BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
29 )
30+ statsd_client.incr("builders.job_failed", labels=labels)
31 job.destroySelf()
32 elif job_action == True:
33 # Reset the job so it will be retried elsewhere.
34 logger.info("Requeueing job %s.", job.build_cookie)
35+ statsd_client.incr("builders.job_reset", labels=labels)
36 job.reset()
37
38 if job_action == False:
39diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
40index a568d64..dfd7530 100644
41--- a/lib/lp/buildmaster/tests/test_manager.py
42+++ b/lib/lp/buildmaster/tests/test_manager.py
43@@ -1385,7 +1385,7 @@ class TestBuilddManager(TestCase):
44 self.assertNotEqual(0, manager.flushLogTails.call_count)
45
46
47-class TestFailureAssessments(TestCaseWithFactory):
48+class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
49
50 layer = ZopelessDatabaseLayer
51
52@@ -1396,6 +1396,7 @@ class TestFailureAssessments(TestCaseWithFactory):
53 self.buildqueue = self.build.queueBuild()
54 self.buildqueue.markAsBuilding(self.builder)
55 self.worker = OkWorker()
56+ self.setUpStats()
57
58 def _recover_failure(self, fail_notes, retry=True):
59 # Helper for recover_failure boilerplate.
60@@ -1409,6 +1410,29 @@ class TestFailureAssessments(TestCaseWithFactory):
61 )
62 return logger.getLogBuffer()
63
64+ def assert_statsd_metrics_requeue(self):
65+ build = removeSecurityProxy(self.build)
66+ self.assertEqual(2, self.stats_client.incr.call_count)
67+ self.stats_client.incr.assert_has_calls(
68+ [
69+ mock.call(
70+ "builders.job_reset,arch={},build=True,builder_name={},"
71+ "env=test,job_type=RECIPEBRANCHBUILD,"
72+ "virtualized=True".format(
73+ build.processor.name,
74+ self.builder.name,
75+ )
76+ ),
77+ mock.call(
78+ "build.reset,arch={},builder_name={},env=test,"
79+ "job_type=RECIPEBRANCHBUILD,virtualized=True".format(
80+ build.processor.name,
81+ self.builder.name,
82+ )
83+ ),
84+ ]
85+ )
86+
87 def test_job_reset_threshold_with_retry(self):
88 naked_build = removeSecurityProxy(self.build)
89 self.builder.failure_count = JOB_RESET_THRESHOLD - 1
90@@ -1426,6 +1450,7 @@ class TestFailureAssessments(TestCaseWithFactory):
91 self.assertIn("Requeueing job", log)
92 self.assertIs(None, self.builder.currentjob)
93 self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
94+ self.assert_statsd_metrics_requeue()
95
96 def test_job_reset_threshold_no_retry(self):
97 naked_build = removeSecurityProxy(self.build)
98@@ -1436,6 +1461,7 @@ class TestFailureAssessments(TestCaseWithFactory):
99 self.assertIn("Requeueing job", log)
100 self.assertIs(None, self.builder.currentjob)
101 self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
102+ self.assert_statsd_metrics_requeue()
103
104 def test_reset_during_cancellation_cancels(self):
105 self.buildqueue.cancel()
106@@ -1449,11 +1475,33 @@ class TestFailureAssessments(TestCaseWithFactory):
107 self.assertIn("Cancelling job", log)
108 self.assertIs(None, self.builder.currentjob)
109 self.assertEqual(BuildStatus.CANCELLED, self.build.status)
110+ self.assertEqual(2, self.stats_client.incr.call_count)
111+ self.stats_client.incr.assert_has_calls(
112+ [
113+ mock.call(
114+ "builders.job_cancelled,arch={},build=True,"
115+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
116+ "virtualized=True".format(
117+ naked_build.processor.name,
118+ self.builder.name,
119+ )
120+ ),
121+ mock.call(
122+ "build.finished,arch={},builder_name={},env=test,"
123+ "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
124+ "virtualized=True".format(
125+ naked_build.processor.name,
126+ self.builder.name,
127+ )
128+ ),
129+ ]
130+ )
131
132 def test_job_failing_more_than_builder_fails_job(self):
133 self.build.gotFailure()
134 self.build.gotFailure()
135 self.builder.gotFailure()
136+ naked_build = removeSecurityProxy(self.build)
137
138 log = self._recover_failure("failnotes")
139 self.assertIn("Failing job", log)
140@@ -1461,6 +1509,27 @@ class TestFailureAssessments(TestCaseWithFactory):
141 self.assertIs(None, self.builder.currentjob)
142 self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
143 self.assertEqual(0, self.builder.failure_count)
144+ self.assertEqual(2, self.stats_client.incr.call_count)
145+ self.stats_client.incr.assert_has_calls(
146+ [
147+ mock.call(
148+ "build.finished,arch={},builder_name={},env=test,"
149+ "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD,"
150+ "virtualized=True".format(
151+ naked_build.processor.name,
152+ self.builder.name,
153+ )
154+ ),
155+ mock.call(
156+ "builders.job_failed,arch={},build=True,builder_name={},"
157+ "env=test,job_type=RECIPEBRANCHBUILD,"
158+ "virtualized=True".format(
159+ naked_build.processor.name,
160+ self.builder.name,
161+ )
162+ ),
163+ ]
164+ )
165
166 def test_bad_job_does_not_unsucceed(self):
167 # If a FULLYBUILT build somehow ends up back in buildd-manager,
168@@ -1473,6 +1542,7 @@ class TestFailureAssessments(TestCaseWithFactory):
169 self.build.gotFailure()
170 self.build.gotFailure()
171 self.builder.gotFailure()
172+ naked_build = removeSecurityProxy(self.build)
173
174 log = self._recover_failure("failnotes")
175 self.assertIn("Failing job", log)
176@@ -1481,6 +1551,27 @@ class TestFailureAssessments(TestCaseWithFactory):
177 self.assertIs(None, self.builder.currentjob)
178 self.assertEqual(self.build.status, BuildStatus.FULLYBUILT)
179 self.assertEqual(0, self.builder.failure_count)
180+ self.assertEqual(2, self.stats_client.incr.call_count)
181+ self.stats_client.incr.assert_has_calls(
182+ [
183+ mock.call(
184+ "build.finished,arch={},builder_name={},env=test,"
185+ "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT,"
186+ "virtualized=True".format(
187+ naked_build.processor.name,
188+ self.builder.name,
189+ )
190+ ),
191+ mock.call(
192+ "builders.job_failed,arch={},build=True,builder_name={},"
193+ "env=test,job_type=RECIPEBRANCHBUILD,"
194+ "virtualized=True".format(
195+ naked_build.processor.name,
196+ self.builder.name,
197+ )
198+ ),
199+ ]
200+ )
201
202 def test_failure_during_cancellation_cancels(self):
203 self.buildqueue.cancel()
204@@ -1489,11 +1580,33 @@ class TestFailureAssessments(TestCaseWithFactory):
205 self.build.gotFailure()
206 self.build.gotFailure()
207 self.builder.gotFailure()
208+ naked_build = removeSecurityProxy(self.build)
209 log = self._recover_failure("failnotes")
210 self.assertIn("Cancelling job", log)
211 self.assertIn("Resetting failure count of builder", log)
212 self.assertIs(None, self.builder.currentjob)
213 self.assertEqual(BuildStatus.CANCELLED, self.build.status)
214+ self.assertEqual(2, self.stats_client.incr.call_count)
215+ self.stats_client.incr.assert_has_calls(
216+ [
217+ mock.call(
218+ "builders.job_cancelled,arch={},build=True,"
219+ "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
220+ "virtualized=True".format(
221+ naked_build.processor.name,
222+ self.builder.name,
223+ )
224+ ),
225+ mock.call(
226+ "build.finished,arch={},builder_name={},env=test,"
227+ "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
228+ "virtualized=True".format(
229+ naked_build.processor.name,
230+ self.builder.name,
231+ )
232+ ),
233+ ]
234+ )
235
236 def test_bad_builder(self):
237 self.builder.setCleanStatus(BuilderCleanStatus.CLEAN)

Subscribers

People subscribed via source and target branches

to status/vote changes: