Launchpad itself

Merge ~twom/launchpad:stats-better-builder-stats-stacking into launchpad:master

Proposed by Tom Wardill on 2020-10-27

Status:	Merged
Approved by:	Tom Wardill on 2020-10-28
Approved revision:	e8163c3a596f6d86126bac3b4820625c391a894f
Merge reported by:	Otto Co-Pilot
Merged at revision:	not available
Proposed branch:	~twom/launchpad:stats-better-builder-stats-stacking
Merge into:	launchpad:master
Diff against target:	233 lines (+62/-24) 10 files modified lib/lp/buildmaster/manager.py (+6/-3) lib/lp/buildmaster/model/buildfarmjobbehaviour.py (+5/-3) lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py (+2/-1) lib/lp/services/config/schema-lazr.conf (+1/-0) lib/lp/services/statsd/model/statsd_client.py (+6/-0) lib/lp/services/statsd/numbercruncher.py (+5/-5) lib/lp/services/statsd/tests/__init__.py (+1/-0) lib/lp/services/statsd/tests/test_numbercruncher.py (+14/-6) lib/lp/services/statsd/tests/test_statsd_client.py (+14/-2) lib/lp/services/webapp/publication.py (+8/-4)
Related bugs:	Link a bug report

Reviewer	Date Requested	Status
Colin Watson (community)		Approve on 2020-10-28
Ioana Lasc (community)	2020-10-27	Approve on 2020-10-28
Review via email: mp+392892@code.launchpad.net

Commit message

Move to use statsd environment config variable

Description of the change

The prefix setting in statsd doesn't let us do the final queries in quite the way we wanted, so add another setting for it and apply it as a label in all the metrics.

Also increase number cruncher logging level to DEBUG to see if we can work out why it appears to stop from the logs.

Revision history for this message

Ioana Lasc (ilasc) wrote on 2020-10-28:

LGTM

review: Approve

Revision history for this message

Colin Watson (cjwatson) wrote on 2020-10-28:

Ideally I'd like the LP client to fill in the extra label automatically rather than having to repeat it everywhere, but this will do fine for now.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Canonical Launchpad Engineering

Christina A Reitbauer

Edson_g2020_ubuntuone Gomes

Jordan

Kevin bush

Maximiliano Bertacchini

Tom Wardill

noômen

to status/vote changes:

asimbol83

1	diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
2	index 7bbe4aa..e9732ea 100644
3	--- a/lib/lp/buildmaster/manager.py
4	+++ b/lib/lp/buildmaster/manager.py
5	@@ -532,10 +532,13 @@ class SlaveScanner:
6	if builder.current_build is not None:
7	builder.current_build.gotFailure()
8	self.statsd_client.incr(
9	- 'builders.judged_failed,build=True,arch={}'.format(
10	- builder.current_build.processor.name))
11	+ 'builders.judged_failed,build=True,arch={},env={}'.format(
12	+ builder.current_build.processor.name,
13	+ self.statsd_client.lp_environment))
14	else:
15	- self.statsd_client.incr('builders.judged_failed,build=False')
16	+ self.statsd_client.incr(
17	+ 'builders.judged_failed,build=False,env={}'.format(
18	+ self.statsd_client.lp_environment))
19	recover_failure(self.logger, vitals, builder, retry, failure.value)
20	transaction.commit()
21	except Exception:
22	diff --git a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
23	index edff53b..88ede6a 100644
24	--- a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
25	+++ b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
26	@@ -156,10 +156,12 @@ class BuildFarmJobBehaviourBase:
27	# Update stats
28	job_type = getattr(self.build, 'job_type', None)
29	job_type_name = job_type.name if job_type else 'UNKNOWN'
30	- getUtility(IStatsdClient).incr(
31	- 'build.count,job_type={},builder_name={}'.format(
32	+ statsd_client = getUtility(IStatsdClient)
33	+ statsd_client.incr(
34	+ 'build.count,job_type={},builder_name={},env={}'.format(
35	job_type_name,
36	- self._builder.name))
37	+ self._builder.name,
38	+ statsd_client.lp_environment))
39
40	logger.info(
41	"Job %s (%s) started on %s: %s %s"
42	diff --git a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
43	index 3bdf53d..06b0e3f 100644
44	--- a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
45	+++ b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
46	@@ -275,7 +275,8 @@ class TestDispatchBuildToSlave(StatsMixin, TestCase):
47	self.assertEqual(1, self.stats_client.incr.call_count)
48	self.assertEqual(
49	self.stats_client.incr.call_args_list[0][0],
50	- ('build.count,job_type=UNKNOWN,builder_name=mock-builder',))
51	+ ('build.count,job_type=UNKNOWN,'
52	+ 'builder_name=mock-builder,env=test',))
53
54
55	class TestGetUploadMethodsMixin:
56	diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
57	index c16270a..aaa9d30 100644
58	--- a/lib/lp/services/config/schema-lazr.conf
59	+++ b/lib/lp/services/config/schema-lazr.conf
60	@@ -2028,3 +2028,4 @@ concurrency: 1
61	host: none
62	port: none
63	prefix: none
64	+environment: none
65	diff --git a/lib/lp/services/statsd/model/statsd_client.py b/lib/lp/services/statsd/model/statsd_client.py
66	index 868af30..6ccf027 100644
67	--- a/lib/lp/services/statsd/model/statsd_client.py
68	+++ b/lib/lp/services/statsd/model/statsd_client.py
69	@@ -43,6 +43,12 @@ class StatsdClient:
70	self._make_client()
71
72	def __getattr__(self, name):
73	+ # This is a convenience to keep all statsd client related
74	+ # items on the client. Having this separate from prefix
75	+ # allows us to use it as a label, rather than as part of the
76	+ # gauge name
77	+ if name == 'lp_environment':
78	+ return config.statsd.environment
79	if self._client is not None:
80	return getattr(self._client, name)
81	else:
82	diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
83	index d2301d0..0a0552c 100644
84	--- a/lib/lp/services/statsd/numbercruncher.py
85	+++ b/lib/lp/services/statsd/numbercruncher.py
86	@@ -47,7 +47,7 @@ class NumberCruncher(service.Service):
87	def _setupLogger(self):
88	"""Set up a 'number-cruncher' logger that redirects to twisted.
89	"""
90	- level = logging.INFO
91	+ level = logging.DEBUG
92	logger = logging.getLogger(NUMBER_CRUNCHER_LOG_NAME)
93	logger.propagate = False
94
95	@@ -74,8 +74,8 @@ class NumberCruncher(service.Service):
96	for queue_type, contents in queue_details.items():
97	virt = queue_type == 'virt'
98	for arch, value in contents.items():
99	- gauge_name = "buildqueue,virtualized={},arch={}".format(
100	- virt, arch)
101	+ gauge_name = "buildqueue,virtualized={},arch={},env={}".format(
102	+ virt, arch, self.statsd_client.lp_environment)
103	self.logger.debug("{}: {}".format(gauge_name, value[0]))
104	self.statsd_client.gauge(gauge_name, value[0])
105	self.logger.debug("Build queue stats update complete.")
106	@@ -107,8 +107,8 @@ class NumberCruncher(service.Service):
107	counts['idle'] += 1
108	for processor, counts in counts_by_processor.items():
109	for count_name, count_value in counts.items():
110	- gauge_name = "builders.{},arch={}".format(
111	- count_name, processor)
112	+ gauge_name = "builders,status={},arch={},env={}".format(
113	+ count_name, processor, self.statsd_client.lp_environment)
114	self.logger.debug("{}: {}".format(gauge_name, count_value))
115	self.statsd_client.gauge(gauge_name, count_value)
116	self.logger.debug("Builder stats update complete.")
117	diff --git a/lib/lp/services/statsd/tests/__init__.py b/lib/lp/services/statsd/tests/__init__.py
118	index 20885a4..23fb067 100644
119	--- a/lib/lp/services/statsd/tests/__init__.py
120	+++ b/lib/lp/services/statsd/tests/__init__.py
121	@@ -19,5 +19,6 @@ class StatsMixin:
122	# Install a mock statsd client so we can assert against the call
123	# counts and args.
124	self.stats_client = mock.Mock()
125	+ self.stats_client.lp_environment = "test"
126	self.useFixture(
127	ZopeUtilityFixture(self.stats_client, IStatsdClient))
128	diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
129	index 47c633f..c5a4ff6 100644
130	--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
131	+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
132	@@ -85,10 +85,18 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
133	if 'amd64' in c[0][0]]
134	self.assertThat(
135	calls, MatchesListwise(
136	- [Equals(('builders.disabled,arch=amd64,virtualized=True', 0)),
137	- Equals(('builders.building,arch=amd64,virtualized=True', 0)),
138	- Equals(('builders.idle,arch=amd64,virtualized=True', 0)),
139	- Equals(('builders.cleaning,arch=amd64,virtualized=True', 1))
140	+ [Equals((
141	+ 'builders,status=disabled,arch=amd64,'
142	+ 'virtualized=True,env=test', 0)),
143	+ Equals((
144	+ 'builders,status=building,arch=amd64,'
145	+ 'virtualized=True,env=test', 0)),
146	+ Equals((
147	+ 'builders,status=idle,arch=amd64,'
148	+ 'virtualized=True,env=test', 0)),
149	+ Equals((
150	+ 'builders,status=cleaning,arch=amd64,'
151	+ 'virtualized=True,env=test', 1))
152	]))
153
154	def test_updateBuilderQueues(self):
155	@@ -109,9 +117,9 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
156	self.assertThat(
157	[x[0] for x in self.stats_client.gauge.call_args_list],
158	MatchesListwise(
159	- [Equals(('buildqueue,virtualized=True,arch={}'.format(
160	+ [Equals(('buildqueue,virtualized=True,arch={},env=test'.format(
161	build.processor.name), 1)),
162	- Equals(('buildqueue,virtualized=False,arch=386', 1))
163	+ Equals(('buildqueue,virtualized=False,arch=386,env=test', 1))
164	]))
165
166	def test_startService_starts_update_queues_loop(self):
167	diff --git a/lib/lp/services/statsd/tests/test_statsd_client.py b/lib/lp/services/statsd/tests/test_statsd_client.py
168	index 6852af7..2e1a8e1 100644
169	--- a/lib/lp/services/statsd/tests/test_statsd_client.py
170	+++ b/lib/lp/services/statsd/tests/test_statsd_client.py
171	@@ -26,7 +26,8 @@ class TestClientConfiguration(TestCase):
172	self.addCleanup(client.reload)
173	config.push(
174	'statsd_test',
175	- "[statsd]\nhost: 127.0.01\nport: 9999\nprefix: test\n")
176	+ "[statsd]\nhost: 127.0.01\n"
177	+ "port: 9999\nprefix: test\nenvironment: test\n")
178	client.reload()
179	self.assertIsInstance(client._client, StatsClient)
180
181	@@ -45,6 +46,17 @@ class TestClientConfiguration(TestCase):
182	self.addCleanup(client.reload)
183	config.push(
184	'statsd_test',
185	- "[statsd]\nhost: 127.0.01\nport: 9999\nprefix: test\n")
186	+ "[statsd]\nhost: 127.0.01\n"
187	+ "port: 9999\nprefix: test\nenvironment: test\n")
188	client.reload()
189	self.assertIsInstance(client._client, StatsClient)
190	+
191	+ def test_lp_environment_is_available(self):
192	+ client = getUtility(IStatsdClient)
193	+ self.addCleanup(client.reload)
194	+ config.push(
195	+ 'statsd_test',
196	+ "[statsd]\nhost: 127.0.01\n"
197	+ "port: 9999\nprefix: test\nenvironment: test\n")
198	+ client.reload()
199	+ self.assertEqual(client.lp_environment, "test")
200	diff --git a/lib/lp/services/webapp/publication.py b/lib/lp/services/webapp/publication.py
201	index 7325d9f..9a9a0e3 100644
202	--- a/lib/lp/services/webapp/publication.py
203	+++ b/lib/lp/services/webapp/publication.py
204	@@ -795,13 +795,16 @@ class LaunchpadBrowserPublication(
205	status = request.response.getStatus()
206	if status == 404: # Not Found
207	OpStats.stats['404s'] += 1
208	- statsd_client.incr('errors.404')
209	+ statsd_client.incr('errors.404,env={}'.format(
210	+ statsd_client.lp_environment))
211	elif status == 500: # Unhandled exceptions
212	OpStats.stats['500s'] += 1
213	- statsd_client.incr('errors.500')
214	+ statsd_client.incr('errors.500,env={}'.format(
215	+ statsd_client.lp_environment))
216	elif status == 503: # Timeouts
217	OpStats.stats['503s'] += 1
218	- statsd_client.incr('errors.503')
219	+ statsd_client.incr('errors.503,env={}'.format(
220	+ statsd_client.lp_environment))
221
222	# Increment counters for status code groups.
223	status_group = str(status)[0] + 'XXs'
224	@@ -810,7 +813,8 @@ class LaunchpadBrowserPublication(
225	# Increment counter for 5XXs_b.
226	if is_browser(request) and status_group == '5XXs':
227	OpStats.stats['5XXs_b'] += 1
228	- statsd_client.incr('errors.5XX')
229	+ statsd_client.incr('errors.5XX,env={}'.format(
230	+ statsd_client.lp_environment))
231
232	# Make sure our databases are in a sane state for the next request.
233	thread_name = threading.current_thread().name