Merge ~blake-rouse/maas:bind9-restart-2.6 into maas:2.6
- Git
- lp:~blake-rouse/maas
- bind9-restart-2.6
- Merge into 2.6
Proposed by
Blake Rouse
Status: | Merged | ||||
---|---|---|---|---|---|
Approved by: | Blake Rouse | ||||
Approved revision: | 3f79dca9529df99d08cc67ab729690b017cf31a1 | ||||
Merge reported by: | MAAS Lander | ||||
Merged at revision: | not available | ||||
Proposed branch: | ~blake-rouse/maas:bind9-restart-2.6 | ||||
Merge into: | maas:2.6 | ||||
Diff against target: |
738 lines (+260/-43) 15 files modified
debian/extras/99-maas-common-sudoers (+3/-2) src/maasserver/dns/config.py (+4/-4) src/maasserver/dns/tests/test_config.py (+5/-2) src/maasserver/region_controller.py (+25/-4) src/maasserver/service_monitor.py (+3/-0) src/maasserver/tests/test_region_controller.py (+48/-8) src/provisioningserver/dns/actions.py (+11/-5) src/provisioningserver/dns/config.py (+2/-2) src/provisioningserver/dns/tests/test_actions.py (+4/-4) src/provisioningserver/dns/tests/test_config.py (+6/-2) src/provisioningserver/service_monitor.py (+3/-0) src/provisioningserver/utils/service_monitor.py (+56/-3) src/provisioningserver/utils/shell.py (+2/-1) src/provisioningserver/utils/tests/test_service_monitor.py (+79/-6) src/provisioningserver/utils/tests/test_shell.py (+9/-0) |
||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Blake Rouse (community) | Approve | ||
Review via email: mp+371228@code.launchpad.net |
Commit message
Fixes LP: #1710278 - Notice when bind9 is hung, force kill the service, ensure it starts again, then perform a reload.
Backport of d0a4ec6ce2af49d
Description of the change
To post a comment you must log in.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | diff --git a/debian/extras/99-maas-common-sudoers b/debian/extras/99-maas-common-sudoers |
2 | index 47387d2..4c2bfd3 100644 |
3 | --- a/debian/extras/99-maas-common-sudoers |
4 | +++ b/debian/extras/99-maas-common-sudoers |
5 | @@ -18,10 +18,11 @@ maas ALL= NOPASSWD: /bin/systemctl restart chrony |
6 | maas ALL= NOPASSWD: /bin/systemctl start chrony |
7 | maas ALL= NOPASSWD: /bin/systemctl status chrony |
8 | |
9 | -# Control of the DNS server: MAAS needs to track status and start it |
10 | -# again if stopped manually. |
11 | +# Control of the DNS server: MAAS needs to track status, start it |
12 | +# again if stopped manually, or if MAAS must brutally kill it! |
13 | maas ALL= NOPASSWD: /bin/systemctl status bind9 |
14 | maas ALL= NOPASSWD: /bin/systemctl start bind9 |
15 | +maas ALL= NOPASSWD: /bin/systemctl kill -s SIGKILL bind9 |
16 | |
17 | # Control of the Proxy server: MAAS needs to reconfigure it after editing |
18 | # its configuration file, start it and stop it. |
19 | diff --git a/src/maasserver/dns/config.py b/src/maasserver/dns/config.py |
20 | index 702b194..36aaacf 100644 |
21 | --- a/src/maasserver/dns/config.py |
22 | +++ b/src/maasserver/dns/config.py |
23 | @@ -54,7 +54,7 @@ def dns_force_reload(): |
24 | DNSPublication(source="Force reload").save() |
25 | |
26 | |
27 | -def dns_update_all_zones(reload_retry=False): |
28 | +def dns_update_all_zones(reload_retry=False, reload_timeout=2): |
29 | """Update all zone files for all domains. |
30 | |
31 | Serving these zone files means updating BIND's configuration to include |
32 | @@ -96,12 +96,12 @@ def dns_update_all_zones(reload_retry=False): |
33 | # actually needed but it seems safer to maintain this behaviour until we |
34 | # have a better understanding. |
35 | if reload_retry: |
36 | - bind_reload_with_retries() |
37 | + reloaded = bind_reload_with_retries(timeout=reload_timeout) |
38 | else: |
39 | - bind_reload() |
40 | + reloaded = bind_reload(timeout=reload_timeout) |
41 | |
42 | # Return the current serial and list of domain names. |
43 | - return serial, [ |
44 | + return serial, reloaded, [ |
45 | domain.name |
46 | for domain in domains |
47 | ] |
48 | diff --git a/src/maasserver/dns/tests/test_config.py b/src/maasserver/dns/tests/test_config.py |
49 | index 6bb5bfb..6d0bdfb 100644 |
50 | --- a/src/maasserver/dns/tests/test_config.py |
51 | +++ b/src/maasserver/dns/tests/test_config.py |
52 | @@ -62,6 +62,7 @@ from testtools.matchers import ( |
53 | Equals, |
54 | FileContains, |
55 | HasLength, |
56 | + Is, |
57 | MatchesSetwise, |
58 | MatchesStructure, |
59 | ) |
60 | @@ -307,7 +308,8 @@ class TestDNSConfigModifications(TestDNSServer): |
61 | bind_reload_with_retries = self.patch_autospec( |
62 | dns_config_module, "bind_reload_with_retries") |
63 | dns_update_all_zones(reload_retry=True) |
64 | - self.assertThat(bind_reload_with_retries, MockCalledOnceWith()) |
65 | + self.assertThat( |
66 | + bind_reload_with_retries, MockCalledOnceWith(timeout=2)) |
67 | |
68 | def test_dns_update_all_zones_passes_upstream_dns_parameter(self): |
69 | self.patch(settings, 'DNS_CONNECT', True) |
70 | @@ -377,8 +379,9 @@ class TestDNSConfigModifications(TestDNSServer): |
71 | self.patch( |
72 | dns_config_module, |
73 | "current_zone_serial").return_value = fake_serial |
74 | - serial, domains = dns_update_all_zones() |
75 | + serial, reloaded, domains = dns_update_all_zones() |
76 | self.assertThat(serial, Equals(fake_serial)) |
77 | + self.assertThat(reloaded, Is(True)) |
78 | self.assertThat(domains, MatchesSetwise(*[ |
79 | Equals(domain.name) |
80 | for domain in Domain.objects.filter(authoritative=True) |
81 | diff --git a/src/maasserver/region_controller.py b/src/maasserver/region_controller.py |
82 | index 9a41c72..2989b6e 100644 |
83 | --- a/src/maasserver/region_controller.py |
84 | +++ b/src/maasserver/region_controller.py |
85 | @@ -48,6 +48,7 @@ from maasserver.rbac import ( |
86 | Resource, |
87 | SyncConflictError, |
88 | ) |
89 | +from maasserver.service_monitor import service_monitor |
90 | from maasserver.utils import synchronised |
91 | from maasserver.utils.orm import ( |
92 | transactional, |
93 | @@ -191,7 +192,11 @@ class RegionControllerService(Service): |
94 | d = deferToDatabase(transactional(dns_update_all_zones)) |
95 | d.addCallback(self._checkSerial) |
96 | d.addCallback(self._logDNSReload) |
97 | + # Order here matters, first needsDNSUpdate is set then pass the |
98 | + # failure onto `_onDNSReloadFailure` to do the correct thing |
99 | + # with the DNS server. |
100 | d.addErrback(_onFailureRetry, 'needsDNSUpdate') |
101 | + d.addErrback(self._onDNSReloadFailure) |
102 | d.addErrback( |
103 | log.err, |
104 | "Failed configuring DNS.") |
105 | @@ -229,7 +234,10 @@ class RegionControllerService(Service): |
106 | """Check that the serial of the domain is updated.""" |
107 | if result is None: |
108 | return None |
109 | - serial, domain_names = result |
110 | + serial, reloaded, domain_names = result |
111 | + if not reloaded: |
112 | + raise DNSReloadError( |
113 | + "Failed to reload DNS; timeout or rdnc command failed.") |
114 | not_matching_domains = set(domain_names) |
115 | loop = 0 |
116 | while len(not_matching_domains) > 0 and loop != 30: |
117 | @@ -252,13 +260,13 @@ class RegionControllerService(Service): |
118 | raise DNSReloadError( |
119 | "Failed to reload DNS; serial mismatch " |
120 | "on domains %s" % ', '.join(not_matching_domains)) |
121 | - return serial, domain_names |
122 | + return result |
123 | |
124 | def _logDNSReload(self, result): |
125 | """Log the reason DNS was reloaded.""" |
126 | if result is None: |
127 | return None |
128 | - serial, domain_names = result |
129 | + serial, _, domain_names = result |
130 | if self.previousSerial is None: |
131 | # This was the first load for starting the service. |
132 | self.previousSerial = serial |
133 | @@ -269,7 +277,10 @@ class RegionControllerService(Service): |
134 | # reason for the reload. |
135 | |
136 | def _logReason(reasons): |
137 | - if len(reasons) == 1: |
138 | + if len(reasons) == 0: |
139 | + msg = ( |
140 | + "Reloaded DNS configuration; previous failure (retry)") |
141 | + elif len(reasons) == 1: |
142 | msg = "Reloaded DNS configuration; %s" % reasons[0] |
143 | else: |
144 | msg = 'Reloaded DNS configuration: \n' + '\n'.join( |
145 | @@ -286,6 +297,16 @@ class RegionControllerService(Service): |
146 | self.previousSerial = serial |
147 | return d |
148 | |
149 | + def _onDNSReloadFailure(self, failure): |
150 | + """Force kill and restart bind9.""" |
151 | + failure.trap(DNSReloadError) |
152 | + if not self.retryOnFailure: |
153 | + return failure |
154 | + log.err(failure, "Failed configuring DNS; killing and restarting") |
155 | + d = service_monitor.killService('bind9') |
156 | + d.addErrback(log.err, "Failed to kill and restart DNS.") |
157 | + return d |
158 | + |
159 | @transactional |
160 | def _getReloadReasons(self, previousSerial, currentSerial): |
161 | return [ |
162 | diff --git a/src/maasserver/service_monitor.py b/src/maasserver/service_monitor.py |
163 | index 92baa43..c902f86 100644 |
164 | --- a/src/maasserver/service_monitor.py |
165 | +++ b/src/maasserver/service_monitor.py |
166 | @@ -30,6 +30,9 @@ class BIND9Service(AlwaysOnService): |
167 | service_name = "bind9" |
168 | snap_service_name = "bind9" |
169 | |
170 | + # Pass SIGKILL directly to parent. |
171 | + kill_extra_opts = ('-s', 'SIGKILL') |
172 | + |
173 | |
174 | class NTPServiceOnRegion(AlwaysOnService): |
175 | """Monitored NTP service on a region controller host.""" |
176 | diff --git a/src/maasserver/tests/test_region_controller.py b/src/maasserver/tests/test_region_controller.py |
177 | index 2c109db..b00fb2d 100644 |
178 | --- a/src/maasserver/tests/test_region_controller.py |
179 | +++ b/src/maasserver/tests/test_region_controller.py |
180 | @@ -212,7 +212,7 @@ class TestRegionControllerService(MAASServerTestCase): |
181 | service = self.make_service(sentinel.listener) |
182 | service.needsDNSUpdate = True |
183 | dns_result = ( |
184 | - random.randint(1, 1000), [ |
185 | + random.randint(1, 1000), True, [ |
186 | factory.make_name('domain') |
187 | for _ in range(3) |
188 | ]) |
189 | @@ -234,6 +234,46 @@ class TestRegionControllerService(MAASServerTestCase): |
190 | |
191 | @wait_for_reactor |
192 | @inlineCallbacks |
193 | + def test_process_zones_kills_bind_on_failed_reload(self): |
194 | + service = self.make_service(sentinel.listener) |
195 | + service.needsDNSUpdate = True |
196 | + service.retryOnFailure = True |
197 | + dns_result_0 = ( |
198 | + random.randint(1, 1000), False, [ |
199 | + factory.make_name('domain') |
200 | + for _ in range(3) |
201 | + ]) |
202 | + dns_result_1 = (dns_result_0[0], True, dns_result_0[2]) |
203 | + mock_dns_update_all_zones = self.patch( |
204 | + region_controller, "dns_update_all_zones") |
205 | + mock_dns_update_all_zones.side_effect = [dns_result_0, dns_result_1] |
206 | + |
207 | + service._checkSerialCalled = False |
208 | + orig_checkSerial = service._checkSerial |
209 | + |
210 | + def _checkSerial(result): |
211 | + if service._checkSerialCalled: |
212 | + return dns_result_1 |
213 | + service._checkSerialCalled = True |
214 | + return orig_checkSerial(result) |
215 | + |
216 | + mock_check_serial = self.patch(service, "_checkSerial") |
217 | + mock_check_serial.side_effect = _checkSerial |
218 | + mock_killService = self.patch( |
219 | + region_controller.service_monitor, 'killService') |
220 | + mock_killService.return_value = succeed(None) |
221 | + service.startProcessing() |
222 | + yield service.processingDefer |
223 | + self.assertThat( |
224 | + mock_dns_update_all_zones, MockCallsMatch(call(), call())) |
225 | + self.assertThat( |
226 | + mock_check_serial, |
227 | + MockCallsMatch(call(dns_result_0), call(dns_result_1))) |
228 | + self.assertThat( |
229 | + mock_killService, MockCalledOnceWith("bind9")) |
230 | + |
231 | + @wait_for_reactor |
232 | + @inlineCallbacks |
233 | def test_process_updates_proxy(self): |
234 | service = self.make_service(sentinel.listener) |
235 | service.needsProxyUpdate = True |
236 | @@ -348,7 +388,7 @@ class TestRegionControllerService(MAASServerTestCase): |
237 | service.needsProxyUpdate = True |
238 | service.needsRBACUpdate = True |
239 | dns_result = ( |
240 | - random.randint(1, 1000), [ |
241 | + random.randint(1, 1000), True, [ |
242 | factory.make_name('domain') |
243 | for _ in range(3) |
244 | ]) |
245 | @@ -407,7 +447,7 @@ class TestRegionControllerService(MAASServerTestCase): |
246 | succeed(([self.make_soa_result(result_serial)], [], [])), |
247 | ] |
248 | # Error should not be raised. |
249 | - return service._checkSerial((formatted_serial, dns_names)) |
250 | + return service._checkSerial((formatted_serial, True, dns_names)) |
251 | |
252 | @wait_for_reactor |
253 | @inlineCallbacks |
254 | @@ -425,7 +465,7 @@ class TestRegionControllerService(MAASServerTestCase): |
255 | mock_lookup.side_effect = lambda *args: succeed(([], [], [])) |
256 | # Error should not be raised. |
257 | with ExpectedException(DNSReloadError): |
258 | - yield service._checkSerial((formatted_serial, dns_names)) |
259 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
260 | |
261 | @wait_for_reactor |
262 | @inlineCallbacks |
263 | @@ -443,7 +483,7 @@ class TestRegionControllerService(MAASServerTestCase): |
264 | mock_lookup.side_effect = ValueError() |
265 | # Error should not be raised. |
266 | with ExpectedException(DNSReloadError): |
267 | - yield service._checkSerial((formatted_serial, dns_names)) |
268 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
269 | |
270 | @wait_for_reactor |
271 | @inlineCallbacks |
272 | @@ -461,7 +501,7 @@ class TestRegionControllerService(MAASServerTestCase): |
273 | mock_lookup.side_effect = TimeoutError() |
274 | # Error should not be raised. |
275 | with ExpectedException(DNSReloadError): |
276 | - yield service._checkSerial((formatted_serial, dns_names)) |
277 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
278 | |
279 | def test__getRBACClient_returns_None_when_no_url(self): |
280 | service = self.make_service(sentinel.listener) |
281 | @@ -565,7 +605,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase): |
282 | service.needsDNSUpdate = True |
283 | service.previousSerial = publications[0].serial |
284 | dns_result = ( |
285 | - publications[-1].serial, [ |
286 | + publications[-1].serial, True, [ |
287 | factory.make_name('domain') |
288 | for _ in range(3) |
289 | ]) |
290 | @@ -602,7 +642,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase): |
291 | service.needsDNSUpdate = True |
292 | service.previousSerial = publications[0].serial |
293 | dns_result = ( |
294 | - publications[-1].serial, [ |
295 | + publications[-1].serial, True, [ |
296 | factory.make_name('domain') |
297 | for _ in range(3) |
298 | ]) |
299 | diff --git a/src/provisioningserver/dns/actions.py b/src/provisioningserver/dns/actions.py |
300 | index a955023..d8439f8 100644 |
301 | --- a/src/provisioningserver/dns/actions.py |
302 | +++ b/src/provisioningserver/dns/actions.py |
303 | @@ -13,7 +13,10 @@ __all__ = [ |
304 | ] |
305 | |
306 | import collections |
307 | -from subprocess import CalledProcessError |
308 | +from subprocess import ( |
309 | + CalledProcessError, |
310 | + TimeoutExpired, |
311 | +) |
312 | from time import sleep |
313 | |
314 | from provisioningserver.dns.config import ( |
315 | @@ -48,7 +51,7 @@ def bind_reconfigure(): |
316 | raise |
317 | |
318 | |
319 | -def bind_reload(): |
320 | +def bind_reload(timeout=2): |
321 | """Ask BIND to reload its configuration and all zone files. This operation |
322 | is 'best effort' (with logging) as the server may not be running, and there |
323 | is often no context for reporting. |
324 | @@ -56,21 +59,24 @@ def bind_reload(): |
325 | :return: True if success, False otherwise. |
326 | """ |
327 | try: |
328 | - execute_rndc_command(("reload",)) |
329 | + execute_rndc_command(("reload",), timeout=timeout) |
330 | return True |
331 | except CalledProcessError as exc: |
332 | maaslog.error("Reloading BIND failed (is it running?): %s", exc) |
333 | return False |
334 | + except TimeoutExpired as exc: |
335 | + maaslog.error("Reloading BIND timed out (is it locked?): %s", exc) |
336 | + return False |
337 | |
338 | |
339 | -def bind_reload_with_retries(attempts=10, interval=2): |
340 | +def bind_reload_with_retries(attempts=10, interval=2, timeout=2): |
341 | """Ask BIND to reload its configuration and all zone files. |
342 | |
343 | :param attempts: The number of attempts. |
344 | :param interval: The time in seconds to sleep between each attempt. |
345 | """ |
346 | for countdown in range(attempts - 1, -1, -1): |
347 | - if bind_reload(): |
348 | + if bind_reload(timeout=timeout): |
349 | break |
350 | if countdown == 0: |
351 | break |
352 | diff --git a/src/provisioningserver/dns/config.py b/src/provisioningserver/dns/config.py |
353 | index 715b36e..f196902 100644 |
354 | --- a/src/provisioningserver/dns/config.py |
355 | +++ b/src/provisioningserver/dns/config.py |
356 | @@ -177,12 +177,12 @@ def set_up_rndc(): |
357 | f.write(named_content) |
358 | |
359 | |
360 | -def execute_rndc_command(arguments): |
361 | +def execute_rndc_command(arguments, timeout=None): |
362 | """Execute a rndc command.""" |
363 | rndc_conf = get_rndc_conf_path() |
364 | rndc_cmd = ['rndc', '-c', rndc_conf] |
365 | rndc_cmd.extend(arguments) |
366 | - call_and_check(rndc_cmd) |
367 | + call_and_check(rndc_cmd, timeout=timeout) |
368 | |
369 | |
370 | def set_up_options_conf(overwrite=True, **kwargs): |
371 | diff --git a/src/provisioningserver/dns/tests/test_actions.py b/src/provisioningserver/dns/tests/test_actions.py |
372 | index 472bb56..58d216e 100644 |
373 | --- a/src/provisioningserver/dns/tests/test_actions.py |
374 | +++ b/src/provisioningserver/dns/tests/test_actions.py |
375 | @@ -78,7 +78,7 @@ class TestReload(MAASTestCase): |
376 | actions.bind_reload() |
377 | self.assertThat( |
378 | actions.execute_rndc_command, |
379 | - MockCalledOnceWith(("reload",))) |
380 | + MockCalledOnceWith(("reload",), timeout=2)) |
381 | |
382 | def test__logs_subprocess_error(self): |
383 | erc = self.patch_autospec(actions, "execute_rndc_command") |
384 | @@ -105,7 +105,7 @@ class TestReloadWithRetries(MAASTestCase): |
385 | bind_reload.return_value = False |
386 | attempts = randint(3, 13) |
387 | actions.bind_reload_with_retries(attempts=attempts) |
388 | - expected_calls = [call()] * attempts |
389 | + expected_calls = [call(timeout=2)] * attempts |
390 | self.assertThat( |
391 | actions.bind_reload, |
392 | MockCallsMatch(*expected_calls)) |
393 | @@ -114,11 +114,11 @@ class TestReloadWithRetries(MAASTestCase): |
394 | self.patch_autospec(actions, "sleep") # Disable. |
395 | bind_reload = self.patch(actions, "bind_reload") |
396 | bind_reload_return_values = [False, False, True, ] |
397 | - bind_reload.side_effect = lambda: ( |
398 | + bind_reload.side_effect = lambda *args, **kwargs: ( |
399 | bind_reload_return_values.pop(0)) |
400 | |
401 | actions.bind_reload_with_retries(attempts=5) |
402 | - expected_calls = [call(), call(), call()] |
403 | + expected_calls = [call(timeout=2), call(timeout=2), call(timeout=2)] |
404 | self.assertThat( |
405 | actions.bind_reload, |
406 | MockCallsMatch(*expected_calls)) |
407 | diff --git a/src/provisioningserver/dns/tests/test_config.py b/src/provisioningserver/dns/tests/test_config.py |
408 | index f569394..902ab12 100644 |
409 | --- a/src/provisioningserver/dns/tests/test_config.py |
410 | +++ b/src/provisioningserver/dns/tests/test_config.py |
411 | @@ -9,7 +9,10 @@ import errno |
412 | import os.path |
413 | import random |
414 | from textwrap import dedent |
415 | -from unittest.mock import Mock |
416 | +from unittest.mock import ( |
417 | + Mock, |
418 | + sentinel, |
419 | +) |
420 | |
421 | from fixtures import EnvironmentVariable |
422 | from maastesting.factory import factory |
423 | @@ -278,10 +281,11 @@ class TestRNDCUtilities(MAASTestCase): |
424 | fake_dir = patch_dns_config_path(self) |
425 | self.patch(config, 'call_and_check', recorder) |
426 | command = factory.make_string() |
427 | - execute_rndc_command([command]) |
428 | + execute_rndc_command([command], timeout=sentinel.timeout) |
429 | rndc_conf_path = os.path.join(fake_dir, MAAS_RNDC_CONF_NAME) |
430 | expected_command = ['rndc', '-c', rndc_conf_path, command] |
431 | self.assertEqual((expected_command,), recorder.calls[0][0]) |
432 | + self.assertEqual({'timeout': sentinel.timeout}, recorder.calls[0][1]) |
433 | |
434 | def test_extract_suggested_named_conf_extracts_section(self): |
435 | named_part = factory.make_string() |
436 | diff --git a/src/provisioningserver/service_monitor.py b/src/provisioningserver/service_monitor.py |
437 | index 75caab2..701fe85 100644 |
438 | --- a/src/provisioningserver/service_monitor.py |
439 | +++ b/src/provisioningserver/service_monitor.py |
440 | @@ -50,6 +50,9 @@ class DNSServiceOnRack(ToggleableService): |
441 | service_name = "bind9" |
442 | snap_service_name = "bind9" |
443 | |
444 | + # Pass SIGKILL directly to parent. |
445 | + kill_extra_opts = ('-s', 'SIGKILL') |
446 | + |
447 | |
448 | class ProxyServiceOnRack(ToggleableService): |
449 | """Monitored proxy service on a rack controller host.""" |
450 | diff --git a/src/provisioningserver/utils/service_monitor.py b/src/provisioningserver/utils/service_monitor.py |
451 | index 3b20273..047ec8b 100644 |
452 | --- a/src/provisioningserver/utils/service_monitor.py |
453 | +++ b/src/provisioningserver/utils/service_monitor.py |
454 | @@ -413,6 +413,26 @@ class ServiceMonitor: |
455 | raise ServiceActionError(error_msg) |
456 | yield self._performServiceAction(service, "reload") |
457 | |
458 | + @asynchronous |
459 | + @inlineCallbacks |
460 | + def killService(self, name): |
461 | + """Kill service. |
462 | + |
463 | + Service will be killed then its state will be ensured to be in its |
464 | + expected state. This is a way of getting a broken related service to be |
465 | + responsive. |
466 | + """ |
467 | + service = self.getServiceByName(name) |
468 | + try: |
469 | + yield self._performServiceAction(service, "kill") |
470 | + except ServiceActionError: |
471 | + # Kill action is allowed to fail, as its possible the service is |
472 | + # already dead or not responding to correct signals to check |
473 | + # the current status. |
474 | + pass |
475 | + state = yield self.ensureService(name) |
476 | + return state |
477 | + |
478 | @inlineCallbacks |
479 | def _execCmd(self, cmd, env, timeout=8, retries=3): |
480 | """Execute the `cmd` with the `env`.""" |
481 | @@ -468,14 +488,46 @@ class ServiceMonitor: |
482 | return self._execCmd(cmd, env) |
483 | |
484 | @asynchronous |
485 | - def _execSupervisorServiceAction(self, service_name, action): |
486 | + def _execSupervisorServiceAction( |
487 | + self, service_name, action, extra_opts=None): |
488 | """Perform the action with the run-supervisorctl command. |
489 | |
490 | :return: tuple (exit code, std-output, std-error) |
491 | """ |
492 | env = get_env_with_bytes_locale() |
493 | cmd = os.path.join(snappy.get_snap_path(), "bin", "run-supervisorctl") |
494 | - cmd = (cmd, action, service_name) |
495 | + |
496 | + # supervisord doesn't support native kill like systemd. Emulate this |
497 | + # behaviour by getting the PID of the process and then killing the PID. |
498 | + if action == 'kill': |
499 | + |
500 | + def _kill_pid(result): |
501 | + exit_code, stdout, _ = result |
502 | + if exit_code != 0: |
503 | + return result |
504 | + try: |
505 | + pid = int(stdout.strip()) |
506 | + except ValueError: |
507 | + pid = 0 |
508 | + if pid == 0: |
509 | + # supervisorctl returns 0 when the process is already dead |
510 | + # or we where not able to get the actual pid. Nothing to |
511 | + # do, as its already dead. |
512 | + return 0, "", "" |
513 | + cmd = ('kill',) |
514 | + if extra_opts: |
515 | + cmd += extra_opts |
516 | + cmd += ('%s' % pid,) |
517 | + return self._execCmd(cmd, env) |
518 | + |
519 | + d = self._execCmd((cmd, 'pid', service_name), env) |
520 | + d.addCallback(_kill_pid) |
521 | + return d |
522 | + |
523 | + cmd = (cmd, action) |
524 | + if extra_opts is not None: |
525 | + cmd += extra_opts |
526 | + cmd += (service_name,) |
527 | return self._execCmd(cmd, env) |
528 | |
529 | @inlineCallbacks |
530 | @@ -488,8 +540,9 @@ class ServiceMonitor: |
531 | else: |
532 | exec_action = self._execSystemDServiceAction |
533 | service_name = service.service_name |
534 | + extra_opts = getattr(service, '%s_extra_opts' % action, None) |
535 | exit_code, output, error = yield lock.run( |
536 | - exec_action, service_name, action) |
537 | + exec_action, service_name, action, extra_opts=extra_opts) |
538 | if exit_code != 0: |
539 | error_msg = ( |
540 | "Service '%s' failed to %s: %s" % ( |
541 | diff --git a/src/provisioningserver/utils/shell.py b/src/provisioningserver/utils/shell.py |
542 | index 0ec7c39..488afb6 100644 |
543 | --- a/src/provisioningserver/utils/shell.py |
544 | +++ b/src/provisioningserver/utils/shell.py |
545 | @@ -105,8 +105,9 @@ def call_and_check(command, *args, **kwargs): |
546 | :return: The command's output from standard output. |
547 | :raise ExternalProcessError: If the command returns nonzero. |
548 | """ |
549 | + timeout = kwargs.pop('timeout', None) |
550 | process = Popen(command, *args, stdout=PIPE, stderr=PIPE, **kwargs) |
551 | - stdout, stderr = process.communicate() |
552 | + stdout, stderr = process.communicate(timeout=timeout) |
553 | stderr = stderr.strip() |
554 | if process.returncode != 0: |
555 | raise ExternalProcessError(process.returncode, command, output=stderr) |
556 | diff --git a/src/provisioningserver/utils/tests/test_service_monitor.py b/src/provisioningserver/utils/tests/test_service_monitor.py |
557 | index eb984ed..11e71da 100644 |
558 | --- a/src/provisioningserver/utils/tests/test_service_monitor.py |
559 | +++ b/src/provisioningserver/utils/tests/test_service_monitor.py |
560 | @@ -10,6 +10,7 @@ import os |
561 | import random |
562 | from textwrap import dedent |
563 | from unittest.mock import ( |
564 | + call, |
565 | Mock, |
566 | sentinel, |
567 | ) |
568 | @@ -18,6 +19,7 @@ from fixtures import FakeLogger |
569 | from maastesting.factory import factory |
570 | from maastesting.matchers import ( |
571 | MockCalledOnceWith, |
572 | + MockCallsMatch, |
573 | MockNotCalled, |
574 | ) |
575 | from maastesting.runtest import MAASTwistedRunTest |
576 | @@ -50,6 +52,7 @@ from twisted.internet import reactor |
577 | from twisted.internet.defer import ( |
578 | CancelledError, |
579 | DeferredLock, |
580 | + fail, |
581 | inlineCallbacks, |
582 | succeed, |
583 | ) |
584 | @@ -452,6 +455,42 @@ class TestServiceMonitor(MAASTestCase): |
585 | yield service_monitor.reloadService(fake_service.name, if_on=True) |
586 | |
587 | @inlineCallbacks |
588 | + def test__killService_performs_kill_then_ensureService(self): |
589 | + fake_service = make_fake_service(SERVICE_STATE.ON) |
590 | + service_monitor = self.make_service_monitor([fake_service]) |
591 | + mock_performServiceAction = self.patch( |
592 | + service_monitor, "_performServiceAction") |
593 | + mock_performServiceAction.return_value = succeed(None) |
594 | + mock_ensureService = self.patch(service_monitor, "ensureService") |
595 | + mock_ensureService.return_value = succeed( |
596 | + ServiceState(SERVICE_STATE.ON, "running")) |
597 | + yield service_monitor.killService(fake_service.name) |
598 | + self.assertThat( |
599 | + mock_performServiceAction, |
600 | + MockCalledOnceWith(fake_service, "kill")) |
601 | + self.assertThat( |
602 | + mock_ensureService, |
603 | + MockCalledOnceWith(fake_service.name)) |
604 | + |
605 | + @inlineCallbacks |
606 | + def test__killService_doesnt_fail_on_ServiceActionError(self): |
607 | + fake_service = make_fake_service(SERVICE_STATE.ON) |
608 | + service_monitor = self.make_service_monitor([fake_service]) |
609 | + mock_performServiceAction = self.patch( |
610 | + service_monitor, "_performServiceAction") |
611 | + mock_performServiceAction.return_value = fail(ServiceActionError()) |
612 | + mock_ensureService = self.patch(service_monitor, "ensureService") |
613 | + mock_ensureService.return_value = succeed( |
614 | + ServiceState(SERVICE_STATE.ON, "running")) |
615 | + yield service_monitor.killService(fake_service.name) |
616 | + self.assertThat( |
617 | + mock_performServiceAction, |
618 | + MockCalledOnceWith(fake_service, "kill")) |
619 | + self.assertThat( |
620 | + mock_ensureService, |
621 | + MockCalledOnceWith(fake_service.name)) |
622 | + |
623 | + @inlineCallbacks |
624 | def test___execCmd_times_out(self): |
625 | monitor = ServiceMonitor(make_fake_service()) |
626 | with ExpectedException(ServiceActionError): |
627 | @@ -535,16 +574,44 @@ class TestServiceMonitor(MAASTestCase): |
628 | mock_getProcessOutputAndValue = self.patch( |
629 | service_monitor_module, "getProcessOutputAndValue") |
630 | mock_getProcessOutputAndValue.return_value = succeed((b"", b"", 0)) |
631 | + extra_opts = ('--extra', factory.make_name('extra')) |
632 | yield service_monitor._execSupervisorServiceAction( |
633 | - service_name, action) |
634 | + service_name, action, extra_opts=extra_opts) |
635 | cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl') |
636 | - cmd = cmd, action, service_name |
637 | + cmd = (cmd, action) + extra_opts + (service_name,) |
638 | self.assertThat( |
639 | mock_getProcessOutputAndValue, MockCalledOnceWith( |
640 | # The environment contains LC_ALL and LANG too. |
641 | cmd[0], cmd[1:], env=get_env_with_bytes_locale())) |
642 | |
643 | @inlineCallbacks |
644 | + def test___execSupervisorServiceAction_emulates_kill(self): |
645 | + snap_path = factory.make_name("path") |
646 | + self.patch(snappy, "get_snap_path").return_value = snap_path |
647 | + service_monitor = self.make_service_monitor() |
648 | + service_name = factory.make_name("service") |
649 | + fake_pid = random.randint(1, 100) |
650 | + mock_getProcessOutputAndValue = self.patch( |
651 | + service_monitor_module, "getProcessOutputAndValue") |
652 | + mock_getProcessOutputAndValue.side_effect = [ |
653 | + succeed((("%s" % fake_pid).encode('utf-8'), b"", 0)), |
654 | + succeed((b"", b"", 0)), |
655 | + ] |
656 | + extra_opts = ('-s', factory.make_name('SIGKILL')) |
657 | + yield service_monitor._execSupervisorServiceAction( |
658 | + service_name, 'kill', extra_opts=extra_opts) |
659 | + cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl') |
660 | + self.assertThat( |
661 | + mock_getProcessOutputAndValue, MockCallsMatch( |
662 | + call( |
663 | + cmd, ("pid", service_name), |
664 | + env=get_env_with_bytes_locale()), |
665 | + call( |
666 | + "kill", extra_opts + ("%s" % fake_pid,), |
667 | + env=get_env_with_bytes_locale()), |
668 | + )) |
669 | + |
670 | + @inlineCallbacks |
671 | def test___execSupervisorServiceAction_decodes_stdout_and_stderr(self): |
672 | # From https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt. |
673 | example_text = ( |
674 | @@ -579,13 +646,16 @@ class TestServiceMonitor(MAASTestCase): |
675 | service_monitor, "_execSystemDServiceAction") |
676 | mock_execSystemDServiceAction.return_value = (0, "", "") |
677 | action = factory.make_name("action") |
678 | + extra_opts = ('--option', factory.make_name("option")) |
679 | + setattr(service, '%s_extra_opts' % action, extra_opts) |
680 | yield service_monitor._performServiceAction(service, action) |
681 | self.assertThat(service_lock.run, MockCalledOnceWith( |
682 | service_monitor._execSystemDServiceAction, |
683 | - service.service_name, action)) |
684 | + service.service_name, action, extra_opts=extra_opts)) |
685 | self.assertThat( |
686 | mock_execSystemDServiceAction, |
687 | - MockCalledOnceWith(service.service_name, action)) |
688 | + MockCalledOnceWith( |
689 | + service.service_name, action, extra_opts=extra_opts)) |
690 | |
691 | @inlineCallbacks |
692 | def test___performServiceAction_holds_lock_perform_supervisor_action(self): |
693 | @@ -599,13 +669,16 @@ class TestServiceMonitor(MAASTestCase): |
694 | service_monitor, "_execSupervisorServiceAction") |
695 | mock_execSupervisorServiceAction.return_value = (0, "", "") |
696 | action = factory.make_name("action") |
697 | + extra_opts = ('--option', factory.make_name("option")) |
698 | + setattr(service, '%s_extra_opts' % action, extra_opts) |
699 | yield service_monitor._performServiceAction(service, action) |
700 | self.assertThat(service_lock.run, MockCalledOnceWith( |
701 | service_monitor._execSupervisorServiceAction, |
702 | - service.service_name, action)) |
703 | + service.service_name, action, extra_opts=extra_opts)) |
704 | self.assertThat( |
705 | mock_execSupervisorServiceAction, |
706 | - MockCalledOnceWith(service.service_name, action)) |
707 | + MockCalledOnceWith( |
708 | + service.service_name, action, extra_opts=extra_opts)) |
709 | |
710 | @inlineCallbacks |
711 | def test___performServiceAction_raises_ServiceActionError_if_fails(self): |
712 | diff --git a/src/provisioningserver/utils/tests/test_shell.py b/src/provisioningserver/utils/tests/test_shell.py |
713 | index 4492b3a..03f67fd 100644 |
714 | --- a/src/provisioningserver/utils/tests/test_shell.py |
715 | +++ b/src/provisioningserver/utils/tests/test_shell.py |
716 | @@ -6,6 +6,7 @@ |
717 | __all__ = [] |
718 | |
719 | import os |
720 | +import random |
721 | from subprocess import CalledProcessError |
722 | |
723 | from fixtures import EnvironmentVariable |
724 | @@ -54,6 +55,14 @@ class TestCallAndCheck(MAASTestCase): |
725 | self.assertEqual(command, error.cmd) |
726 | self.assertEqual(message, error.output) |
727 | |
728 | + def test__passes_timeout_to_communicate(self): |
729 | + command = factory.make_name('command') |
730 | + process = self.patch_popen() |
731 | + timeout = random.randint(1, 10) |
732 | + call_and_check(command, timeout=timeout) |
733 | + self.assertThat( |
734 | + process.communicate, MockCalledOnceWith(timeout=timeout)) |
735 | + |
736 | def test__reports_stderr_on_failure(self): |
737 | nonfile = os.path.join(self.make_dir(), factory.make_name('nonesuch')) |
738 | error = self.assertRaises( |
Self-approving backport.