Merge ~blake-rouse/maas:fix-1710278-2.4 into maas:2.4
- Git
- lp:~blake-rouse/maas
- fix-1710278-2.4
- Merge into 2.4
Status: | Merged | ||||
---|---|---|---|---|---|
Approved by: | Björn Tillenius | ||||
Approved revision: | c64105dbfb38863b5c7b38616864d2c379ac356c | ||||
Merge reported by: | MAAS Lander | ||||
Merged at revision: | not available | ||||
Proposed branch: | ~blake-rouse/maas:fix-1710278-2.4 | ||||
Merge into: | maas:2.4 | ||||
Diff against target: |
876 lines (+344/-48) 15 files modified
debian/extras/99-maas-common-sudoers (+6/-0) src/maasserver/dns/config.py (+4/-4) src/maasserver/dns/tests/test_config.py (+5/-2) src/maasserver/region_controller.py (+38/-5) src/maasserver/service_monitor.py (+3/-0) src/maasserver/tests/test_region_controller.py (+49/-8) src/provisioningserver/dns/actions.py (+11/-5) src/provisioningserver/dns/config.py (+2/-2) src/provisioningserver/dns/tests/test_actions.py (+4/-4) src/provisioningserver/dns/tests/test_config.py (+6/-2) src/provisioningserver/utils/service_monitor.py (+101/-9) src/provisioningserver/utils/shell.py (+2/-1) src/provisioningserver/utils/tests/test_service_monitor.py (+102/-6) src/provisioningserver/utils/tests/test_shell.py (+9/-0) versions.cfg (+2/-0) |
||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
MAAS Lander | Needs Fixing | ||
Adam Collard (community) | Approve | ||
Review via email: mp+372692@code.launchpad.net |
Commit message
Fixes LP: #1710278 - Notice when bind9 is hung, force kill the service, ensure it starts again, then perform a reload.
Backport of d0a4ec6ce2af49d
Description of the change
MAAS Lander (maas-lander) wrote : | # |
UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas
STATUS: FAILED
LOG: http://
COMMIT: 50a45ad9cd72dbc
Adam Collard (adam-collard) : | # |
MAAS Lander (maas-lander) wrote : | # |
UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas
STATUS: FAILED
LOG: http://
COMMIT: c64105dbfb38863
Blake Rouse (blake-rouse) wrote : | # |
The failures from the test running are javascript failures, which have not been touched in this branch.
MAAS Lander (maas-lander) wrote : | # |
LANDING
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas
STATUS: FAILED BUILD
LOG: http://
Preview Diff
1 | diff --git a/debian/extras/99-maas-common-sudoers b/debian/extras/99-maas-common-sudoers |
2 | index 1475b44..d333d65 100644 |
3 | --- a/debian/extras/99-maas-common-sudoers |
4 | +++ b/debian/extras/99-maas-common-sudoers |
5 | @@ -11,6 +11,12 @@ maas ALL= NOPASSWD: /bin/systemctl restart chrony |
6 | maas ALL= NOPASSWD: /bin/systemctl start chrony |
7 | maas ALL= NOPASSWD: /bin/systemctl status chrony |
8 | |
9 | +# Control of the DNS server: MAAS needs to track status, start it |
10 | +# again if stopped manually, or if MAAS must brutally kill it! |
11 | +maas ALL= NOPASSWD: /bin/systemctl status bind9 |
12 | +maas ALL= NOPASSWD: /bin/systemctl start bind9 |
13 | +maas ALL= NOPASSWD: /bin/systemctl kill -s SIGKILL bind9 |
14 | + |
15 | # Writing and deleting files as root. |
16 | maas ALL= NOPASSWD: /usr/lib/maas/maas-delete-file |
17 | maas ALL= NOPASSWD: /usr/lib/maas/maas-write-file |
18 | diff --git a/src/maasserver/dns/config.py b/src/maasserver/dns/config.py |
19 | index e27773a..5312b42 100644 |
20 | --- a/src/maasserver/dns/config.py |
21 | +++ b/src/maasserver/dns/config.py |
22 | @@ -42,7 +42,7 @@ def dns_force_reload(): |
23 | DNSPublication(source="Force reload").save() |
24 | |
25 | |
26 | -def dns_update_all_zones(reload_retry=False): |
27 | +def dns_update_all_zones(reload_retry=False, reload_timeout=2): |
28 | """Update all zone files for all domains. |
29 | |
30 | Serving these zone files means updating BIND's configuration to include |
31 | @@ -84,12 +84,12 @@ def dns_update_all_zones(reload_retry=False): |
32 | # actually needed but it seems safer to maintain this behaviour until we |
33 | # have a better understanding. |
34 | if reload_retry: |
35 | - bind_reload_with_retries() |
36 | + reloaded = bind_reload_with_retries(timeout=reload_timeout) |
37 | else: |
38 | - bind_reload() |
39 | + reloaded = bind_reload(timeout=reload_timeout) |
40 | |
41 | # Return the current serial and list of domain names. |
42 | - return serial, [ |
43 | + return serial, reloaded, [ |
44 | domain.name |
45 | for domain in domains |
46 | ] |
47 | diff --git a/src/maasserver/dns/tests/test_config.py b/src/maasserver/dns/tests/test_config.py |
48 | index c163420..e8303d4 100644 |
49 | --- a/src/maasserver/dns/tests/test_config.py |
50 | +++ b/src/maasserver/dns/tests/test_config.py |
51 | @@ -53,6 +53,7 @@ from testtools.matchers import ( |
52 | Contains, |
53 | Equals, |
54 | FileContains, |
55 | + Is, |
56 | MatchesSetwise, |
57 | MatchesStructure, |
58 | ) |
59 | @@ -255,7 +256,8 @@ class TestDNSConfigModifications(TestDNSServer): |
60 | bind_reload_with_retries = self.patch_autospec( |
61 | dns_config_module, "bind_reload_with_retries") |
62 | dns_update_all_zones(reload_retry=True) |
63 | - self.assertThat(bind_reload_with_retries, MockCalledOnceWith()) |
64 | + self.assertThat( |
65 | + bind_reload_with_retries, MockCalledOnceWith(timeout=2)) |
66 | |
67 | def test_dns_update_all_zones_passes_upstream_dns_parameter(self): |
68 | self.patch(settings, 'DNS_CONNECT', True) |
69 | @@ -314,8 +316,9 @@ class TestDNSConfigModifications(TestDNSServer): |
70 | self.patch( |
71 | dns_config_module, |
72 | "current_zone_serial").return_value = fake_serial |
73 | - serial, domains = dns_update_all_zones() |
74 | + serial, reloaded, domains = dns_update_all_zones() |
75 | self.assertThat(serial, Equals(fake_serial)) |
76 | + self.assertThat(reloaded, Is(True)) |
77 | self.assertThat(domains, MatchesSetwise(*[ |
78 | Equals(domain.name) |
79 | for domain in Domain.objects.filter(authoritative=True) |
80 | diff --git a/src/maasserver/region_controller.py b/src/maasserver/region_controller.py |
81 | index e54b0a9..cd62138 100644 |
82 | --- a/src/maasserver/region_controller.py |
83 | +++ b/src/maasserver/region_controller.py |
84 | @@ -26,6 +26,7 @@ __all__ = [ |
85 | from maasserver.dns.config import dns_update_all_zones |
86 | from maasserver.models.dnspublication import DNSPublication |
87 | from maasserver.proxyconfig import proxy_update_config |
88 | +from maasserver.service_monitor import service_monitor |
89 | from maasserver.utils.orm import transactional |
90 | from maasserver.utils.threads import deferToDatabase |
91 | from provisioningserver.logger import LegacyLogger |
92 | @@ -60,7 +61,7 @@ class RegionControllerService(Service): |
93 | See module documentation for more details. |
94 | """ |
95 | |
96 | - def __init__(self, postgresListener, clock=reactor): |
97 | + def __init__(self, postgresListener, clock=reactor, retryOnFailure=True): |
98 | """Initialise a new `RegionControllerService`. |
99 | |
100 | :param postgresListener: The `PostgresListenerService` that is running |
101 | @@ -68,6 +69,7 @@ class RegionControllerService(Service): |
102 | """ |
103 | super(RegionControllerService, self).__init__() |
104 | self.clock = clock |
105 | + self.retryOnFailure = retryOnFailure |
106 | self.processing = LoopingCall(self.process) |
107 | self.processing.clock = self.clock |
108 | self.processingDefer = None |
109 | @@ -118,12 +120,27 @@ class RegionControllerService(Service): |
110 | |
111 | def process(self): |
112 | """Process the DNS and/or proxy update.""" |
113 | + |
114 | + def _onFailureRetry(failure, attr): |
115 | + """Retry update on failure. |
116 | + |
117 | + Doesn't mask the failure, the failure is still raised. |
118 | + """ |
119 | + if self.retryOnFailure: |
120 | + setattr(self, attr, True) |
121 | + return failure |
122 | + |
123 | defers = [] |
124 | if self.needsDNSUpdate: |
125 | self.needsDNSUpdate = False |
126 | d = deferToDatabase(transactional(dns_update_all_zones)) |
127 | d.addCallback(self._checkSerial) |
128 | d.addCallback(self._logDNSReload) |
129 | + # Order here matters, first needsDNSUpdate is set then pass the |
130 | + # failure onto `_onDNSReloadFailure` to do the correct thing |
131 | + # with the DNS server. |
132 | + d.addErrback(_onFailureRetry, 'needsDNSUpdate') |
133 | + d.addErrback(self._onDNSReloadFailure) |
134 | d.addErrback( |
135 | log.err, |
136 | "Failed configuring DNS.") |
137 | @@ -150,7 +167,10 @@ class RegionControllerService(Service): |
138 | """Check that the serial of the domain is updated.""" |
139 | if result is None: |
140 | return None |
141 | - serial, domain_names = result |
142 | + serial, reloaded, domain_names = result |
143 | + if not reloaded: |
144 | + raise DNSReloadError( |
145 | + "Failed to reload DNS; timeout or rdnc command failed.") |
146 | not_matching_domains = set(domain_names) |
147 | loop = 0 |
148 | while len(not_matching_domains) > 0 and loop != 30: |
149 | @@ -173,13 +193,13 @@ class RegionControllerService(Service): |
150 | raise DNSReloadError( |
151 | "Failed to reload DNS; serial mismatch " |
152 | "on domains %s" % ', '.join(not_matching_domains)) |
153 | - return serial, domain_names |
154 | + return result |
155 | |
156 | def _logDNSReload(self, result): |
157 | """Log the reason DNS was reloaded.""" |
158 | if result is None: |
159 | return None |
160 | - serial, domain_names = result |
161 | + serial, _, domain_names = result |
162 | if self.previousSerial is None: |
163 | # This was the first load for starting the service. |
164 | self.previousSerial = serial |
165 | @@ -190,7 +210,10 @@ class RegionControllerService(Service): |
166 | # reason for the reload. |
167 | |
168 | def _logReason(reasons): |
169 | - if len(reasons) == 1: |
170 | + if len(reasons) == 0: |
171 | + msg = ( |
172 | + "Reloaded DNS configuration; previous failure (retry)") |
173 | + elif len(reasons) == 1: |
174 | msg = "Reloaded DNS configuration; %s" % reasons[0] |
175 | else: |
176 | msg = 'Reloaded DNS configuration: \n' + '\n'.join( |
177 | @@ -207,6 +230,16 @@ class RegionControllerService(Service): |
178 | self.previousSerial = serial |
179 | return d |
180 | |
181 | + def _onDNSReloadFailure(self, failure): |
182 | + """Force kill and restart bind9.""" |
183 | + failure.trap(DNSReloadError) |
184 | + if not self.retryOnFailure: |
185 | + return failure |
186 | + log.err(failure, "Failed configuring DNS; killing and restarting") |
187 | + d = service_monitor.killService('bind9') |
188 | + d.addErrback(log.err, "Failed to kill and restart DNS.") |
189 | + return d |
190 | + |
191 | @transactional |
192 | def _getReloadReasons(self, previousSerial, currentSerial): |
193 | return [ |
194 | diff --git a/src/maasserver/service_monitor.py b/src/maasserver/service_monitor.py |
195 | index 6efd9b7..05d235b 100644 |
196 | --- a/src/maasserver/service_monitor.py |
197 | +++ b/src/maasserver/service_monitor.py |
198 | @@ -29,6 +29,9 @@ class BIND9Service(AlwaysOnService): |
199 | service_name = "bind9" |
200 | snap_service_name = "bind9" |
201 | |
202 | + # Pass SIGKILL directly to parent. |
203 | + kill_extra_opts = ('-s', 'SIGKILL') |
204 | + |
205 | |
206 | class NTPServiceOnRegion(AlwaysOnService): |
207 | """Monitored NTP service on a region controller host.""" |
208 | diff --git a/src/maasserver/tests/test_region_controller.py b/src/maasserver/tests/test_region_controller.py |
209 | index 090e590..6926783 100644 |
210 | --- a/src/maasserver/tests/test_region_controller.py |
211 | +++ b/src/maasserver/tests/test_region_controller.py |
212 | @@ -183,7 +183,7 @@ class TestRegionControllerService(MAASServerTestCase): |
213 | service = RegionControllerService(sentinel.listener) |
214 | service.needsDNSUpdate = True |
215 | dns_result = ( |
216 | - random.randint(1, 1000), [ |
217 | + random.randint(1, 1000), True, [ |
218 | factory.make_name('domain') |
219 | for _ in range(3) |
220 | ]) |
221 | @@ -205,6 +205,46 @@ class TestRegionControllerService(MAASServerTestCase): |
222 | |
223 | @wait_for_reactor |
224 | @inlineCallbacks |
225 | + def test_process_zones_kills_bind_on_failed_reload(self): |
226 | + service = RegionControllerService(sentinel.listener) |
227 | + service.needsDNSUpdate = True |
228 | + service.retryOnFailure = True |
229 | + dns_result_0 = ( |
230 | + random.randint(1, 1000), False, [ |
231 | + factory.make_name('domain') |
232 | + for _ in range(3) |
233 | + ]) |
234 | + dns_result_1 = (dns_result_0[0], True, dns_result_0[2]) |
235 | + mock_dns_update_all_zones = self.patch( |
236 | + region_controller, "dns_update_all_zones") |
237 | + mock_dns_update_all_zones.side_effect = [dns_result_0, dns_result_1] |
238 | + |
239 | + service._checkSerialCalled = False |
240 | + orig_checkSerial = service._checkSerial |
241 | + |
242 | + def _checkSerial(result): |
243 | + if service._checkSerialCalled: |
244 | + return dns_result_1 |
245 | + service._checkSerialCalled = True |
246 | + return orig_checkSerial(result) |
247 | + |
248 | + mock_check_serial = self.patch(service, "_checkSerial") |
249 | + mock_check_serial.side_effect = _checkSerial |
250 | + mock_killService = self.patch( |
251 | + region_controller.service_monitor, 'killService') |
252 | + mock_killService.return_value = succeed(None) |
253 | + service.startProcessing() |
254 | + yield service.processingDefer |
255 | + self.assertThat( |
256 | + mock_dns_update_all_zones, MockCallsMatch(call(), call())) |
257 | + self.assertThat( |
258 | + mock_check_serial, |
259 | + MockCallsMatch(call(dns_result_0), call(dns_result_1))) |
260 | + self.assertThat( |
261 | + mock_killService, MockCalledOnceWith("bind9")) |
262 | + |
263 | + @wait_for_reactor |
264 | + @inlineCallbacks |
265 | def test_process_updates_proxy(self): |
266 | service = RegionControllerService(sentinel.listener) |
267 | service.needsProxyUpdate = True |
268 | @@ -226,6 +266,7 @@ class TestRegionControllerService(MAASServerTestCase): |
269 | def test_process_updates_zones_logs_failure(self): |
270 | service = RegionControllerService(sentinel.listener) |
271 | service.needsDNSUpdate = True |
272 | + service.retryOnFailure = False |
273 | mock_dns_update_all_zones = self.patch( |
274 | region_controller, "dns_update_all_zones") |
275 | mock_dns_update_all_zones.side_effect = factory.make_exception() |
276 | @@ -263,7 +304,7 @@ class TestRegionControllerService(MAASServerTestCase): |
277 | service.needsDNSUpdate = True |
278 | service.needsProxyUpdate = True |
279 | dns_result = ( |
280 | - random.randint(1, 1000), [ |
281 | + random.randint(1, 1000), True, [ |
282 | factory.make_name('domain') |
283 | for _ in range(3) |
284 | ]) |
285 | @@ -318,7 +359,7 @@ class TestRegionControllerService(MAASServerTestCase): |
286 | succeed(([self.make_soa_result(result_serial)], [], [])), |
287 | ] |
288 | # Error should not be raised. |
289 | - return service._checkSerial((formatted_serial, dns_names)) |
290 | + return service._checkSerial((formatted_serial, True, dns_names)) |
291 | |
292 | @wait_for_reactor |
293 | @inlineCallbacks |
294 | @@ -336,7 +377,7 @@ class TestRegionControllerService(MAASServerTestCase): |
295 | mock_lookup.side_effect = lambda *args: succeed(([], [], [])) |
296 | # Error should not be raised. |
297 | with ExpectedException(DNSReloadError): |
298 | - yield service._checkSerial((formatted_serial, dns_names)) |
299 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
300 | |
301 | @wait_for_reactor |
302 | @inlineCallbacks |
303 | @@ -354,7 +395,7 @@ class TestRegionControllerService(MAASServerTestCase): |
304 | mock_lookup.side_effect = ValueError() |
305 | # Error should not be raised. |
306 | with ExpectedException(DNSReloadError): |
307 | - yield service._checkSerial((formatted_serial, dns_names)) |
308 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
309 | |
310 | @wait_for_reactor |
311 | @inlineCallbacks |
312 | @@ -372,7 +413,7 @@ class TestRegionControllerService(MAASServerTestCase): |
313 | mock_lookup.side_effect = TimeoutError() |
314 | # Error should not be raised. |
315 | with ExpectedException(DNSReloadError): |
316 | - yield service._checkSerial((formatted_serial, dns_names)) |
317 | + yield service._checkSerial((formatted_serial, True, dns_names)) |
318 | |
319 | |
320 | class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase): |
321 | @@ -393,7 +434,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase): |
322 | service.needsDNSUpdate = True |
323 | service.previousSerial = publications[0].serial |
324 | dns_result = ( |
325 | - publications[-1].serial, [ |
326 | + publications[-1].serial, True, [ |
327 | factory.make_name('domain') |
328 | for _ in range(3) |
329 | ]) |
330 | @@ -430,7 +471,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase): |
331 | service.needsDNSUpdate = True |
332 | service.previousSerial = publications[0].serial |
333 | dns_result = ( |
334 | - publications[-1].serial, [ |
335 | + publications[-1].serial, True, [ |
336 | factory.make_name('domain') |
337 | for _ in range(3) |
338 | ]) |
339 | diff --git a/src/provisioningserver/dns/actions.py b/src/provisioningserver/dns/actions.py |
340 | index 97ad042..5415184 100644 |
341 | --- a/src/provisioningserver/dns/actions.py |
342 | +++ b/src/provisioningserver/dns/actions.py |
343 | @@ -13,7 +13,10 @@ __all__ = [ |
344 | ] |
345 | |
346 | import collections |
347 | -from subprocess import CalledProcessError |
348 | +from subprocess import ( |
349 | + CalledProcessError, |
350 | + TimeoutExpired, |
351 | +) |
352 | from time import sleep |
353 | |
354 | from provisioningserver.dns.config import ( |
355 | @@ -48,7 +51,7 @@ def bind_reconfigure(): |
356 | raise |
357 | |
358 | |
359 | -def bind_reload(): |
360 | +def bind_reload(timeout=2): |
361 | """Ask BIND to reload its configuration and all zone files. This operation |
362 | is 'best effort' (with logging) as the server may not be running, and there |
363 | is often no context for reporting. |
364 | @@ -56,21 +59,24 @@ def bind_reload(): |
365 | :return: True if success, False otherwise. |
366 | """ |
367 | try: |
368 | - execute_rndc_command(("reload",)) |
369 | + execute_rndc_command(("reload",), timeout=timeout) |
370 | return True |
371 | except CalledProcessError as exc: |
372 | maaslog.error("Reloading BIND failed (is it running?): %s", exc) |
373 | return False |
374 | + except TimeoutExpired as exc: |
375 | + maaslog.error("Reloading BIND timed out (is it locked?): %s", exc) |
376 | + return False |
377 | |
378 | |
379 | -def bind_reload_with_retries(attempts=10, interval=2): |
380 | +def bind_reload_with_retries(attempts=10, interval=2, timeout=2): |
381 | """Ask BIND to reload its configuration and all zone files. |
382 | |
383 | :param attempts: The number of attempts. |
384 | :param interval: The time in seconds to sleep between each attempt. |
385 | """ |
386 | for countdown in range(attempts - 1, -1, -1): |
387 | - if bind_reload(): |
388 | + if bind_reload(timeout=timeout): |
389 | break |
390 | if countdown == 0: |
391 | break |
392 | diff --git a/src/provisioningserver/dns/config.py b/src/provisioningserver/dns/config.py |
393 | index b03b0fb..eb780d0 100644 |
394 | --- a/src/provisioningserver/dns/config.py |
395 | +++ b/src/provisioningserver/dns/config.py |
396 | @@ -172,12 +172,12 @@ def set_up_rndc(): |
397 | f.write(named_content) |
398 | |
399 | |
400 | -def execute_rndc_command(arguments): |
401 | +def execute_rndc_command(arguments, timeout=None): |
402 | """Execute a rndc command.""" |
403 | rndc_conf = get_rndc_conf_path() |
404 | rndc_cmd = ['rndc', '-c', rndc_conf] |
405 | rndc_cmd.extend(arguments) |
406 | - call_and_check(rndc_cmd) |
407 | + call_and_check(rndc_cmd, timeout=timeout) |
408 | |
409 | |
410 | def set_up_options_conf(overwrite=True, **kwargs): |
411 | diff --git a/src/provisioningserver/dns/tests/test_actions.py b/src/provisioningserver/dns/tests/test_actions.py |
412 | index 472bb56..58d216e 100644 |
413 | --- a/src/provisioningserver/dns/tests/test_actions.py |
414 | +++ b/src/provisioningserver/dns/tests/test_actions.py |
415 | @@ -78,7 +78,7 @@ class TestReload(MAASTestCase): |
416 | actions.bind_reload() |
417 | self.assertThat( |
418 | actions.execute_rndc_command, |
419 | - MockCalledOnceWith(("reload",))) |
420 | + MockCalledOnceWith(("reload",), timeout=2)) |
421 | |
422 | def test__logs_subprocess_error(self): |
423 | erc = self.patch_autospec(actions, "execute_rndc_command") |
424 | @@ -105,7 +105,7 @@ class TestReloadWithRetries(MAASTestCase): |
425 | bind_reload.return_value = False |
426 | attempts = randint(3, 13) |
427 | actions.bind_reload_with_retries(attempts=attempts) |
428 | - expected_calls = [call()] * attempts |
429 | + expected_calls = [call(timeout=2)] * attempts |
430 | self.assertThat( |
431 | actions.bind_reload, |
432 | MockCallsMatch(*expected_calls)) |
433 | @@ -114,11 +114,11 @@ class TestReloadWithRetries(MAASTestCase): |
434 | self.patch_autospec(actions, "sleep") # Disable. |
435 | bind_reload = self.patch(actions, "bind_reload") |
436 | bind_reload_return_values = [False, False, True, ] |
437 | - bind_reload.side_effect = lambda: ( |
438 | + bind_reload.side_effect = lambda *args, **kwargs: ( |
439 | bind_reload_return_values.pop(0)) |
440 | |
441 | actions.bind_reload_with_retries(attempts=5) |
442 | - expected_calls = [call(), call(), call()] |
443 | + expected_calls = [call(timeout=2), call(timeout=2), call(timeout=2)] |
444 | self.assertThat( |
445 | actions.bind_reload, |
446 | MockCallsMatch(*expected_calls)) |
447 | diff --git a/src/provisioningserver/dns/tests/test_config.py b/src/provisioningserver/dns/tests/test_config.py |
448 | index a5ecf1b..adfa0e5 100644 |
449 | --- a/src/provisioningserver/dns/tests/test_config.py |
450 | +++ b/src/provisioningserver/dns/tests/test_config.py |
451 | @@ -9,7 +9,10 @@ import errno |
452 | import os.path |
453 | import random |
454 | from textwrap import dedent |
455 | -from unittest.mock import Mock |
456 | +from unittest.mock import ( |
457 | + Mock, |
458 | + sentinel, |
459 | +) |
460 | |
461 | from fixtures import EnvironmentVariable |
462 | from maastesting.factory import factory |
463 | @@ -273,10 +276,11 @@ class TestRNDCUtilities(MAASTestCase): |
464 | fake_dir = patch_dns_config_path(self) |
465 | self.patch(config, 'call_and_check', recorder) |
466 | command = factory.make_string() |
467 | - execute_rndc_command([command]) |
468 | + execute_rndc_command([command], timeout=sentinel.timeout) |
469 | rndc_conf_path = os.path.join(fake_dir, MAAS_RNDC_CONF_NAME) |
470 | expected_command = ['rndc', '-c', rndc_conf_path, command] |
471 | self.assertEqual((expected_command,), recorder.calls[0][0]) |
472 | + self.assertEqual({'timeout': sentinel.timeout}, recorder.calls[0][1]) |
473 | |
474 | def test_extract_suggested_named_conf_extracts_section(self): |
475 | named_part = factory.make_string() |
476 | diff --git a/src/provisioningserver/utils/service_monitor.py b/src/provisioningserver/utils/service_monitor.py |
477 | index 43c2f83..ac4f3c6 100644 |
478 | --- a/src/provisioningserver/utils/service_monitor.py |
479 | +++ b/src/provisioningserver/utils/service_monitor.py |
480 | @@ -35,8 +35,12 @@ from provisioningserver.utils import ( |
481 | typed, |
482 | ) |
483 | from provisioningserver.utils.shell import get_env_with_bytes_locale |
484 | -from provisioningserver.utils.twisted import asynchronous |
485 | +from provisioningserver.utils.twisted import ( |
486 | + asynchronous, |
487 | + deferWithTimeout, |
488 | +) |
489 | from twisted.internet.defer import ( |
490 | + CancelledError, |
491 | DeferredList, |
492 | DeferredLock, |
493 | inlineCallbacks, |
494 | @@ -371,6 +375,67 @@ class ServiceMonitor: |
495 | yield self._performServiceAction(service, "reload") |
496 | |
497 | @asynchronous |
498 | + @inlineCallbacks |
499 | + def killService(self, name): |
500 | + """Kill service. |
501 | + |
502 | + Service will be killed then its state will be ensured to be in its |
503 | + expected state. This is a way of getting a broken related service to be |
504 | + responsive. |
505 | + """ |
506 | + service = self.getServiceByName(name) |
507 | + try: |
508 | + yield self._performServiceAction(service, "kill") |
509 | + except ServiceActionError: |
510 | + # Kill action is allowed to fail, as its possible the service is |
511 | + # already dead or not responding to correct signals to check |
512 | + # the current status. |
513 | + pass |
514 | + state = yield self.ensureService(name) |
515 | + return state |
516 | + |
517 | + @inlineCallbacks |
518 | + def _execCmd(self, cmd, env, timeout=8, retries=3): |
519 | + """Execute the `cmd` with the `env`.""" |
520 | + |
521 | + def decode(result): |
522 | + out, err, code = result |
523 | + return code, out.decode("utf-8"), err.decode("utf-8") |
524 | + |
525 | + def log_code(result, cmd, try_num): |
526 | + _, _, code = result |
527 | + log.debug( |
528 | + "[try:{try_num}] Service monitor got exit " |
529 | + "code '{code}' from cmd: {cmd()}", |
530 | + try_num=try_num, code=code, cmd=lambda: ' '.join(cmd)) |
531 | + return result |
532 | + |
533 | + def call_proc(cmd, env, try_num, timeout): |
534 | + log.debug( |
535 | + "[try:{try_num}] Service monitor executing cmd: {cmd()}", |
536 | + try_num=try_num, cmd=lambda: ' '.join(cmd)) |
537 | + |
538 | + d = deferWithTimeout( |
539 | + timeout, getProcessOutputAndValue, cmd[0], cmd[1:], env=env) |
540 | + d.addCallback(log_code, cmd, try_num) |
541 | + return d.addCallback(decode) |
542 | + |
543 | + for try_num in range(retries): |
544 | + try: |
545 | + result = yield call_proc(cmd, env, try_num + 1, timeout) |
546 | + except CancelledError: |
547 | + if try_num == retries - 1: |
548 | + # Failed on final retry. |
549 | + raise ServiceActionError( |
550 | + "Service monitor timed out after '%d' " |
551 | + "seconds and '%s' retries running cmd: %s" % ( |
552 | + timeout, retries, ' '.join(cmd))) |
553 | + else: |
554 | + # Try again. |
555 | + continue |
556 | + return result |
557 | + |
558 | + @asynchronous |
559 | def _execSystemDServiceAction(self, service_name, action, extra_opts=None): |
560 | """Perform the action with the systemctl command. |
561 | |
562 | @@ -390,21 +455,47 @@ class ServiceMonitor: |
563 | return d.addCallback(decode) |
564 | |
565 | @asynchronous |
566 | - def _execSupervisorServiceAction(self, service_name, action): |
567 | + def _execSupervisorServiceAction( |
568 | + self, service_name, action, extra_opts=None): |
569 | """Perform the action with the run-supervisorctl command. |
570 | |
571 | :return: tuple (exit code, std-output, std-error) |
572 | """ |
573 | env = get_env_with_bytes_locale() |
574 | cmd = os.path.join(snappy.get_snap_path(), "bin", "run-supervisorctl") |
575 | - cmd = (cmd, action, service_name) |
576 | |
577 | - def decode(result): |
578 | - out, err, code = result |
579 | - return code, out.decode("utf-8"), err.decode("utf-8") |
580 | + # supervisord doesn't support native kill like systemd. Emulate this |
581 | + # behaviour by getting the PID of the process and then killing the PID. |
582 | + if action == 'kill': |
583 | + |
584 | + def _kill_pid(result): |
585 | + exit_code, stdout, _ = result |
586 | + if exit_code != 0: |
587 | + return result |
588 | + try: |
589 | + pid = int(stdout.strip()) |
590 | + except ValueError: |
591 | + pid = 0 |
592 | + if pid == 0: |
593 | + # supervisorctl returns 0 when the process is already dead |
594 | + # or we where not able to get the actual pid. Nothing to |
595 | + # do, as its already dead. |
596 | + return 0, "", "" |
597 | + cmd = ('kill',) |
598 | + if extra_opts: |
599 | + cmd += extra_opts |
600 | + cmd += ('%s' % pid,) |
601 | + return self._execCmd(cmd, env) |
602 | + |
603 | + d = self._execCmd((cmd, 'pid', service_name), env) |
604 | + d.addCallback(_kill_pid) |
605 | + return d |
606 | |
607 | - d = getProcessOutputAndValue(cmd[0], cmd[1:], env=env) |
608 | - return d.addCallback(decode) |
609 | + cmd = (cmd, action) |
610 | + if extra_opts is not None: |
611 | + cmd += extra_opts |
612 | + cmd += (service_name,) |
613 | + return self._execCmd(cmd, env) |
614 | |
615 | @inlineCallbacks |
616 | def _performServiceAction(self, service, action): |
617 | @@ -416,8 +507,9 @@ class ServiceMonitor: |
618 | else: |
619 | exec_action = self._execSystemDServiceAction |
620 | service_name = service.service_name |
621 | + extra_opts = getattr(service, '%s_extra_opts' % action, None) |
622 | exit_code, output, error = yield lock.run( |
623 | - exec_action, service_name, action) |
624 | + exec_action, service_name, action, extra_opts=extra_opts) |
625 | if exit_code != 0: |
626 | error_msg = ( |
627 | "Service '%s' failed to %s: %s" % ( |
628 | diff --git a/src/provisioningserver/utils/shell.py b/src/provisioningserver/utils/shell.py |
629 | index 0ec7c39..488afb6 100644 |
630 | --- a/src/provisioningserver/utils/shell.py |
631 | +++ b/src/provisioningserver/utils/shell.py |
632 | @@ -105,8 +105,9 @@ def call_and_check(command, *args, **kwargs): |
633 | :return: The command's output from standard output. |
634 | :raise ExternalProcessError: If the command returns nonzero. |
635 | """ |
636 | + timeout = kwargs.pop('timeout', None) |
637 | process = Popen(command, *args, stdout=PIPE, stderr=PIPE, **kwargs) |
638 | - stdout, stderr = process.communicate() |
639 | + stdout, stderr = process.communicate(timeout=timeout) |
640 | stderr = stderr.strip() |
641 | if process.returncode != 0: |
642 | raise ExternalProcessError(process.returncode, command, output=stderr) |
643 | diff --git a/src/provisioningserver/utils/tests/test_service_monitor.py b/src/provisioningserver/utils/tests/test_service_monitor.py |
644 | index 51d39c3..8cb507c 100644 |
645 | --- a/src/provisioningserver/utils/tests/test_service_monitor.py |
646 | +++ b/src/provisioningserver/utils/tests/test_service_monitor.py |
647 | @@ -10,6 +10,7 @@ import os |
648 | import random |
649 | from textwrap import dedent |
650 | from unittest.mock import ( |
651 | + call, |
652 | Mock, |
653 | sentinel, |
654 | ) |
655 | @@ -18,10 +19,12 @@ from fixtures import FakeLogger |
656 | from maastesting.factory import factory |
657 | from maastesting.matchers import ( |
658 | MockCalledOnceWith, |
659 | + MockCallsMatch, |
660 | MockNotCalled, |
661 | ) |
662 | from maastesting.runtest import MAASTwistedRunTest |
663 | from maastesting.testcase import MAASTestCase |
664 | +from maastesting.twisted import always_fail_with |
665 | from provisioningserver.utils import ( |
666 | service_monitor as service_monitor_module, |
667 | snappy, |
668 | @@ -37,6 +40,7 @@ from provisioningserver.utils.service_monitor import ( |
669 | ServiceUnknownError, |
670 | ) |
671 | from provisioningserver.utils.shell import get_env_with_bytes_locale |
672 | +from provisioningserver.utils.twisted import pause |
673 | from testscenarios import multiply_scenarios |
674 | from testtools import ExpectedException |
675 | from testtools.matchers import ( |
676 | @@ -45,7 +49,9 @@ from testtools.matchers import ( |
677 | ) |
678 | from twisted.internet import reactor |
679 | from twisted.internet.defer import ( |
680 | + CancelledError, |
681 | DeferredLock, |
682 | + fail, |
683 | inlineCallbacks, |
684 | succeed, |
685 | ) |
686 | @@ -448,6 +454,62 @@ class TestServiceMonitor(MAASTestCase): |
687 | yield service_monitor.reloadService(fake_service.name, if_on=True) |
688 | |
689 | @inlineCallbacks |
690 | + def test__killService_performs_kill_then_ensureService(self): |
691 | + fake_service = make_fake_service(SERVICE_STATE.ON) |
692 | + service_monitor = self.make_service_monitor([fake_service]) |
693 | + mock_performServiceAction = self.patch( |
694 | + service_monitor, "_performServiceAction") |
695 | + mock_performServiceAction.return_value = succeed(None) |
696 | + mock_ensureService = self.patch(service_monitor, "ensureService") |
697 | + mock_ensureService.return_value = succeed( |
698 | + ServiceState(SERVICE_STATE.ON, "running")) |
699 | + yield service_monitor.killService(fake_service.name) |
700 | + self.assertThat( |
701 | + mock_performServiceAction, |
702 | + MockCalledOnceWith(fake_service, "kill")) |
703 | + self.assertThat( |
704 | + mock_ensureService, |
705 | + MockCalledOnceWith(fake_service.name)) |
706 | + |
707 | + @inlineCallbacks |
708 | + def test__killService_doesnt_fail_on_ServiceActionError(self): |
709 | + fake_service = make_fake_service(SERVICE_STATE.ON) |
710 | + service_monitor = self.make_service_monitor([fake_service]) |
711 | + mock_performServiceAction = self.patch( |
712 | + service_monitor, "_performServiceAction") |
713 | + mock_performServiceAction.return_value = fail(ServiceActionError()) |
714 | + mock_ensureService = self.patch(service_monitor, "ensureService") |
715 | + mock_ensureService.return_value = succeed( |
716 | + ServiceState(SERVICE_STATE.ON, "running")) |
717 | + yield service_monitor.killService(fake_service.name) |
718 | + self.assertThat( |
719 | + mock_performServiceAction, |
720 | + MockCalledOnceWith(fake_service, "kill")) |
721 | + self.assertThat( |
722 | + mock_ensureService, |
723 | + MockCalledOnceWith(fake_service.name)) |
724 | + |
725 | + @inlineCallbacks |
726 | + def test___execCmd_times_out(self): |
727 | + monitor = ServiceMonitor(make_fake_service()) |
728 | + with ExpectedException(ServiceActionError): |
729 | + yield monitor._execCmd( |
730 | + ["sleep", "0.3"], {}, timeout=0.1, retries=1) |
731 | + # Pause long enough for the reactor to cleanup the process. |
732 | + yield pause(0.5) |
733 | + |
734 | + @inlineCallbacks |
735 | + def test___execCmd_retries(self): |
736 | + monitor = ServiceMonitor(make_fake_service()) |
737 | + mock_deferWithTimeout = self.patch( |
738 | + service_monitor_module, "deferWithTimeout") |
739 | + mock_deferWithTimeout.side_effect = always_fail_with(CancelledError()) |
740 | + with ExpectedException(ServiceActionError): |
741 | + yield monitor._execCmd( |
742 | + ["echo", "Hello"], {}, retries=3) |
743 | + self.assertEqual(3, mock_deferWithTimeout.call_count) |
744 | + |
745 | + @inlineCallbacks |
746 | def test___execSystemDServiceAction_calls_systemctl(self): |
747 | service_monitor = self.make_service_monitor() |
748 | service_name = factory.make_name("service") |
749 | @@ -511,16 +573,44 @@ class TestServiceMonitor(MAASTestCase): |
750 | mock_getProcessOutputAndValue = self.patch( |
751 | service_monitor_module, "getProcessOutputAndValue") |
752 | mock_getProcessOutputAndValue.return_value = succeed((b"", b"", 0)) |
753 | + extra_opts = ('--extra', factory.make_name('extra')) |
754 | yield service_monitor._execSupervisorServiceAction( |
755 | - service_name, action) |
756 | + service_name, action, extra_opts=extra_opts) |
757 | cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl') |
758 | - cmd = cmd, action, service_name |
759 | + cmd = (cmd, action) + extra_opts + (service_name,) |
760 | self.assertThat( |
761 | mock_getProcessOutputAndValue, MockCalledOnceWith( |
762 | # The environment contains LC_ALL and LANG too. |
763 | cmd[0], cmd[1:], env=get_env_with_bytes_locale())) |
764 | |
765 | @inlineCallbacks |
766 | + def test___execSupervisorServiceAction_emulates_kill(self): |
767 | + snap_path = factory.make_name("path") |
768 | + self.patch(snappy, "get_snap_path").return_value = snap_path |
769 | + service_monitor = self.make_service_monitor() |
770 | + service_name = factory.make_name("service") |
771 | + fake_pid = random.randint(1, 100) |
772 | + mock_getProcessOutputAndValue = self.patch( |
773 | + service_monitor_module, "getProcessOutputAndValue") |
774 | + mock_getProcessOutputAndValue.side_effect = [ |
775 | + succeed((("%s" % fake_pid).encode('utf-8'), b"", 0)), |
776 | + succeed((b"", b"", 0)), |
777 | + ] |
778 | + extra_opts = ('-s', factory.make_name('SIGKILL')) |
779 | + yield service_monitor._execSupervisorServiceAction( |
780 | + service_name, 'kill', extra_opts=extra_opts) |
781 | + cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl') |
782 | + self.assertThat( |
783 | + mock_getProcessOutputAndValue, MockCallsMatch( |
784 | + call( |
785 | + cmd, ("pid", service_name), |
786 | + env=get_env_with_bytes_locale()), |
787 | + call( |
788 | + "kill", extra_opts + ("%s" % fake_pid,), |
789 | + env=get_env_with_bytes_locale()), |
790 | + )) |
791 | + |
792 | + @inlineCallbacks |
793 | def test___execSupervisorServiceAction_decodes_stdout_and_stderr(self): |
794 | # From https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt. |
795 | example_text = ( |
796 | @@ -555,13 +645,16 @@ class TestServiceMonitor(MAASTestCase): |
797 | service_monitor, "_execSystemDServiceAction") |
798 | mock_execSystemDServiceAction.return_value = (0, "", "") |
799 | action = factory.make_name("action") |
800 | + extra_opts = ('--option', factory.make_name("option")) |
801 | + setattr(service, '%s_extra_opts' % action, extra_opts) |
802 | yield service_monitor._performServiceAction(service, action) |
803 | self.assertThat(service_lock.run, MockCalledOnceWith( |
804 | service_monitor._execSystemDServiceAction, |
805 | - service.service_name, action)) |
806 | + service.service_name, action, extra_opts=extra_opts)) |
807 | self.assertThat( |
808 | mock_execSystemDServiceAction, |
809 | - MockCalledOnceWith(service.service_name, action)) |
810 | + MockCalledOnceWith( |
811 | + service.service_name, action, extra_opts=extra_opts)) |
812 | |
813 | @inlineCallbacks |
814 | def test___performServiceAction_holds_lock_perform_supervisor_action(self): |
815 | @@ -575,13 +668,16 @@ class TestServiceMonitor(MAASTestCase): |
816 | service_monitor, "_execSupervisorServiceAction") |
817 | mock_execSupervisorServiceAction.return_value = (0, "", "") |
818 | action = factory.make_name("action") |
819 | + extra_opts = ('--option', factory.make_name("option")) |
820 | + setattr(service, '%s_extra_opts' % action, extra_opts) |
821 | yield service_monitor._performServiceAction(service, action) |
822 | self.assertThat(service_lock.run, MockCalledOnceWith( |
823 | service_monitor._execSupervisorServiceAction, |
824 | - service.service_name, action)) |
825 | + service.service_name, action, extra_opts=extra_opts)) |
826 | self.assertThat( |
827 | mock_execSupervisorServiceAction, |
828 | - MockCalledOnceWith(service.service_name, action)) |
829 | + MockCalledOnceWith( |
830 | + service.service_name, action, extra_opts=extra_opts)) |
831 | |
832 | @inlineCallbacks |
833 | def test___performServiceAction_raises_ServiceActionError_if_fails(self): |
834 | diff --git a/src/provisioningserver/utils/tests/test_shell.py b/src/provisioningserver/utils/tests/test_shell.py |
835 | index 4492b3a..03f67fd 100644 |
836 | --- a/src/provisioningserver/utils/tests/test_shell.py |
837 | +++ b/src/provisioningserver/utils/tests/test_shell.py |
838 | @@ -6,6 +6,7 @@ |
839 | __all__ = [] |
840 | |
841 | import os |
842 | +import random |
843 | from subprocess import CalledProcessError |
844 | |
845 | from fixtures import EnvironmentVariable |
846 | @@ -54,6 +55,14 @@ class TestCallAndCheck(MAASTestCase): |
847 | self.assertEqual(command, error.cmd) |
848 | self.assertEqual(message, error.output) |
849 | |
850 | + def test__passes_timeout_to_communicate(self): |
851 | + command = factory.make_name('command') |
852 | + process = self.patch_popen() |
853 | + timeout = random.randint(1, 10) |
854 | + call_and_check(command, timeout=timeout) |
855 | + self.assertThat( |
856 | + process.communicate, MockCalledOnceWith(timeout=timeout)) |
857 | + |
858 | def test__reports_stderr_on_failure(self): |
859 | nonfile = os.path.join(self.make_dir(), factory.make_name('nonesuch')) |
860 | error = self.assertRaises( |
861 | diff --git a/versions.cfg b/versions.cfg |
862 | index 2d8eb7e..f214d65 100644 |
863 | --- a/versions.cfg |
864 | +++ b/versions.cfg |
865 | @@ -25,9 +25,11 @@ linecache2 = 1.0.0 |
866 | nose = 1.3.7 |
867 | nose-timer = 0.6.0 |
868 | pbr = 1.10.0 |
869 | +pexpect = 4.6.0 |
870 | pickleshare = 0.7.4 |
871 | postgresfixture = 0.4.1 |
872 | prompt-toolkit = 1.0.9 |
873 | +ptyprocess = 0.6.0 |
874 | python-mimeparse = 1.6.0 |
875 | python-subunit = 1.2.0 |
876 | selenium = 2.45 |
UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas
STATUS: FAILED maas-ci- jenkins. internal: 8080/job/ maas/job/ branch- tester/ 6400/console a456cbca5b7e577 e86b7e892e
LOG: http://
COMMIT: 06640035f714bad