Merge ~blake-rouse/maas:fix-1710278-2.4 into maas:2.4

Proposed by Blake Rouse
Status: Merged
Approved by: Björn Tillenius
Approved revision: c64105dbfb38863b5c7b38616864d2c379ac356c
Merge reported by: MAAS Lander
Merged at revision: not available
Proposed branch: ~blake-rouse/maas:fix-1710278-2.4
Merge into: maas:2.4
Diff against target: 876 lines (+344/-48)
15 files modified
debian/extras/99-maas-common-sudoers (+6/-0)
src/maasserver/dns/config.py (+4/-4)
src/maasserver/dns/tests/test_config.py (+5/-2)
src/maasserver/region_controller.py (+38/-5)
src/maasserver/service_monitor.py (+3/-0)
src/maasserver/tests/test_region_controller.py (+49/-8)
src/provisioningserver/dns/actions.py (+11/-5)
src/provisioningserver/dns/config.py (+2/-2)
src/provisioningserver/dns/tests/test_actions.py (+4/-4)
src/provisioningserver/dns/tests/test_config.py (+6/-2)
src/provisioningserver/utils/service_monitor.py (+101/-9)
src/provisioningserver/utils/shell.py (+2/-1)
src/provisioningserver/utils/tests/test_service_monitor.py (+102/-6)
src/provisioningserver/utils/tests/test_shell.py (+9/-0)
versions.cfg (+2/-0)
Reviewer Review Type Date Requested Status
MAAS Lander Needs Fixing
Adam Collard (community) Approve
Review via email: mp+372692@code.launchpad.net

Commit message

Fixes LP: #1710278 - Notice when bind9 is hung, force kill the service, ensure it starts again, then perform a reload.

Backport of d0a4ec6ce2af49d00431c0d7ad4c5164d3951e1a.

To post a comment you must log in.
Revision history for this message
MAAS Lander (maas-lander) wrote :

UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas

STATUS: FAILED
LOG: http://maas-ci-jenkins.internal:8080/job/maas/job/branch-tester/6400/console
COMMIT: 06640035f714bada456cbca5b7e577e86b7e892e

review: Needs Fixing
Revision history for this message
MAAS Lander (maas-lander) wrote :

UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas

STATUS: FAILED
LOG: http://maas-ci-jenkins.internal:8080/job/maas/job/branch-tester/6402/console
COMMIT: 50a45ad9cd72dbcea4a5d099a864c62249ff26f5

review: Needs Fixing
Revision history for this message
Adam Collard (adam-collard) :
review: Approve
Revision history for this message
MAAS Lander (maas-lander) wrote :

UNIT TESTS
-b fix-1710278-2.4 lp:~blake-rouse/maas/+git/maas into -b 2.4 lp:~maas-committers/maas

STATUS: FAILED
LOG: http://maas-ci-jenkins.internal:8080/job/maas/job/branch-tester/6729/console
COMMIT: c64105dbfb38863b5c7b38616864d2c379ac356c

review: Needs Fixing
Revision history for this message
Blake Rouse (blake-rouse) wrote :

The failures from the test running are javascript failures, which have not been touched in this branch.

Revision history for this message
MAAS Lander (maas-lander) wrote :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/debian/extras/99-maas-common-sudoers b/debian/extras/99-maas-common-sudoers
2index 1475b44..d333d65 100644
3--- a/debian/extras/99-maas-common-sudoers
4+++ b/debian/extras/99-maas-common-sudoers
5@@ -11,6 +11,12 @@ maas ALL= NOPASSWD: /bin/systemctl restart chrony
6 maas ALL= NOPASSWD: /bin/systemctl start chrony
7 maas ALL= NOPASSWD: /bin/systemctl status chrony
8
9+# Control of the DNS server: MAAS needs to track status, start it
10+# again if stopped manually, or if MAAS must brutally kill it!
11+maas ALL= NOPASSWD: /bin/systemctl status bind9
12+maas ALL= NOPASSWD: /bin/systemctl start bind9
13+maas ALL= NOPASSWD: /bin/systemctl kill -s SIGKILL bind9
14+
15 # Writing and deleting files as root.
16 maas ALL= NOPASSWD: /usr/lib/maas/maas-delete-file
17 maas ALL= NOPASSWD: /usr/lib/maas/maas-write-file
18diff --git a/src/maasserver/dns/config.py b/src/maasserver/dns/config.py
19index e27773a..5312b42 100644
20--- a/src/maasserver/dns/config.py
21+++ b/src/maasserver/dns/config.py
22@@ -42,7 +42,7 @@ def dns_force_reload():
23 DNSPublication(source="Force reload").save()
24
25
26-def dns_update_all_zones(reload_retry=False):
27+def dns_update_all_zones(reload_retry=False, reload_timeout=2):
28 """Update all zone files for all domains.
29
30 Serving these zone files means updating BIND's configuration to include
31@@ -84,12 +84,12 @@ def dns_update_all_zones(reload_retry=False):
32 # actually needed but it seems safer to maintain this behaviour until we
33 # have a better understanding.
34 if reload_retry:
35- bind_reload_with_retries()
36+ reloaded = bind_reload_with_retries(timeout=reload_timeout)
37 else:
38- bind_reload()
39+ reloaded = bind_reload(timeout=reload_timeout)
40
41 # Return the current serial and list of domain names.
42- return serial, [
43+ return serial, reloaded, [
44 domain.name
45 for domain in domains
46 ]
47diff --git a/src/maasserver/dns/tests/test_config.py b/src/maasserver/dns/tests/test_config.py
48index c163420..e8303d4 100644
49--- a/src/maasserver/dns/tests/test_config.py
50+++ b/src/maasserver/dns/tests/test_config.py
51@@ -53,6 +53,7 @@ from testtools.matchers import (
52 Contains,
53 Equals,
54 FileContains,
55+ Is,
56 MatchesSetwise,
57 MatchesStructure,
58 )
59@@ -255,7 +256,8 @@ class TestDNSConfigModifications(TestDNSServer):
60 bind_reload_with_retries = self.patch_autospec(
61 dns_config_module, "bind_reload_with_retries")
62 dns_update_all_zones(reload_retry=True)
63- self.assertThat(bind_reload_with_retries, MockCalledOnceWith())
64+ self.assertThat(
65+ bind_reload_with_retries, MockCalledOnceWith(timeout=2))
66
67 def test_dns_update_all_zones_passes_upstream_dns_parameter(self):
68 self.patch(settings, 'DNS_CONNECT', True)
69@@ -314,8 +316,9 @@ class TestDNSConfigModifications(TestDNSServer):
70 self.patch(
71 dns_config_module,
72 "current_zone_serial").return_value = fake_serial
73- serial, domains = dns_update_all_zones()
74+ serial, reloaded, domains = dns_update_all_zones()
75 self.assertThat(serial, Equals(fake_serial))
76+ self.assertThat(reloaded, Is(True))
77 self.assertThat(domains, MatchesSetwise(*[
78 Equals(domain.name)
79 for domain in Domain.objects.filter(authoritative=True)
80diff --git a/src/maasserver/region_controller.py b/src/maasserver/region_controller.py
81index e54b0a9..cd62138 100644
82--- a/src/maasserver/region_controller.py
83+++ b/src/maasserver/region_controller.py
84@@ -26,6 +26,7 @@ __all__ = [
85 from maasserver.dns.config import dns_update_all_zones
86 from maasserver.models.dnspublication import DNSPublication
87 from maasserver.proxyconfig import proxy_update_config
88+from maasserver.service_monitor import service_monitor
89 from maasserver.utils.orm import transactional
90 from maasserver.utils.threads import deferToDatabase
91 from provisioningserver.logger import LegacyLogger
92@@ -60,7 +61,7 @@ class RegionControllerService(Service):
93 See module documentation for more details.
94 """
95
96- def __init__(self, postgresListener, clock=reactor):
97+ def __init__(self, postgresListener, clock=reactor, retryOnFailure=True):
98 """Initialise a new `RegionControllerService`.
99
100 :param postgresListener: The `PostgresListenerService` that is running
101@@ -68,6 +69,7 @@ class RegionControllerService(Service):
102 """
103 super(RegionControllerService, self).__init__()
104 self.clock = clock
105+ self.retryOnFailure = retryOnFailure
106 self.processing = LoopingCall(self.process)
107 self.processing.clock = self.clock
108 self.processingDefer = None
109@@ -118,12 +120,27 @@ class RegionControllerService(Service):
110
111 def process(self):
112 """Process the DNS and/or proxy update."""
113+
114+ def _onFailureRetry(failure, attr):
115+ """Retry update on failure.
116+
117+ Doesn't mask the failure, the failure is still raised.
118+ """
119+ if self.retryOnFailure:
120+ setattr(self, attr, True)
121+ return failure
122+
123 defers = []
124 if self.needsDNSUpdate:
125 self.needsDNSUpdate = False
126 d = deferToDatabase(transactional(dns_update_all_zones))
127 d.addCallback(self._checkSerial)
128 d.addCallback(self._logDNSReload)
129+ # Order here matters, first needsDNSUpdate is set then pass the
130+ # failure onto `_onDNSReloadFailure` to do the correct thing
131+ # with the DNS server.
132+ d.addErrback(_onFailureRetry, 'needsDNSUpdate')
133+ d.addErrback(self._onDNSReloadFailure)
134 d.addErrback(
135 log.err,
136 "Failed configuring DNS.")
137@@ -150,7 +167,10 @@ class RegionControllerService(Service):
138 """Check that the serial of the domain is updated."""
139 if result is None:
140 return None
141- serial, domain_names = result
142+ serial, reloaded, domain_names = result
143+ if not reloaded:
144+ raise DNSReloadError(
145+ "Failed to reload DNS; timeout or rdnc command failed.")
146 not_matching_domains = set(domain_names)
147 loop = 0
148 while len(not_matching_domains) > 0 and loop != 30:
149@@ -173,13 +193,13 @@ class RegionControllerService(Service):
150 raise DNSReloadError(
151 "Failed to reload DNS; serial mismatch "
152 "on domains %s" % ', '.join(not_matching_domains))
153- return serial, domain_names
154+ return result
155
156 def _logDNSReload(self, result):
157 """Log the reason DNS was reloaded."""
158 if result is None:
159 return None
160- serial, domain_names = result
161+ serial, _, domain_names = result
162 if self.previousSerial is None:
163 # This was the first load for starting the service.
164 self.previousSerial = serial
165@@ -190,7 +210,10 @@ class RegionControllerService(Service):
166 # reason for the reload.
167
168 def _logReason(reasons):
169- if len(reasons) == 1:
170+ if len(reasons) == 0:
171+ msg = (
172+ "Reloaded DNS configuration; previous failure (retry)")
173+ elif len(reasons) == 1:
174 msg = "Reloaded DNS configuration; %s" % reasons[0]
175 else:
176 msg = 'Reloaded DNS configuration: \n' + '\n'.join(
177@@ -207,6 +230,16 @@ class RegionControllerService(Service):
178 self.previousSerial = serial
179 return d
180
181+ def _onDNSReloadFailure(self, failure):
182+ """Force kill and restart bind9."""
183+ failure.trap(DNSReloadError)
184+ if not self.retryOnFailure:
185+ return failure
186+ log.err(failure, "Failed configuring DNS; killing and restarting")
187+ d = service_monitor.killService('bind9')
188+ d.addErrback(log.err, "Failed to kill and restart DNS.")
189+ return d
190+
191 @transactional
192 def _getReloadReasons(self, previousSerial, currentSerial):
193 return [
194diff --git a/src/maasserver/service_monitor.py b/src/maasserver/service_monitor.py
195index 6efd9b7..05d235b 100644
196--- a/src/maasserver/service_monitor.py
197+++ b/src/maasserver/service_monitor.py
198@@ -29,6 +29,9 @@ class BIND9Service(AlwaysOnService):
199 service_name = "bind9"
200 snap_service_name = "bind9"
201
202+ # Pass SIGKILL directly to parent.
203+ kill_extra_opts = ('-s', 'SIGKILL')
204+
205
206 class NTPServiceOnRegion(AlwaysOnService):
207 """Monitored NTP service on a region controller host."""
208diff --git a/src/maasserver/tests/test_region_controller.py b/src/maasserver/tests/test_region_controller.py
209index 090e590..6926783 100644
210--- a/src/maasserver/tests/test_region_controller.py
211+++ b/src/maasserver/tests/test_region_controller.py
212@@ -183,7 +183,7 @@ class TestRegionControllerService(MAASServerTestCase):
213 service = RegionControllerService(sentinel.listener)
214 service.needsDNSUpdate = True
215 dns_result = (
216- random.randint(1, 1000), [
217+ random.randint(1, 1000), True, [
218 factory.make_name('domain')
219 for _ in range(3)
220 ])
221@@ -205,6 +205,46 @@ class TestRegionControllerService(MAASServerTestCase):
222
223 @wait_for_reactor
224 @inlineCallbacks
225+ def test_process_zones_kills_bind_on_failed_reload(self):
226+ service = RegionControllerService(sentinel.listener)
227+ service.needsDNSUpdate = True
228+ service.retryOnFailure = True
229+ dns_result_0 = (
230+ random.randint(1, 1000), False, [
231+ factory.make_name('domain')
232+ for _ in range(3)
233+ ])
234+ dns_result_1 = (dns_result_0[0], True, dns_result_0[2])
235+ mock_dns_update_all_zones = self.patch(
236+ region_controller, "dns_update_all_zones")
237+ mock_dns_update_all_zones.side_effect = [dns_result_0, dns_result_1]
238+
239+ service._checkSerialCalled = False
240+ orig_checkSerial = service._checkSerial
241+
242+ def _checkSerial(result):
243+ if service._checkSerialCalled:
244+ return dns_result_1
245+ service._checkSerialCalled = True
246+ return orig_checkSerial(result)
247+
248+ mock_check_serial = self.patch(service, "_checkSerial")
249+ mock_check_serial.side_effect = _checkSerial
250+ mock_killService = self.patch(
251+ region_controller.service_monitor, 'killService')
252+ mock_killService.return_value = succeed(None)
253+ service.startProcessing()
254+ yield service.processingDefer
255+ self.assertThat(
256+ mock_dns_update_all_zones, MockCallsMatch(call(), call()))
257+ self.assertThat(
258+ mock_check_serial,
259+ MockCallsMatch(call(dns_result_0), call(dns_result_1)))
260+ self.assertThat(
261+ mock_killService, MockCalledOnceWith("bind9"))
262+
263+ @wait_for_reactor
264+ @inlineCallbacks
265 def test_process_updates_proxy(self):
266 service = RegionControllerService(sentinel.listener)
267 service.needsProxyUpdate = True
268@@ -226,6 +266,7 @@ class TestRegionControllerService(MAASServerTestCase):
269 def test_process_updates_zones_logs_failure(self):
270 service = RegionControllerService(sentinel.listener)
271 service.needsDNSUpdate = True
272+ service.retryOnFailure = False
273 mock_dns_update_all_zones = self.patch(
274 region_controller, "dns_update_all_zones")
275 mock_dns_update_all_zones.side_effect = factory.make_exception()
276@@ -263,7 +304,7 @@ class TestRegionControllerService(MAASServerTestCase):
277 service.needsDNSUpdate = True
278 service.needsProxyUpdate = True
279 dns_result = (
280- random.randint(1, 1000), [
281+ random.randint(1, 1000), True, [
282 factory.make_name('domain')
283 for _ in range(3)
284 ])
285@@ -318,7 +359,7 @@ class TestRegionControllerService(MAASServerTestCase):
286 succeed(([self.make_soa_result(result_serial)], [], [])),
287 ]
288 # Error should not be raised.
289- return service._checkSerial((formatted_serial, dns_names))
290+ return service._checkSerial((formatted_serial, True, dns_names))
291
292 @wait_for_reactor
293 @inlineCallbacks
294@@ -336,7 +377,7 @@ class TestRegionControllerService(MAASServerTestCase):
295 mock_lookup.side_effect = lambda *args: succeed(([], [], []))
296 # Error should not be raised.
297 with ExpectedException(DNSReloadError):
298- yield service._checkSerial((formatted_serial, dns_names))
299+ yield service._checkSerial((formatted_serial, True, dns_names))
300
301 @wait_for_reactor
302 @inlineCallbacks
303@@ -354,7 +395,7 @@ class TestRegionControllerService(MAASServerTestCase):
304 mock_lookup.side_effect = ValueError()
305 # Error should not be raised.
306 with ExpectedException(DNSReloadError):
307- yield service._checkSerial((formatted_serial, dns_names))
308+ yield service._checkSerial((formatted_serial, True, dns_names))
309
310 @wait_for_reactor
311 @inlineCallbacks
312@@ -372,7 +413,7 @@ class TestRegionControllerService(MAASServerTestCase):
313 mock_lookup.side_effect = TimeoutError()
314 # Error should not be raised.
315 with ExpectedException(DNSReloadError):
316- yield service._checkSerial((formatted_serial, dns_names))
317+ yield service._checkSerial((formatted_serial, True, dns_names))
318
319
320 class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase):
321@@ -393,7 +434,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase):
322 service.needsDNSUpdate = True
323 service.previousSerial = publications[0].serial
324 dns_result = (
325- publications[-1].serial, [
326+ publications[-1].serial, True, [
327 factory.make_name('domain')
328 for _ in range(3)
329 ])
330@@ -430,7 +471,7 @@ class TestRegionControllerServiceTransactional(MAASTransactionServerTestCase):
331 service.needsDNSUpdate = True
332 service.previousSerial = publications[0].serial
333 dns_result = (
334- publications[-1].serial, [
335+ publications[-1].serial, True, [
336 factory.make_name('domain')
337 for _ in range(3)
338 ])
339diff --git a/src/provisioningserver/dns/actions.py b/src/provisioningserver/dns/actions.py
340index 97ad042..5415184 100644
341--- a/src/provisioningserver/dns/actions.py
342+++ b/src/provisioningserver/dns/actions.py
343@@ -13,7 +13,10 @@ __all__ = [
344 ]
345
346 import collections
347-from subprocess import CalledProcessError
348+from subprocess import (
349+ CalledProcessError,
350+ TimeoutExpired,
351+)
352 from time import sleep
353
354 from provisioningserver.dns.config import (
355@@ -48,7 +51,7 @@ def bind_reconfigure():
356 raise
357
358
359-def bind_reload():
360+def bind_reload(timeout=2):
361 """Ask BIND to reload its configuration and all zone files. This operation
362 is 'best effort' (with logging) as the server may not be running, and there
363 is often no context for reporting.
364@@ -56,21 +59,24 @@ def bind_reload():
365 :return: True if success, False otherwise.
366 """
367 try:
368- execute_rndc_command(("reload",))
369+ execute_rndc_command(("reload",), timeout=timeout)
370 return True
371 except CalledProcessError as exc:
372 maaslog.error("Reloading BIND failed (is it running?): %s", exc)
373 return False
374+ except TimeoutExpired as exc:
375+ maaslog.error("Reloading BIND timed out (is it locked?): %s", exc)
376+ return False
377
378
379-def bind_reload_with_retries(attempts=10, interval=2):
380+def bind_reload_with_retries(attempts=10, interval=2, timeout=2):
381 """Ask BIND to reload its configuration and all zone files.
382
383 :param attempts: The number of attempts.
384 :param interval: The time in seconds to sleep between each attempt.
385 """
386 for countdown in range(attempts - 1, -1, -1):
387- if bind_reload():
388+ if bind_reload(timeout=timeout):
389 break
390 if countdown == 0:
391 break
392diff --git a/src/provisioningserver/dns/config.py b/src/provisioningserver/dns/config.py
393index b03b0fb..eb780d0 100644
394--- a/src/provisioningserver/dns/config.py
395+++ b/src/provisioningserver/dns/config.py
396@@ -172,12 +172,12 @@ def set_up_rndc():
397 f.write(named_content)
398
399
400-def execute_rndc_command(arguments):
401+def execute_rndc_command(arguments, timeout=None):
402 """Execute a rndc command."""
403 rndc_conf = get_rndc_conf_path()
404 rndc_cmd = ['rndc', '-c', rndc_conf]
405 rndc_cmd.extend(arguments)
406- call_and_check(rndc_cmd)
407+ call_and_check(rndc_cmd, timeout=timeout)
408
409
410 def set_up_options_conf(overwrite=True, **kwargs):
411diff --git a/src/provisioningserver/dns/tests/test_actions.py b/src/provisioningserver/dns/tests/test_actions.py
412index 472bb56..58d216e 100644
413--- a/src/provisioningserver/dns/tests/test_actions.py
414+++ b/src/provisioningserver/dns/tests/test_actions.py
415@@ -78,7 +78,7 @@ class TestReload(MAASTestCase):
416 actions.bind_reload()
417 self.assertThat(
418 actions.execute_rndc_command,
419- MockCalledOnceWith(("reload",)))
420+ MockCalledOnceWith(("reload",), timeout=2))
421
422 def test__logs_subprocess_error(self):
423 erc = self.patch_autospec(actions, "execute_rndc_command")
424@@ -105,7 +105,7 @@ class TestReloadWithRetries(MAASTestCase):
425 bind_reload.return_value = False
426 attempts = randint(3, 13)
427 actions.bind_reload_with_retries(attempts=attempts)
428- expected_calls = [call()] * attempts
429+ expected_calls = [call(timeout=2)] * attempts
430 self.assertThat(
431 actions.bind_reload,
432 MockCallsMatch(*expected_calls))
433@@ -114,11 +114,11 @@ class TestReloadWithRetries(MAASTestCase):
434 self.patch_autospec(actions, "sleep") # Disable.
435 bind_reload = self.patch(actions, "bind_reload")
436 bind_reload_return_values = [False, False, True, ]
437- bind_reload.side_effect = lambda: (
438+ bind_reload.side_effect = lambda *args, **kwargs: (
439 bind_reload_return_values.pop(0))
440
441 actions.bind_reload_with_retries(attempts=5)
442- expected_calls = [call(), call(), call()]
443+ expected_calls = [call(timeout=2), call(timeout=2), call(timeout=2)]
444 self.assertThat(
445 actions.bind_reload,
446 MockCallsMatch(*expected_calls))
447diff --git a/src/provisioningserver/dns/tests/test_config.py b/src/provisioningserver/dns/tests/test_config.py
448index a5ecf1b..adfa0e5 100644
449--- a/src/provisioningserver/dns/tests/test_config.py
450+++ b/src/provisioningserver/dns/tests/test_config.py
451@@ -9,7 +9,10 @@ import errno
452 import os.path
453 import random
454 from textwrap import dedent
455-from unittest.mock import Mock
456+from unittest.mock import (
457+ Mock,
458+ sentinel,
459+)
460
461 from fixtures import EnvironmentVariable
462 from maastesting.factory import factory
463@@ -273,10 +276,11 @@ class TestRNDCUtilities(MAASTestCase):
464 fake_dir = patch_dns_config_path(self)
465 self.patch(config, 'call_and_check', recorder)
466 command = factory.make_string()
467- execute_rndc_command([command])
468+ execute_rndc_command([command], timeout=sentinel.timeout)
469 rndc_conf_path = os.path.join(fake_dir, MAAS_RNDC_CONF_NAME)
470 expected_command = ['rndc', '-c', rndc_conf_path, command]
471 self.assertEqual((expected_command,), recorder.calls[0][0])
472+ self.assertEqual({'timeout': sentinel.timeout}, recorder.calls[0][1])
473
474 def test_extract_suggested_named_conf_extracts_section(self):
475 named_part = factory.make_string()
476diff --git a/src/provisioningserver/utils/service_monitor.py b/src/provisioningserver/utils/service_monitor.py
477index 43c2f83..ac4f3c6 100644
478--- a/src/provisioningserver/utils/service_monitor.py
479+++ b/src/provisioningserver/utils/service_monitor.py
480@@ -35,8 +35,12 @@ from provisioningserver.utils import (
481 typed,
482 )
483 from provisioningserver.utils.shell import get_env_with_bytes_locale
484-from provisioningserver.utils.twisted import asynchronous
485+from provisioningserver.utils.twisted import (
486+ asynchronous,
487+ deferWithTimeout,
488+)
489 from twisted.internet.defer import (
490+ CancelledError,
491 DeferredList,
492 DeferredLock,
493 inlineCallbacks,
494@@ -371,6 +375,67 @@ class ServiceMonitor:
495 yield self._performServiceAction(service, "reload")
496
497 @asynchronous
498+ @inlineCallbacks
499+ def killService(self, name):
500+ """Kill service.
501+
502+ Service will be killed then its state will be ensured to be in its
503+ expected state. This is a way of getting a broken related service to be
504+ responsive.
505+ """
506+ service = self.getServiceByName(name)
507+ try:
508+ yield self._performServiceAction(service, "kill")
509+ except ServiceActionError:
510+ # Kill action is allowed to fail, as its possible the service is
511+ # already dead or not responding to correct signals to check
512+ # the current status.
513+ pass
514+ state = yield self.ensureService(name)
515+ return state
516+
517+ @inlineCallbacks
518+ def _execCmd(self, cmd, env, timeout=8, retries=3):
519+ """Execute the `cmd` with the `env`."""
520+
521+ def decode(result):
522+ out, err, code = result
523+ return code, out.decode("utf-8"), err.decode("utf-8")
524+
525+ def log_code(result, cmd, try_num):
526+ _, _, code = result
527+ log.debug(
528+ "[try:{try_num}] Service monitor got exit "
529+ "code '{code}' from cmd: {cmd()}",
530+ try_num=try_num, code=code, cmd=lambda: ' '.join(cmd))
531+ return result
532+
533+ def call_proc(cmd, env, try_num, timeout):
534+ log.debug(
535+ "[try:{try_num}] Service monitor executing cmd: {cmd()}",
536+ try_num=try_num, cmd=lambda: ' '.join(cmd))
537+
538+ d = deferWithTimeout(
539+ timeout, getProcessOutputAndValue, cmd[0], cmd[1:], env=env)
540+ d.addCallback(log_code, cmd, try_num)
541+ return d.addCallback(decode)
542+
543+ for try_num in range(retries):
544+ try:
545+ result = yield call_proc(cmd, env, try_num + 1, timeout)
546+ except CancelledError:
547+ if try_num == retries - 1:
548+ # Failed on final retry.
549+ raise ServiceActionError(
550+ "Service monitor timed out after '%d' "
551+ "seconds and '%s' retries running cmd: %s" % (
552+ timeout, retries, ' '.join(cmd)))
553+ else:
554+ # Try again.
555+ continue
556+ return result
557+
558+ @asynchronous
559 def _execSystemDServiceAction(self, service_name, action, extra_opts=None):
560 """Perform the action with the systemctl command.
561
562@@ -390,21 +455,47 @@ class ServiceMonitor:
563 return d.addCallback(decode)
564
565 @asynchronous
566- def _execSupervisorServiceAction(self, service_name, action):
567+ def _execSupervisorServiceAction(
568+ self, service_name, action, extra_opts=None):
569 """Perform the action with the run-supervisorctl command.
570
571 :return: tuple (exit code, std-output, std-error)
572 """
573 env = get_env_with_bytes_locale()
574 cmd = os.path.join(snappy.get_snap_path(), "bin", "run-supervisorctl")
575- cmd = (cmd, action, service_name)
576
577- def decode(result):
578- out, err, code = result
579- return code, out.decode("utf-8"), err.decode("utf-8")
580+ # supervisord doesn't support native kill like systemd. Emulate this
581+ # behaviour by getting the PID of the process and then killing the PID.
582+ if action == 'kill':
583+
584+ def _kill_pid(result):
585+ exit_code, stdout, _ = result
586+ if exit_code != 0:
587+ return result
588+ try:
589+ pid = int(stdout.strip())
590+ except ValueError:
591+ pid = 0
592+ if pid == 0:
593+ # supervisorctl returns 0 when the process is already dead
594+ # or we where not able to get the actual pid. Nothing to
595+ # do, as its already dead.
596+ return 0, "", ""
597+ cmd = ('kill',)
598+ if extra_opts:
599+ cmd += extra_opts
600+ cmd += ('%s' % pid,)
601+ return self._execCmd(cmd, env)
602+
603+ d = self._execCmd((cmd, 'pid', service_name), env)
604+ d.addCallback(_kill_pid)
605+ return d
606
607- d = getProcessOutputAndValue(cmd[0], cmd[1:], env=env)
608- return d.addCallback(decode)
609+ cmd = (cmd, action)
610+ if extra_opts is not None:
611+ cmd += extra_opts
612+ cmd += (service_name,)
613+ return self._execCmd(cmd, env)
614
615 @inlineCallbacks
616 def _performServiceAction(self, service, action):
617@@ -416,8 +507,9 @@ class ServiceMonitor:
618 else:
619 exec_action = self._execSystemDServiceAction
620 service_name = service.service_name
621+ extra_opts = getattr(service, '%s_extra_opts' % action, None)
622 exit_code, output, error = yield lock.run(
623- exec_action, service_name, action)
624+ exec_action, service_name, action, extra_opts=extra_opts)
625 if exit_code != 0:
626 error_msg = (
627 "Service '%s' failed to %s: %s" % (
628diff --git a/src/provisioningserver/utils/shell.py b/src/provisioningserver/utils/shell.py
629index 0ec7c39..488afb6 100644
630--- a/src/provisioningserver/utils/shell.py
631+++ b/src/provisioningserver/utils/shell.py
632@@ -105,8 +105,9 @@ def call_and_check(command, *args, **kwargs):
633 :return: The command's output from standard output.
634 :raise ExternalProcessError: If the command returns nonzero.
635 """
636+ timeout = kwargs.pop('timeout', None)
637 process = Popen(command, *args, stdout=PIPE, stderr=PIPE, **kwargs)
638- stdout, stderr = process.communicate()
639+ stdout, stderr = process.communicate(timeout=timeout)
640 stderr = stderr.strip()
641 if process.returncode != 0:
642 raise ExternalProcessError(process.returncode, command, output=stderr)
643diff --git a/src/provisioningserver/utils/tests/test_service_monitor.py b/src/provisioningserver/utils/tests/test_service_monitor.py
644index 51d39c3..8cb507c 100644
645--- a/src/provisioningserver/utils/tests/test_service_monitor.py
646+++ b/src/provisioningserver/utils/tests/test_service_monitor.py
647@@ -10,6 +10,7 @@ import os
648 import random
649 from textwrap import dedent
650 from unittest.mock import (
651+ call,
652 Mock,
653 sentinel,
654 )
655@@ -18,10 +19,12 @@ from fixtures import FakeLogger
656 from maastesting.factory import factory
657 from maastesting.matchers import (
658 MockCalledOnceWith,
659+ MockCallsMatch,
660 MockNotCalled,
661 )
662 from maastesting.runtest import MAASTwistedRunTest
663 from maastesting.testcase import MAASTestCase
664+from maastesting.twisted import always_fail_with
665 from provisioningserver.utils import (
666 service_monitor as service_monitor_module,
667 snappy,
668@@ -37,6 +40,7 @@ from provisioningserver.utils.service_monitor import (
669 ServiceUnknownError,
670 )
671 from provisioningserver.utils.shell import get_env_with_bytes_locale
672+from provisioningserver.utils.twisted import pause
673 from testscenarios import multiply_scenarios
674 from testtools import ExpectedException
675 from testtools.matchers import (
676@@ -45,7 +49,9 @@ from testtools.matchers import (
677 )
678 from twisted.internet import reactor
679 from twisted.internet.defer import (
680+ CancelledError,
681 DeferredLock,
682+ fail,
683 inlineCallbacks,
684 succeed,
685 )
686@@ -448,6 +454,62 @@ class TestServiceMonitor(MAASTestCase):
687 yield service_monitor.reloadService(fake_service.name, if_on=True)
688
689 @inlineCallbacks
690+ def test__killService_performs_kill_then_ensureService(self):
691+ fake_service = make_fake_service(SERVICE_STATE.ON)
692+ service_monitor = self.make_service_monitor([fake_service])
693+ mock_performServiceAction = self.patch(
694+ service_monitor, "_performServiceAction")
695+ mock_performServiceAction.return_value = succeed(None)
696+ mock_ensureService = self.patch(service_monitor, "ensureService")
697+ mock_ensureService.return_value = succeed(
698+ ServiceState(SERVICE_STATE.ON, "running"))
699+ yield service_monitor.killService(fake_service.name)
700+ self.assertThat(
701+ mock_performServiceAction,
702+ MockCalledOnceWith(fake_service, "kill"))
703+ self.assertThat(
704+ mock_ensureService,
705+ MockCalledOnceWith(fake_service.name))
706+
707+ @inlineCallbacks
708+ def test__killService_doesnt_fail_on_ServiceActionError(self):
709+ fake_service = make_fake_service(SERVICE_STATE.ON)
710+ service_monitor = self.make_service_monitor([fake_service])
711+ mock_performServiceAction = self.patch(
712+ service_monitor, "_performServiceAction")
713+ mock_performServiceAction.return_value = fail(ServiceActionError())
714+ mock_ensureService = self.patch(service_monitor, "ensureService")
715+ mock_ensureService.return_value = succeed(
716+ ServiceState(SERVICE_STATE.ON, "running"))
717+ yield service_monitor.killService(fake_service.name)
718+ self.assertThat(
719+ mock_performServiceAction,
720+ MockCalledOnceWith(fake_service, "kill"))
721+ self.assertThat(
722+ mock_ensureService,
723+ MockCalledOnceWith(fake_service.name))
724+
725+ @inlineCallbacks
726+ def test___execCmd_times_out(self):
727+ monitor = ServiceMonitor(make_fake_service())
728+ with ExpectedException(ServiceActionError):
729+ yield monitor._execCmd(
730+ ["sleep", "0.3"], {}, timeout=0.1, retries=1)
731+ # Pause long enough for the reactor to cleanup the process.
732+ yield pause(0.5)
733+
734+ @inlineCallbacks
735+ def test___execCmd_retries(self):
736+ monitor = ServiceMonitor(make_fake_service())
737+ mock_deferWithTimeout = self.patch(
738+ service_monitor_module, "deferWithTimeout")
739+ mock_deferWithTimeout.side_effect = always_fail_with(CancelledError())
740+ with ExpectedException(ServiceActionError):
741+ yield monitor._execCmd(
742+ ["echo", "Hello"], {}, retries=3)
743+ self.assertEqual(3, mock_deferWithTimeout.call_count)
744+
745+ @inlineCallbacks
746 def test___execSystemDServiceAction_calls_systemctl(self):
747 service_monitor = self.make_service_monitor()
748 service_name = factory.make_name("service")
749@@ -511,16 +573,44 @@ class TestServiceMonitor(MAASTestCase):
750 mock_getProcessOutputAndValue = self.patch(
751 service_monitor_module, "getProcessOutputAndValue")
752 mock_getProcessOutputAndValue.return_value = succeed((b"", b"", 0))
753+ extra_opts = ('--extra', factory.make_name('extra'))
754 yield service_monitor._execSupervisorServiceAction(
755- service_name, action)
756+ service_name, action, extra_opts=extra_opts)
757 cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl')
758- cmd = cmd, action, service_name
759+ cmd = (cmd, action) + extra_opts + (service_name,)
760 self.assertThat(
761 mock_getProcessOutputAndValue, MockCalledOnceWith(
762 # The environment contains LC_ALL and LANG too.
763 cmd[0], cmd[1:], env=get_env_with_bytes_locale()))
764
765 @inlineCallbacks
766+ def test___execSupervisorServiceAction_emulates_kill(self):
767+ snap_path = factory.make_name("path")
768+ self.patch(snappy, "get_snap_path").return_value = snap_path
769+ service_monitor = self.make_service_monitor()
770+ service_name = factory.make_name("service")
771+ fake_pid = random.randint(1, 100)
772+ mock_getProcessOutputAndValue = self.patch(
773+ service_monitor_module, "getProcessOutputAndValue")
774+ mock_getProcessOutputAndValue.side_effect = [
775+ succeed((("%s" % fake_pid).encode('utf-8'), b"", 0)),
776+ succeed((b"", b"", 0)),
777+ ]
778+ extra_opts = ('-s', factory.make_name('SIGKILL'))
779+ yield service_monitor._execSupervisorServiceAction(
780+ service_name, 'kill', extra_opts=extra_opts)
781+ cmd = os.path.join(snap_path, 'bin', 'run-supervisorctl')
782+ self.assertThat(
783+ mock_getProcessOutputAndValue, MockCallsMatch(
784+ call(
785+ cmd, ("pid", service_name),
786+ env=get_env_with_bytes_locale()),
787+ call(
788+ "kill", extra_opts + ("%s" % fake_pid,),
789+ env=get_env_with_bytes_locale()),
790+ ))
791+
792+ @inlineCallbacks
793 def test___execSupervisorServiceAction_decodes_stdout_and_stderr(self):
794 # From https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt.
795 example_text = (
796@@ -555,13 +645,16 @@ class TestServiceMonitor(MAASTestCase):
797 service_monitor, "_execSystemDServiceAction")
798 mock_execSystemDServiceAction.return_value = (0, "", "")
799 action = factory.make_name("action")
800+ extra_opts = ('--option', factory.make_name("option"))
801+ setattr(service, '%s_extra_opts' % action, extra_opts)
802 yield service_monitor._performServiceAction(service, action)
803 self.assertThat(service_lock.run, MockCalledOnceWith(
804 service_monitor._execSystemDServiceAction,
805- service.service_name, action))
806+ service.service_name, action, extra_opts=extra_opts))
807 self.assertThat(
808 mock_execSystemDServiceAction,
809- MockCalledOnceWith(service.service_name, action))
810+ MockCalledOnceWith(
811+ service.service_name, action, extra_opts=extra_opts))
812
813 @inlineCallbacks
814 def test___performServiceAction_holds_lock_perform_supervisor_action(self):
815@@ -575,13 +668,16 @@ class TestServiceMonitor(MAASTestCase):
816 service_monitor, "_execSupervisorServiceAction")
817 mock_execSupervisorServiceAction.return_value = (0, "", "")
818 action = factory.make_name("action")
819+ extra_opts = ('--option', factory.make_name("option"))
820+ setattr(service, '%s_extra_opts' % action, extra_opts)
821 yield service_monitor._performServiceAction(service, action)
822 self.assertThat(service_lock.run, MockCalledOnceWith(
823 service_monitor._execSupervisorServiceAction,
824- service.service_name, action))
825+ service.service_name, action, extra_opts=extra_opts))
826 self.assertThat(
827 mock_execSupervisorServiceAction,
828- MockCalledOnceWith(service.service_name, action))
829+ MockCalledOnceWith(
830+ service.service_name, action, extra_opts=extra_opts))
831
832 @inlineCallbacks
833 def test___performServiceAction_raises_ServiceActionError_if_fails(self):
834diff --git a/src/provisioningserver/utils/tests/test_shell.py b/src/provisioningserver/utils/tests/test_shell.py
835index 4492b3a..03f67fd 100644
836--- a/src/provisioningserver/utils/tests/test_shell.py
837+++ b/src/provisioningserver/utils/tests/test_shell.py
838@@ -6,6 +6,7 @@
839 __all__ = []
840
841 import os
842+import random
843 from subprocess import CalledProcessError
844
845 from fixtures import EnvironmentVariable
846@@ -54,6 +55,14 @@ class TestCallAndCheck(MAASTestCase):
847 self.assertEqual(command, error.cmd)
848 self.assertEqual(message, error.output)
849
850+ def test__passes_timeout_to_communicate(self):
851+ command = factory.make_name('command')
852+ process = self.patch_popen()
853+ timeout = random.randint(1, 10)
854+ call_and_check(command, timeout=timeout)
855+ self.assertThat(
856+ process.communicate, MockCalledOnceWith(timeout=timeout))
857+
858 def test__reports_stderr_on_failure(self):
859 nonfile = os.path.join(self.make_dir(), factory.make_name('nonesuch'))
860 error = self.assertRaises(
861diff --git a/versions.cfg b/versions.cfg
862index 2d8eb7e..f214d65 100644
863--- a/versions.cfg
864+++ b/versions.cfg
865@@ -25,9 +25,11 @@ linecache2 = 1.0.0
866 nose = 1.3.7
867 nose-timer = 0.6.0
868 pbr = 1.10.0
869+pexpect = 4.6.0
870 pickleshare = 0.7.4
871 postgresfixture = 0.4.1
872 prompt-toolkit = 1.0.9
873+ptyprocess = 0.6.0
874 python-mimeparse = 1.6.0
875 python-subunit = 1.2.0
876 selenium = 2.45

Subscribers

People subscribed via source and target branches