Merge ~blake-rouse/maas:cmd-timeout-service-monitor into maas:master

Proposed by Blake Rouse
Status: Merged
Approved by: Blake Rouse
Approved revision: 89cf2222d968826355d4580c576b1df4a11486cb
Merge reported by: MAAS Lander
Merged at revision: not available
Proposed branch: ~blake-rouse/maas:cmd-timeout-service-monitor
Merge into: maas:master
Diff against target: 111 lines (+45/-15)
2 files modified
src/provisioningserver/utils/service_monitor.py (+36/-15)
src/provisioningserver/utils/tests/test_service_monitor.py (+9/-0)
Reviewer Review Type Date Requested Status
Mike Pontillo (community) Approve
Review via email: mp+355195@code.launchpad.net

Commit message

Add a 5 second timeout to execution of system monitor.

Also adds more debug logging inside the system monitor.

To post a comment you must log in.
Revision history for this message
Mike Pontillo (mpontillo) wrote :

Looks good! I tested this branch and it seems to prevent the DHCP errors I've been seeing this cycle.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/src/provisioningserver/utils/service_monitor.py b/src/provisioningserver/utils/service_monitor.py
2index b43d49e..3f5e76f 100644
3--- a/src/provisioningserver/utils/service_monitor.py
4+++ b/src/provisioningserver/utils/service_monitor.py
5@@ -35,8 +35,12 @@ from provisioningserver.utils import (
6 typed,
7 )
8 from provisioningserver.utils.shell import get_env_with_bytes_locale
9-from provisioningserver.utils.twisted import asynchronous
10+from provisioningserver.utils.twisted import (
11+ asynchronous,
12+ deferWithTimeout,
13+)
14 from twisted.internet.defer import (
15+ CancelledError,
16 DeferredList,
17 DeferredLock,
18 inlineCallbacks,
19@@ -406,6 +410,35 @@ class ServiceMonitor:
20 raise ServiceActionError(error_msg)
21 yield self._performServiceAction(service, "reload")
22
23+ def _execCmd(self, cmd, env, timeout=5):
24+
25+ def decode(result):
26+ out, err, code = result
27+ return code, out.decode("utf-8"), err.decode("utf-8")
28+
29+ def log_code(result, cmd):
30+ _, _, code = result
31+ log.debug(
32+ "Service monitor got exit code '{code}' from cmd: {cmd()}",
33+ code=code, cmd=lambda: ' '.join(cmd))
34+ return result
35+
36+ def handle_timeout(failure, timeout, cmd):
37+ failure.trap(CancelledError)
38+ raise ServiceActionError(
39+ "Service monitor timed out after '%d' "
40+ "seconds running cmd: %s" % (timeout, ' '.join(cmd)))
41+
42+ log.debug(
43+ "Service monitor executing cmd: {cmd()}",
44+ cmd=lambda: ' '.join(cmd))
45+
46+ d = deferWithTimeout(
47+ timeout, getProcessOutputAndValue, cmd[0], cmd[1:], env=env)
48+ d.addErrback(handle_timeout, timeout, cmd)
49+ d.addCallback(log_code, cmd)
50+ return d.addCallback(decode)
51+
52 @asynchronous
53 def _execSystemDServiceAction(self, service_name, action, extra_opts=None):
54 """Perform the action with the systemctl command.
55@@ -417,13 +450,7 @@ class ServiceMonitor:
56 if extra_opts is not None:
57 cmd.extend(extra_opts)
58 cmd.append(service_name)
59-
60- def decode(result):
61- out, err, code = result
62- return code, out.decode("utf-8"), err.decode("utf-8")
63-
64- d = getProcessOutputAndValue(cmd[0], cmd[1:], env=env)
65- return d.addCallback(decode)
66+ return self._execCmd(cmd, env)
67
68 @asynchronous
69 def _execSupervisorServiceAction(self, service_name, action):
70@@ -434,13 +461,7 @@ class ServiceMonitor:
71 env = get_env_with_bytes_locale()
72 cmd = os.path.join(snappy.get_snap_path(), "bin", "run-supervisorctl")
73 cmd = (cmd, action, service_name)
74-
75- def decode(result):
76- out, err, code = result
77- return code, out.decode("utf-8"), err.decode("utf-8")
78-
79- d = getProcessOutputAndValue(cmd[0], cmd[1:], env=env)
80- return d.addCallback(decode)
81+ return self._execCmd(cmd, env)
82
83 @inlineCallbacks
84 def _performServiceAction(self, service, action):
85diff --git a/src/provisioningserver/utils/tests/test_service_monitor.py b/src/provisioningserver/utils/tests/test_service_monitor.py
86index f40619b..969fb00 100644
87--- a/src/provisioningserver/utils/tests/test_service_monitor.py
88+++ b/src/provisioningserver/utils/tests/test_service_monitor.py
89@@ -38,6 +38,7 @@ from provisioningserver.utils.service_monitor import (
90 ToggleableService,
91 )
92 from provisioningserver.utils.shell import get_env_with_bytes_locale
93+from provisioningserver.utils.twisted import pause
94 from testscenarios import multiply_scenarios
95 from testtools import ExpectedException
96 from testtools.matchers import (
97@@ -449,6 +450,14 @@ class TestServiceMonitor(MAASTestCase):
98 yield service_monitor.reloadService(fake_service.name, if_on=True)
99
100 @inlineCallbacks
101+ def test___execCmd_times_out(self):
102+ monitor = ServiceMonitor(make_fake_service())
103+ with ExpectedException(ServiceActionError):
104+ yield monitor._execCmd(["sleep", "0.3"], {}, timeout=0.1)
105+ # Pause long enough for the reactor to cleanup the process.
106+ yield pause(0.5)
107+
108+ @inlineCallbacks
109 def test___execSystemDServiceAction_calls_systemctl(self):
110 service_monitor = self.make_service_monitor()
111 service_name = factory.make_name("service")

Subscribers

People subscribed via source and target branches