Merge ~mpontillo/maas:remote-script-runner-reliability into maas:master

Proposed by Mike Pontillo
Status: Merged
Merge reported by: Lee Trager
Merged at revision: 7cea436db728c3424612069709d49c7ef9e9ce3e
Proposed branch: ~mpontillo/maas:remote-script-runner-reliability
Merge into: maas:master
Diff against target: 81 lines (+8/-6)
3 files modified
src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py (+2/-1)
src/provisioningserver/refresh/__init__.py (+2/-1)
src/provisioningserver/refresh/maas_api_helper.py (+4/-4)
Reviewer Review Type Date Requested Status
MAAS Maintainers Pending
Review via email: mp+331030@code.launchpad.net

Commit message

Make remote script runner more reliable

 * Use a monotonic clock instead of relative time.
 * Ensure stdin is redirected from /dev/null.

To post a comment you must log in.
Revision history for this message
Lee Trager (ltrager) wrote :

I didn't think about NTP resetting the clock when implementing this. Using time.monotic() should fix any time outs due to NTP changing the system time. I don't see any issue with closing STDIN either.

Revision history for this message
Mike Pontillo (mpontillo) wrote :

After looking at `lsof` output, I don't think it's necessary to redirect stdin to DEVNULL; this already seems to happen by default due to the way cloud-init is running and/or how it runs the scripts. But I think we should do it anyway for good measure.

The other change I think we should make to the runner related to bug #1718517 - we should surface any HTTP errors that occur on submit, and we should retry in the case of HTTP 408 or 5xx errors (such as service unavailable, like when Apache is up and running but MAAS is restarting) or timeouts.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
2index 9dc5c24..5ff7ced 100644
3--- a/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
4+++ b/src/metadataserver/user_data/templates/snippets/maas_run_remote_scripts.py
5@@ -29,6 +29,7 @@ import re
6 import shlex
7 from subprocess import (
8 check_output,
9+ DEVNULL,
10 PIPE,
11 Popen,
12 TimeoutExpired,
13@@ -91,7 +92,7 @@ def download_and_extract_tar(url, creds, scripts_dir):
14 def run_and_check(
15 cmd, combined_path, stdout_path, stderr_path, script_name, args,
16 ignore_error=False):
17- proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
18+ proc = Popen(cmd, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)
19 capture_script_output(proc, combined_path, stdout_path, stderr_path)
20 if proc.returncode != 0 and not ignore_error:
21 args['exit_status'] = proc.returncode
22diff --git a/src/provisioningserver/refresh/__init__.py b/src/provisioningserver/refresh/__init__.py
23index bf05f10..cd52816 100644
24--- a/src/provisioningserver/refresh/__init__.py
25+++ b/src/provisioningserver/refresh/__init__.py
26@@ -7,6 +7,7 @@ import os
27 import socket
28 import stat
29 from subprocess import (
30+ DEVNULL,
31 PIPE,
32 Popen,
33 TimeoutExpired,
34@@ -164,7 +165,7 @@ def runscripts(scripts, url, creds, tmpdir):
35 timeout_seconds = timeout.seconds
36
37 try:
38- proc = Popen(script_path, stdout=PIPE, stderr=PIPE)
39+ proc = Popen(script_path, stdin=DEVNULL, stdout=PIPE, stderr=PIPE)
40 capture_script_output(
41 proc, combined_path, stdout_path, stderr_path, timeout_seconds)
42 except OSError as e:
43diff --git a/src/provisioningserver/refresh/maas_api_helper.py b/src/provisioningserver/refresh/maas_api_helper.py
44index d4dd138..eac7d49 100644
45--- a/src/provisioningserver/refresh/maas_api_helper.py
46+++ b/src/provisioningserver/refresh/maas_api_helper.py
47@@ -295,7 +295,7 @@ def capture_script_output(
48 timeout = None
49 else:
50 # Pad the timeout by 60 seconds to allow for cleanup.
51- timeout = datetime.now() + timedelta(seconds=(timeout_seconds + 60))
52+ timeout = time.monotonic() + timeout_seconds + 60
53
54 # Create the file and then open it in read write mode for terminal
55 # emulation.
56@@ -309,14 +309,14 @@ def capture_script_output(
57 while selector.get_map() and proc.poll() is None:
58 # Select with a short timeout so that we don't tight loop.
59 _select_script_output(selector, combined, 0.1)
60- if timeout is not None and datetime.now() > timeout:
61+ if timeout is not None and time.monotonic() > timeout:
62 break
63 else:
64 # Process has finished or has closed stdout and stderr.
65 # Process anything still sitting in the latter's buffers.
66 _select_script_output(selector, combined, 0.0)
67
68- now = datetime.now()
69+ now = time.monotonic()
70 # Wait for the process to finish.
71 if timeout is None:
72 # No timeout just wait until the process finishes.
73@@ -329,7 +329,7 @@ def capture_script_output(
74 # stdout and stderr have been closed but the timeout has not been
75 # exceeded. Run with the remaining amount of time.
76 try:
77- return proc.wait(timeout=(timeout - now).seconds)
78+ return proc.wait(timeout=(timeout - now))
79 except TimeoutExpired:
80 # Make sure the process was killed
81 proc.kill()

Subscribers

People subscribed via source and target branches