Merge ~asbalderson/juju-wait:machine-wait into juju-wait:master

Proposed by Alexander Balderson
Status: Merged
Merged at revision: dc9a4e78881e900cacc047621a4f633c201c3057
Proposed branch: ~asbalderson/juju-wait:machine-wait
Merge into: juju-wait:master
Diff against target: 154 lines (+86/-3)
1 file modified
juju_wait/__init__.py (+86/-3)
Reviewer Review Type Date Requested Status
Stuart Bishop (community) Needs Fixing
Jason Hobbs (community) Approve
Review via email: mp+377464@code.launchpad.net

Commit message

Start checking if machines are down during deploys

There are cases when machines never come up or enter a bad state
during juju deploys. This checks if those machines are in bad
states, and exits.

This should bubble up errors and failures much faster.

To post a comment you must log in.
Revision history for this message
Jason Hobbs (jason-hobbs) :
review: Needs Fixing
Revision history for this message
Jason Hobbs (jason-hobbs) wrote :

lgtm, thanks!

review: Approve
Revision history for this message
Stuart Bishop (stub) wrote :

The argument order in the wait function needs to be adjusted, so it is backwards compatible with callsites using positional parameters (of which there are probably none, as the only user of this as a library I'm aware of is using keyword parameters, but might as well avoid the possibility).

It would be better if the code used datetime instances for timestamps rather than floats, to match what is already being done elsewhere in the code. I'm not going to require this to be fixed though.

review: Needs Fixing
Revision history for this message
Alexander Balderson (asbalderson) wrote :

> The argument order in the wait function needs to be adjusted, so it is
> backwards compatible with callsites using positional parameters (of which
> there are probably none, as the only user of this as a library I'm aware of is
> using keyword parameters, but might as well avoid the possibility).

Good call on this; I had overlooked that option.

> It would be better if the code used datetime instances for timestamps rather
> than floats, to match what is already being done elsewhere in the code. I'm
> not going to require this to be fixed though.

This is good feedback, The adjustments have been made to help keep things consistent.

Revision history for this message
Stuart Bishop (stub) wrote :

Looks good, except for what I think is some debugging code that leaked in on line 90 of the diff.

Revision history for this message
Alexander Balderson (asbalderson) wrote :

> Looks good, except for what I think is some debugging code that leaked in on
> line 90 of the diff.

Whoops; good catch!

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/juju_wait/__init__.py b/juju_wait/__init__.py
2index ffd0417..02449ee 100755
3--- a/juju_wait/__init__.py
4+++ b/juju_wait/__init__.py
5@@ -47,6 +47,17 @@ class EnvironmentAction(argparse.Action):
6 os.environ['JUJU_MODEL'] = values[0]
7
8
9+class MachineState(object):
10+ def __init__(self, state, state_time):
11+ self.state = state
12+ self.time = state_time
13+
14+ def __eq__(self, other):
15+ if self.state == other.state and self.time == other.time:
16+ return True
17+ return False
18+
19+
20 def parse_ts(ts):
21 '''Parse the Juju provided timestamp, which must be UTC.'''
22 return datetime.strptime(ts, '%d %b %Y %H:%M:%SZ')
23@@ -192,6 +203,11 @@ WORKLOAD_OK_STATES = ['active', 'unknown']
24 # values, consider that fatal and raise.
25 WORKLOAD_ERROR_STATES = ['error']
26
27+# If a machine is in one of these states it may be broken.
28+MACHINE_ERROR_STATES = ['down', 'error', 'stopped']
29+
30+MACHINE_PENDING_STATES = ['pending']
31+
32 StatusFields = namedtuple('StatusFields', ['timestamp',
33 'status_type',
34 'state',
35@@ -229,6 +245,14 @@ def wait_cmd(args=sys.argv[1:]):
36 action='append', default=list(), type=str,
37 help='Exclude the application from "active" status'
38 ' checks. Can be used multiple times.')
39+ parser.add_argument('--machine-exclude', dest='machine_exclude', action='append',
40+ default=list(), type=str,
41+ help='exclude these machines from checking for downed states.'
42+ ' Can be used multiple times.')
43+ parser.add_argument('--machine-error-timeout', dest='machine_error_timeout', type=int, action='store', default=300,
44+ help='Time for a machine to be in a bad state before exiting (seconds).')
45+ parser.add_argument('--machine-pending-timeout', dest='machine_pending_timeout', type=int, action='store', default=3600,
46+ help='Time for a machine to be in a pending state before exiting (seconds).')
47 parser.add_argument('--version', default=False, action='store_true')
48 args = parser.parse_args(args)
49
50@@ -269,7 +293,10 @@ def wait_cmd(args=sys.argv[1:]):
51 args.wait_for_workload,
52 max_wait,
53 args.retry_errors,
54- args.exclude
55+ args.exclude,
56+ args.machine_error_timeout,
57+ args.machine_pending_timeout,
58+ args.machine_exclude
59 )
60 return 0
61 except JujuWaitException as x:
62@@ -309,7 +336,8 @@ def get_error_repeat_count(uname):
63 return len(status_lines)
64
65
66-def wait(log=None, wait_for_workload=False, max_wait=None, retry_errors=0, exclude=None):
67+def wait(log=None, wait_for_workload=False, max_wait=None, retry_errors=0, exclude=None,
68+ machine_error_timeout=300, machine_pending_timeout=3600, machine_exclude=None):
69 # Note that wait_for_workload is only useful as a workaround for
70 # broken setups. It is impossible for a charm to report that it has
71 # completed setup, because a charm has no information about units
72@@ -325,6 +353,10 @@ def wait(log=None, wait_for_workload=False, max_wait=None, retry_errors=0, exclu
73 exclude = []
74 exclude = set(exclude)
75
76+ if machine_exclude is None:
77+ machine_exclude = []
78+ machine_exclude = set(machine_exclude)
79+
80 # pre-juju 1.24, we can only detect idleless by looking for changes
81 # in the logs.
82 prev_logs = {}
83@@ -333,12 +365,15 @@ def wait(log=None, wait_for_workload=False, max_wait=None, retry_errors=0, exclu
84 ready_since = None
85 logging_reset = False
86
87+ machine_status = {}
88+
89 while True:
90 status = get_status()
91 ready = True
92
93 # If defined, fail if max_wait is exceeded
94- epoch_elapsed = time.time() - epoch_started
95+ loop_starttime = time.time()
96+ epoch_elapsed = loop_starttime - epoch_started
97 if max_wait and epoch_elapsed > max_wait:
98 ready = False
99 logging.error('Not ready in {}s (max_wait)'.format(max_wait))
100@@ -501,6 +536,54 @@ def wait(log=None, wait_for_workload=False, max_wait=None, retry_errors=0, exclu
101 ''.format(uname, logs[uname].strip()))
102 ready = False
103
104+ for machine in status.get('machines', {}).keys():
105+ machine_data = status['machines'][machine]
106+ machine_state = MachineState(
107+ machine_data['juju-status']['current'],
108+ parse_ts(
109+ machine_data['juju-status']['since']
110+ )
111+ )
112+
113+ if not machine_status.get(machine):
114+ machine_status[machine] = machine_state
115+
116+ if machine_status[machine] != machine_state:
117+ machine_status[machine] = machine_state
118+
119+ for container in machine_data.get('containers', {}).keys():
120+ container_data = machine_data['containers'][container]
121+ container_state = MachineState(
122+ container_data['juju-status']['current'],
123+ parse_ts(
124+ container_data['juju-status']['since']
125+ )
126+ )
127+
128+ if not machine_status.get(container):
129+ machine_status[container] = container_state
130+
131+ if machine_status[container] != container_state:
132+ machine_status[container] = container_state
133+
134+ for machine, state in machine_status.items():
135+ if machine in machine_exclude:
136+ logging.debug('Ignoring machine {} because it is excluded'.format(machine))
137+ continue
138+ if state.state in MACHINE_ERROR_STATES:
139+ ready = False
140+ logging.debug('Machine {} is in {}'.format(machine, state.state))
141+ if loop_starttime - state.time.timestamp() > machine_error_timeout:
142+ minutes = round((loop_starttime - state.time.timestamp())/60)
143+ logging.error('Machine {} is in {} for {} minutes.'.format(machine, state.state, minutes))
144+ raise JujuWaitException(1)
145+ if state.state in MACHINE_PENDING_STATES:
146+ ready = False
147+ if loop_starttime - state.time.timestamp() > machine_pending_timeout:
148+ minutes = round((loop_starttime - state.time.timestamp())/60)
149+ logging.error('Machine {} has been pending for {} minutes'.format(machine, minutes))
150+ raise JujuWaitException(1)
151+
152 if ready:
153 # We are never ready until this check has been running until
154 # IDLE_CONFIRMATION time has passed. This ensures that if we

Subscribers

People subscribed via source and target branches

to all changes: