Merge lp:~mskalka/juju-ci-tools/better-container-reboot into lp:juju-ci-tools

Proposed by Michael Skalka
Status: Merged
Merged at revision: 1923
Proposed branch: lp:~mskalka/juju-ci-tools/better-container-reboot
Merge into: lp:juju-ci-tools
Diff against target: 133 lines (+25/-38)
1 file modified
assess_network_health.py (+25/-38)
To merge this branch: bzr merge lp:~mskalka/juju-ci-tools/better-container-reboot
Reviewer Review Type Date Requested Status
Curtis Hovey (community) code Approve
Review via email: mp+318959@code.launchpad.net

Description of the change

Tweaks the reboot logic for containers. This bypassess the container not being directly addressable by juju in certain cases.

To post a comment you must log in.
Revision history for this message
Curtis Hovey (sinzui) wrote :

Thank you.

review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'assess_network_health.py'
2--- assess_network_health.py 2017-03-02 16:05:06 +0000
3+++ assess_network_health.py 2017-03-03 20:45:37 +0000
4@@ -35,12 +35,6 @@
5 NO_EXPOSED_UNITS = 'No exposed units'
6
7
8-class ConnectionError(Exception):
9- """Connection failed in some way"""
10- def __init__(self, message):
11- self.message = message
12-
13-
14 class AssessNetworkHealth:
15
16 def __init__(self, args):
17@@ -67,7 +61,7 @@
18 if not reboot:
19 if results_pre:
20 error_string.extend(results_pre)
21- raise ConnectionError('\n'.join(error_string))
22+ raise Exception('\n'.join(error_string))
23 log.info('SUCESS')
24 return
25 log.info('Units completed pre-reboot tests, rebooting machines.')
26@@ -78,7 +72,7 @@
27 error_string.extend(results_pre or 'No pre-reboot failures.')
28 error_string.extend(['Post-reboot test failures:'])
29 error_string.extend(results_post or 'No post-reboot failures.')
30- raise ConnectionError('\n'.join(error_string))
31+ raise Exception('\n'.join(error_string))
32 log.info('SUCESS')
33 return
34
35@@ -236,7 +230,12 @@
36 log.info("Assessing internet connection for "
37 "machine: {}".format(unit[0]))
38 results[unit[0]] = False
39- routes = self.ssh(client, unit[0], 'ip route show')
40+ try:
41+ routes = self.ssh(client, unit[0], 'ip route show')
42+ except subprocess.CalledProcessError as e:
43+ log.error('Could not connect to address for unit: {0}, '
44+ 'unable to find default route.'.format(unit[0]))
45+ continue
46 default_route = re.search(r'^default\s+via\s+([\d\.]+)\s+', routes,
47 re.MULTILINE)
48 if default_route:
49@@ -265,7 +264,9 @@
50 nh_units = []
51 for service in apps.values():
52 for unit in service.get('units', {}).values():
53- nh_units.extend(unit.get('subordinates').keys())
54+ nh_subs = [u for u in unit.get('subordinates').keys()
55+ if 'network-health' in u]
56+ nh_units.extend(nh_subs)
57 for nh_unit in nh_units:
58 service_results = {}
59 for service, units in targets.items():
60@@ -371,23 +372,22 @@
61
62 def reboot_machines(self, client):
63 log.info("Starting reboot of all containers.")
64-
65- def reboot(unit):
66- log.info("Restarting unit: {}".format(unit))
67- client.juju(
68- 'run', ('--machine', unit, 'sudo shutdown -r now'))
69-
70 try:
71 for machine, m_info in client.get_status().iter_machines():
72+ cont_ids = []
73 try:
74- for cont, c_info in m_info['containers'].items():
75- reboot(cont)
76- # TODO: Find another way to ensure LXD container has
77- # properly rebooted
78+ cont_ids.extend([c['instance-id'] for c in
79+ m_info.get('containers').values()])
80 except KeyError:
81- log.info('No containers to reboot for '
82- 'machine: {}'.format(machine))
83- reboot(machine)
84+ log.info('No containers for machine: {}'.format(machine))
85+ if cont_ids:
86+ log.info('Restarting containers: {0} on '
87+ 'machine: {1}'.format(cont_ids, machine))
88+ self.ssh(client, machine,
89+ 'sudo lxc restart {}'.format(' '.join(cont_ids)))
90+ log.info("Restarting machine: {}".format(machine))
91+ client.juju('run', ('--machine', machine,
92+ 'sudo shutdown -r now'))
93 hostname = client.get_status().get_machine_dns_name(machine)
94 wait_for_port(hostname, 22, timeout=240)
95
96@@ -395,17 +395,7 @@
97 logging.info(
98 "Error running shutdown:\nstdout: %s\nstderr: %s",
99 e.output, getattr(e, 'stderr', None))
100- raise
101-
102- def get_uptime(self, client, host):
103- uptime_pattern = re.compile(r'.*(\d+)')
104- uptime_output = self.ssh(client, host, 'uptime -p')
105- log.info('uptime -p: {}'.format(uptime_output))
106- match = uptime_pattern.match(uptime_output)
107- if match:
108- return int(match.group(1))
109- else:
110- return 0
111+ client.wait_for_started()
112
113 def ssh(self, client, machine, cmd):
114 """Convenience function: run a juju ssh command and get back the output
115@@ -418,7 +408,7 @@
116 attempts = 4
117 for attempt in range(attempts):
118 try:
119- return client.get_juju_output('ssh', '--proxy', machine[0],
120+ return client.get_juju_output('ssh', '--proxy', machine,
121 cmd)
122 except subprocess.CalledProcessError as e:
123 # If the connection to the host failed, try again in a couple
124@@ -511,9 +501,6 @@
125 target_model=args.model,
126 series=args.series,
127 reboot=args.reboot)
128- except Exception as e:
129- log.error(e)
130- raise
131 finally:
132 test.cleanup(client)
133 return 0

Subscribers

People subscribed via source and target branches