Merge lp:~tribaal/charms/trusty/rabbitmq-server/backport-lp1500204 into lp:charms/trusty/rabbitmq-server
- Trusty Tahr (14.04)
- backport-lp1500204
- Merge into trunk
Status: | Merged |
---|---|
Merged at revision: | 105 |
Proposed branch: | lp:~tribaal/charms/trusty/rabbitmq-server/backport-lp1500204 |
Merge into: | lp:charms/trusty/rabbitmq-server |
Diff against target: |
325 lines (+163/-63) 4 files modified
hooks/rabbit_utils.py (+72/-50) hooks/rabbitmq_server_relations.py (+19/-12) metadata.yaml (+1/-1) unit_tests/test_rabbit_utils.py (+71/-0) |
To merge this branch: | bzr merge lp:~tribaal/charms/trusty/rabbitmq-server/backport-lp1500204 |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Liam Young (community) | Approve | ||
Adam Collard (community) | Needs Information | ||
Review via email: mp+273958@code.launchpad.net |
Commit message
Description of the change
This branch is a backport to -stable of the patch/diff fixing LP:1500204 here: https:/
This basically makes the charm ignore the min-cluster-size option in case juju leader-election is available. If it is, the nodes are told to cluster with the leader.
Adam Collard (adam-collard) : | # |
uosci-testing-bot (uosci-testing-bot) wrote : | # |
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_lint_check #11615 rabbitmq-server for tribaal mp273958
LINT OK: passed
Build: http://
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_amulet_test #7265 rabbitmq-server for tribaal mp273958
AMULET OK: passed
Build: http://
Chris Glass (tribaal) wrote : | # |
Putting this back in WIP until we fix the comments in -next first. Will re-backport to this branch.
Chris Glass (tribaal) wrote : | # |
As discussed with members of the openstack-charmers team, we'll backport the -next version of the code, and not try to address some of the comments here (although they are valid).
Comments will be taken care of when we hopefully get rid of both min-cluster-size and max-cluster-retry altogether.
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_unit_test #10960 rabbitmq-server for tribaal mp273958
UNIT OK: passed
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_lint_check #11782 rabbitmq-server for tribaal mp273958
LINT OK: passed
Build: http://
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_amulet_test #7299 rabbitmq-server for tribaal mp273958
AMULET FAIL: amulet-test failed
AMULET Results (max last 2 lines):
make: *** [functional_test] Error 1
ERROR:root:Make target returned non-zero.
Full amulet test output: http://
Build: http://
Ryan Beisner (1chb1n) wrote : | # |
FYI, the stable rmq charm still possesses the old tests (pre-refactor).
- 110. By Chris Glass
-
Empty commit to try and kick OSCI again.
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_unit_test #10961 rabbitmq-server for tribaal mp273958
UNIT OK: passed
uosci-testing-bot (uosci-testing-bot) wrote : | # |
charm_lint_check #11784 rabbitmq-server for tribaal mp273958
LINT OK: passed
Build: http://
Ryan Beisner (1chb1n) wrote : | # |
No need to re-kick. rmq stable has known-problematic tests.
For a validated backport, the next amulet tests should also be ported.
Preview Diff
1 | === modified file 'hooks/rabbit_utils.py' |
2 | --- hooks/rabbit_utils.py 2015-09-23 19:36:14 +0000 |
3 | +++ hooks/rabbit_utils.py 2015-10-13 17:03:15 +0000 |
4 | @@ -6,6 +6,8 @@ |
5 | import subprocess |
6 | import glob |
7 | import tempfile |
8 | +import random |
9 | +import time |
10 | |
11 | from charmhelpers.contrib.openstack.utils import ( |
12 | get_hostname, |
13 | @@ -13,12 +15,10 @@ |
14 | |
15 | from charmhelpers.core.hookenv import ( |
16 | config, |
17 | - relation_ids, |
18 | - relation_get, |
19 | - related_units, |
20 | log, ERROR, |
21 | INFO, |
22 | - service_name |
23 | + service_name, |
24 | + cached, |
25 | ) |
26 | |
27 | from charmhelpers.core.host import ( |
28 | @@ -242,68 +242,42 @@ |
29 | cluster_cmd = 'join_cluster' |
30 | else: |
31 | cluster_cmd = 'cluster' |
32 | - out = subprocess.check_output([RABBITMQ_CTL, 'cluster_status']) |
33 | - log('cluster status is %s' % str(out)) |
34 | - |
35 | - # check if node is already clustered |
36 | - total_nodes = 1 |
37 | - running_nodes = [] |
38 | - m = re.search("\{running_nodes,\[(.*?)\]\}", out.strip(), re.DOTALL) |
39 | - if m is not None: |
40 | - running_nodes = m.group(1).split(',') |
41 | - running_nodes = [x.replace("'", '') for x in running_nodes] |
42 | - total_nodes = len(running_nodes) |
43 | - |
44 | - if total_nodes > 1: |
45 | + |
46 | + if clustered(): |
47 | log('Node is already clustered, skipping') |
48 | return False |
49 | |
50 | - # check all peers and try to cluster with them |
51 | - available_nodes = [] |
52 | - for r_id in relation_ids('cluster'): |
53 | - for unit in related_units(r_id): |
54 | - if config('prefer-ipv6'): |
55 | - address = relation_get('hostname', |
56 | - rid=r_id, unit=unit) |
57 | - else: |
58 | - address = relation_get('private-address', |
59 | - rid=r_id, unit=unit) |
60 | - if address is not None: |
61 | - node = get_hostname(address, fqdn=False) |
62 | - if node: |
63 | - available_nodes.append(node) |
64 | - else: |
65 | - log('Cannot resolve hostname for {} ' |
66 | - 'using DNS servers'.format(address), level='WARNING') |
67 | - |
68 | - if len(available_nodes) == 0: |
69 | + # Check the leader and try to cluster with it |
70 | + if len(leader_node()) == 0: |
71 | log('No nodes available to cluster with') |
72 | return False |
73 | |
74 | - # iterate over all the nodes, join to the first available |
75 | num_tries = 0 |
76 | - for node in available_nodes: |
77 | - log('Clustering with remote rabbit host (%s).' % node) |
78 | - if node in running_nodes: |
79 | + for node in leader_node(): |
80 | + if node in running_nodes(): |
81 | log('Host already clustered with %s.' % node) |
82 | return False |
83 | - |
84 | + # NOTE: The primary problem rabbitmq has clustering is when |
85 | + # more than one node attempts to cluster at the same time. |
86 | + # The asynchronous nature of hook firing nearly guarantees |
87 | + # this. Using random time wait is a hack until we can |
88 | + # implement charmhelpers.coordinator. |
89 | + time.sleep(random.random()*100) |
90 | + log('Clustering with remote rabbit host (%s).' % node) |
91 | try: |
92 | cmd = [RABBITMQ_CTL, 'stop_app'] |
93 | subprocess.check_call(cmd) |
94 | - cmd = [RABBITMQ_CTL, cluster_cmd, 'rabbit@%s' % node] |
95 | - try: |
96 | - subprocess.check_output(cmd, stderr=subprocess.STDOUT) |
97 | - except subprocess.CalledProcessError as e: |
98 | - if not e.returncode == 2 or \ |
99 | - "{ok,already_member}" not in e.output: |
100 | - raise e |
101 | + cmd = [RABBITMQ_CTL, cluster_cmd, node] |
102 | + subprocess.check_output(cmd, stderr=subprocess.STDOUT) |
103 | cmd = [RABBITMQ_CTL, 'start_app'] |
104 | subprocess.check_call(cmd) |
105 | log('Host clustered with %s.' % node) |
106 | return True |
107 | - except: |
108 | - log('Failed to cluster with %s.' % node) |
109 | + except subprocess.CalledProcessError as e: |
110 | + log('Failed to cluster with %s. Exception: %s' |
111 | + % (node, e)) |
112 | + cmd = [RABBITMQ_CTL, 'start_app'] |
113 | + subprocess.check_call(cmd) |
114 | # continue to the next node |
115 | num_tries += 1 |
116 | if num_tries > config('max-cluster-tries'): |
117 | @@ -591,3 +565,51 @@ |
118 | for v in restart_map().values(): |
119 | _services = _services + v |
120 | return list(set(_services)) |
121 | + |
122 | + |
123 | +def running_nodes(): |
124 | + ''' Determine the current set of running rabbitmq-units in the cluster ''' |
125 | + out = subprocess.check_output([RABBITMQ_CTL, 'cluster_status']) |
126 | + |
127 | + running_nodes = [] |
128 | + m = re.search("\{running_nodes,\[(.*?)\]\}", out.strip(), re.DOTALL) |
129 | + if m is not None: |
130 | + running_nodes = m.group(1).split(',') |
131 | + running_nodes = [x.replace("'", '').strip() for x in running_nodes] |
132 | + |
133 | + return running_nodes |
134 | + |
135 | + |
136 | +@cached |
137 | +def leader_node(): |
138 | + ''' Provide the leader node for clustering ''' |
139 | + # Each rabbitmq node should join_cluster with the leader |
140 | + # to avoid split-brain clusters. |
141 | + leader_node_ip = peer_retrieve('leader_node_ip') |
142 | + if leader_node_ip: |
143 | + return ["rabbit@" + get_node_hostname(leader_node_ip)] |
144 | + else: |
145 | + return [] |
146 | + |
147 | + |
148 | +def get_node_hostname(address): |
149 | + ''' Resolve IP address to hostname for nodes ''' |
150 | + node = get_hostname(address, fqdn=False) |
151 | + if node: |
152 | + return node |
153 | + else: |
154 | + log('Cannot resolve hostname for {} using DNS servers'.format(address), |
155 | + level='WARNING') |
156 | + return None |
157 | + |
158 | + |
159 | +@cached |
160 | +def clustered(): |
161 | + ''' Determine whether local rabbitmq-server is clustered ''' |
162 | + # NOTE: A rabbitmq node can only join a cluster once. |
163 | + # Simply checking for more than one running node tells us |
164 | + # if this unit is in a cluster. |
165 | + if len(running_nodes()) > 1: |
166 | + return True |
167 | + else: |
168 | + return False |
169 | |
170 | === modified file 'hooks/rabbitmq_server_relations.py' |
171 | --- hooks/rabbitmq_server_relations.py 2015-09-23 19:36:14 +0000 |
172 | +++ hooks/rabbitmq_server_relations.py 2015-10-13 17:03:15 +0000 |
173 | @@ -63,6 +63,7 @@ |
174 | UnregisteredHookError, |
175 | is_leader, |
176 | charm_dir, |
177 | + unit_private_ip, |
178 | ) |
179 | from charmhelpers.core.host import ( |
180 | cmp_pkgrevno, |
181 | @@ -257,16 +258,23 @@ |
182 | """ |
183 | min_size = config('min-cluster-size') |
184 | if min_size: |
185 | - size = 0 |
186 | - for rid in relation_ids('cluster'): |
187 | - size = len(related_units(rid)) |
188 | + # Ignore min-cluster-size if juju has leadership election |
189 | + try: |
190 | + is_leader() |
191 | + log("Ignoring min-cluster-size in favour of Juju leader election") |
192 | + return True |
193 | + except NotImplementedError: |
194 | + log("Leader election is not available, using min-cluster-size") |
195 | + size = 0 |
196 | + for rid in relation_ids('cluster'): |
197 | + size = len(related_units(rid)) |
198 | |
199 | - # Include this unit |
200 | - size += 1 |
201 | - if min_size > size: |
202 | - log("Insufficient number of peer units to form cluster " |
203 | - "(expected=%s, got=%s)" % (min_size, size), level=INFO) |
204 | - return False |
205 | + # Include this unit |
206 | + size += 1 |
207 | + if min_size > size: |
208 | + log("Insufficient number of peer units to form cluster " |
209 | + "(expected=%s, got=%s)" % (min_size, size), level=INFO) |
210 | + return False |
211 | |
212 | return True |
213 | |
214 | @@ -315,6 +323,7 @@ |
215 | log('Leader peer_storing cookie', level=INFO) |
216 | cookie = open(rabbit.COOKIE_PATH, 'r').read().strip() |
217 | peer_store('cookie', cookie) |
218 | + peer_store('leader_node_ip', unit_private_ip()) |
219 | |
220 | |
221 | @hooks.hook('cluster-relation-changed') |
222 | @@ -341,10 +350,8 @@ |
223 | rabbit.update_hosts_file({private_address: hostname}) |
224 | |
225 | if not is_sufficient_peers(): |
226 | - # Stop rabbit until leader has finished configuring |
227 | - log('Not enough peers, stopping until leader is configured', |
228 | + log('Not enough peers, waiting until leader is configured', |
229 | level=INFO) |
230 | - service_stop('rabbitmq-server') |
231 | return |
232 | |
233 | # sync the cookie with peers if necessary |
234 | |
235 | === modified file 'metadata.yaml' |
236 | --- metadata.yaml 2013-11-15 19:15:16 +0000 |
237 | +++ metadata.yaml 2015-10-13 17:03:15 +0000 |
238 | @@ -5,7 +5,7 @@ |
239 | RabbitMQ is an implementation of AMQP, the emerging standard for high |
240 | performance enterprise messaging. The RabbitMQ server is a robust and |
241 | scalable implementation of an AMQP broker. |
242 | -categories: ["misc"] |
243 | +tags: ["misc"] |
244 | provides: |
245 | amqp: |
246 | interface: rabbitmq |
247 | |
248 | === modified file 'unit_tests/test_rabbit_utils.py' |
249 | --- unit_tests/test_rabbit_utils.py 2015-04-13 15:42:57 +0000 |
250 | +++ unit_tests/test_rabbit_utils.py 2015-10-13 17:03:15 +0000 |
251 | @@ -69,3 +69,74 @@ |
252 | self.assertEqual(lines[0], "#somedata\n") |
253 | self.assertEqual(lines[1], "%s %s\n" % (map.items()[0])) |
254 | self.assertEqual(lines[4], "%s %s\n" % (map.items()[3])) |
255 | + |
256 | + @mock.patch('rabbit_utils.get_hostname') |
257 | + def test_get_node_hostname(self, mock_get_hostname): |
258 | + mock_get_hostname.return_value = 'juju-devel3-machine-13' |
259 | + self.assertEqual(rabbit_utils.get_node_hostname('192.168.20.50'), |
260 | + 'juju-devel3-machine-13') |
261 | + mock_get_hostname.assert_called_with('192.168.20.50', fqdn=False) |
262 | + |
263 | + @mock.patch('rabbit_utils.get_node_hostname') |
264 | + @mock.patch('rabbit_utils.peer_retrieve') |
265 | + def test_leader_node(self, mock_peer_retrieve, mock_get_node_hostname): |
266 | + mock_peer_retrieve.return_value = '192.168.20.50' |
267 | + mock_get_node_hostname.return_value = 'juju-devel3-machine-15' |
268 | + self.assertEqual(rabbit_utils.leader_node(), |
269 | + ['rabbit@juju-devel3-machine-15']) |
270 | + |
271 | + @mock.patch('rabbit_utils.subprocess.check_call') |
272 | + @mock.patch('rabbit_utils.subprocess.check_output') |
273 | + @mock.patch('rabbit_utils.time') |
274 | + @mock.patch('rabbit_utils.running_nodes') |
275 | + @mock.patch('rabbit_utils.leader_node') |
276 | + @mock.patch('rabbit_utils.clustered') |
277 | + @mock.patch('rabbit_utils.cmp_pkgrevno') |
278 | + def test_cluster_with_not_clustered(self, mock_cmp_pkgrevno, |
279 | + mock_clustered, mock_leader_node, |
280 | + mock_running_nodes, mock_time, |
281 | + mock_check_output, mock_check_call): |
282 | + mock_cmp_pkgrevno.return_value = True |
283 | + mock_clustered.return_value = False |
284 | + mock_leader_node.return_value = ['rabbit@juju-devel7-machine-11'] |
285 | + mock_running_nodes.return_value = ['rabbit@juju-devel5-machine-19'] |
286 | + rabbit_utils.cluster_with() |
287 | + mock_check_output.assert_called_with([rabbit_utils.RABBITMQ_CTL, |
288 | + 'join_cluster', |
289 | + 'rabbit@juju-devel7-machine-11'], |
290 | + stderr=-2) |
291 | + |
292 | + @mock.patch('rabbit_utils.subprocess.check_call') |
293 | + @mock.patch('rabbit_utils.subprocess.check_output') |
294 | + @mock.patch('rabbit_utils.time') |
295 | + @mock.patch('rabbit_utils.running_nodes') |
296 | + @mock.patch('rabbit_utils.leader_node') |
297 | + @mock.patch('rabbit_utils.clustered') |
298 | + @mock.patch('rabbit_utils.cmp_pkgrevno') |
299 | + def test_cluster_with_clustered(self, mock_cmp_pkgrevno, mock_clustered, |
300 | + mock_leader_node, mock_running_nodes, |
301 | + mock_time, mock_check_output, |
302 | + mock_check_call): |
303 | + mock_clustered.return_value = True |
304 | + mock_leader_node.return_value = ['rabbit@juju-devel7-machine-11'] |
305 | + mock_running_nodes.return_value = ['rabbit@juju-devel5-machine-19', |
306 | + 'rabbit@juju-devel7-machine-11'] |
307 | + rabbit_utils.cluster_with() |
308 | + assert not mock_check_output.called |
309 | + |
310 | + @mock.patch('rabbit_utils.subprocess.check_call') |
311 | + @mock.patch('rabbit_utils.subprocess.check_output') |
312 | + @mock.patch('rabbit_utils.time') |
313 | + @mock.patch('rabbit_utils.running_nodes') |
314 | + @mock.patch('rabbit_utils.leader_node') |
315 | + @mock.patch('rabbit_utils.clustered') |
316 | + @mock.patch('rabbit_utils.cmp_pkgrevno') |
317 | + def test_cluster_with_no_leader(self, mock_cmp_pkgrevno, mock_clustered, |
318 | + mock_leader_node, mock_running_nodes, |
319 | + mock_time, mock_check_output, |
320 | + mock_check_call): |
321 | + mock_clustered.return_value = False |
322 | + mock_leader_node.return_value = [] |
323 | + mock_running_nodes.return_value = ['rabbit@juju-devel5-machine-19'] |
324 | + rabbit_utils.cluster_with() |
325 | + assert not mock_check_output.called |
charm_unit_test #10804 rabbitmq-server for tribaal mp273958
UNIT OK: passed
Build: http:// 10.245. 162.77: 8080/job/ charm_unit_ test/10804/