Merge lp:~allenap/maas/power-change-merge-and-log into lp:~maas-committers/maas/trunk
- power-change-merge-and-log
- Merge into trunk
Proposed by
Gavin Panella
Status: | Merged | ||||
---|---|---|---|---|---|
Approved by: | Gavin Panella | ||||
Approved revision: | no longer in the source branch. | ||||
Merged at revision: | 3735 | ||||
Proposed branch: | lp:~allenap/maas/power-change-merge-and-log | ||||
Merge into: | lp:~maas-committers/maas/trunk | ||||
Diff against target: |
306 lines (+115/-41) 2 files modified
src/provisioningserver/rpc/power.py (+46/-22) src/provisioningserver/rpc/tests/test_power.py (+69/-19) |
||||
To merge this branch: | bzr merge lp:~allenap/maas/power-change-merge-and-log | ||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Raphaël Badin (community) | Approve | ||
Review via email: mp+254332@code.launchpad.net |
Commit message
Improve logging for power changes in the cluster, and allow non-conflicting power changes.
Description of the change
To post a comment you must log in.
Revision history for this message
Gavin Panella (allenap) wrote : | # |
> Looks good… will this improve the situation for one of the bugs I
> filed yesterday about the mysterious unhandled twisted errors?
> (1436885 & co)
I hope so. I wasn't sure so I haven't linked them to the bug/bugs, but I
think I will, but not assign the bug to myself unless they do end up
fixed.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'src/provisioningserver/rpc/power.py' | |||
2 | --- src/provisioningserver/rpc/power.py 2015-03-25 15:33:23 +0000 | |||
3 | +++ src/provisioningserver/rpc/power.py 2015-03-27 00:50:35 +0000 | |||
4 | @@ -40,6 +40,7 @@ | |||
5 | 40 | ) | 40 | ) |
6 | 41 | from provisioningserver.utils.twisted import ( | 41 | from provisioningserver.utils.twisted import ( |
7 | 42 | asynchronous, | 42 | asynchronous, |
8 | 43 | callOut, | ||
9 | 43 | deferred, | 44 | deferred, |
10 | 44 | deferWithTimeout, | 45 | deferWithTimeout, |
11 | 45 | pause, | 46 | pause, |
12 | @@ -47,6 +48,7 @@ | |||
13 | 47 | ) | 48 | ) |
14 | 48 | from twisted.internet import reactor | 49 | from twisted.internet import reactor |
15 | 49 | from twisted.internet.defer import ( | 50 | from twisted.internet.defer import ( |
16 | 51 | CancelledError, | ||
17 | 50 | DeferredList, | 52 | DeferredList, |
18 | 51 | DeferredSemaphore, | 53 | DeferredSemaphore, |
19 | 52 | inlineCallbacks, | 54 | inlineCallbacks, |
20 | @@ -183,33 +185,55 @@ | |||
21 | 183 | """ | 185 | """ |
22 | 184 | assert power_change in ('on', 'off'), ( | 186 | assert power_change in ('on', 'off'), ( |
23 | 185 | "Unknown power change: %s" % power_change) | 187 | "Unknown power change: %s" % power_change) |
24 | 188 | |||
25 | 186 | # There should be one and only one power change for each system ID. | 189 | # There should be one and only one power change for each system ID. |
29 | 187 | # If there's one already, raise an error. | 190 | if system_id in power_action_registry: |
30 | 188 | registered_power_action = power_action_registry.get(system_id, None) | 191 | current_power_change, d = power_action_registry[system_id] |
31 | 189 | if registered_power_action is not None: | 192 | else: |
32 | 193 | current_power_change, d = None, None | ||
33 | 194 | |||
34 | 195 | if current_power_change is None: | ||
35 | 196 | # Arrange for the power change to happen later; do not make the caller | ||
36 | 197 | # wait, because it might take a long time. We set a timeout of two | ||
37 | 198 | # minutes so that if the power action doesn't return in a timely | ||
38 | 199 | # fashion (or fails silently or some such) it doesn't block other | ||
39 | 200 | # actions on the node. | ||
40 | 201 | d = deferLater( | ||
41 | 202 | clock, 0, deferWithTimeout, CHANGE_POWER_STATE_TIMEOUT, | ||
42 | 203 | change_power_state, system_id, hostname, power_type, power_change, | ||
43 | 204 | context, clock) | ||
44 | 205 | |||
45 | 206 | power_action_registry[system_id] = power_change, d | ||
46 | 207 | |||
47 | 208 | # Whether we succeed or fail, we need to remove the action from the | ||
48 | 209 | # registry of actions, otherwise subsequent actions will fail. | ||
49 | 210 | d.addBoth(callOut, power_action_registry.pop, system_id, None) | ||
50 | 211 | |||
51 | 212 | # Log cancellations distinctly from other errors. | ||
52 | 213 | def eb_cancelled(failure): | ||
53 | 214 | failure.trap(CancelledError) | ||
54 | 215 | log.msg( | ||
55 | 216 | "%s: Power could not be turned %s; timed out." | ||
56 | 217 | % (hostname, power_change)) | ||
57 | 218 | d.addErrback(eb_cancelled) | ||
58 | 219 | |||
59 | 220 | # Catch-all log. | ||
60 | 221 | d.addErrback( | ||
61 | 222 | log.err, "%s: Power could not be turned %s." % ( | ||
62 | 223 | hostname, power_change)) | ||
63 | 224 | |||
64 | 225 | elif current_power_change == power_change: | ||
65 | 226 | # What we want is already happening; let it continue. | ||
66 | 227 | pass | ||
67 | 228 | |||
68 | 229 | else: | ||
69 | 230 | # Right now we reject conflicting power changes. However, we have the | ||
70 | 231 | # Deferred (in `d`) along which the current power change is occurring, | ||
71 | 232 | # so the option to cancel is available if we want it. | ||
72 | 190 | raise PowerActionAlreadyInProgress( | 233 | raise PowerActionAlreadyInProgress( |
73 | 191 | "Unable to change power state to '%s' for node %s: another " | 234 | "Unable to change power state to '%s' for node %s: another " |
74 | 192 | "action is already in progress for that node." % | 235 | "action is already in progress for that node." % |
75 | 193 | (power_change, hostname)) | 236 | (power_change, hostname)) |
76 | 194 | power_action_registry[system_id] = power_change | ||
77 | 195 | |||
78 | 196 | def clean_up(*args, **kwargs): | ||
79 | 197 | power_action_registry.pop(system_id, None) | ||
80 | 198 | |||
81 | 199 | # Arrange for the power change to happen later; do not make the caller | ||
82 | 200 | # wait, because it might take a long time. | ||
83 | 201 | # We set a timeout of two minutes so that if the power action | ||
84 | 202 | # doesn't return in a timely fashion (or fails silently or | ||
85 | 203 | # some such) it doesn't block other actions on the node. | ||
86 | 204 | d = deferLater( | ||
87 | 205 | clock, 0, deferWithTimeout, CHANGE_POWER_STATE_TIMEOUT, | ||
88 | 206 | change_power_state, system_id, hostname, power_type, | ||
89 | 207 | power_change, context, clock) | ||
90 | 208 | d.addErrback(log.err) | ||
91 | 209 | # Whether we succeeded or failed, we need to remove the action | ||
92 | 210 | # from the registry of actions, otherwise every subsequent | ||
93 | 211 | # action will fail. | ||
94 | 212 | d.addBoth(clean_up) | ||
95 | 213 | 237 | ||
96 | 214 | 238 | ||
97 | 215 | @asynchronous | 239 | @asynchronous |
98 | 216 | 240 | ||
99 | === modified file 'src/provisioningserver/rpc/tests/test_power.py' | |||
100 | --- src/provisioningserver/rpc/tests/test_power.py 2015-03-25 15:33:23 +0000 | |||
101 | +++ src/provisioningserver/rpc/tests/test_power.py 2015-03-27 00:50:35 +0000 | |||
102 | @@ -243,7 +243,7 @@ | |||
103 | 243 | factory.make_name('context-key'): factory.make_name('context-val') | 243 | factory.make_name('context-key'): factory.make_name('context-val') |
104 | 244 | } | 244 | } |
105 | 245 | self.patch(power, 'pause') | 245 | self.patch(power, 'pause') |
107 | 246 | power.power_action_registry[system_id] = power_change | 246 | power.power_action_registry[system_id] = power_change, sentinel.d |
108 | 247 | # Patch the power action utility so that it says the node is | 247 | # Patch the power action utility so that it says the node is |
109 | 248 | # in the required power state. | 248 | # in the required power state. |
110 | 249 | power_action, execute = patch_power_action( | 249 | power_action, execute = patch_power_action( |
111 | @@ -274,7 +274,7 @@ | |||
112 | 274 | context = { | 274 | context = { |
113 | 275 | factory.make_name('context-key'): factory.make_name('context-val') | 275 | factory.make_name('context-key'): factory.make_name('context-val') |
114 | 276 | } | 276 | } |
116 | 277 | power.power_action_registry[system_id] = power_change | 277 | power.power_action_registry[system_id] = power_change, sentinel.d |
117 | 278 | self.patch(power, 'pause') | 278 | self.patch(power, 'pause') |
118 | 279 | power_action, execute = patch_power_action( | 279 | power_action, execute = patch_power_action( |
119 | 280 | self, return_value=random.choice(['on', 'off'])) | 280 | self, return_value=random.choice(['on', 'off'])) |
120 | @@ -302,7 +302,7 @@ | |||
121 | 302 | factory.make_name('context-key'): factory.make_name('context-val') | 302 | factory.make_name('context-key'): factory.make_name('context-val') |
122 | 303 | } | 303 | } |
123 | 304 | self.patch(power, 'pause') | 304 | self.patch(power, 'pause') |
125 | 305 | power.power_action_registry[system_id] = power_change | 305 | power.power_action_registry[system_id] = power_change, sentinel.d |
126 | 306 | # Simulate a failure to power up the node, then a success. | 306 | # Simulate a failure to power up the node, then a success. |
127 | 307 | power_action, execute = patch_power_action( | 307 | power_action, execute = patch_power_action( |
128 | 308 | self, side_effect=[None, 'off', None, 'on']) | 308 | self, side_effect=[None, 'off', None, 'on']) |
129 | @@ -332,7 +332,7 @@ | |||
130 | 332 | factory.make_name('context-key'): factory.make_name('context-val') | 332 | factory.make_name('context-key'): factory.make_name('context-val') |
131 | 333 | } | 333 | } |
132 | 334 | self.patch(power, 'pause') | 334 | self.patch(power, 'pause') |
134 | 335 | power.power_action_registry[system_id] = power_change | 335 | power.power_action_registry[system_id] = power_change, sentinel.d |
135 | 336 | # Patch the power action utility so that it says the node is | 336 | # Patch the power action utility so that it says the node is |
136 | 337 | # in the required power state. | 337 | # in the required power state. |
137 | 338 | power_action, execute = patch_power_action( | 338 | power_action, execute = patch_power_action( |
138 | @@ -363,7 +363,7 @@ | |||
139 | 363 | factory.make_name('context-key'): factory.make_name('context-val') | 363 | factory.make_name('context-key'): factory.make_name('context-val') |
140 | 364 | } | 364 | } |
141 | 365 | self.patch(power, 'pause') | 365 | self.patch(power, 'pause') |
143 | 366 | power.power_action_registry[system_id] = power_change | 366 | power.power_action_registry[system_id] = power_change, sentinel.d |
144 | 367 | # Simulate a persistent failure. | 367 | # Simulate a persistent failure. |
145 | 368 | power_action, execute = patch_power_action( | 368 | power_action, execute = patch_power_action( |
146 | 369 | self, return_value='off') | 369 | self, return_value='off') |
147 | @@ -393,7 +393,7 @@ | |||
148 | 393 | factory.make_name('context-key'): factory.make_name('context-val') | 393 | factory.make_name('context-key'): factory.make_name('context-val') |
149 | 394 | } | 394 | } |
150 | 395 | self.patch(power, 'pause') | 395 | self.patch(power, 'pause') |
152 | 396 | power.power_action_registry[system_id] = power_change | 396 | power.power_action_registry[system_id] = power_change, sentinel.d |
153 | 397 | # Simulate an exception. | 397 | # Simulate an exception. |
154 | 398 | exception_message = factory.make_name('exception') | 398 | exception_message = factory.make_name('exception') |
155 | 399 | power_action, execute = patch_power_action( | 399 | power_action, execute = patch_power_action( |
156 | @@ -418,7 +418,7 @@ | |||
157 | 418 | context = { | 418 | context = { |
158 | 419 | factory.make_name('context-key'): factory.make_name('context-val') | 419 | factory.make_name('context-key'): factory.make_name('context-val') |
159 | 420 | } | 420 | } |
161 | 421 | power.power_action_registry[system_id] = power_change | 421 | power.power_action_registry[system_id] = power_change, sentinel.d |
162 | 422 | # Simulate two failures to power up the node, then a success. | 422 | # Simulate two failures to power up the node, then a success. |
163 | 423 | power_action, execute = patch_power_action( | 423 | power_action, execute = patch_power_action( |
164 | 424 | self, side_effect=[None, 'off', None, 'off', None, 'on']) | 424 | self, side_effect=[None, 'off', None, 'off', None, 'on']) |
165 | @@ -789,7 +789,8 @@ | |||
166 | 789 | 789 | ||
167 | 790 | run_tests_with = MAASTwistedRunTest.make_factory(timeout=5) | 790 | run_tests_with = MAASTwistedRunTest.make_factory(timeout=5) |
168 | 791 | 791 | ||
170 | 792 | def patch_power_action_registry(self): | 792 | def setUp(self): |
171 | 793 | super(TestMaybeChangePowerState, self).setUp() | ||
172 | 793 | self.patch(power, 'power_action_registry', {}) | 794 | self.patch(power, 'power_action_registry', {}) |
173 | 794 | 795 | ||
174 | 795 | def patch_methods_using_rpc(self): | 796 | def patch_methods_using_rpc(self): |
175 | @@ -809,7 +810,6 @@ | |||
176 | 809 | @inlineCallbacks | 810 | @inlineCallbacks |
177 | 810 | def test_adds_action_to_registry(self): | 811 | def test_adds_action_to_registry(self): |
178 | 811 | self.patch_methods_using_rpc() | 812 | self.patch_methods_using_rpc() |
179 | 812 | self.patch_power_action_registry() | ||
180 | 813 | 813 | ||
181 | 814 | system_id = factory.make_name('system_id') | 814 | system_id = factory.make_name('system_id') |
182 | 815 | hostname = factory.make_name('hostname') | 815 | hostname = factory.make_name('hostname') |
183 | @@ -822,27 +822,45 @@ | |||
184 | 822 | yield power.maybe_change_power_state( | 822 | yield power.maybe_change_power_state( |
185 | 823 | system_id, hostname, power_type, power_change, context) | 823 | system_id, hostname, power_type, power_change, context) |
186 | 824 | self.assertEqual( | 824 | self.assertEqual( |
188 | 825 | {system_id: power_change}, | 825 | {system_id: (power_change, ANY)}, |
189 | 826 | power.power_action_registry) | 826 | power.power_action_registry) |
190 | 827 | reactor.runUntilCurrent() # Run all delayed calls. | 827 | reactor.runUntilCurrent() # Run all delayed calls. |
191 | 828 | self.assertEqual({}, power.power_action_registry) | 828 | self.assertEqual({}, power.power_action_registry) |
192 | 829 | 829 | ||
193 | 830 | @inlineCallbacks | 830 | @inlineCallbacks |
195 | 831 | def test_errors_when_change_already_registered(self): | 831 | def test_errors_when_change_conflicts_with_in_progress_change(self): |
196 | 832 | system_id = factory.make_name('system_id') | 832 | system_id = factory.make_name('system_id') |
197 | 833 | hostname = factory.make_name('hostname') | 833 | hostname = factory.make_name('hostname') |
198 | 834 | power_type = random.choice(power.QUERY_POWER_TYPES) | 834 | power_type = random.choice(power.QUERY_POWER_TYPES) |
200 | 835 | power_change = random.choice(['on', 'off']) | 835 | power_changes = ['on', 'off'] |
201 | 836 | random.shuffle(power_changes) | ||
202 | 837 | current_power_change, power_change = power_changes | ||
203 | 836 | context = { | 838 | context = { |
204 | 837 | factory.make_name('context-key'): factory.make_name('context-val') | 839 | factory.make_name('context-key'): factory.make_name('context-val') |
205 | 838 | } | 840 | } |
208 | 839 | 841 | power.power_action_registry[system_id] = ( | |
209 | 840 | power.power_action_registry[system_id] = power_change | 842 | current_power_change, sentinel.d) |
210 | 841 | with ExpectedException(exceptions.PowerActionAlreadyInProgress): | 843 | with ExpectedException(exceptions.PowerActionAlreadyInProgress): |
211 | 842 | yield power.maybe_change_power_state( | 844 | yield power.maybe_change_power_state( |
212 | 843 | system_id, hostname, power_type, power_change, context) | 845 | system_id, hostname, power_type, power_change, context) |
213 | 844 | 846 | ||
214 | 845 | @inlineCallbacks | 847 | @inlineCallbacks |
215 | 848 | def test_does_nothing_when_change_matches_in_progress_change(self): | ||
216 | 849 | system_id = factory.make_name('system_id') | ||
217 | 850 | hostname = factory.make_name('hostname') | ||
218 | 851 | power_type = random.choice(power.QUERY_POWER_TYPES) | ||
219 | 852 | current_power_change = power_change = random.choice(['on', 'off']) | ||
220 | 853 | context = { | ||
221 | 854 | factory.make_name('context-key'): factory.make_name('context-val') | ||
222 | 855 | } | ||
223 | 856 | power.power_action_registry[system_id] = ( | ||
224 | 857 | current_power_change, sentinel.d) | ||
225 | 858 | yield power.maybe_change_power_state( | ||
226 | 859 | system_id, hostname, power_type, power_change, context) | ||
227 | 860 | self.assertThat(power.power_action_registry, Equals( | ||
228 | 861 | {system_id: (power_change, sentinel.d)})) | ||
229 | 862 | |||
230 | 863 | @inlineCallbacks | ||
231 | 846 | def test_calls_change_power_state_later(self): | 864 | def test_calls_change_power_state_later(self): |
232 | 847 | self.patch_methods_using_rpc() | 865 | self.patch_methods_using_rpc() |
233 | 848 | 866 | ||
234 | @@ -865,7 +883,6 @@ | |||
235 | 865 | 883 | ||
236 | 866 | @inlineCallbacks | 884 | @inlineCallbacks |
237 | 867 | def test_clears_lock_if_change_power_state_success(self): | 885 | def test_clears_lock_if_change_power_state_success(self): |
238 | 868 | self.patch_power_action_registry() | ||
239 | 869 | self.patch_methods_using_rpc() | 886 | self.patch_methods_using_rpc() |
240 | 870 | 887 | ||
241 | 871 | system_id = factory.make_name('system_id') | 888 | system_id = factory.make_name('system_id') |
242 | @@ -883,7 +900,6 @@ | |||
243 | 883 | 900 | ||
244 | 884 | @inlineCallbacks | 901 | @inlineCallbacks |
245 | 885 | def test_clears_lock_if_change_power_state_fails(self): | 902 | def test_clears_lock_if_change_power_state_fails(self): |
246 | 886 | self.patch_power_action_registry() | ||
247 | 887 | 903 | ||
248 | 888 | class TestException(Exception): | 904 | class TestException(Exception): |
249 | 889 | pass | 905 | pass |
250 | @@ -892,7 +908,7 @@ | |||
251 | 892 | power.power_change_starting.side_effect = TestException('boom') | 908 | power.power_change_starting.side_effect = TestException('boom') |
252 | 893 | 909 | ||
253 | 894 | system_id = factory.make_name('system_id') | 910 | system_id = factory.make_name('system_id') |
255 | 895 | hostname = sentinel.hostname | 911 | hostname = factory.make_hostname() |
256 | 896 | power_type = sentinel.power_type | 912 | power_type = sentinel.power_type |
257 | 897 | power_change = random.choice(['on', 'off']) | 913 | power_change = random.choice(['on', 'off']) |
258 | 898 | context = sentinel.context | 914 | context = sentinel.context |
259 | @@ -904,12 +920,46 @@ | |||
260 | 904 | reactor.runUntilCurrent() # Run all delayed calls. | 920 | reactor.runUntilCurrent() # Run all delayed calls. |
261 | 905 | self.assertNotIn(system_id, power.power_action_registry) | 921 | self.assertNotIn(system_id, power.power_action_registry) |
262 | 906 | self.assertDocTestMatches( | 922 | self.assertDocTestMatches( |
264 | 907 | "Unhandled Error...TestException: boom", | 923 | """\ |
265 | 924 | %s: Power could not be turned %s. | ||
266 | 925 | Traceback (most recent call last): | ||
267 | 926 | ... | ||
268 | 927 | %s.TestException: boom | ||
269 | 928 | """ % (hostname, power_change, __name__), | ||
270 | 929 | logger.dump()) | ||
271 | 930 | |||
272 | 931 | @inlineCallbacks | ||
273 | 932 | def test_clears_lock_if_change_power_state_is_cancelled(self): | ||
274 | 933 | # Patch in an unfired Deferred here. This will pause the call so that | ||
275 | 934 | # we can grab the delayed call from the registry in time to cancel it. | ||
276 | 935 | self.patch_autospec(power, 'change_power_state') | ||
277 | 936 | power.change_power_state.return_value = Deferred() | ||
278 | 937 | |||
279 | 938 | system_id = factory.make_name('system_id') | ||
280 | 939 | hostname = factory.make_hostname() | ||
281 | 940 | power_type = sentinel.power_type | ||
282 | 941 | power_change = random.choice(['on', 'off']) | ||
283 | 942 | context = sentinel.context | ||
284 | 943 | |||
285 | 944 | logger = self.useFixture(TwistedLoggerFixture()) | ||
286 | 945 | |||
287 | 946 | yield power.maybe_change_power_state( | ||
288 | 947 | system_id, hostname, power_type, power_change, context) | ||
289 | 948 | |||
290 | 949 | # Get the Deferred from the registry and cancel it. | ||
291 | 950 | _, d = power.power_action_registry[system_id] | ||
292 | 951 | d.cancel() | ||
293 | 952 | yield d | ||
294 | 953 | |||
295 | 954 | self.assertNotIn(system_id, power.power_action_registry) | ||
296 | 955 | self.assertDocTestMatches( | ||
297 | 956 | """\ | ||
298 | 957 | %s: Power could not be turned %s; timed out. | ||
299 | 958 | """ % (hostname, power_change), | ||
300 | 908 | logger.dump()) | 959 | logger.dump()) |
301 | 909 | 960 | ||
302 | 910 | @inlineCallbacks | 961 | @inlineCallbacks |
303 | 911 | def test__calls_change_power_state_with_timeout(self): | 962 | def test__calls_change_power_state_with_timeout(self): |
304 | 912 | self.patch_power_action_registry() | ||
305 | 913 | self.patch_methods_using_rpc() | 963 | self.patch_methods_using_rpc() |
306 | 914 | defer_with_timeout = self.patch(power, 'deferWithTimeout') | 964 | defer_with_timeout = self.patch(power, 'deferWithTimeout') |
307 | 915 | 965 |
Looks good… will this improve the situation for one of the bugs I filed yesterday about the mysterious unhandled twisted errors? (1436885 & co)