Merge lp:~allenap/maas/power-change-merge-and-log into lp:~maas-committers/maas/trunk

Proposed by Gavin Panella
Status: Merged
Approved by: Gavin Panella
Approved revision: no longer in the source branch.
Merged at revision: 3735
Proposed branch: lp:~allenap/maas/power-change-merge-and-log
Merge into: lp:~maas-committers/maas/trunk
Diff against target: 306 lines (+115/-41)
2 files modified
src/provisioningserver/rpc/power.py (+46/-22)
src/provisioningserver/rpc/tests/test_power.py (+69/-19)
To merge this branch: bzr merge lp:~allenap/maas/power-change-merge-and-log
Reviewer Review Type Date Requested Status
Raphaël Badin (community) Approve
Review via email: mp+254332@code.launchpad.net

Commit message

Improve logging for power changes in the cluster, and allow non-conflicting power changes.

To post a comment you must log in.
Revision history for this message
Raphaël Badin (rvb) wrote :

Looks good… will this improve the situation for one of the bugs I filed yesterday about the mysterious unhandled twisted errors? (1436885 & co)

review: Approve
Revision history for this message
Gavin Panella (allenap) wrote :

> Looks good… will this improve the situation for one of the bugs I
> filed yesterday about the mysterious unhandled twisted errors?
> (1436885 & co)

I hope so. I wasn't sure so I haven't linked them to the bug/bugs, but I
think I will, but not assign the bug to myself unless they do end up
fixed.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'src/provisioningserver/rpc/power.py'
--- src/provisioningserver/rpc/power.py 2015-03-25 15:33:23 +0000
+++ src/provisioningserver/rpc/power.py 2015-03-27 00:50:35 +0000
@@ -40,6 +40,7 @@
40)40)
41from provisioningserver.utils.twisted import (41from provisioningserver.utils.twisted import (
42 asynchronous,42 asynchronous,
43 callOut,
43 deferred,44 deferred,
44 deferWithTimeout,45 deferWithTimeout,
45 pause,46 pause,
@@ -47,6 +48,7 @@
47)48)
48from twisted.internet import reactor49from twisted.internet import reactor
49from twisted.internet.defer import (50from twisted.internet.defer import (
51 CancelledError,
50 DeferredList,52 DeferredList,
51 DeferredSemaphore,53 DeferredSemaphore,
52 inlineCallbacks,54 inlineCallbacks,
@@ -183,33 +185,55 @@
183 """185 """
184 assert power_change in ('on', 'off'), (186 assert power_change in ('on', 'off'), (
185 "Unknown power change: %s" % power_change)187 "Unknown power change: %s" % power_change)
188
186 # There should be one and only one power change for each system ID.189 # There should be one and only one power change for each system ID.
187 # If there's one already, raise an error.190 if system_id in power_action_registry:
188 registered_power_action = power_action_registry.get(system_id, None)191 current_power_change, d = power_action_registry[system_id]
189 if registered_power_action is not None:192 else:
193 current_power_change, d = None, None
194
195 if current_power_change is None:
196 # Arrange for the power change to happen later; do not make the caller
197 # wait, because it might take a long time. We set a timeout of two
198 # minutes so that if the power action doesn't return in a timely
199 # fashion (or fails silently or some such) it doesn't block other
200 # actions on the node.
201 d = deferLater(
202 clock, 0, deferWithTimeout, CHANGE_POWER_STATE_TIMEOUT,
203 change_power_state, system_id, hostname, power_type, power_change,
204 context, clock)
205
206 power_action_registry[system_id] = power_change, d
207
208 # Whether we succeed or fail, we need to remove the action from the
209 # registry of actions, otherwise subsequent actions will fail.
210 d.addBoth(callOut, power_action_registry.pop, system_id, None)
211
212 # Log cancellations distinctly from other errors.
213 def eb_cancelled(failure):
214 failure.trap(CancelledError)
215 log.msg(
216 "%s: Power could not be turned %s; timed out."
217 % (hostname, power_change))
218 d.addErrback(eb_cancelled)
219
220 # Catch-all log.
221 d.addErrback(
222 log.err, "%s: Power could not be turned %s." % (
223 hostname, power_change))
224
225 elif current_power_change == power_change:
226 # What we want is already happening; let it continue.
227 pass
228
229 else:
230 # Right now we reject conflicting power changes. However, we have the
231 # Deferred (in `d`) along which the current power change is occurring,
232 # so the option to cancel is available if we want it.
190 raise PowerActionAlreadyInProgress(233 raise PowerActionAlreadyInProgress(
191 "Unable to change power state to '%s' for node %s: another "234 "Unable to change power state to '%s' for node %s: another "
192 "action is already in progress for that node." %235 "action is already in progress for that node." %
193 (power_change, hostname))236 (power_change, hostname))
194 power_action_registry[system_id] = power_change
195
196 def clean_up(*args, **kwargs):
197 power_action_registry.pop(system_id, None)
198
199 # Arrange for the power change to happen later; do not make the caller
200 # wait, because it might take a long time.
201 # We set a timeout of two minutes so that if the power action
202 # doesn't return in a timely fashion (or fails silently or
203 # some such) it doesn't block other actions on the node.
204 d = deferLater(
205 clock, 0, deferWithTimeout, CHANGE_POWER_STATE_TIMEOUT,
206 change_power_state, system_id, hostname, power_type,
207 power_change, context, clock)
208 d.addErrback(log.err)
209 # Whether we succeeded or failed, we need to remove the action
210 # from the registry of actions, otherwise every subsequent
211 # action will fail.
212 d.addBoth(clean_up)
213237
214238
215@asynchronous239@asynchronous
216240
=== modified file 'src/provisioningserver/rpc/tests/test_power.py'
--- src/provisioningserver/rpc/tests/test_power.py 2015-03-25 15:33:23 +0000
+++ src/provisioningserver/rpc/tests/test_power.py 2015-03-27 00:50:35 +0000
@@ -243,7 +243,7 @@
243 factory.make_name('context-key'): factory.make_name('context-val')243 factory.make_name('context-key'): factory.make_name('context-val')
244 }244 }
245 self.patch(power, 'pause')245 self.patch(power, 'pause')
246 power.power_action_registry[system_id] = power_change246 power.power_action_registry[system_id] = power_change, sentinel.d
247 # Patch the power action utility so that it says the node is247 # Patch the power action utility so that it says the node is
248 # in the required power state.248 # in the required power state.
249 power_action, execute = patch_power_action(249 power_action, execute = patch_power_action(
@@ -274,7 +274,7 @@
274 context = {274 context = {
275 factory.make_name('context-key'): factory.make_name('context-val')275 factory.make_name('context-key'): factory.make_name('context-val')
276 }276 }
277 power.power_action_registry[system_id] = power_change277 power.power_action_registry[system_id] = power_change, sentinel.d
278 self.patch(power, 'pause')278 self.patch(power, 'pause')
279 power_action, execute = patch_power_action(279 power_action, execute = patch_power_action(
280 self, return_value=random.choice(['on', 'off']))280 self, return_value=random.choice(['on', 'off']))
@@ -302,7 +302,7 @@
302 factory.make_name('context-key'): factory.make_name('context-val')302 factory.make_name('context-key'): factory.make_name('context-val')
303 }303 }
304 self.patch(power, 'pause')304 self.patch(power, 'pause')
305 power.power_action_registry[system_id] = power_change305 power.power_action_registry[system_id] = power_change, sentinel.d
306 # Simulate a failure to power up the node, then a success.306 # Simulate a failure to power up the node, then a success.
307 power_action, execute = patch_power_action(307 power_action, execute = patch_power_action(
308 self, side_effect=[None, 'off', None, 'on'])308 self, side_effect=[None, 'off', None, 'on'])
@@ -332,7 +332,7 @@
332 factory.make_name('context-key'): factory.make_name('context-val')332 factory.make_name('context-key'): factory.make_name('context-val')
333 }333 }
334 self.patch(power, 'pause')334 self.patch(power, 'pause')
335 power.power_action_registry[system_id] = power_change335 power.power_action_registry[system_id] = power_change, sentinel.d
336 # Patch the power action utility so that it says the node is336 # Patch the power action utility so that it says the node is
337 # in the required power state.337 # in the required power state.
338 power_action, execute = patch_power_action(338 power_action, execute = patch_power_action(
@@ -363,7 +363,7 @@
363 factory.make_name('context-key'): factory.make_name('context-val')363 factory.make_name('context-key'): factory.make_name('context-val')
364 }364 }
365 self.patch(power, 'pause')365 self.patch(power, 'pause')
366 power.power_action_registry[system_id] = power_change366 power.power_action_registry[system_id] = power_change, sentinel.d
367 # Simulate a persistent failure.367 # Simulate a persistent failure.
368 power_action, execute = patch_power_action(368 power_action, execute = patch_power_action(
369 self, return_value='off')369 self, return_value='off')
@@ -393,7 +393,7 @@
393 factory.make_name('context-key'): factory.make_name('context-val')393 factory.make_name('context-key'): factory.make_name('context-val')
394 }394 }
395 self.patch(power, 'pause')395 self.patch(power, 'pause')
396 power.power_action_registry[system_id] = power_change396 power.power_action_registry[system_id] = power_change, sentinel.d
397 # Simulate an exception.397 # Simulate an exception.
398 exception_message = factory.make_name('exception')398 exception_message = factory.make_name('exception')
399 power_action, execute = patch_power_action(399 power_action, execute = patch_power_action(
@@ -418,7 +418,7 @@
418 context = {418 context = {
419 factory.make_name('context-key'): factory.make_name('context-val')419 factory.make_name('context-key'): factory.make_name('context-val')
420 }420 }
421 power.power_action_registry[system_id] = power_change421 power.power_action_registry[system_id] = power_change, sentinel.d
422 # Simulate two failures to power up the node, then a success.422 # Simulate two failures to power up the node, then a success.
423 power_action, execute = patch_power_action(423 power_action, execute = patch_power_action(
424 self, side_effect=[None, 'off', None, 'off', None, 'on'])424 self, side_effect=[None, 'off', None, 'off', None, 'on'])
@@ -789,7 +789,8 @@
789789
790 run_tests_with = MAASTwistedRunTest.make_factory(timeout=5)790 run_tests_with = MAASTwistedRunTest.make_factory(timeout=5)
791791
792 def patch_power_action_registry(self):792 def setUp(self):
793 super(TestMaybeChangePowerState, self).setUp()
793 self.patch(power, 'power_action_registry', {})794 self.patch(power, 'power_action_registry', {})
794795
795 def patch_methods_using_rpc(self):796 def patch_methods_using_rpc(self):
@@ -809,7 +810,6 @@
809 @inlineCallbacks810 @inlineCallbacks
810 def test_adds_action_to_registry(self):811 def test_adds_action_to_registry(self):
811 self.patch_methods_using_rpc()812 self.patch_methods_using_rpc()
812 self.patch_power_action_registry()
813813
814 system_id = factory.make_name('system_id')814 system_id = factory.make_name('system_id')
815 hostname = factory.make_name('hostname')815 hostname = factory.make_name('hostname')
@@ -822,27 +822,45 @@
822 yield power.maybe_change_power_state(822 yield power.maybe_change_power_state(
823 system_id, hostname, power_type, power_change, context)823 system_id, hostname, power_type, power_change, context)
824 self.assertEqual(824 self.assertEqual(
825 {system_id: power_change},825 {system_id: (power_change, ANY)},
826 power.power_action_registry)826 power.power_action_registry)
827 reactor.runUntilCurrent() # Run all delayed calls.827 reactor.runUntilCurrent() # Run all delayed calls.
828 self.assertEqual({}, power.power_action_registry)828 self.assertEqual({}, power.power_action_registry)
829829
830 @inlineCallbacks830 @inlineCallbacks
831 def test_errors_when_change_already_registered(self):831 def test_errors_when_change_conflicts_with_in_progress_change(self):
832 system_id = factory.make_name('system_id')832 system_id = factory.make_name('system_id')
833 hostname = factory.make_name('hostname')833 hostname = factory.make_name('hostname')
834 power_type = random.choice(power.QUERY_POWER_TYPES)834 power_type = random.choice(power.QUERY_POWER_TYPES)
835 power_change = random.choice(['on', 'off'])835 power_changes = ['on', 'off']
836 random.shuffle(power_changes)
837 current_power_change, power_change = power_changes
836 context = {838 context = {
837 factory.make_name('context-key'): factory.make_name('context-val')839 factory.make_name('context-key'): factory.make_name('context-val')
838 }840 }
839841 power.power_action_registry[system_id] = (
840 power.power_action_registry[system_id] = power_change842 current_power_change, sentinel.d)
841 with ExpectedException(exceptions.PowerActionAlreadyInProgress):843 with ExpectedException(exceptions.PowerActionAlreadyInProgress):
842 yield power.maybe_change_power_state(844 yield power.maybe_change_power_state(
843 system_id, hostname, power_type, power_change, context)845 system_id, hostname, power_type, power_change, context)
844846
845 @inlineCallbacks847 @inlineCallbacks
848 def test_does_nothing_when_change_matches_in_progress_change(self):
849 system_id = factory.make_name('system_id')
850 hostname = factory.make_name('hostname')
851 power_type = random.choice(power.QUERY_POWER_TYPES)
852 current_power_change = power_change = random.choice(['on', 'off'])
853 context = {
854 factory.make_name('context-key'): factory.make_name('context-val')
855 }
856 power.power_action_registry[system_id] = (
857 current_power_change, sentinel.d)
858 yield power.maybe_change_power_state(
859 system_id, hostname, power_type, power_change, context)
860 self.assertThat(power.power_action_registry, Equals(
861 {system_id: (power_change, sentinel.d)}))
862
863 @inlineCallbacks
846 def test_calls_change_power_state_later(self):864 def test_calls_change_power_state_later(self):
847 self.patch_methods_using_rpc()865 self.patch_methods_using_rpc()
848866
@@ -865,7 +883,6 @@
865883
866 @inlineCallbacks884 @inlineCallbacks
867 def test_clears_lock_if_change_power_state_success(self):885 def test_clears_lock_if_change_power_state_success(self):
868 self.patch_power_action_registry()
869 self.patch_methods_using_rpc()886 self.patch_methods_using_rpc()
870887
871 system_id = factory.make_name('system_id')888 system_id = factory.make_name('system_id')
@@ -883,7 +900,6 @@
883900
884 @inlineCallbacks901 @inlineCallbacks
885 def test_clears_lock_if_change_power_state_fails(self):902 def test_clears_lock_if_change_power_state_fails(self):
886 self.patch_power_action_registry()
887903
888 class TestException(Exception):904 class TestException(Exception):
889 pass905 pass
@@ -892,7 +908,7 @@
892 power.power_change_starting.side_effect = TestException('boom')908 power.power_change_starting.side_effect = TestException('boom')
893909
894 system_id = factory.make_name('system_id')910 system_id = factory.make_name('system_id')
895 hostname = sentinel.hostname911 hostname = factory.make_hostname()
896 power_type = sentinel.power_type912 power_type = sentinel.power_type
897 power_change = random.choice(['on', 'off'])913 power_change = random.choice(['on', 'off'])
898 context = sentinel.context914 context = sentinel.context
@@ -904,12 +920,46 @@
904 reactor.runUntilCurrent() # Run all delayed calls.920 reactor.runUntilCurrent() # Run all delayed calls.
905 self.assertNotIn(system_id, power.power_action_registry)921 self.assertNotIn(system_id, power.power_action_registry)
906 self.assertDocTestMatches(922 self.assertDocTestMatches(
907 "Unhandled Error...TestException: boom",923 """\
924 %s: Power could not be turned %s.
925 Traceback (most recent call last):
926 ...
927 %s.TestException: boom
928 """ % (hostname, power_change, __name__),
929 logger.dump())
930
931 @inlineCallbacks
932 def test_clears_lock_if_change_power_state_is_cancelled(self):
933 # Patch in an unfired Deferred here. This will pause the call so that
934 # we can grab the delayed call from the registry in time to cancel it.
935 self.patch_autospec(power, 'change_power_state')
936 power.change_power_state.return_value = Deferred()
937
938 system_id = factory.make_name('system_id')
939 hostname = factory.make_hostname()
940 power_type = sentinel.power_type
941 power_change = random.choice(['on', 'off'])
942 context = sentinel.context
943
944 logger = self.useFixture(TwistedLoggerFixture())
945
946 yield power.maybe_change_power_state(
947 system_id, hostname, power_type, power_change, context)
948
949 # Get the Deferred from the registry and cancel it.
950 _, d = power.power_action_registry[system_id]
951 d.cancel()
952 yield d
953
954 self.assertNotIn(system_id, power.power_action_registry)
955 self.assertDocTestMatches(
956 """\
957 %s: Power could not be turned %s; timed out.
958 """ % (hostname, power_change),
908 logger.dump())959 logger.dump())
909960
910 @inlineCallbacks961 @inlineCallbacks
911 def test__calls_change_power_state_with_timeout(self):962 def test__calls_change_power_state_with_timeout(self):
912 self.patch_power_action_registry()
913 self.patch_methods_using_rpc()963 self.patch_methods_using_rpc()
914 defer_with_timeout = self.patch(power, 'deferWithTimeout')964 defer_with_timeout = self.patch(power, 'deferWithTimeout')
915965