Merge lp:~andreserl/maas/lp1598149_lp1605252_2.0 into lp:maas/2.0

Proposed by Andres Rodriguez
Status: Merged
Approved by: Andres Rodriguez
Approved revision: no longer in the source branch.
Merged at revision: 5171
Proposed branch: lp:~andreserl/maas/lp1598149_lp1605252_2.0
Merge into: lp:maas/2.0
Diff against target: 199 lines (+54/-13)
5 files modified
src/maasserver/models/node.py (+10/-3)
src/maasserver/node_status.py (+16/-0)
src/maasserver/status_monitor.py (+18/-3)
src/maasserver/tests/test_status_monitor.py (+6/-5)
src/metadataserver/api.py (+4/-2)
To merge this branch: bzr merge lp:~andreserl/maas/lp1598149_lp1605252_2.0
Reviewer Review Type Date Requested Status
Andres Rodriguez (community) Approve
Review via email: mp+300923@code.launchpad.net

Commit message

Backport trunk rev 5200: Ensure that messages when a timeout for deploying, releasing, and commissioning happens is logged (This was supported in older releases but was dropped in 2.0).

 Also, drive-by fix the commissioning monitor services. This ensures that the status_expires is not cleared when it shouldn't and allows the monitor to work while commissioning.

 Another drive-by improvement, is to ensure that the monitor is only checking MACHINES and not anything else.

To post a comment you must log in.
Revision history for this message
Andres Rodriguez (andreserl) wrote :

self approve, it has landed in trunk.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'src/maasserver/models/node.py'
--- src/maasserver/models/node.py 2016-07-15 02:18:08 +0000
+++ src/maasserver/models/node.py 2016-07-22 16:13:17 +0000
@@ -122,6 +122,7 @@
122 COMMISSIONING_LIKE_STATUSES,122 COMMISSIONING_LIKE_STATUSES,
123 get_failed_status,123 get_failed_status,
124 is_failed_status,124 is_failed_status,
125 NODE_FAILURE_MONITORED_STATUS_TIMEOUTS,
125 NODE_TRANSITIONS,126 NODE_TRANSITIONS,
126)127)
127from maasserver.rpc import (128from maasserver.rpc import (
@@ -983,7 +984,9 @@
983 """984 """
984 # Return a *very* conservative estimate for now.985 # Return a *very* conservative estimate for now.
985 # Something that shouldn't conflict with any deployment.986 # Something that shouldn't conflict with any deployment.
986 return timedelta(minutes=40).total_seconds()987 return timedelta(
988 minutes=NODE_FAILURE_MONITORED_STATUS_TIMEOUTS[
989 NODE_STATUS.DEPLOYING]).total_seconds()
987990
988 def get_commissioning_time(self):991 def get_commissioning_time(self):
989 """Return the commissioning time of this node (in seconds).992 """Return the commissioning time of this node (in seconds).
@@ -991,14 +994,18 @@
991 This is the maximum time the commissioning is allowed to take.994 This is the maximum time the commissioning is allowed to take.
992 """995 """
993 # Return a *very* conservative estimate for now.996 # Return a *very* conservative estimate for now.
994 return timedelta(minutes=20).total_seconds()997 return timedelta(
998 minutes=NODE_FAILURE_MONITORED_STATUS_TIMEOUTS[
999 NODE_STATUS.COMMISSIONING]).total_seconds()
9951000
996 def get_releasing_time(self):1001 def get_releasing_time(self):
997 """Return the releasing time of this node (in seconds).1002 """Return the releasing time of this node (in seconds).
9981003
999 This is the maximum time that releasing is allowed to take.1004 This is the maximum time that releasing is allowed to take.
1000 """1005 """
1001 return timedelta(minutes=5).total_seconds()1006 return timedelta(
1007 minutes=NODE_FAILURE_MONITORED_STATUS_TIMEOUTS[
1008 NODE_STATUS.RELEASING]).total_seconds()
10021009
1003 def _register_request_event(1010 def _register_request_event(
1004 self, user, type_name, action='', comment=None):1011 self, user, type_name, action='', comment=None):
10051012
=== modified file 'src/maasserver/node_status.py'
--- src/maasserver/node_status.py 2015-12-01 18:12:59 +0000
+++ src/maasserver/node_status.py 2016-07-22 16:13:17 +0000
@@ -162,6 +162,22 @@
162 NODE_STATUS.DISK_ERASING: NODE_STATUS.FAILED_DISK_ERASING,162 NODE_STATUS.DISK_ERASING: NODE_STATUS.FAILED_DISK_ERASING,
163}163}
164164
165# State transitions that are monitored for timeouts for when a node
166# fails:
167# Mapping between in-progress statuses and the corresponding failed
168# statuses.
169NODE_FAILURE_MONITORED_STATUS_TRANSITIONS = {
170 NODE_STATUS.COMMISSIONING: NODE_STATUS.FAILED_COMMISSIONING,
171 NODE_STATUS.DEPLOYING: NODE_STATUS.FAILED_DEPLOYMENT,
172 NODE_STATUS.RELEASING: NODE_STATUS.FAILED_RELEASING,
173}
174
175NODE_FAILURE_MONITORED_STATUS_TIMEOUTS = {
176 NODE_STATUS.COMMISSIONING: 20,
177 NODE_STATUS.DEPLOYING: 40,
178 NODE_STATUS.RELEASING: 5,
179}
180
165# Statuses that correspond to managed steps for which MAAS actively181# Statuses that correspond to managed steps for which MAAS actively
166# monitors that the status changes after a fixed period of time.182# monitors that the status changes after a fixed period of time.
167MONITORED_STATUSES = list(NODE_FAILURE_STATUS_TRANSITIONS.keys())183MONITORED_STATUSES = list(NODE_FAILURE_STATUS_TRANSITIONS.keys())
168184
=== modified file 'src/maasserver/status_monitor.py'
--- src/maasserver/status_monitor.py 2016-02-01 10:28:01 +0000
+++ src/maasserver/status_monitor.py 2016-07-22 16:13:17 +0000
@@ -8,9 +8,16 @@
8 'StatusMonitorService',8 'StatusMonitorService',
9 ]9 ]
1010
11from maasserver.enum import (
12 NODE_STATUS_CHOICES_DICT,
13 NODE_TYPE,
14)
11from maasserver.models.node import Node15from maasserver.models.node import Node
12from maasserver.models.timestampedmodel import now16from maasserver.models.timestampedmodel import now
13from maasserver.node_status import NODE_FAILURE_STATUS_TRANSITIONS17from maasserver.node_status import (
18 NODE_FAILURE_MONITORED_STATUS_TIMEOUTS,
19 NODE_FAILURE_MONITORED_STATUS_TRANSITIONS,
20)
14from maasserver.utils.orm import transactional21from maasserver.utils.orm import transactional
15from maasserver.utils.threads import deferToDatabase22from maasserver.utils.threads import deferToDatabase
16from provisioningserver.utils.twisted import synchronous23from provisioningserver.utils.twisted import synchronous
@@ -21,14 +28,22 @@
21 """Mark all nodes in that database as failed where the status did not28 """Mark all nodes in that database as failed where the status did not
22 transition in time. `status_expires` is checked on the node to see if the29 transition in time. `status_expires` is checked on the node to see if the
23 current time is newer than the expired time.30 current time is newer than the expired time.
31
32 Status monitors are only available for Machines that are Commissioning,
33 Deploying or Releasing.
24 """34 """
25 current_db_time = now()35 current_db_time = now()
26 expired_nodes = Node.objects.filter(36 expired_nodes = Node.objects.filter(
27 status__in=NODE_FAILURE_STATUS_TRANSITIONS.keys(),37 node_type=NODE_TYPE.MACHINE,
38 status__in=NODE_FAILURE_MONITORED_STATUS_TRANSITIONS.keys(),
28 status_expires__isnull=False,39 status_expires__isnull=False,
29 status_expires__lte=current_db_time)40 status_expires__lte=current_db_time)
30 for node in expired_nodes:41 for node in expired_nodes:
31 node._mark_failed(None, commit=False)42 comment = "Machine operation '%s' timed out after %s minutes." % (
43 NODE_STATUS_CHOICES_DICT[node.status],
44 NODE_FAILURE_MONITORED_STATUS_TIMEOUTS[node.status],
45 )
46 node._mark_failed(None, commit=False, comment=comment)
32 node.status_expires = None47 node.status_expires = None
33 node.save()48 node.save()
3449
3550
=== modified file 'src/maasserver/tests/test_status_monitor.py'
--- src/maasserver/tests/test_status_monitor.py 2016-05-16 09:21:53 +0000
+++ src/maasserver/tests/test_status_monitor.py 2016-07-22 16:13:17 +0000
@@ -14,7 +14,7 @@
1414
15from maasserver import status_monitor15from maasserver import status_monitor
16from maasserver.models.signals.testing import SignalsDisabled16from maasserver.models.signals.testing import SignalsDisabled
17from maasserver.node_status import NODE_FAILURE_STATUS_TRANSITIONS17from maasserver.node_status import NODE_FAILURE_MONITORED_STATUS_TRANSITIONS
18from maasserver.status_monitor import (18from maasserver.status_monitor import (
19 mark_nodes_failed_after_expiring,19 mark_nodes_failed_after_expiring,
20 StatusMonitorService,20 StatusMonitorService,
@@ -40,7 +40,7 @@
40 expired_time = current_time - timedelta(minutes=1)40 expired_time = current_time - timedelta(minutes=1)
41 nodes = [41 nodes = [
42 factory.make_Node(status=status, status_expires=expired_time)42 factory.make_Node(status=status, status_expires=expired_time)
43 for status in NODE_FAILURE_STATUS_TRANSITIONS.keys()43 for status in NODE_FAILURE_MONITORED_STATUS_TRANSITIONS.keys()
44 ]44 ]
45 mark_nodes_failed_after_expiring()45 mark_nodes_failed_after_expiring()
46 failed_statuses = [46 failed_statuses = [
@@ -48,7 +48,8 @@
48 for node in nodes48 for node in nodes
49 ]49 ]
50 self.assertItemsEqual(50 self.assertItemsEqual(
51 NODE_FAILURE_STATUS_TRANSITIONS.values(), failed_statuses)51 NODE_FAILURE_MONITORED_STATUS_TRANSITIONS.values(),
52 failed_statuses)
5253
53 def test__skips_those_that_have_not_expired(self):54 def test__skips_those_that_have_not_expired(self):
54 self.useFixture(SignalsDisabled("power"))55 self.useFixture(SignalsDisabled("power"))
@@ -57,7 +58,7 @@
57 expired_time = current_time + timedelta(minutes=1)58 expired_time = current_time + timedelta(minutes=1)
58 nodes = [59 nodes = [
59 factory.make_Node(status=status, status_expires=expired_time)60 factory.make_Node(status=status, status_expires=expired_time)
60 for status in NODE_FAILURE_STATUS_TRANSITIONS.keys()61 for status in NODE_FAILURE_MONITORED_STATUS_TRANSITIONS.keys()
61 ]62 ]
62 mark_nodes_failed_after_expiring()63 mark_nodes_failed_after_expiring()
63 failed_statuses = [64 failed_statuses = [
@@ -65,7 +66,7 @@
65 for node in nodes66 for node in nodes
66 ]67 ]
67 self.assertItemsEqual(68 self.assertItemsEqual(
68 NODE_FAILURE_STATUS_TRANSITIONS.keys(), failed_statuses)69 NODE_FAILURE_MONITORED_STATUS_TRANSITIONS.keys(), failed_statuses)
6970
7071
71class TestStatusMonitorService(MAASServerTestCase):72class TestStatusMonitorService(MAASServerTestCase):
7273
=== modified file 'src/metadataserver/api.py'
--- src/metadataserver/api.py 2016-06-09 08:10:02 +0000
+++ src/metadataserver/api.py 2016-07-22 16:13:17 +0000
@@ -352,7 +352,6 @@
352 # At the end of a top-level event, we change the node status.352 # At the end of a top-level event, we change the node status.
353 if _is_top_level(activity_name) and event_type == 'finish':353 if _is_top_level(activity_name) and event_type == 'finish':
354 if node.status == NODE_STATUS.COMMISSIONING:354 if node.status == NODE_STATUS.COMMISSIONING:
355 node.status_expires = None
356 if result in ['FAIL', 'FAILURE']:355 if result in ['FAIL', 'FAILURE']:
357 node.status = NODE_STATUS.FAILED_COMMISSIONING356 node.status = NODE_STATUS.FAILED_COMMISSIONING
358357
@@ -370,6 +369,7 @@
370 if node.node_type == NODE_TYPE.MACHINE and node.status in [369 if node.node_type == NODE_TYPE.MACHINE and node.status in [
371 NODE_STATUS.READY,370 NODE_STATUS.READY,
372 NODE_STATUS.FAILED_COMMISSIONING]:371 NODE_STATUS.FAILED_COMMISSIONING]:
372 node.status_expires = None
373 node.owner = None373 node.owner = None
374 node.error = 'failed: %s' % description374 node.error = 'failed: %s' % description
375375
@@ -518,8 +518,10 @@
518 if node.power_type != "mscm":518 if node.power_type != "mscm":
519 store_node_power_parameters(node, request)519 store_node_power_parameters(node, request)
520520
521 node.status_expires = None
522 target_status = self.signaling_statuses.get(status)521 target_status = self.signaling_statuses.get(status)
522 if target_status in [NODE_STATUS.FAILED_COMMISSIONING,
523 NODE_STATUS.READY]:
524 node.status_expires = None
523 # Recalculate tags when commissioning ends.525 # Recalculate tags when commissioning ends.
524 if target_status == NODE_STATUS.READY:526 if target_status == NODE_STATUS.READY:
525 populate_tags_for_single_node(Tag.objects.all(), node)527 populate_tags_for_single_node(Tag.objects.all(), node)

Subscribers

People subscribed via source and target branches