Merge ~mpontillo/maas:surface-post-commissioning-errors--bug-1718517 into maas:master

Proposed by Mike Pontillo
Status: Merged
Approved by: Mike Pontillo
Approved revision: c64f6175f6bf9167e95c76f19611e11b9eb846bd
Merge reported by: MAAS Lander
Merged at revision: not available
Proposed branch: ~mpontillo/maas:surface-post-commissioning-errors--bug-1718517
Merge into: maas:master
Diff against target: 273 lines (+162/-12)
4 files modified
src/metadataserver/api.py (+69/-9)
src/metadataserver/models/scriptresult.py (+9/-2)
src/metadataserver/models/tests/test_scriptresult.py (+25/-1)
src/metadataserver/tests/test_api.py (+59/-0)
Reviewer Review Type Date Requested Status
Lee Trager (community) Approve
Review via email: mp+332881@code.launchpad.net

Commit message

LP: #1718517 - Handle and properly surface post-processing errors during commissioning.

Description of the change

E2E tested throughout development - working well for me.

To post a comment you must log in.
Revision history for this message
Lee Trager (ltrager) wrote :

LGTM thanks for fixing this!

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/src/metadataserver/api.py b/src/metadataserver/api.py
2index af4f4af..18153ea 100644
3--- a/src/metadataserver/api.py
4+++ b/src/metadataserver/api.py
5@@ -71,7 +71,10 @@ from maasserver.preseed import (
6 get_preseed,
7 )
8 from maasserver.utils import find_rack_controller
9-from maasserver.utils.orm import get_one
10+from maasserver.utils.orm import (
11+ get_one,
12+ is_retryable_failure,
13+)
14 from metadataserver import logger
15 from metadataserver.builtin_scripts.hooks import NODE_INFO_SCRIPTS
16 from metadataserver.enum import (
17@@ -94,9 +97,13 @@ from provisioningserver.events import (
18 EVENT_DETAILS,
19 EVENT_TYPES,
20 )
21+from provisioningserver.logger import LegacyLogger
22 import yaml
23
24
25+log = LegacyLogger()
26+
27+
28 class UnknownMetadataVersion(MAASAPINotFound):
29 """Not a known metadata version."""
30
31@@ -364,6 +371,44 @@ class StatusHandler(MetadataViewHandler):
32 'be used instead.')
33
34
35+def try_or_log_event(
36+ machine, signal_status, error_message, func, *args, **kwargs):
37+ """
38+ Attempts to run the specified function, related to the specified node and
39+ signal status. Will log the specified error, and create a node event,
40+ if the function fails.
41+
42+ If the function raises a retryable failure, will re-raise the exception so
43+ that a retry can be attempted.
44+
45+ :param machine: The machine related to the attempted action. Will be used
46+ in order to log an event, if the function raises an exception.
47+ :param signal_status: The initial SIGNAL_STATUS, which will be returned
48+ as-is if no exception occurs. If an exception occurs,
49+ SIGNAL_STATUS.FAILED will be returned.
50+ :param error_message: The error message for the log (and node event log) if
51+ an exception occurs.
52+ :param func: The function which will be attempted
53+ :param args: Arguments to pass to the function to be attempted.
54+ :param kwargs: Keyword arguments to pass to the function to be attempted.
55+ :return:
56+ """
57+ try:
58+ func(*args, **kwargs)
59+ except BaseException as e:
60+ if is_retryable_failure(e):
61+ # Not the fault of the post-processing function, so
62+ # re-raise so that the retry mechanism does its job.
63+ raise
64+ log.err(None, error_message)
65+ Event.objects.create_node_event(
66+ system_id=machine.system_id,
67+ event_type=EVENT_TYPES.SCRIPT_RESULT_ERROR,
68+ event_description=error_message)
69+ signal_status = SIGNAL_STATUS.FAILED
70+ return signal_status
71+
72+
73 class VersionIndexHandler(MetadataViewHandler):
74 """Listing for a given metadata version."""
75 create = update = delete = None
76@@ -488,10 +533,14 @@ class VersionIndexHandler(MetadataViewHandler):
77 # Commissioning was successful, setup the default storage layout
78 # and the initial networking configuration for the node.
79 if status in (SIGNAL_STATUS.TESTING, SIGNAL_STATUS.OK):
80- # XXX 2016-05-10 ltrager, LP:1580405 - Exceptions raised
81- # here are not logged or shown to the user.
82- node.set_default_storage_layout()
83- node.set_initial_networking_configuration()
84+ status = try_or_log_event(
85+ node, status,
86+ "Failed to set default storage layout.",
87+ node.set_default_storage_layout)
88+ status = try_or_log_event(
89+ node, status,
90+ "Failed to set default networking configuration.",
91+ node.set_initial_networking_configuration)
92
93 # XXX 2014-10-21 newell, bug=1382075
94 # Auto detection for IPMI tries to save power parameters
95@@ -516,11 +565,22 @@ class VersionIndexHandler(MetadataViewHandler):
96 node.current_testing_script_set.regenerate()
97
98 if target_status in [NODE_STATUS.READY, NODE_STATUS.TESTING]:
99- # Recalculate tags when commissioning ends.
100- populate_tags_for_single_node(Tag.objects.all(), node)
101- elif (target_status == NODE_STATUS.FAILED_COMMISSIONING and
102+ # Commissioning has ended. Check if any scripts failed during
103+ # post-processing; if so, the commissioning counts as a failure.
104+ qs = node.current_commissioning_script_set.scriptresult_set.filter(
105+ status=SCRIPT_STATUS.FAILED)
106+ if qs.count() > 0:
107+ target_status = NODE_STATUS.FAILED_COMMISSIONING
108+ else:
109+ # Recalculate tags when commissioning ends.
110+ try_or_log_event(
111+ node, status, "Failed to update tags.",
112+ populate_tags_for_single_node,
113+ Tag.objects.all(), node)
114+
115+ if (target_status == NODE_STATUS.FAILED_COMMISSIONING and
116 node.current_testing_script_set is not None):
117- # If commissioning failed testing doesn't run, mark any pending
118+ # If commissioning failed, testing doesn't run; mark any pending
119 # scripts as aborted.
120 qs = node.current_testing_script_set.scriptresult_set.filter(
121 status=SCRIPT_STATUS.PENDING)
122diff --git a/src/metadataserver/models/scriptresult.py b/src/metadataserver/models/scriptresult.py
123index 9b9195e..7620ddd 100644
124--- a/src/metadataserver/models/scriptresult.py
125+++ b/src/metadataserver/models/scriptresult.py
126@@ -255,10 +255,17 @@ class ScriptResult(CleanSave, TimestampedModel):
127 if (self.script_set.result_type == RESULT_TYPE.COMMISSIONING and
128 self.name in NODE_INFO_SCRIPTS):
129 post_process_hook = NODE_INFO_SCRIPTS[self.name]['hook']
130- post_process_hook(
131+ err = (
132+ "%s(%s): commissioning script '%s' failed during "
133+ "post-processing." % (
134+ self.script_set.node.fqdn,
135+ self.script_set.node.system_id, self.name))
136+ # Circular imports.
137+ from metadataserver.api import try_or_log_event
138+ try_or_log_event(
139+ self.script_set.node, None, err, post_process_hook,
140 node=self.script_set.node, output=self.stdout,
141 exit_status=self.exit_status)
142-
143 self.save()
144
145 @property
146diff --git a/src/metadataserver/models/tests/test_scriptresult.py b/src/metadataserver/models/tests/test_scriptresult.py
147index baa628f..ed8f41e 100644
148--- a/src/metadataserver/models/tests/test_scriptresult.py
149+++ b/src/metadataserver/models/tests/test_scriptresult.py
150@@ -19,7 +19,10 @@ from maasserver.models import (
151 from maasserver.testing.factory import factory
152 from maasserver.testing.testcase import MAASServerTestCase
153 from maasserver.utils.orm import reload_object
154-from maastesting.matchers import MockCalledOnceWith
155+from maastesting.matchers import (
156+ DocTestMatches,
157+ MockCalledOnceWith,
158+)
159 from metadataserver.enum import (
160 RESULT_TYPE,
161 SCRIPT_STATUS,
162@@ -336,6 +339,27 @@ class TestScriptResult(MAASServerTestCase):
163 MockCalledOnceWith(
164 node=script_set.node, output=b'', exit_status=exit_status))
165
166+ def test_store_result_logs_event_upon_hook_failure(self):
167+ script_set = factory.make_ScriptSet(
168+ result_type=RESULT_TYPE.COMMISSIONING)
169+ script_result = factory.make_ScriptResult(
170+ script_set=script_set, status=SCRIPT_STATUS.RUNNING)
171+ exit_status = random.randint(0, 255)
172+
173+ def _raise():
174+ raise Exception()
175+
176+ scriptresult_module.NODE_INFO_SCRIPTS[script_result.name] = {
177+ 'hook': _raise,
178+ }
179+ self.addCleanup(
180+ scriptresult_module.NODE_INFO_SCRIPTS.pop, script_result.name)
181+ script_result.store_result(exit_status)
182+ expected_event = Event.objects.last()
183+ self.assertThat(
184+ expected_event.description,
185+ DocTestMatches("...failed during post-processing."))
186+
187 def test_save_stores_start_time(self):
188 script_result = factory.make_ScriptResult(status=SCRIPT_STATUS.PENDING)
189 script_result.status = SCRIPT_STATUS.RUNNING
190diff --git a/src/metadataserver/tests/test_api.py b/src/metadataserver/tests/test_api.py
191index 4bb1a83..bc6c8c7 100644
192--- a/src/metadataserver/tests/test_api.py
193+++ b/src/metadataserver/tests/test_api.py
194@@ -64,6 +64,7 @@ from maasserver.testing.testcase import (
195 from maasserver.testing.testclient import MAASSensibleOAuthClient
196 from maasserver.utils.orm import reload_object
197 from maastesting.matchers import (
198+ DocTestMatches,
199 MockCalledOnceWith,
200 MockNotCalled,
201 )
202@@ -990,6 +991,23 @@ class TestInstallingAPI(MAASServerTestCase):
203 self.assertEqual(NODE_STATUS.DEPLOYING, reload_object(node).status)
204 self.assertThat(populate_tags_for_single_node, MockNotCalled())
205
206+ def test_tag_population_failure_logs_event(self):
207+ populate_tags_for_single_node = self.patch(
208+ api, "populate_tags_for_single_node")
209+ populate_tags_for_single_node.side_effect = Exception
210+ node = factory.make_Node(
211+ interface=True, status=NODE_STATUS.COMMISSIONING,
212+ with_empty_script_sets=True)
213+ client = make_node_client(node=node)
214+ response = call_signal(client, status=SIGNAL_STATUS.OK)
215+ self.assertEqual(http.client.OK, response.status_code)
216+ self.assertEqual(
217+ NODE_STATUS.READY, reload_object(node).status)
218+ expected_event = Event.objects.last()
219+ self.assertThat(
220+ expected_event.description,
221+ DocTestMatches("Failed to update tags."))
222+
223 def test_signaling_installation_success_is_idempotent(self):
224 node = factory.make_Node(
225 status=NODE_STATUS.DEPLOYING, with_empty_script_sets=True)
226@@ -1914,6 +1932,47 @@ class TestCommissioningAPI(MAASServerTestCase):
227 Node.set_default_storage_layout,
228 MockCalledOnceWith(node))
229
230+ def test_signal_fails_commissioning_and_logs_event_if_storage_config_fails(
231+ self):
232+ mock = self.patch_autospec(Node, "set_default_storage_layout")
233+ mock.side_effect = Exception
234+ node = factory.make_Node(
235+ status=NODE_STATUS.COMMISSIONING, with_empty_script_sets=True)
236+ client = make_node_client(node)
237+ response = call_signal(
238+ client, status=SIGNAL_STATUS.OK, script_result=0)
239+ self.assertEqual(http.client.OK, response.status_code)
240+ self.assertEqual(
241+ NODE_STATUS.FAILED_COMMISSIONING, reload_object(node).status)
242+ self.assertThat(
243+ Node.set_default_storage_layout,
244+ MockCalledOnceWith(node))
245+ expected_event = Event.objects.last()
246+ self.assertThat(
247+ expected_event.description,
248+ DocTestMatches("Failed to set default storage layout."))
249+
250+ def test_signal_fails_commissioning_and_logs_event_if_network_config_fails(
251+ self):
252+ mock = self.patch_autospec(
253+ Node, "set_initial_networking_configuration")
254+ mock.side_effect = Exception
255+ node = factory.make_Node(
256+ status=NODE_STATUS.COMMISSIONING, with_empty_script_sets=True)
257+ client = make_node_client(node)
258+ response = call_signal(
259+ client, status=SIGNAL_STATUS.OK, script_result=0)
260+ self.assertEqual(http.client.OK, response.status_code)
261+ self.assertEqual(
262+ NODE_STATUS.FAILED_COMMISSIONING, reload_object(node).status)
263+ self.assertThat(
264+ Node.set_initial_networking_configuration,
265+ MockCalledOnceWith(node))
266+ expected_event = Event.objects.last()
267+ self.assertThat(
268+ expected_event.description,
269+ DocTestMatches("Failed to set default networking configuration."))
270+
271 def test_signal_sets_default_storage_layout_if_TESTING(self):
272 self.patch_autospec(Node, "set_default_storage_layout")
273 node = factory.make_Node(

Subscribers

People subscribed via source and target branches