Merge lp:~jtv/maas/1.5-bug-1354511 into lp:maas/1.5

Proposed by Jeroen T. Vermeulen on 2014-08-08
Status: Merged
Approved by: Jeroen T. Vermeulen on 2014-08-08
Approved revision: 2296
Merged at revision: 2296
Proposed branch: lp:~jtv/maas/1.5-bug-1354511
Merge into: lp:maas/1.5
Diff against target: 162 lines (+76/-21)
2 files modified
src/maasserver/api.py (+28/-9)
src/maasserver/tests/test_api_commissioning.py (+48/-12)
To merge this branch: bzr merge lp:~jtv/maas/1.5-bug-1354511
Reviewer Review Type Date Requested Status
Jeroen T. Vermeulen (community) Approve on 2014-08-08
Review via email: mp+230177@code.launchpad.net

Commit message

Backport trunk r2668: When timing out nodes that haven't finished commissioning, compare the last-update timestamp to the database clock (from which it was also generated), not to the Python-side clock. Timezone skew between the two could cause nodes to time out immediately, or hours late.

Description of the change

Straight backport. For self-approval.

Jeroen

To post a comment you must log in.
Jeroen T. Vermeulen (jtv) wrote :

Self-approving.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/maasserver/api.py'
2--- src/maasserver/api.py 2014-07-24 00:09:02 +0000
3+++ src/maasserver/api.py 2014-08-08 20:17:35 +0000
4@@ -97,10 +97,6 @@
5 b64encode,
6 )
7 from cStringIO import StringIO
8-from datetime import (
9- datetime,
10- timedelta,
11- )
12 from functools import partial
13 import httplib
14 from inspect import getdoc
15@@ -704,12 +700,35 @@
16 Anything that has been commissioning for longer than
17 settings.COMMISSIONING_TIMEOUT is moved into the FAILED_TESTS status.
18 """
19- interval = timedelta(minutes=settings.COMMISSIONING_TIMEOUT)
20- cutoff = datetime.now() - interval
21- query = Node.objects.filter(
22- status=NODE_STATUS.COMMISSIONING, updated__lte=cutoff)
23+ # Compute the cutoff time on the database, using the database's
24+ # clock to compare to the "updated" timestamp, also set from the
25+ # database's clock. Otherwise, a sufficient difference between the
26+ # two clocks (including timezone offset!) might cause commissioning to
27+ # "time out" immediately, or hours late.
28+ #
29+ # This timeout relies on nothing else updating the commissioning node
30+ # within the hour. Otherwise, the timestamp will be refreshed as a
31+ # side effect and timeout will be postponed.
32+ #
33+ # This query both identifies and updates the failed nodes. It
34+ # refreshes the "updated" timestamp, but does not run any Django-side
35+ # code associated with saving the nodes.
36+ params = {
37+ 'commissioning': NODE_STATUS.COMMISSIONING,
38+ 'failed_tests': NODE_STATUS.FAILED_TESTS,
39+ 'minutes': settings.COMMISSIONING_TIMEOUT
40+ }
41+ query = Node.objects.raw("""
42+ UPDATE maasserver_node
43+ SET
44+ status = %(failed_tests)s,
45+ updated = now()
46+ WHERE
47+ status = %(commissioning)s AND
48+ updated <= (now() - interval '%(minutes)f minutes')
49+ RETURNING *
50+ """ % params)
51 results = list(query)
52- query.update(status=NODE_STATUS.FAILED_TESTS)
53 # Note that Django doesn't call save() on updated nodes here,
54 # but I don't think anything requires its effects anyway.
55 return results
56
57=== modified file 'src/maasserver/tests/test_api_commissioning.py'
58--- src/maasserver/tests/test_api_commissioning.py 2014-02-01 04:04:37 +0000
59+++ src/maasserver/tests/test_api_commissioning.py 2014-08-08 20:17:35 +0000
60@@ -15,15 +15,12 @@
61 __all__ = []
62
63 from base64 import b64encode
64-from datetime import (
65- datetime,
66- timedelta,
67- )
68 import httplib
69 import json
70
71 from django.conf import settings
72 from django.core.urlresolvers import reverse
73+from django.db import connection
74 from maasserver.enum import NODE_STATUS
75 from maasserver.testing import reload_object
76 from maasserver.testing.api import APITestCase
77@@ -36,9 +33,22 @@
78 class TestCommissioningTimeout(MAASServerTestCase):
79 """Testing of commissioning timeout API."""
80
81+ def age_node(self, node, minutes, cursor=None):
82+ """Set back the update/creation timestamps on `node` by `minutes`."""
83+ if cursor is None:
84+ cursor = connection.cursor()
85+ cursor.execute("""
86+ UPDATE maasserver_node
87+ SET
88+ created = created - interval '%f minutes',
89+ updated = updated - interval '%f minutes'
90+ WHERE id = %d
91+ """ % (minutes, minutes, node.id))
92+
93 def test_check_with_no_action(self):
94 self.client_log_in()
95 node = factory.make_node(status=NODE_STATUS.READY)
96+ self.age_node(node, settings.COMMISSIONING_TIMEOUT + 100)
97 response = self.client.post(
98 reverse('nodes_handler'), {'op': 'check_commissioning'})
99 # Anything that's not commissioning should be ignored.
100@@ -49,8 +59,8 @@
101
102 def test_check_with_commissioning_but_not_expired_node(self):
103 self.client_log_in()
104- node = factory.make_node(
105- status=NODE_STATUS.COMMISSIONING)
106+ node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
107+ self.age_node(node, settings.COMMISSIONING_TIMEOUT - 1)
108 response = self.client.post(
109 reverse('nodes_handler'), {'op': 'check_commissioning'})
110 node = reload_object(node)
111@@ -60,12 +70,8 @@
112
113 def test_check_with_commissioning_and_expired_node(self):
114 self.client_log_in()
115- # Have an interval 1 second longer than the timeout.
116- interval = timedelta(seconds=1, minutes=settings.COMMISSIONING_TIMEOUT)
117- updated_at = datetime.now() - interval
118- node = factory.make_node(
119- status=NODE_STATUS.COMMISSIONING, created=datetime.now(),
120- updated=updated_at)
121+ node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
122+ self.age_node(node, settings.COMMISSIONING_TIMEOUT + 1)
123
124 response = self.client.post(
125 reverse('nodes_handler'), {'op': 'check_commissioning'})
126@@ -82,6 +88,36 @@
127 for response_node in json.loads(response.content)],
128 ))
129
130+ def test_check_ignores_timezone_skew_between_python_and_database(self):
131+ cursor = connection.cursor()
132+ # Set time zone, for the duration of the ongoing transaction.
133+ cursor.execute("SET LOCAL TIME ZONE +13")
134+ self.client_log_in()
135+ late_node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
136+ self.age_node(
137+ late_node, settings.COMMISSIONING_TIMEOUT + 1, cursor=cursor)
138+ timely_node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
139+ self.age_node(
140+ timely_node, settings.COMMISSIONING_TIMEOUT - 1, cursor=cursor)
141+
142+ response = self.client.post(
143+ reverse('nodes_handler'), {'op': 'check_commissioning'})
144+ self.assertEqual(
145+ (httplib.OK, [late_node.system_id]),
146+ (
147+ response.status_code,
148+ [
149+ response_node['system_id']
150+ for response_node in json.loads(response.content)
151+ ],
152+ ))
153+ self.assertEqual(
154+ NODE_STATUS.FAILED_TESTS,
155+ reload_object(late_node).status)
156+ self.assertEqual(
157+ NODE_STATUS.COMMISSIONING,
158+ reload_object(timely_node).status)
159+
160
161 class AdminCommissioningScriptsAPITest(MAASServerTestCase):
162 """Tests for `CommissioningScriptsHandler`."""

Subscribers

People subscribed via source and target branches

to all changes: