Merge lp:~jtv/maas/bug-1354511 into lp:~maas-committers/maas/trunk

Proposed by Jeroen T. Vermeulen
Status: Merged
Approved by: Jeroen T. Vermeulen
Approved revision: no longer in the source branch.
Merged at revision: 2668
Proposed branch: lp:~jtv/maas/bug-1354511
Merge into: lp:~maas-committers/maas/trunk
Diff against target: 162 lines (+76/-21)
2 files modified
src/maasserver/api.py (+28/-9)
src/maasserver/tests/test_api_commissioning.py (+48/-12)
To merge this branch: bzr merge lp:~jtv/maas/bug-1354511
Reviewer Review Type Date Requested Status
Blake Rouse (community) Approve
Review via email: mp+230160@code.launchpad.net

Commit message

When timing out nodes that haven't finished commissioning, compare the last-update timestamp to the database clock (from which it was also generated), not to the Python-side clock. Timezone skew between the two could cause nodes to time out immediately, or hours late.

Description of the change

The timezone skew was causing test failures for me (and, I gather, for Julian as well) as a result of recent travel. A database cluster set up while in one time zone might break tests run in another. The new test simulates this situation, and reproduces the problem, as long as your timezone offset from UTC is less than 12 hours.

I intend to backport this fix to the currently maintained versions; I believe those are at least 1.2 (in 12.04 LTS), 1.5 (in 14.04 LTS), and 1.6 (for 14.10).

Jeroen

To post a comment you must log in.
Revision history for this message
Blake Rouse (blake-rouse) wrote :

Looks good. No blockers, just two comments. Comments inline.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/maasserver/api.py'
2--- src/maasserver/api.py 2014-08-05 15:48:54 +0000
3+++ src/maasserver/api.py 2014-08-08 19:55:26 +0000
4@@ -102,10 +102,6 @@
5 b64encode,
6 )
7 from cStringIO import StringIO
8-from datetime import (
9- datetime,
10- timedelta,
11- )
12 from functools import partial
13 import httplib
14 from inspect import getdoc
15@@ -858,12 +854,35 @@
16 Anything that has been commissioning for longer than
17 settings.COMMISSIONING_TIMEOUT is moved into the FAILED_TESTS status.
18 """
19- interval = timedelta(minutes=settings.COMMISSIONING_TIMEOUT)
20- cutoff = datetime.now() - interval
21- query = Node.objects.filter(
22- status=NODE_STATUS.COMMISSIONING, updated__lte=cutoff)
23+ # Compute the cutoff time on the database, using the database's
24+ # clock to compare to the "updated" timestamp, also set from the
25+ # database's clock. Otherwise, a sufficient difference between the
26+ # two clocks (including timezone offset!) might cause commissioning to
27+ # "time out" immediately, or hours late.
28+ #
29+ # This timeout relies on nothing else updating the commissioning node
30+ # within the hour. Otherwise, the timestamp will be refreshed as a
31+ # side effect and timeout will be postponed.
32+ #
33+ # This query both identifies and updates the failed nodes. It
34+ # refreshes the "updated" timestamp, but does not run any Django-side
35+ # code associated with saving the nodes.
36+ params = {
37+ 'commissioning': NODE_STATUS.COMMISSIONING,
38+ 'failed_tests': NODE_STATUS.FAILED_TESTS,
39+ 'minutes': settings.COMMISSIONING_TIMEOUT
40+ }
41+ query = Node.objects.raw("""
42+ UPDATE maasserver_node
43+ SET
44+ status = %(failed_tests)s,
45+ updated = now()
46+ WHERE
47+ status = %(commissioning)s AND
48+ updated <= (now() - interval '%(minutes)f minutes')
49+ RETURNING *
50+ """ % params)
51 results = list(query)
52- query.update(status=NODE_STATUS.FAILED_TESTS)
53 # Note that Django doesn't call save() on updated nodes here,
54 # but I don't think anything requires its effects anyway.
55 return results
56
57=== modified file 'src/maasserver/tests/test_api_commissioning.py'
58--- src/maasserver/tests/test_api_commissioning.py 2014-08-08 19:04:18 +0000
59+++ src/maasserver/tests/test_api_commissioning.py 2014-08-08 19:55:26 +0000
60@@ -15,15 +15,12 @@
61 __all__ = []
62
63 from base64 import b64encode
64-from datetime import (
65- datetime,
66- timedelta,
67- )
68 import httplib
69 import json
70
71 from django.conf import settings
72 from django.core.urlresolvers import reverse
73+from django.db import connection
74 from maasserver.enum import NODE_STATUS
75 from maasserver.testing.api import APITestCase
76 from maasserver.testing.factory import factory
77@@ -36,9 +33,22 @@
78 class TestCommissioningTimeout(MAASServerTestCase):
79 """Testing of commissioning timeout API."""
80
81+ def age_node(self, node, minutes, cursor=None):
82+ """Set back the update/creation timestamps on `node` by `minutes`."""
83+ if cursor is None:
84+ cursor = connection.cursor()
85+ cursor.execute("""
86+ UPDATE maasserver_node
87+ SET
88+ created = created - interval '%f minutes',
89+ updated = updated - interval '%f minutes'
90+ WHERE id = %d
91+ """ % (minutes, minutes, node.id))
92+
93 def test_check_with_no_action(self):
94 self.client_log_in()
95 node = factory.make_node(status=NODE_STATUS.READY)
96+ self.age_node(node, settings.COMMISSIONING_TIMEOUT + 100)
97 response = self.client.post(
98 reverse('nodes_handler'), {'op': 'check_commissioning'})
99 # Anything that's not commissioning should be ignored.
100@@ -49,8 +59,8 @@
101
102 def test_check_with_commissioning_but_not_expired_node(self):
103 self.client_log_in()
104- node = factory.make_node(
105- status=NODE_STATUS.COMMISSIONING)
106+ node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
107+ self.age_node(node, settings.COMMISSIONING_TIMEOUT - 1)
108 response = self.client.post(
109 reverse('nodes_handler'), {'op': 'check_commissioning'})
110 node = reload_object(node)
111@@ -60,12 +70,8 @@
112
113 def test_check_with_commissioning_and_expired_node(self):
114 self.client_log_in()
115- # Have an interval 1 second longer than the timeout.
116- interval = timedelta(seconds=1, minutes=settings.COMMISSIONING_TIMEOUT)
117- updated_at = datetime.now() - interval
118- node = factory.make_node(
119- status=NODE_STATUS.COMMISSIONING, created=datetime.now(),
120- updated=updated_at)
121+ node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
122+ self.age_node(node, settings.COMMISSIONING_TIMEOUT + 1)
123
124 response = self.client.post(
125 reverse('nodes_handler'), {'op': 'check_commissioning'})
126@@ -82,6 +88,36 @@
127 for response_node in json.loads(response.content)],
128 ))
129
130+ def test_check_ignores_timezone_skew_between_python_and_database(self):
131+ cursor = connection.cursor()
132+ # Set time zone, for the duration of the ongoing transaction.
133+ cursor.execute("SET LOCAL TIME ZONE +13")
134+ self.client_log_in()
135+ late_node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
136+ self.age_node(
137+ late_node, settings.COMMISSIONING_TIMEOUT + 1, cursor=cursor)
138+ timely_node = factory.make_node(status=NODE_STATUS.COMMISSIONING)
139+ self.age_node(
140+ timely_node, settings.COMMISSIONING_TIMEOUT - 1, cursor=cursor)
141+
142+ response = self.client.post(
143+ reverse('nodes_handler'), {'op': 'check_commissioning'})
144+ self.assertEqual(
145+ (httplib.OK, [late_node.system_id]),
146+ (
147+ response.status_code,
148+ [
149+ response_node['system_id']
150+ for response_node in json.loads(response.content)
151+ ],
152+ ))
153+ self.assertEqual(
154+ NODE_STATUS.FAILED_TESTS,
155+ reload_object(late_node).status)
156+ self.assertEqual(
157+ NODE_STATUS.COMMISSIONING,
158+ reload_object(timely_node).status)
159+
160
161 class AdminCommissioningScriptsAPITest(MAASServerTestCase):
162 """Tests for `CommissioningScriptsHandler`."""