Merge lp:~blake-rouse/maas/reap-old-regiond-workers-2.1 into lp:maas/2.1

Proposed by Blake Rouse
Status: Merged
Approved by: Blake Rouse
Approved revision: no longer in the source branch.
Merged at revision: 5587
Proposed branch: lp:~blake-rouse/maas/reap-old-regiond-workers-2.1
Merge into: lp:maas/2.1
Diff against target: 196 lines (+94/-10)
6 files modified
src/maasserver/models/node.py (+5/-7)
src/maasserver/models/tests/test_node.py (+5/-3)
src/maasserver/rpc/regionservice.py (+13/-0)
src/maasserver/rpc/tests/test_regionservice.py (+29/-0)
src/provisioningserver/utils/ps.py (+14/-0)
src/provisioningserver/utils/tests/test_ps.py (+28/-0)
To merge this branch: bzr merge lp:~blake-rouse/maas/reap-old-regiond-workers-2.1
Reviewer Review Type Date Requested Status
Blake Rouse (community) Approve
Review via email: mp+317524@code.launchpad.net

Commit message

Backport r5728 + r5737: Reap dead sibling regiond workers. Import message of degraded rackd.

This fixes users seeing bad reports of bad connections when dead regiond workers are stuck in the database for a longer period of time. As long as one regiond process is running on that region controller it will ensure that its other siblings are not in the database unless they are actually running.

To post a comment you must log in.
Revision history for this message
Blake Rouse (blake-rouse) wrote :

Self-approving backport.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/maasserver/models/node.py'
2--- src/maasserver/models/node.py 2016-12-07 11:26:49 +0000
3+++ src/maasserver/models/node.py 2017-02-16 16:52:53 +0000
4@@ -4356,15 +4356,13 @@
5 Service.objects.update_service_for(
6 self, "rackd", SERVICE_STATUS.RUNNING)
7 else:
8- # Not connected to all regions.
9- missing_regions = set(
10- process.region
11- for process in missing_processes
12- )
13+ # Calculate precentage of connection.
14+ percentage = ((dead_regions * 4) + len(missing_processes)) / (
15+ RegionController.objects.count() * 4)
16 Service.objects.update_service_for(
17 self, "rackd", SERVICE_STATUS.DEGRADED,
18- "Missing connections to %d region controller(s)." % (
19- dead_regions + len(missing_regions)))
20+ "{:.0%} connected to region controllers.".format(
21+ percentage))
22
23 def get_image_sync_status(self, boot_images=None):
24 """Return the status of the boot image import process."""
25
26=== modified file 'src/maasserver/models/tests/test_node.py'
27--- src/maasserver/models/tests/test_node.py 2016-12-07 11:26:49 +0000
28+++ src/maasserver/models/tests/test_node.py 2017-02-16 16:52:53 +0000
29@@ -8419,13 +8419,15 @@
30 RegionRackRPCConnection.objects.create(
31 endpoint=connected_endpoint, rack_controller=rack_controller)
32 rack_controller.update_rackd_status()
33+ percentage = (
34+ (len(regions_without_processes) * 4) + 1) / (
35+ (len(regions_without_processes) + len(regions_with_processes)) * 4)
36 self.assertThat(
37 Service.objects.get(node=rack_controller, name="rackd"),
38 MatchesStructure.byEquality(
39 status=SERVICE_STATUS.DEGRADED, status_info=(
40- "Missing connections to %d region controller(s)." % (
41- len(regions_with_processes) +
42- len(regions_without_processes)))))
43+ "{:.0%} connected to region controllers.".format(
44+ percentage))))
45
46 fake_images = [
47 {
48
49=== modified file 'src/maasserver/rpc/regionservice.py'
50--- src/maasserver/rpc/regionservice.py 2016-10-28 15:58:32 +0000
51+++ src/maasserver/rpc/regionservice.py 2017-02-16 16:52:53 +0000
52@@ -83,6 +83,7 @@
53 get_all_interface_addresses,
54 resolves_to_loopback_address,
55 )
56+from provisioningserver.utils.ps import is_pid_running
57 from provisioningserver.utils.twisted import (
58 asynchronous,
59 call,
60@@ -1326,6 +1327,18 @@
61 RegionControllerProcess.objects.filter(
62 updated__lte=remove_before_time).delete()
63
64+ # Remove any processes that cannot be identified as still running
65+ # on this region controller. This helps remove the need to wait
66+ # 90 seconds to reap the old processes (only on region controllers
67+ # where another regiond process is still running).
68+ sibling_processes = (
69+ RegionControllerProcess.objects
70+ .filter(region=region_obj)
71+ .exclude(id=self.process_id))
72+ for sibling_process in sibling_processes:
73+ if not is_pid_running(sibling_process.pid):
74+ sibling_process.delete()
75+
76 # Update all endpoints for this process.
77 previous_endpoint_ids = set(
78 RegionControllerProcessEndpoint.objects.filter(
79
80=== modified file 'src/maasserver/rpc/tests/test_regionservice.py'
81--- src/maasserver/rpc/tests/test_regionservice.py 2016-10-28 15:58:32 +0000
82+++ src/maasserver/rpc/tests/test_regionservice.py 2017-02-16 16:52:53 +0000
83@@ -1424,6 +1424,34 @@
84 self.assertIsNone(reload_object(old_region_process))
85 self.assertIsNone(reload_object(old_other_region_process))
86
87+ def test_update_removes_dead_sibling_processes(self):
88+ advertising = RegionAdvertising.promote()
89+
90+ region = RegionController.objects.get(system_id=advertising.region_id)
91+ [my_process] = region.processes.all()
92+
93+ running_pid = randint(1, 10)
94+ running_sibling = RegionControllerProcess.objects.create(
95+ region=region, pid=running_pid)
96+ dead_pid = randint(11, 20)
97+ dead_sibling = RegionControllerProcess.objects.create(
98+ region=region, pid=dead_pid)
99+
100+ def is_running(pid):
101+ if pid == running_pid:
102+ return True
103+ elif pid == dead_pid:
104+ return False
105+ else:
106+ raise ValueError("Unknown PID.")
107+
108+ self.patch(regionservice, "is_pid_running").side_effect = is_running
109+
110+ advertising.update(self.make_addresses())
111+
112+ self.assertIsNotNone(reload_object(running_sibling))
113+ self.assertIsNone(reload_object(dead_sibling))
114+
115 def test_update_updates_updated_time_on_region_and_process(self):
116 current_time = now()
117 self.patch(timestampedmodel, "now").return_value = current_time
118@@ -1494,6 +1522,7 @@
119
120 region = RegionController.objects.get(system_id=advertising.region_id)
121 [process] = region.processes.all()
122+ self.patch(regionservice, "is_pid_running").return_value = True
123
124 # Make 3 more processes.
125 for _ in range(3):
126
127=== modified file 'src/provisioningserver/utils/ps.py'
128--- src/provisioningserver/utils/ps.py 2016-03-28 13:54:47 +0000
129+++ src/provisioningserver/utils/ps.py 2017-02-16 16:52:53 +0000
130@@ -17,6 +17,20 @@
131 )
132
133
134+def is_pid_running(pid):
135+ """Return True if the `pid` is running."""
136+ try:
137+ os.kill(pid, 0)
138+ except ProcessLookupError:
139+ return False
140+ except PermissionError:
141+ return True
142+ except OSError:
143+ return False
144+ else:
145+ return True
146+
147+
148 def is_pid_in_container(pid, proc_path="/proc"):
149 """Return True if the `pid` is running in a container.
150
151
152=== modified file 'src/provisioningserver/utils/tests/test_ps.py'
153--- src/provisioningserver/utils/tests/test_ps.py 2016-03-28 13:54:47 +0000
154+++ src/provisioningserver/utils/tests/test_ps.py 2017-02-16 16:52:53 +0000
155@@ -16,6 +16,7 @@
156 from provisioningserver.utils.ps import (
157 get_running_pids_with_command,
158 is_pid_in_container,
159+ is_pid_running,
160 running_in_container,
161 )
162 from provisioningserver.utils.shell import ExternalProcessError
163@@ -63,6 +64,33 @@
164 """)
165
166
167+class TestIsPIDRunning(MAASTestCase):
168+
169+ scenarios = (
170+ ("running", {
171+ "result": True,
172+ "exception": None,
173+ }),
174+ ("lookup-error", {
175+ "result": False,
176+ "exception": ProcessLookupError(),
177+ }),
178+ ("permission-error", {
179+ "result": True,
180+ "exception": PermissionError(),
181+ }),
182+ ("os-error", {
183+ "result": False,
184+ "exception": OSError(),
185+ })
186+ )
187+
188+ def test__result(self):
189+ self.patch(ps_module.os, "kill").side_effect = self.exception
190+ self.assertEquals(
191+ self.result, is_pid_running(random.randint(100, 200)))
192+
193+
194 class TestIsPIDInContainer(MAASTestCase):
195
196 scenarios = (

Subscribers

People subscribed via source and target branches

to all changes: