Merge lp:~raharper/curtin/trunk.md-retry-stop-resync into lp:~curtin-dev/curtin/trunk

Proposed by Ryan Harper
Status: Merged
Merged at revision: 497
Proposed branch: lp:~raharper/curtin/trunk.md-retry-stop-resync
Merge into: lp:~curtin-dev/curtin/trunk
Diff against target: 315 lines (+214/-23)
3 files modified
curtin/block/__init__.py (+0/-3)
curtin/block/mdadm.py (+63/-10)
tests/unittests/test_block_mdadm.py (+151/-10)
To merge this branch: bzr merge lp:~raharper/curtin/trunk.md-retry-stop-resync
Reviewer Review Type Date Requested Status
Server Team CI bot continuous-integration Approve
Scott Moser (community) Approve
Review via email: mp+323765@code.launchpad.net

Description of the change

mdadm_stop: Add retry and additional steps to halt a resync

In our vmtest (as well as seen in real deployments) after a reboot of a system with mdadm arrays already configured on disk, md may start a 'resync' operation to bring an array online. The resync operation sometimes prevents mdadm from stopping the array. This patch applies a few known methods to halt the resync, namely writing 'idle' to md/sync_action and changing the sync_max and sync_min values to 0.

Add additional unittests to exercise the retry path now taken to help ensure we can reliably shutdown a mdadm array even while in 'resync' state.

To post a comment you must log in.
Revision history for this message
Ryan Harper (raharper) wrote :
Revision history for this message
Server Team CI bot (server-team-bot) wrote :
review: Approve (continuous-integration)
Revision history for this message
Scott Moser (smoser) wrote :

one small comment. other than that, i approve. the proof is really in the racey mdadm pudding.

Revision history for this message
Scott Moser (smoser) :
review: Approve
502. By Ryan Harper

Add docstring as requested

Revision history for this message
Server Team CI bot (server-team-bot) wrote :
review: Approve (continuous-integration)
503. By Ryan Harper

merge from trunk

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'curtin/block/__init__.py'
2--- curtin/block/__init__.py 2017-03-21 18:05:24 +0000
3+++ curtin/block/__init__.py 2017-05-15 15:47:54 +0000
4@@ -94,7 +94,6 @@
5 # cciss devices need to have 'cciss!' prepended
6 if path.startswith('/dev/cciss'):
7 dev_kname = 'cciss!' + dev_kname
8- LOG.debug("path_to_kname input: '{}' output: '{}'".format(path, dev_kname))
9 return dev_kname
10
11
12@@ -106,7 +105,6 @@
13 # if given something that is already a dev path, return it
14 if os.path.exists(kname) and is_valid_device(kname):
15 path = kname
16- LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path))
17 return os.path.realpath(path)
18 # adding '/dev' to path is not sufficient to handle cciss devices and
19 # possibly other special devices which have not been encountered yet
20@@ -114,7 +112,6 @@
21 # make sure path we get is correct
22 if not (os.path.exists(path) and is_valid_device(path)):
23 raise OSError('could not get path to dev from kname: {}'.format(kname))
24- LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path))
25 return path
26
27
28
29=== modified file 'curtin/block/mdadm.py'
30--- curtin/block/mdadm.py 2017-05-03 02:05:22 +0000
31+++ curtin/block/mdadm.py 2017-05-15 15:47:54 +0000
32@@ -26,6 +26,7 @@
33 import re
34 import shlex
35 from subprocess import CalledProcessError
36+import time
37
38 from curtin.block import (dev_short, dev_path, is_valid_device, sys_block_path)
39 from curtin.block import get_holders
40@@ -253,13 +254,59 @@
41 return data
42
43
44-def mdadm_stop(devpath):
45+def mdadm_stop(devpath, retries=None):
46 assert_valid_devpath(devpath)
47+ if not retries:
48+ retries = [0.2] * 60
49+
50+ sync_action = md_sysfs_attr_path(devpath, 'sync_action')
51+ sync_max = md_sysfs_attr_path(devpath, 'sync_max')
52+ sync_min = md_sysfs_attr_path(devpath, 'sync_min')
53
54 LOG.info("mdadm stopping: %s" % devpath)
55- out, err = util.subp(["mdadm", "--manage", "--stop", devpath],
56- capture=True)
57- LOG.debug("mdadm stop:\n%s\n%s", out, err)
58+ for (attempt, wait) in enumerate(retries):
59+ try:
60+ LOG.debug('mdadm: stop on %s attempt %s', devpath, attempt)
61+ # An array in 'resync' state may not be stoppable, attempt to
62+ # cancel an ongoing resync
63+ val = md_sysfs_attr(devpath, 'sync_action')
64+ LOG.debug('%s/sync_max = %s', sync_action, val)
65+ if val != "idle":
66+ LOG.debug("mdadm: setting array sync_action=idle")
67+ util.write_file(sync_action, content="idle")
68+
69+ # Setting the sync_{max,min} may can help prevent the array from
70+ # changing back to 'resync' which may prevent the array from being
71+ # stopped
72+ val = md_sysfs_attr(devpath, 'sync_max')
73+ LOG.debug('%s/sync_max = %s', sync_max, val)
74+ if val != "0":
75+ LOG.debug("mdadm: setting array sync_{min,max}=0")
76+ try:
77+ util.write_file(sync_max, content="0")
78+ util.write_file(sync_min, content="0")
79+ except IOError:
80+ LOG.warning('mdadm: failed to set sync_{max,min} values')
81+ pass
82+
83+ # one wonders why this command doesn't do any of the above itself?
84+ out, err = util.subp(["mdadm", "--manage", "--stop", devpath],
85+ capture=True)
86+ LOG.debug("mdadm stop command output:\n%s\n%s", out, err)
87+ LOG.info("mdadm: successfully stopped %s after %s attempt(s)",
88+ devpath, attempt+1)
89+ return
90+
91+ except util.ProcessExecutionError:
92+ LOG.warning("mdadm stop failed, retrying ")
93+ if os.path.isfile('/proc/mdstat'):
94+ LOG.critical("/proc/mdstat:\n%s",
95+ util.load_file('/proc/mdstat'))
96+ LOG.debug("mdadm: stop failed, retrying in %s seconds", wait)
97+ time.sleep(wait)
98+ pass
99+
100+ raise OSError('Failed to stop mdadm device %s', devpath)
101
102
103 def mdadm_remove(devpath):
104@@ -341,16 +388,22 @@
105 raise ValueError("Invalid devpath: '%s'" % devpath)
106
107
108+def md_sysfs_attr_path(md_devname, attrname):
109+ """ Return the path to a md device attribute under the 'md' dir """
110+ # build /sys/class/block/<md_short>/md
111+ sysmd = sys_block_path(md_devname, "md")
112+
113+ # append attrname
114+ return os.path.join(sysmd, attrname)
115+
116+
117 def md_sysfs_attr(md_devname, attrname):
118+ """ Return the attribute str of an md device found under the 'md' dir """
119+ attrdata = ''
120 if not valid_mdname(md_devname):
121 raise ValueError('Invalid md devicename: [{}]'.format(md_devname))
122
123- attrdata = ''
124- # /sys/class/block/<md_short>/md
125- sysmd = sys_block_path(md_devname, "md")
126-
127- # /sys/class/block/<md_short>/md/attrname
128- sysfs_attr_path = os.path.join(sysmd, attrname)
129+ sysfs_attr_path = md_sysfs_attr_path(md_devname, attrname)
130 if os.path.isfile(sysfs_attr_path):
131 attrdata = util.load_file(sysfs_attr_path).strip()
132
133
134=== modified file 'tests/unittests/test_block_mdadm.py'
135--- tests/unittests/test_block_mdadm.py 2017-05-03 02:07:45 +0000
136+++ tests/unittests/test_block_mdadm.py 2017-05-15 15:47:54 +0000
137@@ -331,27 +331,168 @@
138 class TestBlockMdadmStop(MdadmTestBase):
139 def setUp(self):
140 super(TestBlockMdadmStop, self).setUp()
141- self.add_patch('curtin.block.mdadm.util', 'mock_util')
142+ self.add_patch('curtin.block.mdadm.util.lsb_release', 'mock_util_lsb')
143+ self.add_patch('curtin.block.mdadm.util.subp', 'mock_util_subp')
144+ self.add_patch('curtin.block.mdadm.util.write_file',
145+ 'mock_util_write_file')
146+ self.add_patch('curtin.block.mdadm.util.load_file',
147+ 'mock_util_load_file')
148 self.add_patch('curtin.block.mdadm.is_valid_device', 'mock_valid')
149+ self.add_patch('curtin.block.mdadm.sys_block_path',
150+ 'mock_sys_block_path')
151+ self.add_patch('curtin.block.mdadm.os.path.isfile', 'mock_path_isfile')
152
153 # Common mock settings
154 self.mock_valid.return_value = True
155- self.mock_util.lsb_release.return_value = {'codename': 'xenial'}
156- self.mock_util.subp.side_effect = [
157+ self.mock_util_lsb.return_value = {'codename': 'xenial'}
158+ self.mock_util_subp.side_effect = iter([
159 ("", ""), # mdadm stop device
160- ]
161+ ])
162+ self.mock_path_isfile.return_value = True
163+ self.mock_util_load_file.side_effect = iter([
164+ "idle", "max",
165+ ])
166+
167+ def _set_sys_path(self, md_device):
168+ self.sys_path = '/sys/class/block/%s/md' % md_device.split("/")[-1]
169+ self.mock_sys_block_path.return_value = self.sys_path
170
171 def test_mdadm_stop_no_devpath(self):
172 with self.assertRaises(ValueError):
173 mdadm.mdadm_stop(None)
174
175 def test_mdadm_stop(self):
176- device = "/dev/vdc"
177- mdadm.mdadm_stop(device)
178- expected_calls = [
179- call(["mdadm", "--manage", "--stop", device], capture=True)
180- ]
181- self.mock_util.subp.assert_has_calls(expected_calls)
182+ device = "/dev/md0"
183+ self._set_sys_path(device)
184+
185+ mdadm.mdadm_stop(device)
186+
187+ expected_calls = [
188+ call(["mdadm", "--manage", "--stop", device], capture=True)
189+ ]
190+ self.mock_util_subp.assert_has_calls(expected_calls)
191+
192+ expected_reads = [
193+ call(self.sys_path + '/sync_action'),
194+ call(self.sys_path + '/sync_max'),
195+ ]
196+ self.mock_util_load_file.assert_has_calls(expected_reads)
197+
198+ @patch('curtin.block.mdadm.time.sleep')
199+ def test_mdadm_stop_retry(self, mock_sleep):
200+ device = "/dev/md10"
201+ self._set_sys_path(device)
202+ self.mock_util_load_file.side_effect = iter([
203+ "resync", "max",
204+ "proc/mdstat output",
205+ "idle", "0",
206+ ])
207+ self.mock_util_subp.side_effect = iter([
208+ util.ProcessExecutionError(),
209+ ("mdadm stopped %s" % device, ''),
210+ ])
211+
212+ mdadm.mdadm_stop(device)
213+
214+ expected_calls = [
215+ call(["mdadm", "--manage", "--stop", device], capture=True),
216+ call(["mdadm", "--manage", "--stop", device], capture=True)
217+ ]
218+ self.mock_util_subp.assert_has_calls(expected_calls)
219+
220+ expected_reads = [
221+ call(self.sys_path + '/sync_action'),
222+ call(self.sys_path + '/sync_max'),
223+ call('/proc/mdstat'),
224+ call(self.sys_path + '/sync_action'),
225+ call(self.sys_path + '/sync_max'),
226+ ]
227+ self.mock_util_load_file.assert_has_calls(expected_reads)
228+
229+ expected_writes = [
230+ call(self.sys_path + '/sync_action', content='idle'),
231+ call(self.sys_path + '/sync_max', content='0'),
232+ call(self.sys_path + '/sync_min', content='0'),
233+ ]
234+ self.mock_util_write_file.assert_has_calls(expected_writes)
235+
236+ @patch('curtin.block.mdadm.time.sleep')
237+ def test_mdadm_stop_retry_sysfs_write_fail(self, mock_sleep):
238+ device = "/dev/md126"
239+ self._set_sys_path(device)
240+ self.mock_util_load_file.side_effect = iter([
241+ "resync", "max",
242+ "proc/mdstat output",
243+ "idle", "0",
244+ ])
245+ self.mock_util_subp.side_effect = iter([
246+ util.ProcessExecutionError(),
247+ ("mdadm stopped %s" % device, ''),
248+ ])
249+ # sometimes we fail to modify sysfs attrs
250+ self.mock_util_write_file.side_effect = iter([
251+ "", # write to sync_action OK
252+ IOError(), # write to sync_max FAIL
253+ ])
254+
255+ mdadm.mdadm_stop(device)
256+
257+ expected_calls = [
258+ call(["mdadm", "--manage", "--stop", device], capture=True),
259+ call(["mdadm", "--manage", "--stop", device], capture=True)
260+ ]
261+ self.mock_util_subp.assert_has_calls(expected_calls)
262+
263+ expected_reads = [
264+ call(self.sys_path + '/sync_action'),
265+ call(self.sys_path + '/sync_max'),
266+ call('/proc/mdstat'),
267+ call(self.sys_path + '/sync_action'),
268+ call(self.sys_path + '/sync_max'),
269+ ]
270+ self.mock_util_load_file.assert_has_calls(expected_reads)
271+
272+ expected_writes = [
273+ call(self.sys_path + '/sync_action', content='idle'),
274+ ]
275+ self.mock_util_write_file.assert_has_calls(expected_writes)
276+
277+ @patch('curtin.block.mdadm.time.sleep')
278+ def test_mdadm_stop_retry_exhausted(self, mock_sleep):
279+ device = "/dev/md/37"
280+ retries = 60
281+ self._set_sys_path(device)
282+ self.mock_util_load_file.side_effect = iter([
283+ "resync", "max",
284+ "proc/mdstat output",
285+ ] * retries)
286+ self.mock_util_subp.side_effect = iter([
287+ util.ProcessExecutionError(),
288+ ] * retries)
289+ # sometimes we fail to modify sysfs attrs
290+ self.mock_util_write_file.side_effect = iter([
291+ "", IOError()] * retries)
292+
293+ with self.assertRaises(OSError):
294+ mdadm.mdadm_stop(device)
295+
296+ expected_calls = [
297+ call(["mdadm", "--manage", "--stop", device], capture=True),
298+ ] * retries
299+ self.mock_util_subp.assert_has_calls(expected_calls)
300+
301+ expected_reads = [
302+ call(self.sys_path + '/sync_action'),
303+ call(self.sys_path + '/sync_max'),
304+ call('/proc/mdstat'),
305+ ] * retries
306+ self.mock_util_load_file.assert_has_calls(expected_reads)
307+
308+ expected_writes = [
309+ call(self.sys_path + '/sync_action', content='idle'),
310+ call(self.sys_path + '/sync_max', content='0'),
311+ ] * retries
312+ self.mock_util_write_file.assert_has_calls(expected_writes)
313
314
315 class TestBlockMdadmRemove(MdadmTestBase):

Subscribers

People subscribed via source and target branches