Merge lp:~raharper/curtin/trunk.md-retry-stop-resync into lp:~curtin-dev/curtin/trunk

Proposed by Ryan Harper
Status: Merged
Merged at revision: 497
Proposed branch: lp:~raharper/curtin/trunk.md-retry-stop-resync
Merge into: lp:~curtin-dev/curtin/trunk
Diff against target: 315 lines (+214/-23)
3 files modified
curtin/block/__init__.py (+0/-3)
curtin/block/mdadm.py (+63/-10)
tests/unittests/test_block_mdadm.py (+151/-10)
To merge this branch: bzr merge lp:~raharper/curtin/trunk.md-retry-stop-resync
Reviewer Review Type Date Requested Status
Server Team CI bot continuous-integration Approve
Scott Moser (community) Approve
Review via email: mp+323765@code.launchpad.net

Description of the change

mdadm_stop: Add retry and additional steps to halt a resync

In our vmtest (as well as seen in real deployments) after a reboot of a system with mdadm arrays already configured on disk, md may start a 'resync' operation to bring an array online. The resync operation sometimes prevents mdadm from stopping the array. This patch applies a few known methods to halt the resync, namely writing 'idle' to md/sync_action and changing the sync_max and sync_min values to 0.

Add additional unittests to exercise the retry path now taken to help ensure we can reliably shutdown a mdadm array even while in 'resync' state.

To post a comment you must log in.
Revision history for this message
Ryan Harper (raharper) wrote :
Revision history for this message
Server Team CI bot (server-team-bot) wrote :
review: Approve (continuous-integration)
Revision history for this message
Scott Moser (smoser) wrote :

one small comment. other than that, i approve. the proof is really in the racey mdadm pudding.

Revision history for this message
Scott Moser (smoser) :
review: Approve
502. By Ryan Harper

Add docstring as requested

Revision history for this message
Server Team CI bot (server-team-bot) wrote :
review: Approve (continuous-integration)
503. By Ryan Harper

merge from trunk

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'curtin/block/__init__.py'
--- curtin/block/__init__.py 2017-03-21 18:05:24 +0000
+++ curtin/block/__init__.py 2017-05-15 15:47:54 +0000
@@ -94,7 +94,6 @@
94 # cciss devices need to have 'cciss!' prepended94 # cciss devices need to have 'cciss!' prepended
95 if path.startswith('/dev/cciss'):95 if path.startswith('/dev/cciss'):
96 dev_kname = 'cciss!' + dev_kname96 dev_kname = 'cciss!' + dev_kname
97 LOG.debug("path_to_kname input: '{}' output: '{}'".format(path, dev_kname))
98 return dev_kname97 return dev_kname
9998
10099
@@ -106,7 +105,6 @@
106 # if given something that is already a dev path, return it105 # if given something that is already a dev path, return it
107 if os.path.exists(kname) and is_valid_device(kname):106 if os.path.exists(kname) and is_valid_device(kname):
108 path = kname107 path = kname
109 LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path))
110 return os.path.realpath(path)108 return os.path.realpath(path)
111 # adding '/dev' to path is not sufficient to handle cciss devices and109 # adding '/dev' to path is not sufficient to handle cciss devices and
112 # possibly other special devices which have not been encountered yet110 # possibly other special devices which have not been encountered yet
@@ -114,7 +112,6 @@
114 # make sure path we get is correct112 # make sure path we get is correct
115 if not (os.path.exists(path) and is_valid_device(path)):113 if not (os.path.exists(path) and is_valid_device(path)):
116 raise OSError('could not get path to dev from kname: {}'.format(kname))114 raise OSError('could not get path to dev from kname: {}'.format(kname))
117 LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path))
118 return path115 return path
119116
120117
121118
=== modified file 'curtin/block/mdadm.py'
--- curtin/block/mdadm.py 2017-05-03 02:05:22 +0000
+++ curtin/block/mdadm.py 2017-05-15 15:47:54 +0000
@@ -26,6 +26,7 @@
26import re26import re
27import shlex27import shlex
28from subprocess import CalledProcessError28from subprocess import CalledProcessError
29import time
2930
30from curtin.block import (dev_short, dev_path, is_valid_device, sys_block_path)31from curtin.block import (dev_short, dev_path, is_valid_device, sys_block_path)
31from curtin.block import get_holders32from curtin.block import get_holders
@@ -253,13 +254,59 @@
253 return data254 return data
254255
255256
256def mdadm_stop(devpath):257def mdadm_stop(devpath, retries=None):
257 assert_valid_devpath(devpath)258 assert_valid_devpath(devpath)
259 if not retries:
260 retries = [0.2] * 60
261
262 sync_action = md_sysfs_attr_path(devpath, 'sync_action')
263 sync_max = md_sysfs_attr_path(devpath, 'sync_max')
264 sync_min = md_sysfs_attr_path(devpath, 'sync_min')
258265
259 LOG.info("mdadm stopping: %s" % devpath)266 LOG.info("mdadm stopping: %s" % devpath)
260 out, err = util.subp(["mdadm", "--manage", "--stop", devpath],267 for (attempt, wait) in enumerate(retries):
261 capture=True)268 try:
262 LOG.debug("mdadm stop:\n%s\n%s", out, err)269 LOG.debug('mdadm: stop on %s attempt %s', devpath, attempt)
270 # An array in 'resync' state may not be stoppable, attempt to
271 # cancel an ongoing resync
272 val = md_sysfs_attr(devpath, 'sync_action')
273 LOG.debug('%s/sync_max = %s', sync_action, val)
274 if val != "idle":
275 LOG.debug("mdadm: setting array sync_action=idle")
276 util.write_file(sync_action, content="idle")
277
278 # Setting the sync_{max,min} may can help prevent the array from
279 # changing back to 'resync' which may prevent the array from being
280 # stopped
281 val = md_sysfs_attr(devpath, 'sync_max')
282 LOG.debug('%s/sync_max = %s', sync_max, val)
283 if val != "0":
284 LOG.debug("mdadm: setting array sync_{min,max}=0")
285 try:
286 util.write_file(sync_max, content="0")
287 util.write_file(sync_min, content="0")
288 except IOError:
289 LOG.warning('mdadm: failed to set sync_{max,min} values')
290 pass
291
292 # one wonders why this command doesn't do any of the above itself?
293 out, err = util.subp(["mdadm", "--manage", "--stop", devpath],
294 capture=True)
295 LOG.debug("mdadm stop command output:\n%s\n%s", out, err)
296 LOG.info("mdadm: successfully stopped %s after %s attempt(s)",
297 devpath, attempt+1)
298 return
299
300 except util.ProcessExecutionError:
301 LOG.warning("mdadm stop failed, retrying ")
302 if os.path.isfile('/proc/mdstat'):
303 LOG.critical("/proc/mdstat:\n%s",
304 util.load_file('/proc/mdstat'))
305 LOG.debug("mdadm: stop failed, retrying in %s seconds", wait)
306 time.sleep(wait)
307 pass
308
309 raise OSError('Failed to stop mdadm device %s', devpath)
263310
264311
265def mdadm_remove(devpath):312def mdadm_remove(devpath):
@@ -341,16 +388,22 @@
341 raise ValueError("Invalid devpath: '%s'" % devpath)388 raise ValueError("Invalid devpath: '%s'" % devpath)
342389
343390
391def md_sysfs_attr_path(md_devname, attrname):
392 """ Return the path to a md device attribute under the 'md' dir """
393 # build /sys/class/block/<md_short>/md
394 sysmd = sys_block_path(md_devname, "md")
395
396 # append attrname
397 return os.path.join(sysmd, attrname)
398
399
344def md_sysfs_attr(md_devname, attrname):400def md_sysfs_attr(md_devname, attrname):
401 """ Return the attribute str of an md device found under the 'md' dir """
402 attrdata = ''
345 if not valid_mdname(md_devname):403 if not valid_mdname(md_devname):
346 raise ValueError('Invalid md devicename: [{}]'.format(md_devname))404 raise ValueError('Invalid md devicename: [{}]'.format(md_devname))
347405
348 attrdata = ''406 sysfs_attr_path = md_sysfs_attr_path(md_devname, attrname)
349 # /sys/class/block/<md_short>/md
350 sysmd = sys_block_path(md_devname, "md")
351
352 # /sys/class/block/<md_short>/md/attrname
353 sysfs_attr_path = os.path.join(sysmd, attrname)
354 if os.path.isfile(sysfs_attr_path):407 if os.path.isfile(sysfs_attr_path):
355 attrdata = util.load_file(sysfs_attr_path).strip()408 attrdata = util.load_file(sysfs_attr_path).strip()
356409
357410
=== modified file 'tests/unittests/test_block_mdadm.py'
--- tests/unittests/test_block_mdadm.py 2017-05-03 02:07:45 +0000
+++ tests/unittests/test_block_mdadm.py 2017-05-15 15:47:54 +0000
@@ -331,27 +331,168 @@
331class TestBlockMdadmStop(MdadmTestBase):331class TestBlockMdadmStop(MdadmTestBase):
332 def setUp(self):332 def setUp(self):
333 super(TestBlockMdadmStop, self).setUp()333 super(TestBlockMdadmStop, self).setUp()
334 self.add_patch('curtin.block.mdadm.util', 'mock_util')334 self.add_patch('curtin.block.mdadm.util.lsb_release', 'mock_util_lsb')
335 self.add_patch('curtin.block.mdadm.util.subp', 'mock_util_subp')
336 self.add_patch('curtin.block.mdadm.util.write_file',
337 'mock_util_write_file')
338 self.add_patch('curtin.block.mdadm.util.load_file',
339 'mock_util_load_file')
335 self.add_patch('curtin.block.mdadm.is_valid_device', 'mock_valid')340 self.add_patch('curtin.block.mdadm.is_valid_device', 'mock_valid')
341 self.add_patch('curtin.block.mdadm.sys_block_path',
342 'mock_sys_block_path')
343 self.add_patch('curtin.block.mdadm.os.path.isfile', 'mock_path_isfile')
336344
337 # Common mock settings345 # Common mock settings
338 self.mock_valid.return_value = True346 self.mock_valid.return_value = True
339 self.mock_util.lsb_release.return_value = {'codename': 'xenial'}347 self.mock_util_lsb.return_value = {'codename': 'xenial'}
340 self.mock_util.subp.side_effect = [348 self.mock_util_subp.side_effect = iter([
341 ("", ""), # mdadm stop device349 ("", ""), # mdadm stop device
342 ]350 ])
351 self.mock_path_isfile.return_value = True
352 self.mock_util_load_file.side_effect = iter([
353 "idle", "max",
354 ])
355
356 def _set_sys_path(self, md_device):
357 self.sys_path = '/sys/class/block/%s/md' % md_device.split("/")[-1]
358 self.mock_sys_block_path.return_value = self.sys_path
343359
344 def test_mdadm_stop_no_devpath(self):360 def test_mdadm_stop_no_devpath(self):
345 with self.assertRaises(ValueError):361 with self.assertRaises(ValueError):
346 mdadm.mdadm_stop(None)362 mdadm.mdadm_stop(None)
347363
348 def test_mdadm_stop(self):364 def test_mdadm_stop(self):
349 device = "/dev/vdc"365 device = "/dev/md0"
350 mdadm.mdadm_stop(device)366 self._set_sys_path(device)
351 expected_calls = [367
352 call(["mdadm", "--manage", "--stop", device], capture=True)368 mdadm.mdadm_stop(device)
353 ]369
354 self.mock_util.subp.assert_has_calls(expected_calls)370 expected_calls = [
371 call(["mdadm", "--manage", "--stop", device], capture=True)
372 ]
373 self.mock_util_subp.assert_has_calls(expected_calls)
374
375 expected_reads = [
376 call(self.sys_path + '/sync_action'),
377 call(self.sys_path + '/sync_max'),
378 ]
379 self.mock_util_load_file.assert_has_calls(expected_reads)
380
381 @patch('curtin.block.mdadm.time.sleep')
382 def test_mdadm_stop_retry(self, mock_sleep):
383 device = "/dev/md10"
384 self._set_sys_path(device)
385 self.mock_util_load_file.side_effect = iter([
386 "resync", "max",
387 "proc/mdstat output",
388 "idle", "0",
389 ])
390 self.mock_util_subp.side_effect = iter([
391 util.ProcessExecutionError(),
392 ("mdadm stopped %s" % device, ''),
393 ])
394
395 mdadm.mdadm_stop(device)
396
397 expected_calls = [
398 call(["mdadm", "--manage", "--stop", device], capture=True),
399 call(["mdadm", "--manage", "--stop", device], capture=True)
400 ]
401 self.mock_util_subp.assert_has_calls(expected_calls)
402
403 expected_reads = [
404 call(self.sys_path + '/sync_action'),
405 call(self.sys_path + '/sync_max'),
406 call('/proc/mdstat'),
407 call(self.sys_path + '/sync_action'),
408 call(self.sys_path + '/sync_max'),
409 ]
410 self.mock_util_load_file.assert_has_calls(expected_reads)
411
412 expected_writes = [
413 call(self.sys_path + '/sync_action', content='idle'),
414 call(self.sys_path + '/sync_max', content='0'),
415 call(self.sys_path + '/sync_min', content='0'),
416 ]
417 self.mock_util_write_file.assert_has_calls(expected_writes)
418
419 @patch('curtin.block.mdadm.time.sleep')
420 def test_mdadm_stop_retry_sysfs_write_fail(self, mock_sleep):
421 device = "/dev/md126"
422 self._set_sys_path(device)
423 self.mock_util_load_file.side_effect = iter([
424 "resync", "max",
425 "proc/mdstat output",
426 "idle", "0",
427 ])
428 self.mock_util_subp.side_effect = iter([
429 util.ProcessExecutionError(),
430 ("mdadm stopped %s" % device, ''),
431 ])
432 # sometimes we fail to modify sysfs attrs
433 self.mock_util_write_file.side_effect = iter([
434 "", # write to sync_action OK
435 IOError(), # write to sync_max FAIL
436 ])
437
438 mdadm.mdadm_stop(device)
439
440 expected_calls = [
441 call(["mdadm", "--manage", "--stop", device], capture=True),
442 call(["mdadm", "--manage", "--stop", device], capture=True)
443 ]
444 self.mock_util_subp.assert_has_calls(expected_calls)
445
446 expected_reads = [
447 call(self.sys_path + '/sync_action'),
448 call(self.sys_path + '/sync_max'),
449 call('/proc/mdstat'),
450 call(self.sys_path + '/sync_action'),
451 call(self.sys_path + '/sync_max'),
452 ]
453 self.mock_util_load_file.assert_has_calls(expected_reads)
454
455 expected_writes = [
456 call(self.sys_path + '/sync_action', content='idle'),
457 ]
458 self.mock_util_write_file.assert_has_calls(expected_writes)
459
460 @patch('curtin.block.mdadm.time.sleep')
461 def test_mdadm_stop_retry_exhausted(self, mock_sleep):
462 device = "/dev/md/37"
463 retries = 60
464 self._set_sys_path(device)
465 self.mock_util_load_file.side_effect = iter([
466 "resync", "max",
467 "proc/mdstat output",
468 ] * retries)
469 self.mock_util_subp.side_effect = iter([
470 util.ProcessExecutionError(),
471 ] * retries)
472 # sometimes we fail to modify sysfs attrs
473 self.mock_util_write_file.side_effect = iter([
474 "", IOError()] * retries)
475
476 with self.assertRaises(OSError):
477 mdadm.mdadm_stop(device)
478
479 expected_calls = [
480 call(["mdadm", "--manage", "--stop", device], capture=True),
481 ] * retries
482 self.mock_util_subp.assert_has_calls(expected_calls)
483
484 expected_reads = [
485 call(self.sys_path + '/sync_action'),
486 call(self.sys_path + '/sync_max'),
487 call('/proc/mdstat'),
488 ] * retries
489 self.mock_util_load_file.assert_has_calls(expected_reads)
490
491 expected_writes = [
492 call(self.sys_path + '/sync_action', content='idle'),
493 call(self.sys_path + '/sync_max', content='0'),
494 ] * retries
495 self.mock_util_write_file.assert_has_calls(expected_writes)
355496
356497
357class TestBlockMdadmRemove(MdadmTestBase):498class TestBlockMdadmRemove(MdadmTestBase):

Subscribers

People subscribed via source and target branches