Merge lp:~raharper/curtin/trunk.md-retry-stop-resync into lp:~curtin-dev/curtin/trunk
- trunk.md-retry-stop-resync
- Merge into trunk
Status: | Merged |
---|---|
Merged at revision: | 497 |
Proposed branch: | lp:~raharper/curtin/trunk.md-retry-stop-resync |
Merge into: | lp:~curtin-dev/curtin/trunk |
Diff against target: |
315 lines (+214/-23) 3 files modified
curtin/block/__init__.py (+0/-3) curtin/block/mdadm.py (+63/-10) tests/unittests/test_block_mdadm.py (+151/-10) |
To merge this branch: | bzr merge lp:~raharper/curtin/trunk.md-retry-stop-resync |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Server Team CI bot | continuous-integration | Approve | |
Scott Moser (community) | Approve | ||
Review via email: mp+323765@code.launchpad.net |
Commit message
Description of the change
mdadm_stop: Add retry and additional steps to halt a resync
In our vmtest (as well as seen in real deployments) after a reboot of a system with mdadm arrays already configured on disk, md may start a 'resync' operation to bring an array online. The resync operation sometimes prevents mdadm from stopping the array. This patch applies a few known methods to halt the resync, namely writing 'idle' to md/sync_action and changing the sync_max and sync_min values to 0.
Add additional unittests to exercise the retry path now taken to help ensure we can reliably shutdown a mdadm array even while in 'resync' state.
Ryan Harper (raharper) wrote : | # |
Server Team CI bot (server-team-bot) wrote : | # |
PASSED: Continuous integration, rev:501
https:/
Executed test runs:
SUCCESS: https:/
SUCCESS: https:/
SUCCESS: https:/
SUCCESS: https:/
Click here to trigger a rebuild:
https:/
Scott Moser (smoser) wrote : | # |
one small comment. other than that, i approve. the proof is really in the racey mdadm pudding.
Scott Moser (smoser) : | # |
- 502. By Ryan Harper
-
Add docstring as requested
Server Team CI bot (server-team-bot) wrote : | # |
PASSED: Continuous integration, rev:502
https:/
Executed test runs:
SUCCESS: https:/
SUCCESS: https:/
SUCCESS: https:/
SUCCESS: https:/
Click here to trigger a rebuild:
https:/
- 503. By Ryan Harper
-
merge from trunk
Preview Diff
1 | === modified file 'curtin/block/__init__.py' | |||
2 | --- curtin/block/__init__.py 2017-03-21 18:05:24 +0000 | |||
3 | +++ curtin/block/__init__.py 2017-05-15 15:47:54 +0000 | |||
4 | @@ -94,7 +94,6 @@ | |||
5 | 94 | # cciss devices need to have 'cciss!' prepended | 94 | # cciss devices need to have 'cciss!' prepended |
6 | 95 | if path.startswith('/dev/cciss'): | 95 | if path.startswith('/dev/cciss'): |
7 | 96 | dev_kname = 'cciss!' + dev_kname | 96 | dev_kname = 'cciss!' + dev_kname |
8 | 97 | LOG.debug("path_to_kname input: '{}' output: '{}'".format(path, dev_kname)) | ||
9 | 98 | return dev_kname | 97 | return dev_kname |
10 | 99 | 98 | ||
11 | 100 | 99 | ||
12 | @@ -106,7 +105,6 @@ | |||
13 | 106 | # if given something that is already a dev path, return it | 105 | # if given something that is already a dev path, return it |
14 | 107 | if os.path.exists(kname) and is_valid_device(kname): | 106 | if os.path.exists(kname) and is_valid_device(kname): |
15 | 108 | path = kname | 107 | path = kname |
16 | 109 | LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path)) | ||
17 | 110 | return os.path.realpath(path) | 108 | return os.path.realpath(path) |
18 | 111 | # adding '/dev' to path is not sufficient to handle cciss devices and | 109 | # adding '/dev' to path is not sufficient to handle cciss devices and |
19 | 112 | # possibly other special devices which have not been encountered yet | 110 | # possibly other special devices which have not been encountered yet |
20 | @@ -114,7 +112,6 @@ | |||
21 | 114 | # make sure path we get is correct | 112 | # make sure path we get is correct |
22 | 115 | if not (os.path.exists(path) and is_valid_device(path)): | 113 | if not (os.path.exists(path) and is_valid_device(path)): |
23 | 116 | raise OSError('could not get path to dev from kname: {}'.format(kname)) | 114 | raise OSError('could not get path to dev from kname: {}'.format(kname)) |
24 | 117 | LOG.debug("kname_to_path input: '{}' output: '{}'".format(kname, path)) | ||
25 | 118 | return path | 115 | return path |
26 | 119 | 116 | ||
27 | 120 | 117 | ||
28 | 121 | 118 | ||
29 | === modified file 'curtin/block/mdadm.py' | |||
30 | --- curtin/block/mdadm.py 2017-05-03 02:05:22 +0000 | |||
31 | +++ curtin/block/mdadm.py 2017-05-15 15:47:54 +0000 | |||
32 | @@ -26,6 +26,7 @@ | |||
33 | 26 | import re | 26 | import re |
34 | 27 | import shlex | 27 | import shlex |
35 | 28 | from subprocess import CalledProcessError | 28 | from subprocess import CalledProcessError |
36 | 29 | import time | ||
37 | 29 | 30 | ||
38 | 30 | from curtin.block import (dev_short, dev_path, is_valid_device, sys_block_path) | 31 | from curtin.block import (dev_short, dev_path, is_valid_device, sys_block_path) |
39 | 31 | from curtin.block import get_holders | 32 | from curtin.block import get_holders |
40 | @@ -253,13 +254,59 @@ | |||
41 | 253 | return data | 254 | return data |
42 | 254 | 255 | ||
43 | 255 | 256 | ||
45 | 256 | def mdadm_stop(devpath): | 257 | def mdadm_stop(devpath, retries=None): |
46 | 257 | assert_valid_devpath(devpath) | 258 | assert_valid_devpath(devpath) |
47 | 259 | if not retries: | ||
48 | 260 | retries = [0.2] * 60 | ||
49 | 261 | |||
50 | 262 | sync_action = md_sysfs_attr_path(devpath, 'sync_action') | ||
51 | 263 | sync_max = md_sysfs_attr_path(devpath, 'sync_max') | ||
52 | 264 | sync_min = md_sysfs_attr_path(devpath, 'sync_min') | ||
53 | 258 | 265 | ||
54 | 259 | LOG.info("mdadm stopping: %s" % devpath) | 266 | LOG.info("mdadm stopping: %s" % devpath) |
58 | 260 | out, err = util.subp(["mdadm", "--manage", "--stop", devpath], | 267 | for (attempt, wait) in enumerate(retries): |
59 | 261 | capture=True) | 268 | try: |
60 | 262 | LOG.debug("mdadm stop:\n%s\n%s", out, err) | 269 | LOG.debug('mdadm: stop on %s attempt %s', devpath, attempt) |
61 | 270 | # An array in 'resync' state may not be stoppable, attempt to | ||
62 | 271 | # cancel an ongoing resync | ||
63 | 272 | val = md_sysfs_attr(devpath, 'sync_action') | ||
64 | 273 | LOG.debug('%s/sync_max = %s', sync_action, val) | ||
65 | 274 | if val != "idle": | ||
66 | 275 | LOG.debug("mdadm: setting array sync_action=idle") | ||
67 | 276 | util.write_file(sync_action, content="idle") | ||
68 | 277 | |||
69 | 278 | # Setting the sync_{max,min} may can help prevent the array from | ||
70 | 279 | # changing back to 'resync' which may prevent the array from being | ||
71 | 280 | # stopped | ||
72 | 281 | val = md_sysfs_attr(devpath, 'sync_max') | ||
73 | 282 | LOG.debug('%s/sync_max = %s', sync_max, val) | ||
74 | 283 | if val != "0": | ||
75 | 284 | LOG.debug("mdadm: setting array sync_{min,max}=0") | ||
76 | 285 | try: | ||
77 | 286 | util.write_file(sync_max, content="0") | ||
78 | 287 | util.write_file(sync_min, content="0") | ||
79 | 288 | except IOError: | ||
80 | 289 | LOG.warning('mdadm: failed to set sync_{max,min} values') | ||
81 | 290 | pass | ||
82 | 291 | |||
83 | 292 | # one wonders why this command doesn't do any of the above itself? | ||
84 | 293 | out, err = util.subp(["mdadm", "--manage", "--stop", devpath], | ||
85 | 294 | capture=True) | ||
86 | 295 | LOG.debug("mdadm stop command output:\n%s\n%s", out, err) | ||
87 | 296 | LOG.info("mdadm: successfully stopped %s after %s attempt(s)", | ||
88 | 297 | devpath, attempt+1) | ||
89 | 298 | return | ||
90 | 299 | |||
91 | 300 | except util.ProcessExecutionError: | ||
92 | 301 | LOG.warning("mdadm stop failed, retrying ") | ||
93 | 302 | if os.path.isfile('/proc/mdstat'): | ||
94 | 303 | LOG.critical("/proc/mdstat:\n%s", | ||
95 | 304 | util.load_file('/proc/mdstat')) | ||
96 | 305 | LOG.debug("mdadm: stop failed, retrying in %s seconds", wait) | ||
97 | 306 | time.sleep(wait) | ||
98 | 307 | pass | ||
99 | 308 | |||
100 | 309 | raise OSError('Failed to stop mdadm device %s', devpath) | ||
101 | 263 | 310 | ||
102 | 264 | 311 | ||
103 | 265 | def mdadm_remove(devpath): | 312 | def mdadm_remove(devpath): |
104 | @@ -341,16 +388,22 @@ | |||
105 | 341 | raise ValueError("Invalid devpath: '%s'" % devpath) | 388 | raise ValueError("Invalid devpath: '%s'" % devpath) |
106 | 342 | 389 | ||
107 | 343 | 390 | ||
108 | 391 | def md_sysfs_attr_path(md_devname, attrname): | ||
109 | 392 | """ Return the path to a md device attribute under the 'md' dir """ | ||
110 | 393 | # build /sys/class/block/<md_short>/md | ||
111 | 394 | sysmd = sys_block_path(md_devname, "md") | ||
112 | 395 | |||
113 | 396 | # append attrname | ||
114 | 397 | return os.path.join(sysmd, attrname) | ||
115 | 398 | |||
116 | 399 | |||
117 | 344 | def md_sysfs_attr(md_devname, attrname): | 400 | def md_sysfs_attr(md_devname, attrname): |
118 | 401 | """ Return the attribute str of an md device found under the 'md' dir """ | ||
119 | 402 | attrdata = '' | ||
120 | 345 | if not valid_mdname(md_devname): | 403 | if not valid_mdname(md_devname): |
121 | 346 | raise ValueError('Invalid md devicename: [{}]'.format(md_devname)) | 404 | raise ValueError('Invalid md devicename: [{}]'.format(md_devname)) |
122 | 347 | 405 | ||
129 | 348 | attrdata = '' | 406 | sysfs_attr_path = md_sysfs_attr_path(md_devname, attrname) |
124 | 349 | # /sys/class/block/<md_short>/md | ||
125 | 350 | sysmd = sys_block_path(md_devname, "md") | ||
126 | 351 | |||
127 | 352 | # /sys/class/block/<md_short>/md/attrname | ||
128 | 353 | sysfs_attr_path = os.path.join(sysmd, attrname) | ||
130 | 354 | if os.path.isfile(sysfs_attr_path): | 407 | if os.path.isfile(sysfs_attr_path): |
131 | 355 | attrdata = util.load_file(sysfs_attr_path).strip() | 408 | attrdata = util.load_file(sysfs_attr_path).strip() |
132 | 356 | 409 | ||
133 | 357 | 410 | ||
134 | === modified file 'tests/unittests/test_block_mdadm.py' | |||
135 | --- tests/unittests/test_block_mdadm.py 2017-05-03 02:07:45 +0000 | |||
136 | +++ tests/unittests/test_block_mdadm.py 2017-05-15 15:47:54 +0000 | |||
137 | @@ -331,27 +331,168 @@ | |||
138 | 331 | class TestBlockMdadmStop(MdadmTestBase): | 331 | class TestBlockMdadmStop(MdadmTestBase): |
139 | 332 | def setUp(self): | 332 | def setUp(self): |
140 | 333 | super(TestBlockMdadmStop, self).setUp() | 333 | super(TestBlockMdadmStop, self).setUp() |
142 | 334 | self.add_patch('curtin.block.mdadm.util', 'mock_util') | 334 | self.add_patch('curtin.block.mdadm.util.lsb_release', 'mock_util_lsb') |
143 | 335 | self.add_patch('curtin.block.mdadm.util.subp', 'mock_util_subp') | ||
144 | 336 | self.add_patch('curtin.block.mdadm.util.write_file', | ||
145 | 337 | 'mock_util_write_file') | ||
146 | 338 | self.add_patch('curtin.block.mdadm.util.load_file', | ||
147 | 339 | 'mock_util_load_file') | ||
148 | 335 | self.add_patch('curtin.block.mdadm.is_valid_device', 'mock_valid') | 340 | self.add_patch('curtin.block.mdadm.is_valid_device', 'mock_valid') |
149 | 341 | self.add_patch('curtin.block.mdadm.sys_block_path', | ||
150 | 342 | 'mock_sys_block_path') | ||
151 | 343 | self.add_patch('curtin.block.mdadm.os.path.isfile', 'mock_path_isfile') | ||
152 | 336 | 344 | ||
153 | 337 | # Common mock settings | 345 | # Common mock settings |
154 | 338 | self.mock_valid.return_value = True | 346 | self.mock_valid.return_value = True |
157 | 339 | self.mock_util.lsb_release.return_value = {'codename': 'xenial'} | 347 | self.mock_util_lsb.return_value = {'codename': 'xenial'} |
158 | 340 | self.mock_util.subp.side_effect = [ | 348 | self.mock_util_subp.side_effect = iter([ |
159 | 341 | ("", ""), # mdadm stop device | 349 | ("", ""), # mdadm stop device |
161 | 342 | ] | 350 | ]) |
162 | 351 | self.mock_path_isfile.return_value = True | ||
163 | 352 | self.mock_util_load_file.side_effect = iter([ | ||
164 | 353 | "idle", "max", | ||
165 | 354 | ]) | ||
166 | 355 | |||
167 | 356 | def _set_sys_path(self, md_device): | ||
168 | 357 | self.sys_path = '/sys/class/block/%s/md' % md_device.split("/")[-1] | ||
169 | 358 | self.mock_sys_block_path.return_value = self.sys_path | ||
170 | 343 | 359 | ||
171 | 344 | def test_mdadm_stop_no_devpath(self): | 360 | def test_mdadm_stop_no_devpath(self): |
172 | 345 | with self.assertRaises(ValueError): | 361 | with self.assertRaises(ValueError): |
173 | 346 | mdadm.mdadm_stop(None) | 362 | mdadm.mdadm_stop(None) |
174 | 347 | 363 | ||
175 | 348 | def test_mdadm_stop(self): | 364 | def test_mdadm_stop(self): |
182 | 349 | device = "/dev/vdc" | 365 | device = "/dev/md0" |
183 | 350 | mdadm.mdadm_stop(device) | 366 | self._set_sys_path(device) |
184 | 351 | expected_calls = [ | 367 | |
185 | 352 | call(["mdadm", "--manage", "--stop", device], capture=True) | 368 | mdadm.mdadm_stop(device) |
186 | 353 | ] | 369 | |
187 | 354 | self.mock_util.subp.assert_has_calls(expected_calls) | 370 | expected_calls = [ |
188 | 371 | call(["mdadm", "--manage", "--stop", device], capture=True) | ||
189 | 372 | ] | ||
190 | 373 | self.mock_util_subp.assert_has_calls(expected_calls) | ||
191 | 374 | |||
192 | 375 | expected_reads = [ | ||
193 | 376 | call(self.sys_path + '/sync_action'), | ||
194 | 377 | call(self.sys_path + '/sync_max'), | ||
195 | 378 | ] | ||
196 | 379 | self.mock_util_load_file.assert_has_calls(expected_reads) | ||
197 | 380 | |||
198 | 381 | @patch('curtin.block.mdadm.time.sleep') | ||
199 | 382 | def test_mdadm_stop_retry(self, mock_sleep): | ||
200 | 383 | device = "/dev/md10" | ||
201 | 384 | self._set_sys_path(device) | ||
202 | 385 | self.mock_util_load_file.side_effect = iter([ | ||
203 | 386 | "resync", "max", | ||
204 | 387 | "proc/mdstat output", | ||
205 | 388 | "idle", "0", | ||
206 | 389 | ]) | ||
207 | 390 | self.mock_util_subp.side_effect = iter([ | ||
208 | 391 | util.ProcessExecutionError(), | ||
209 | 392 | ("mdadm stopped %s" % device, ''), | ||
210 | 393 | ]) | ||
211 | 394 | |||
212 | 395 | mdadm.mdadm_stop(device) | ||
213 | 396 | |||
214 | 397 | expected_calls = [ | ||
215 | 398 | call(["mdadm", "--manage", "--stop", device], capture=True), | ||
216 | 399 | call(["mdadm", "--manage", "--stop", device], capture=True) | ||
217 | 400 | ] | ||
218 | 401 | self.mock_util_subp.assert_has_calls(expected_calls) | ||
219 | 402 | |||
220 | 403 | expected_reads = [ | ||
221 | 404 | call(self.sys_path + '/sync_action'), | ||
222 | 405 | call(self.sys_path + '/sync_max'), | ||
223 | 406 | call('/proc/mdstat'), | ||
224 | 407 | call(self.sys_path + '/sync_action'), | ||
225 | 408 | call(self.sys_path + '/sync_max'), | ||
226 | 409 | ] | ||
227 | 410 | self.mock_util_load_file.assert_has_calls(expected_reads) | ||
228 | 411 | |||
229 | 412 | expected_writes = [ | ||
230 | 413 | call(self.sys_path + '/sync_action', content='idle'), | ||
231 | 414 | call(self.sys_path + '/sync_max', content='0'), | ||
232 | 415 | call(self.sys_path + '/sync_min', content='0'), | ||
233 | 416 | ] | ||
234 | 417 | self.mock_util_write_file.assert_has_calls(expected_writes) | ||
235 | 418 | |||
236 | 419 | @patch('curtin.block.mdadm.time.sleep') | ||
237 | 420 | def test_mdadm_stop_retry_sysfs_write_fail(self, mock_sleep): | ||
238 | 421 | device = "/dev/md126" | ||
239 | 422 | self._set_sys_path(device) | ||
240 | 423 | self.mock_util_load_file.side_effect = iter([ | ||
241 | 424 | "resync", "max", | ||
242 | 425 | "proc/mdstat output", | ||
243 | 426 | "idle", "0", | ||
244 | 427 | ]) | ||
245 | 428 | self.mock_util_subp.side_effect = iter([ | ||
246 | 429 | util.ProcessExecutionError(), | ||
247 | 430 | ("mdadm stopped %s" % device, ''), | ||
248 | 431 | ]) | ||
249 | 432 | # sometimes we fail to modify sysfs attrs | ||
250 | 433 | self.mock_util_write_file.side_effect = iter([ | ||
251 | 434 | "", # write to sync_action OK | ||
252 | 435 | IOError(), # write to sync_max FAIL | ||
253 | 436 | ]) | ||
254 | 437 | |||
255 | 438 | mdadm.mdadm_stop(device) | ||
256 | 439 | |||
257 | 440 | expected_calls = [ | ||
258 | 441 | call(["mdadm", "--manage", "--stop", device], capture=True), | ||
259 | 442 | call(["mdadm", "--manage", "--stop", device], capture=True) | ||
260 | 443 | ] | ||
261 | 444 | self.mock_util_subp.assert_has_calls(expected_calls) | ||
262 | 445 | |||
263 | 446 | expected_reads = [ | ||
264 | 447 | call(self.sys_path + '/sync_action'), | ||
265 | 448 | call(self.sys_path + '/sync_max'), | ||
266 | 449 | call('/proc/mdstat'), | ||
267 | 450 | call(self.sys_path + '/sync_action'), | ||
268 | 451 | call(self.sys_path + '/sync_max'), | ||
269 | 452 | ] | ||
270 | 453 | self.mock_util_load_file.assert_has_calls(expected_reads) | ||
271 | 454 | |||
272 | 455 | expected_writes = [ | ||
273 | 456 | call(self.sys_path + '/sync_action', content='idle'), | ||
274 | 457 | ] | ||
275 | 458 | self.mock_util_write_file.assert_has_calls(expected_writes) | ||
276 | 459 | |||
277 | 460 | @patch('curtin.block.mdadm.time.sleep') | ||
278 | 461 | def test_mdadm_stop_retry_exhausted(self, mock_sleep): | ||
279 | 462 | device = "/dev/md/37" | ||
280 | 463 | retries = 60 | ||
281 | 464 | self._set_sys_path(device) | ||
282 | 465 | self.mock_util_load_file.side_effect = iter([ | ||
283 | 466 | "resync", "max", | ||
284 | 467 | "proc/mdstat output", | ||
285 | 468 | ] * retries) | ||
286 | 469 | self.mock_util_subp.side_effect = iter([ | ||
287 | 470 | util.ProcessExecutionError(), | ||
288 | 471 | ] * retries) | ||
289 | 472 | # sometimes we fail to modify sysfs attrs | ||
290 | 473 | self.mock_util_write_file.side_effect = iter([ | ||
291 | 474 | "", IOError()] * retries) | ||
292 | 475 | |||
293 | 476 | with self.assertRaises(OSError): | ||
294 | 477 | mdadm.mdadm_stop(device) | ||
295 | 478 | |||
296 | 479 | expected_calls = [ | ||
297 | 480 | call(["mdadm", "--manage", "--stop", device], capture=True), | ||
298 | 481 | ] * retries | ||
299 | 482 | self.mock_util_subp.assert_has_calls(expected_calls) | ||
300 | 483 | |||
301 | 484 | expected_reads = [ | ||
302 | 485 | call(self.sys_path + '/sync_action'), | ||
303 | 486 | call(self.sys_path + '/sync_max'), | ||
304 | 487 | call('/proc/mdstat'), | ||
305 | 488 | ] * retries | ||
306 | 489 | self.mock_util_load_file.assert_has_calls(expected_reads) | ||
307 | 490 | |||
308 | 491 | expected_writes = [ | ||
309 | 492 | call(self.sys_path + '/sync_action', content='idle'), | ||
310 | 493 | call(self.sys_path + '/sync_max', content='0'), | ||
311 | 494 | ] * retries | ||
312 | 495 | self.mock_util_write_file.assert_has_calls(expected_writes) | ||
313 | 355 | 496 | ||
314 | 356 | 497 | ||
315 | 357 | class TestBlockMdadmRemove(MdadmTestBase): | 498 | class TestBlockMdadmRemove(MdadmTestBase): |
Started a vmtest run here: /jenkins. ubuntu. com/server/ job/curtin- vmtest- devel-debug/ 43/
https:/