Merge lp:~sseman/juju-chaos-monkey/replay-commands into lp:juju-chaos-monkey

Proposed by Seman
Status: Merged
Merged at revision: 22
Proposed branch: lp:~sseman/juju-chaos-monkey/replay-commands
Merge into: lp:juju-chaos-monkey
Diff against target: 324 lines (+164/-29)
3 files modified
chaos_monkey.py (+3/-8)
runner.py (+60/-9)
tests/test_runner.py (+101/-12)
To merge this branch: bzr merge lp:~sseman/juju-chaos-monkey/replay-commands
Reviewer Review Type Date Requested Status
John George (community) Approve
Review via email: mp+261583@code.launchpad.net

Description of the change

This branch adds support to replay Chaos Monkey commands from a structured (YAML) data file. It handles a reboot requests as follows:
  - Write the remaining Chaos Monkey commands to a temporary file.
  - After a reboot, read the temporary file and continue the run.
  - Delete the temporary file.

Here is output from using the --replay option:

$ cat /home/ubuntu/cmd_list.log
- [deny-incoming, 2]
- [restart-unit, 2]
- [deny-all, 2]

$runner.py --replay /home/ubuntu/cmd_list.log /home/ubuntu/cm

$cat results.log
2015-06-10 01:02:41 INFO Chaos Monkey started in /home/ubuntu/cm
2015-06-10 01:02:41 INFO Replaying commands from /home/ubuntu/cm/cmd_list.log
2015-06-10 01:02:41 INFO Deny all incoming network traffic except ssh.
2015-06-10 01:02:45 INFO Restart the unit.
2015-06-10 01:02:45 INFO Init script generated:
 cmd: --replay /home/ubuntu/cm/cmd_list.log /home/ubuntu/cm
 expire_time: 1433898167.64
 runner_path: /home/ubuntu/cm/runner.py
2015-06-10 01:02:45 INFO Chaos Monkey stopped.

2015-06-10 01:03:08 INFO Chaos Monkey restarted after a reboot in /home/ubuntu/cm
2015-06-10 01:03:08 INFO Init script removed from /etc/init/chaos-monkey-restart.conf
2015-06-10 01:03:08 INFO Replaying commands from /home/ubuntu/cm/cmd_list.log
2015-06-10 01:03:08 INFO Deny all incoming and outgoing network traffic except ssh.
2015-06-10 01:03:15 INFO Chaos Monkey stopped.

To post a comment you must log in.
Revision history for this message
John George (jog) wrote :

This looks good, thanks for the example output in the description, it helps a lot while reviewing.

If you passed args returned from parse_args() to Runner.factory and setup the constructor to accept each value, with some minimal default, then all the instance data can be stored in one place. The function definitions can then be smaller. Rather than passing in args to each function, self can be referenced. Changing the data storage location is not required to land this branch but we've talked about a later branch that will change argument passing for the random_chaos() function. Please strongly consider this proposed change for that branch.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'chaos_monkey.py'
--- chaos_monkey.py 2015-06-04 17:49:00 +0000
+++ chaos_monkey.py 2015-06-10 01:20:20 +0000
@@ -18,14 +18,6 @@
18 all_chaos, factory_obj = ChaosMonkey.get_all_chaos()18 all_chaos, factory_obj = ChaosMonkey.get_all_chaos()
19 return cls([], factory_obj)19 return cls([], factory_obj)
2020
21 @property
22 def command_tag(self):
23 return ":CHAOS_CMD:"
24
25 @property
26 def description_tag(self):
27 return ":CHAOS_DSCR:"
28
29 @staticmethod21 @staticmethod
30 def get_all_chaos():22 def get_all_chaos():
31 all_chaos = []23 all_chaos = []
@@ -85,3 +77,6 @@
85 if item.command_str == command_str:77 if item.command_str == command_str:
86 return item78 return item
87 return None79 return None
80
81 def reset_command_selection(self):
82 self.chaos = []
8883
=== modified file 'runner.py'
--- runner.py 2015-06-08 23:51:53 +0000
+++ runner.py 2015-06-10 01:20:20 +0000
@@ -10,6 +10,8 @@
10 sleep10 sleep
11)11)
1212
13import yaml
14
13from chaos.kill import Kill15from chaos.kill import Kill
14from chaos_monkey import ChaosMonkey16from chaos_monkey import ChaosMonkey
15from utility import (17from utility import (
@@ -35,6 +37,7 @@
35 self.chaos_monkey = chaos_monkey37 self.chaos_monkey = chaos_monkey
36 self.expire_time = None38 self.expire_time = None
37 self.cmd_log_name = cmd_log_name39 self.cmd_log_name = cmd_log_name
40 self.replay_filename_ext = '.part'
3841
39 @classmethod42 @classmethod
40 def factory(cls, workspace, log_count=1, dry_run=False):43 def factory(cls, workspace, log_count=1, dry_run=False):
@@ -131,6 +134,7 @@
131 include_command=None, exclude_command=None):134 include_command=None, exclude_command=None):
132 all_groups = ChaosMonkey.get_all_groups()135 all_groups = ChaosMonkey.get_all_groups()
133 all_commands = ChaosMonkey.get_all_commands()136 all_commands = ChaosMonkey.get_all_commands()
137 self.chaos_monkey.reset_command_selection()
134138
135 # If any groups and any commands are not included, assume the intent139 # If any groups and any commands are not included, assume the intent
136 # is to include all groups and all commands.140 # is to include all groups and all commands.
@@ -151,6 +155,42 @@
151 exclude_command, all_commands)155 exclude_command, all_commands)
152 self.chaos_monkey.exclude_command(exclude_command)156 self.chaos_monkey.exclude_command(exclude_command)
153157
158 def replay_commands(self, args):
159 """Replay Chaos Monkey commands from a file."""
160 commands = self.get_command_list(args)
161 while commands:
162 command = commands.pop()
163 command_str = command[0]
164 enablement_timeout = command[1]
165 if command_str == Kill.restart_cmd and commands:
166 # Save the commands to a temporary file before a reboot.
167 self.save_command_list(commands, args)
168 self.random_chaos(
169 run_timeout=enablement_timeout,
170 enablement_timeout=enablement_timeout,
171 include_command=command_str)
172 if command_str == Kill.restart_cmd:
173 break
174
175 def get_command_list(self, args):
176 """Get the command list from a file."""
177 file_path = (args.replay + self.replay_filename_ext
178 if args.restart else args.replay)
179 with open(file_path) as f:
180 commands = yaml.load(f.read())
181 commands.reverse()
182 # If it is a restart, remove the temporary file.
183 if args.restart:
184 os.remove(file_path)
185 return commands
186
187 def save_command_list(self, commands, args):
188 """Before a shutdown and restart request, this method is called
189 to save the command list to a temporary file."""
190 file_path = args.replay + self.replay_filename_ext
191 with open(file_path, 'w') as f:
192 f.write(yaml.dump(commands))
193
154 @staticmethod194 @staticmethod
155 def _validate(sub_string, all_list):195 def _validate(sub_string, all_list):
156 sub_list = split_arg_string(sub_string)196 sub_list = split_arg_string(sub_string)
@@ -245,6 +285,9 @@
245 parser.add_argument(285 parser.add_argument(
246 '-ep', '--expire-time', type=float,286 '-ep', '--expire-time', type=float,
247 help='Chaos Monkey expire time (UNIX timestamp).', default=None)287 help='Chaos Monkey expire time (UNIX timestamp).', default=None)
288 parser.add_argument(
289 '-rp', '--replay', metavar='FULL-FILE-PATH',
290 help='Replay Chaos Monkey commands from a file.', default=None)
248 args = parser.parse_args(argv)291 args = parser.parse_args(argv)
249292
250 if args.run_once and args.total_timeout:293 if args.run_once and args.total_timeout:
@@ -262,8 +305,12 @@
262 if args.enablement_timeout < 0:305 if args.enablement_timeout < 0:
263 parser.error("Invalid enablement-timeout value: timeout must be "306 parser.error("Invalid enablement-timeout value: timeout must be "
264 "zero or greater.")307 "zero or greater.")
308 if args.replay and not os.path.isabs(args.replay):
309 parser.error("Please provide an absolute file path to the replay "
310 "argument: {}".format(args.replay))
265 return args311 return args
266312
313
267if __name__ == '__main__':314if __name__ == '__main__':
268 args = parse_args()315 args = parse_args()
269 runner = Runner.factory(workspace=args.path, log_count=args.log_count,316 runner = Runner.factory(workspace=args.path, log_count=args.log_count,
@@ -278,15 +325,19 @@
278325
279 runner.acquire_lock(restart=args.restart)326 runner.acquire_lock(restart=args.restart)
280 try:327 try:
281 runner.random_chaos(328 if args.replay:
282 run_timeout=args.total_timeout,329 logging.info('Replaying commands from {}'.format(args.replay))
283 enablement_timeout=args.enablement_timeout,330 runner.replay_commands(args=args)
284 include_group=args.include_group,331 else:
285 exclude_group=args.exclude_group,332 runner.random_chaos(
286 include_command=args.include_command,333 run_timeout=args.total_timeout,
287 exclude_command=args.exclude_command,334 enablement_timeout=args.enablement_timeout,
288 run_once=args.run_once,335 include_group=args.include_group,
289 expire_time=args.expire_time)336 exclude_group=args.exclude_group,
337 include_command=args.include_command,
338 exclude_command=args.exclude_command,
339 run_once=args.run_once,
340 expire_time=args.expire_time)
290 except Exception as e:341 except Exception as e:
291 logging.error('{} ({})'.format(e, type(e).__name__))342 logging.error('{} ({})'.format(e, type(e).__name__))
292 sys.exit(1)343 sys.exit(1)
293344
=== modified file 'tests/test_runner.py'
--- tests/test_runner.py 2015-06-08 23:51:53 +0000
+++ tests/test_runner.py 2015-06-10 01:20:20 +0000
@@ -4,9 +4,11 @@
4import signal4import signal
5import subprocess5import subprocess
6from StringIO import StringIO6from StringIO import StringIO
7from tempfile import NamedTemporaryFile
7from time import time8from time import time
89
9from mock import patch, call10from mock import patch, call
11import yaml
1012
11from chaos.kill import Kill13from chaos.kill import Kill
12from chaos_monkey import ChaosMonkey14from chaos_monkey import ChaosMonkey
@@ -530,7 +532,8 @@
530 total_timeout=10, log_count=2, include_group=None,532 total_timeout=10, log_count=2, include_group=None,
531 exclude_group=None, include_command=None,533 exclude_group=None, include_command=None,
532 exclude_command=None, dry_run=False,534 exclude_command=None, dry_run=False,
533 run_once=False, restart=False, expire_time=None))535 run_once=False, restart=False, expire_time=None,
536 replay=None))
534537
535 def test_parse_args_non_default_values(self):538 def test_parse_args_non_default_values(self):
536 args = parse_args(['path',539 args = parse_args(['path',
@@ -543,14 +546,16 @@
543 '--exclude-command', 'deny-incoming',546 '--exclude-command', 'deny-incoming',
544 '--dry-run',547 '--dry-run',
545 '--restart',548 '--restart',
546 '--expire-time', '111.11'])549 '--expire-time', '111.11',
550 '--replay', '/path/to/foo'])
547 self.assertEqual(551 self.assertEqual(
548 args, Namespace(path='path', enablement_timeout=30,552 args, Namespace(path='path', enablement_timeout=30,
549 total_timeout=600, log_count=4,553 total_timeout=600, log_count=4,
550 include_group='net', exclude_group=Kill.group,554 include_group='net', exclude_group=Kill.group,
551 include_command='deny-all',555 include_command='deny-all',
552 exclude_command='deny-incoming', dry_run=True,556 exclude_command='deny-incoming', dry_run=True,
553 run_once=False, restart=True, expire_time=111.11))557 run_once=False, restart=True, expire_time=111.11,
558 replay='/path/to/foo'))
554559
555 def test_parse_args_non_default_values_set_run_once(self):560 def test_parse_args_non_default_values_set_run_once(self):
556 args = parse_args(['path',561 args = parse_args(['path',
@@ -568,7 +573,8 @@
568 include_group='net', exclude_group=Kill.group,573 include_group='net', exclude_group=Kill.group,
569 include_command='deny-all',574 include_command='deny-all',
570 exclude_command='deny-incoming', dry_run=True,575 exclude_command='deny-incoming', dry_run=True,
571 run_once=True, restart=False, expire_time=None))576 run_once=True, restart=False, expire_time=None,
577 replay=None))
572578
573 def test_parse_args_error_enablement_greater_than_total_timeout(self):579 def test_parse_args_error_enablement_greater_than_total_timeout(self):
574 with parse_error(self) as stderr:580 with parse_error(self) as stderr:
@@ -636,14 +642,7 @@
636 with temp_dir() as directory:642 with temp_dir() as directory:
637 runner = Runner(directory, ChaosMonkey.factory())643 runner = Runner(directory, ChaosMonkey.factory())
638 runner._run_command(enablement_timeout=0)644 runner._run_command(enablement_timeout=0)
639 self.assertEqual(mock.mock_calls, [645 self.assertEqual(mock.mock_calls, self._deny_port_call_list())
640 call(['ufw', 'deny', '37017']),
641 call(['ufw', 'allow', 'in', 'to', 'any']),
642 call(['ufw', '--force', 'enable']),
643 call(['ufw', 'disable']),
644 call(['ufw', 'delete', 'allow', 'in', 'to', 'any']),
645 call(['ufw', 'delete', 'deny', '37017']),
646 ])
647646
648 def test_run_command_select_restart_unit(self):647 def test_run_command_select_restart_unit(self):
649 chaos = self._get_chaos_object(Kill(), Kill.restart_cmd)648 chaos = self._get_chaos_object(Kill(), Kill.restart_cmd)
@@ -657,6 +656,80 @@
657 self.assertEqual(mock.mock_calls, [call(['shutdown', '-r', 'now'])])656 self.assertEqual(mock.mock_calls, [call(['shutdown', '-r', 'now'])])
658 ri_mock.upstart.assert_called_once_with()657 ri_mock.upstart.assert_called_once_with()
659658
659 def test_replay_commands(self):
660 with patch('utility.check_output', autospec=True) as mock:
661 with temp_dir() as directory:
662 runner = Runner(directory, ChaosMonkey.factory())
663 with NamedTemporaryFile() as temp_file:
664 self._write_command_list_to_file(temp_file)
665 args = Namespace(replay=temp_file.name, restart=False)
666 runner.replay_commands(args)
667 expected = self._deny_port_call_list()
668 expected.extend(self._deny_port_call_list('17017'))
669 self.assertEqual(mock.mock_calls, expected)
670
671 def test_replay_commands_with_restart_command(self):
672 commands = "- [restart-unit, 1]\n- [deny-api-server, 1]\n"
673 with patch('utility.check_output', autospec=True) as mock:
674 with patch('runner.Init.install'):
675 with temp_dir() as directory:
676 runner = Runner(directory, ChaosMonkey.factory())
677 with NamedTemporaryFile() as temp_file:
678 self._write_command_list_to_file(
679 temp_file, data=commands)
680 args = Namespace(replay=temp_file.name, restart=False)
681 runner.replay_commands(args)
682 # Verify a temporary file is created because there
683 # is a restart-unit command in the list.
684 self.assertIs(os.path.isfile(
685 temp_file.name + runner.replay_filename_ext), True)
686 args = Namespace(replay=temp_file.name, restart=True)
687 file_content = runner.get_command_list(args)
688 # Verify the temporary files is deleted.
689 self.assertIsNot(os.path.isfile(
690 temp_file.name + runner.replay_filename_ext), True)
691 self.assertEqual(mock.mock_calls, [call(['shutdown', '-r', 'now'])])
692 self.assertEqual(file_content, [yaml.load(commands)[1]])
693
694 def test_replay_commands_after_reboot(self):
695 with patch('utility.check_output', autospec=True) as mock:
696 with temp_dir() as directory:
697 runner = Runner(directory, ChaosMonkey.factory())
698 temp_file = NamedTemporaryFile(
699 suffix=runner.replay_filename_ext, delete=False)
700 self._write_command_list_to_file(temp_file)
701 args = Namespace(
702 replay=temp_file.name.split('.')[0], restart=True)
703 runner.replay_commands(args)
704 # Verify replay_commands() has deleted the temp file.
705 self.assertIsNot(os.path.isfile(temp_file.name), True)
706 expected = self._deny_port_call_list()
707 expected.extend(self._deny_port_call_list('17017'))
708 self.assertEqual(mock.mock_calls, expected)
709
710 def test_get_command_list(self):
711 with temp_dir() as directory:
712 runner = Runner(directory, ChaosMonkey.factory())
713 with NamedTemporaryFile() as temp_file:
714 self._write_command_list_to_file(temp_file)
715 args = Namespace(replay=temp_file.name, restart=False)
716 commands = runner.get_command_list(args)
717 expected = [['deny-state-server', 1], ['deny-api-server', 1]]
718 self.assertItemsEqual(commands, expected)
719
720 def test_save_replay_command_list(self):
721 commands = [['deny-state-server', 1], ['deny-api-server', 1]]
722 with temp_dir() as directory:
723 runner = Runner(directory, ChaosMonkey.factory())
724 with NamedTemporaryFile(
725 suffix=runner.replay_filename_ext) as temp_file:
726 args = Namespace(replay=temp_file.name.split('.')[0],
727 restart=False)
728 runner.save_command_list(commands, args)
729 file_content = temp_file.read()
730 expected = yaml.dump(commands)
731 self.assertItemsEqual(file_content, expected)
732
660 def _get_chaos_object(self, obj, command_str):733 def _get_chaos_object(self, obj, command_str):
661 for chaos in obj.get_chaos():734 for chaos in obj.get_chaos():
662 if chaos.command_str == command_str:735 if chaos.command_str == command_str:
@@ -665,6 +738,22 @@
665 self.fail("'{}' chaos not found".format(command_str))738 self.fail("'{}' chaos not found".format(command_str))
666 return chaos739 return chaos
667740
741 def _write_command_list_to_file(self, fd, data=None):
742 data = ("- [deny-state-server, 1]\n- [deny-api-server, 1]\n"
743 if not data else data)
744 fd.write(data)
745 fd.flush()
746 return data
747
748 def _deny_port_call_list(self, port='37017'):
749 return [
750 call(['ufw', 'deny', port]),
751 call(['ufw', 'allow', 'in', 'to', 'any']),
752 call(['ufw', '--force', 'enable']),
753 call(['ufw', 'disable']),
754 call(['ufw', 'delete', 'allow', 'in', 'to', 'any']),
755 call(['ufw', 'delete', 'deny', port])]
756
668757
669def add_fake_group(chaos_monkey):758def add_fake_group(chaos_monkey):
670 chaos = Chaos(None, None, 'fake_group', 'fake_command_str', 'description')759 chaos = Chaos(None, None, 'fake_group', 'fake_command_str', 'description')

Subscribers

People subscribed via source and target branches