Merge lp:~justin-fathomdb/nova/where-has-my-instance-gone into lp:~hudson-openstack/nova/trunk

Proposed by justinsb
Status: Merged
Approved by: Devin Carlen
Approved revision: 917
Merged at revision: 960
Proposed branch: lp:~justin-fathomdb/nova/where-has-my-instance-gone
Merge into: lp:~hudson-openstack/nova/trunk
Diff against target: 143 lines (+48/-19)
3 files modified
nova/compute/manager.py (+2/-6)
nova/tests/test_compute.py (+2/-1)
nova/virt/libvirt_conn.py (+44/-12)
To merge this branch: bzr merge lp:~justin-fathomdb/nova/where-has-my-instance-gone
Reviewer Review Type Date Requested Status
Devin Carlen (community) Approve
Vish Ishaya (community) Approve
Review via email: mp+55450@code.launchpad.net

Description of the change

Keep guest instances when libvirt host restarts

To post a comment you must log in.
Revision history for this message
justinsb (justin-fathomdb) wrote :

I haven't been able to test this in production as much as I'd like, but it's a fairly critical bug, so proposing for merge.

Revision history for this message
Vish Ishaya (vishvananda) wrote :

Really like to see this in soon.

review: Approve
Revision history for this message
Devin Carlen (devcamcar) wrote :

It's generally our policy to not leave commented code blocks in. We have history in bzr so we can always go find if we need.

14 + # NOTE(justinsb): We no longer auto-remove SHUTOFF instances
15 + # It's quite hard to get them back when we do.
16 + #if vm_state == power_state.SHUTOFF:
17 + # # TODO(soren): This is what the compute manager does when you
18 + # # terminate an instance. At some point I figure we'll have a
19 + # # "terminated" state and some sort of cleanup job that runs
20 + # # occasionally, cleaning them out.
21 + # self.db.instance_destroy(context, db_instance['id'])

59 + # NOTE(justinsb): We no longer delete these instances,
60 + # the user may want to power them back on
61 + #if state == power_state.SHUTOFF:
62 + # # TODO(soren): This is what the compute manager does when you
63 + # # terminate # an instance. At some point I figure we'll have a
64 + # # "terminated" state and some sort of cleanup job that runs
65 + # # occasionally, cleaning them out.
66 + # db.instance_destroy(ctxt, instance['id'])

This looks good, and I'll approve once this is cleaned up.

Revision history for this message
justinsb (justin-fathomdb) wrote :

Removed commented code and fixed up with correct libvirt exit code

Revision history for this message
Devin Carlen (devcamcar) wrote :

lgtm

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'nova/compute/manager.py'
2--- nova/compute/manager.py 2011-03-30 05:59:13 +0000
3+++ nova/compute/manager.py 2011-04-07 22:54:36 +0000
4@@ -1097,12 +1097,8 @@
5 db_instance['id'],
6 vm_state)
7
8- if vm_state == power_state.SHUTOFF:
9- # TODO(soren): This is what the compute manager does when you
10- # terminate an instance. At some point I figure we'll have a
11- # "terminated" state and some sort of cleanup job that runs
12- # occasionally, cleaning them out.
13- self.db.instance_destroy(context, db_instance['id'])
14+ # NOTE(justinsb): We no longer auto-remove SHUTOFF instances
15+ # It's quite hard to get them back when we do.
16
17 # Are there VMs not in the DB?
18 for vm_not_found_in_db in vms_not_found_in_db:
19
20=== modified file 'nova/tests/test_compute.py'
21--- nova/tests/test_compute.py 2011-04-04 20:17:04 +0000
22+++ nova/tests/test_compute.py 2011-04-07 22:54:36 +0000
23@@ -666,4 +666,5 @@
24
25 instances = db.instance_get_all(context.get_admin_context())
26 LOG.info(_("After force-killing instances: %s"), instances)
27- self.assertEqual(len(instances), 0)
28+ self.assertEqual(len(instances), 1)
29+ self.assertEqual(power_state.SHUTOFF, instances[0]['state'])
30
31=== modified file 'nova/virt/libvirt_conn.py'
32--- nova/virt/libvirt_conn.py 2011-04-07 21:48:29 +0000
33+++ nova/virt/libvirt_conn.py 2011-04-07 22:54:36 +0000
34@@ -117,6 +117,8 @@
35 'Define live migration behavior')
36 flags.DEFINE_string('qemu_img', 'qemu-img',
37 'binary to use for qemu-img commands')
38+flags.DEFINE_bool('start_guests_on_host_boot', False,
39+ 'Whether to restart guests when the host reboots')
40
41
42 def get_connection(read_only):
43@@ -231,12 +233,8 @@
44 {'name': instance['name'], 'state': state})
45 db.instance_set_state(ctxt, instance['id'], state)
46
47- if state == power_state.SHUTOFF:
48- # TODO(soren): This is what the compute manager does when you
49- # terminate # an instance. At some point I figure we'll have a
50- # "terminated" state and some sort of cleanup job that runs
51- # occasionally, cleaning them out.
52- db.instance_destroy(ctxt, instance['id'])
53+ # NOTE(justinsb): We no longer delete SHUTOFF instances,
54+ # the user may want to power them back on
55
56 if state != power_state.RUNNING:
57 continue
58@@ -475,7 +473,7 @@
59 xml = self.to_xml(instance)
60 self.firewall_driver.setup_basic_filtering(instance)
61 self.firewall_driver.prepare_instance_filter(instance)
62- self._conn.createXML(xml, 0)
63+ self._create_new_domain(xml)
64 self.firewall_driver.apply_instance_filter(instance)
65
66 timer = utils.LoopingCall(f=None)
67@@ -523,7 +521,7 @@
68 'kernel_id': FLAGS.rescue_kernel_id,
69 'ramdisk_id': FLAGS.rescue_ramdisk_id}
70 self._create_image(instance, xml, '.rescue', rescue_images)
71- self._conn.createXML(xml, 0)
72+ self._create_new_domain(xml)
73
74 timer = utils.LoopingCall(f=None)
75
76@@ -566,10 +564,15 @@
77 self.firewall_driver.setup_basic_filtering(instance, network_info)
78 self.firewall_driver.prepare_instance_filter(instance, network_info)
79 self._create_image(instance, xml, network_info)
80- self._conn.createXML(xml, 0)
81+ domain = self._create_new_domain(xml)
82 LOG.debug(_("instance %s: is running"), instance['name'])
83 self.firewall_driver.apply_instance_filter(instance)
84
85+ if FLAGS.start_guests_on_host_boot:
86+ LOG.debug(_("instance %s: setting autostart ON") %
87+ instance['name'])
88+ domain.setAutostart(1)
89+
90 timer = utils.LoopingCall(f=None)
91
92 def _wait_for_boot():
93@@ -989,11 +992,22 @@
94 return xml
95
96 def get_info(self, instance_name):
97+ # NOTE(justinsb): When libvirt isn't running / can't connect, we get:
98+ # libvir: Remote error : unable to connect to
99+ # '/var/run/libvirt/libvirt-sock', libvirtd may need to be started:
100+ # No such file or directory
101 try:
102 virt_dom = self._conn.lookupByName(instance_name)
103- except:
104- raise exception.NotFound(_("Instance %s not found")
105- % instance_name)
106+ except libvirt.libvirtError as e:
107+ errcode = e.get_error_code()
108+ if errcode == libvirt.VIR_ERR_NO_DOMAIN:
109+ raise exception.NotFound(_("Instance %s not found")
110+ % instance_name)
111+ LOG.warning(_("Error from libvirt during lookup. "
112+ "Code=%(errcode)s Error=%(e)s") %
113+ locals())
114+ raise
115+
116 (state, max_mem, mem, num_cpu, cpu_time) = virt_dom.info()
117 return {'state': state,
118 'max_mem': max_mem,
119@@ -1001,6 +1015,24 @@
120 'num_cpu': num_cpu,
121 'cpu_time': cpu_time}
122
123+ def _create_new_domain(self, xml, persistent=True, launch_flags=0):
124+ # NOTE(justinsb): libvirt has two types of domain:
125+ # * a transient domain disappears when the guest is shutdown
126+ # or the host is rebooted.
127+ # * a permanent domain is not automatically deleted
128+ # NOTE(justinsb): Even for ephemeral instances, transient seems risky
129+
130+ if persistent:
131+ # To create a persistent domain, first define it, then launch it.
132+ domain = self._conn.defineXML(xml)
133+
134+ domain.createWithFlags(launch_flags)
135+ else:
136+ # createXML call creates a transient domain
137+ domain = self._conn.createXML(xml, launch_flags)
138+
139+ return domain
140+
141 def get_diagnostics(self, instance_name):
142 raise exception.ApiError(_("diagnostics are not supported "
143 "for libvirt"))