Merge ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master

Proposed by Chad Smith on 2017-11-30
Status: Merged
Merged at revision: 16ad7eacc3890d219a337ca5d956f696e172854d
Proposed branch: ~chad.smith/cloud-init:fix-dhclient-absent-pidfile
Merge into: cloud-init:master
Diff against target: 320 lines (+118/-48)
5 files modified
cloudinit/net/dhcp.py (+29/-15)
cloudinit/net/tests/test_dhcp.py (+61/-5)
cloudinit/sources/DataSourceAzure.py (+3/-26)
cloudinit/util.py (+22/-0)
tests/unittests/test_datasource/test_azure.py (+3/-2)
Reviewer Review Type Date Requested Status
Server Team CI bot continuous-integration Approve on 2017-11-30
Scott Moser 2017-11-30 Approve on 2017-11-30
Review via email: mp+334543@code.launchpad.net

Description of the Change

ec2: Fix sandboxed dhclient background process cleanup.

There is a race condition where our sandboxed dhclient properly writes a lease file but has not yet written a pid file. If the sandbox temporary directory is torn down before the dhclient subprocess writes a pidfile DataSourceEc2Local gets a traceback and the instance will fallback to DataSourceEc2 in the init-network stage. This wastes boot cycles we'd rather not spend.

Fix handling of sandboxed dhclient to wait for both pidfile and leasefile before proceding. If either file doesn't show in 5 seconds, log a warning and return empty lease results {}.

LP: #1735331

To post a comment you must log in.
Chad Smith (chad.smith) wrote :

Validated on ec2 with a deb built from this branch

looping on
sudo python3 -c 'from cloudinit.net.dhcp import maybe_perform_dhcp_discovery; maybe_perform_dhcp_discovery()'
produces no errors and leaks no sandboxed dhclient background processes

PASSED: Continuous integration, rev:c4348d7228dd50fc270422e956939bd48514ae57
https://jenkins.ubuntu.com/server/job/cloud-init-ci/563/
Executed test runs:
    SUCCESS: Checkout
    SUCCESS: Unit & Style Tests
    SUCCESS: Ubuntu LTS: Build
    SUCCESS: Ubuntu LTS: Integration
    SUCCESS: MAAS Compatability Testing
    IN_PROGRESS: Declarative: Post Actions

Click here to trigger a rebuild:
https://jenkins.ubuntu.com/server/job/cloud-init-ci/563/rebuild

review: Approve (continuous-integration)
Scott Moser (smoser) wrote :

some things inline.

review: Needs Fixing
4638684... by Chad Smith on 2017-11-30

fix incorrect return values from dhcp_discovery and maybe_perform_dhcp_discovery to be [] instead of {}

16ad7ea... by Chad Smith on 2017-11-30

debug instead of info message

Chad Smith (chad.smith) wrote :

Pushed changes per review comments, all good suggestions thx

Scott Moser (smoser) wrote :

thanks

review: Approve

PASSED: Continuous integration, rev:16ad7eacc3890d219a337ca5d956f696e172854d
https://jenkins.ubuntu.com/server/job/cloud-init-ci/565/
Executed test runs:
    SUCCESS: Checkout
    SUCCESS: Unit & Style Tests
    SUCCESS: Ubuntu LTS: Build
    SUCCESS: Ubuntu LTS: Integration
    SUCCESS: MAAS Compatability Testing
    IN_PROGRESS: Declarative: Post Actions

Click here to trigger a rebuild:
https://jenkins.ubuntu.com/server/job/cloud-init-ci/565/rebuild

review: Approve (continuous-integration)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
2index d8624d8..875a460 100644
3--- a/cloudinit/net/dhcp.py
4+++ b/cloudinit/net/dhcp.py
5@@ -36,22 +36,23 @@ def maybe_perform_dhcp_discovery(nic=None):
6 skip dhcp_discovery and return an empty dict.
7
8 @param nic: Name of the network interface we want to run dhclient on.
9- @return: A dict of dhcp options from the dhclient discovery if run,
10- otherwise an empty dict is returned.
11+ @return: A list of dicts representing dhcp options for each lease obtained
12+ from the dhclient discovery if run, otherwise an empty list is
13+ returned.
14 """
15 if nic is None:
16 nic = find_fallback_nic()
17 if nic is None:
18 LOG.debug('Skip dhcp_discovery: Unable to find fallback nic.')
19- return {}
20+ return []
21 elif nic not in get_devicelist():
22 LOG.debug(
23 'Skip dhcp_discovery: nic %s not found in get_devicelist.', nic)
24- return {}
25+ return []
26 dhclient_path = util.which('dhclient')
27 if not dhclient_path:
28 LOG.debug('Skip dhclient configuration: No dhclient command found.')
29- return {}
30+ return []
31 with temp_utils.tempdir(prefix='cloud-init-dhcp-', needs_exe=True) as tdir:
32 # Use /var/tmp because /run/cloud-init/tmp is mounted noexec
33 return dhcp_discovery(dhclient_path, nic, tdir)
34@@ -60,8 +61,8 @@ def maybe_perform_dhcp_discovery(nic=None):
35 def parse_dhcp_lease_file(lease_file):
36 """Parse the given dhcp lease file for the most recent lease.
37
38- Return a dict of dhcp options as key value pairs for the most recent lease
39- block.
40+ Return a list of dicts of dhcp options. Each dict contains key value pairs
41+ a specific lease in order from oldest to newest.
42
43 @raises: InvalidDHCPLeaseFileError on empty of unparseable leasefile
44 content.
45@@ -96,8 +97,8 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
46 @param cleandir: The directory from which to run dhclient as well as store
47 dhcp leases.
48
49- @return: A dict of dhcp options parsed from the dhcp.leases file or empty
50- dict.
51+ @return: A list of dicts of representing the dhcp leases parsed from the
52+ dhcp.leases file or empty list.
53 """
54 LOG.debug('Performing a dhcp discovery on %s', interface)
55
56@@ -119,13 +120,26 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
57 cmd = [sandbox_dhclient_cmd, '-1', '-v', '-lf', lease_file,
58 '-pf', pid_file, interface, '-sf', '/bin/true']
59 util.subp(cmd, capture=True)
60- pid = None
61+
62+ # dhclient doesn't write a pid file until after it forks when it gets a
63+ # proper lease response. Since cleandir is a temp directory that gets
64+ # removed, we need to wait for that pidfile creation before the
65+ # cleandir is removed, otherwise we get FileNotFound errors.
66+ missing = util.wait_for_files(
67+ [pid_file, lease_file], maxwait=5, naplen=0.01)
68+ if missing:
69+ LOG.warning("dhclient did not produce expected files: %s",
70+ ', '.join(os.path.basename(f) for f in missing))
71+ return []
72+ pid_content = util.load_file(pid_file).strip()
73 try:
74- pid = int(util.load_file(pid_file).strip())
75- return parse_dhcp_lease_file(lease_file)
76- finally:
77- if pid:
78- os.kill(pid, signal.SIGKILL)
79+ pid = int(pid_content)
80+ except ValueError:
81+ LOG.debug(
82+ "pid file contains non-integer content '%s'", pid_content)
83+ else:
84+ os.kill(pid, signal.SIGKILL)
85+ return parse_dhcp_lease_file(lease_file)
86
87
88 def networkd_parse_lease(content):
89diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
90index 3d8e15c..db25b6f 100644
91--- a/cloudinit/net/tests/test_dhcp.py
92+++ b/cloudinit/net/tests/test_dhcp.py
93@@ -1,6 +1,5 @@
94 # This file is part of cloud-init. See LICENSE file for license information.
95
96-import mock
97 import os
98 import signal
99 from textwrap import dedent
100@@ -9,7 +8,8 @@ from cloudinit.net.dhcp import (
101 InvalidDHCPLeaseFileError, maybe_perform_dhcp_discovery,
102 parse_dhcp_lease_file, dhcp_discovery, networkd_load_leases)
103 from cloudinit.util import ensure_file, write_file
104-from cloudinit.tests.helpers import CiTestCase, wrap_and_call, populate_dir
105+from cloudinit.tests.helpers import (
106+ CiTestCase, mock, populate_dir, wrap_and_call)
107
108
109 class TestParseDHCPLeasesFile(CiTestCase):
110@@ -69,14 +69,14 @@ class TestDHCPDiscoveryClean(CiTestCase):
111 def test_no_fallback_nic_found(self, m_fallback_nic):
112 """Log and do nothing when nic is absent and no fallback is found."""
113 m_fallback_nic.return_value = None # No fallback nic found
114- self.assertEqual({}, maybe_perform_dhcp_discovery())
115+ self.assertEqual([], maybe_perform_dhcp_discovery())
116 self.assertIn(
117 'Skip dhcp_discovery: Unable to find fallback nic.',
118 self.logs.getvalue())
119
120 def test_provided_nic_does_not_exist(self):
121 """When the provided nic doesn't exist, log a message and no-op."""
122- self.assertEqual({}, maybe_perform_dhcp_discovery('idontexist'))
123+ self.assertEqual([], maybe_perform_dhcp_discovery('idontexist'))
124 self.assertIn(
125 'Skip dhcp_discovery: nic idontexist not found in get_devicelist.',
126 self.logs.getvalue())
127@@ -87,7 +87,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
128 """When dhclient doesn't exist in the OS, log the issue and no-op."""
129 m_fallback.return_value = 'eth9'
130 m_which.return_value = None # dhclient isn't found
131- self.assertEqual({}, maybe_perform_dhcp_discovery())
132+ self.assertEqual([], maybe_perform_dhcp_discovery())
133 self.assertIn(
134 'Skip dhclient configuration: No dhclient command found.',
135 self.logs.getvalue())
136@@ -117,6 +117,62 @@ class TestDHCPDiscoveryClean(CiTestCase):
137
138 @mock.patch('cloudinit.net.dhcp.os.kill')
139 @mock.patch('cloudinit.net.dhcp.util.subp')
140+ def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp,
141+ m_kill):
142+ """dhcp_discovery logs a warning when pidfile contains invalid content.
143+
144+ Lease processing still occurs and no proc kill is attempted.
145+ """
146+ tmpdir = self.tmp_dir()
147+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
148+ script_content = '#!/bin/bash\necho fake-dhclient'
149+ write_file(dhclient_script, script_content, mode=0o755)
150+ write_file(self.tmp_path('dhclient.pid', tmpdir), '') # Empty pid ''
151+ lease_content = dedent("""
152+ lease {
153+ interface "eth9";
154+ fixed-address 192.168.2.74;
155+ option subnet-mask 255.255.255.0;
156+ option routers 192.168.2.1;
157+ }
158+ """)
159+ write_file(self.tmp_path('dhcp.leases', tmpdir), lease_content)
160+
161+ self.assertItemsEqual(
162+ [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
163+ 'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
164+ dhcp_discovery(dhclient_script, 'eth9', tmpdir))
165+ self.assertIn(
166+ "pid file contains non-integer content ''", self.logs.getvalue())
167+ m_kill.assert_not_called()
168+
169+ @mock.patch('cloudinit.net.dhcp.os.kill')
170+ @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
171+ @mock.patch('cloudinit.net.dhcp.util.subp')
172+ def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
173+ m_subp,
174+ m_wait,
175+ m_kill):
176+ """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
177+ tmpdir = self.tmp_dir()
178+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
179+ script_content = '#!/bin/bash\necho fake-dhclient'
180+ write_file(dhclient_script, script_content, mode=0o755)
181+ # Don't create pid or leases file
182+ pidfile = self.tmp_path('dhclient.pid', tmpdir)
183+ leasefile = self.tmp_path('dhcp.leases', tmpdir)
184+ m_wait.return_value = [pidfile] # Return the missing pidfile wait for
185+ self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
186+ self.assertEqual(
187+ mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
188+ m_wait.call_args_list[0])
189+ self.assertIn(
190+ 'WARNING: dhclient did not produce expected files: dhclient.pid',
191+ self.logs.getvalue())
192+ m_kill.assert_not_called()
193+
194+ @mock.patch('cloudinit.net.dhcp.os.kill')
195+ @mock.patch('cloudinit.net.dhcp.util.subp')
196 def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
197 """dhcp_discovery brings up the interface and runs dhclient.
198
199diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
200index 8c3492d..14367e9 100644
201--- a/cloudinit/sources/DataSourceAzure.py
202+++ b/cloudinit/sources/DataSourceAzure.py
203@@ -11,7 +11,6 @@ from functools import partial
204 import os
205 import os.path
206 import re
207-import time
208 from xml.dom import minidom
209 import xml.etree.ElementTree as ET
210
211@@ -321,7 +320,7 @@ class DataSourceAzure(sources.DataSource):
212 # https://bugs.launchpad.net/cloud-init/+bug/1717611
213 missing = util.log_time(logfunc=LOG.debug,
214 msg="waiting for SSH public key files",
215- func=wait_for_files,
216+ func=util.wait_for_files,
217 args=(fp_files, 900))
218
219 if len(missing):
220@@ -556,8 +555,8 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
221 is_new_instance=False):
222 # wait for ephemeral disk to come up
223 naplen = .2
224- missing = wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
225- log_pre="Azure ephemeral disk: ")
226+ missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
227+ log_pre="Azure ephemeral disk: ")
228
229 if missing:
230 LOG.warning("ephemeral device '%s' did not appear after %d seconds.",
231@@ -639,28 +638,6 @@ def pubkeys_from_crt_files(flist):
232 return pubkeys
233
234
235-def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
236- need = set(flist)
237- waited = 0
238- while True:
239- need -= set([f for f in need if os.path.exists(f)])
240- if len(need) == 0:
241- LOG.debug("%sAll files appeared after %s seconds: %s",
242- log_pre, waited, flist)
243- return []
244- if waited == 0:
245- LOG.info("%sWaiting up to %s seconds for the following files: %s",
246- log_pre, maxwait, flist)
247- if waited + naplen > maxwait:
248- break
249- time.sleep(naplen)
250- waited += naplen
251-
252- LOG.warning("%sStill missing files after %s seconds: %s",
253- log_pre, maxwait, need)
254- return need
255-
256-
257 def write_files(datadir, files, dirmode=None):
258
259 def _redact_password(cnt, fname):
260diff --git a/cloudinit/util.py b/cloudinit/util.py
261index e1290aa..6c014ba 100644
262--- a/cloudinit/util.py
263+++ b/cloudinit/util.py
264@@ -2541,4 +2541,26 @@ def load_shell_content(content, add_empty=False, empty_val=None):
265 return data
266
267
268+def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
269+ need = set(flist)
270+ waited = 0
271+ while True:
272+ need -= set([f for f in need if os.path.exists(f)])
273+ if len(need) == 0:
274+ LOG.debug("%sAll files appeared after %s seconds: %s",
275+ log_pre, waited, flist)
276+ return []
277+ if waited == 0:
278+ LOG.debug("%sWaiting up to %s seconds for the following files: %s",
279+ log_pre, maxwait, flist)
280+ if waited + naplen > maxwait:
281+ break
282+ time.sleep(naplen)
283+ waited += naplen
284+
285+ LOG.debug("%sStill missing files after %s seconds: %s",
286+ log_pre, maxwait, need)
287+ return need
288+
289+
290 # vi: ts=4 expandtab
291diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
292index 0a11777..7cb1812 100644
293--- a/tests/unittests/test_datasource/test_azure.py
294+++ b/tests/unittests/test_datasource/test_azure.py
295@@ -171,7 +171,6 @@ scbus-1 on xpt0 bus 0
296 self.apply_patches([
297 (dsaz, 'list_possible_azure_ds_devs', dsdevs),
298 (dsaz, 'invoke_agent', _invoke_agent),
299- (dsaz, 'wait_for_files', _wait_for_files),
300 (dsaz, 'pubkeys_from_crt_files', _pubkeys_from_crt_files),
301 (dsaz, 'perform_hostname_bounce', mock.MagicMock()),
302 (dsaz, 'get_hostname', mock.MagicMock()),
303@@ -179,6 +178,8 @@ scbus-1 on xpt0 bus 0
304 (dsaz, 'get_metadata_from_fabric', self.get_metadata_from_fabric),
305 (dsaz.util, 'read_dmi_data', mock.MagicMock(
306 side_effect=_dmi_mocks)),
307+ (dsaz.util, 'wait_for_files', mock.MagicMock(
308+ side_effect=_wait_for_files)),
309 ])
310
311 dsrc = dsaz.DataSourceAzure(
312@@ -647,7 +648,7 @@ class TestAzureBounce(TestCase):
313 self.patches.enter_context(
314 mock.patch.object(dsaz, 'invoke_agent'))
315 self.patches.enter_context(
316- mock.patch.object(dsaz, 'wait_for_files'))
317+ mock.patch.object(dsaz.util, 'wait_for_files'))
318 self.patches.enter_context(
319 mock.patch.object(dsaz, 'list_possible_azure_ds_devs',
320 mock.MagicMock(return_value=[])))

Subscribers

People subscribed via source and target branches