Merge lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479 into lp:launchpad

Proposed by Julian Edwards
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: no longer in the source branch.
Merged at revision: 11938
Proposed branch: lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479
Merge into: lp:launchpad
Diff against target: 194 lines (+101/-7)
4 files modified
lib/lp/buildmaster/manager.py (+5/-3)
lib/lp/buildmaster/model/builder.py (+39/-3)
lib/lp/buildmaster/tests/test_builder.py (+54/-0)
lib/lp/code/model/recipebuilder.py (+3/-1)
To merge this branch: bzr merge lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479
Reviewer Review Type Date Requested Status
Jelmer Vernooij (community) code Approve
Review via email: mp+41068@code.launchpad.net

Commit message

Make XMLRPC connections to builder slaves respect the config's timeout value.

Description of the change

This branch tries to fix an issue with the buildd-manager where it's disabling a lot of builders en-masse.

It appears to be a problem when dispatching a lot of recipe builds at once. These are known to take the builder into swap as the bzr checkout process is quite heavyweight. This makes the builder quite unresponsive.

The default Twisted TCP connection timeout is 30 seconds, which appears to not be enough to allow a heavily loaded builder slave to respond to an xmlrpc request. Twisted also does not allow you to override this timeout, so to fix this I've basically copied some chunks from the Proxy class and altered parts to supply the timeout value to reactor.connectTCP().

The new test relies on the fact that connectTCP() is never actually called, so we don't need to worry about getting other types of connection errors.

To post a comment you must log in.
Revision history for this message
Jelmer Vernooij (jelmer) wrote :

It's a pity that we need this, both because xmlrpc.Proxy should have its own timeout argument and because the slave really shouldn't take that long to respond. This seems like a reasonable way to deal with it for the moment though. Also, yay for more logging.

Two minor stylistic issues: the "from twisted.internet import ssl" on line 39 could be in the head of the file, and the 'd' variable should be longer according to our style guide.

review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/buildmaster/manager.py'
2--- lib/lp/buildmaster/manager.py 2010-10-28 15:04:15 +0000
3+++ lib/lp/buildmaster/manager.py 2010-11-17 21:52:53 +0000
4@@ -151,10 +151,12 @@
5 if failure.check(
6 BuildSlaveFailure, CannotBuild, BuildBehaviorMismatch,
7 CannotResumeHost, BuildDaemonError, CannotFetchFile):
8- self.logger.info("Scanning failed with: %s" % error_message)
9+ self.logger.info("Scanning %s failed with: %s" % (
10+ self.builder_name, error_message))
11 else:
12- self.logger.info("Scanning failed with: %s\n%s" %
13- (failure.getErrorMessage(), failure.getTraceback()))
14+ self.logger.info("Scanning %s failed with: %s\n%s" % (
15+ self.builder_name, failure.getErrorMessage(),
16+ failure.getTraceback()))
17
18 # Decide if we need to terminate the job or fail the
19 # builder.
20
21=== modified file 'lib/lp/buildmaster/model/builder.py'
22--- lib/lp/buildmaster/model/builder.py 2010-11-10 13:06:05 +0000
23+++ lib/lp/buildmaster/model/builder.py 2010-11-17 21:52:53 +0000
24@@ -8,6 +8,7 @@
25 __all__ = [
26 'Builder',
27 'BuilderSet',
28+ 'ProxyWithConnectionTimeout',
29 'rescueBuilderIfLost',
30 'updateBuilderStatus',
31 ]
32@@ -99,6 +100,41 @@
33 noisy = False
34
35
36+class ProxyWithConnectionTimeout(xmlrpc.Proxy):
37+ """Extend Twisted's Proxy to provide a configurable connection timeout."""
38+
39+ def __init__(self, url, user=None, password=None, allowNone=False,
40+ useDateTime=False, timeout=None):
41+ xmlrpc.Proxy.__init__(
42+ self, url, user, password, allowNone, useDateTime)
43+ if timeout is None:
44+ self.timeout = config.builddmaster.socket_timeout
45+ else:
46+ self.timeout = timeout
47+
48+ def callRemote(self, method, *args):
49+ """Basically a carbon copy of the parent but passes the timeout
50+ to connectTCP."""
51+
52+ def cancel(d):
53+ factory.deferred = None
54+ connector.disconnect()
55+ factory = self.queryFactory(
56+ self.path, self.host, method, self.user,
57+ self.password, self.allowNone, args, cancel, self.useDateTime)
58+ if self.secure:
59+ from twisted.internet import ssl
60+ connector = default_reactor.connectSSL(
61+ self.host, self.port or 443, factory,
62+ ssl.ClientContextFactory(),
63+ timeout=self.timeout)
64+ else:
65+ connector = default_reactor.connectTCP(
66+ self.host, self.port or 80, factory,
67+ timeout=self.timeout)
68+ return factory.deferred
69+
70+
71 class BuilderSlave(object):
72 """Add in a few useful methods for the XMLRPC slave.
73
74@@ -141,7 +177,7 @@
75 """
76 rpc_url = urlappend(builder_url.encode('utf-8'), 'rpc')
77 if proxy is None:
78- server_proxy = xmlrpc.Proxy(rpc_url, allowNone=True)
79+ server_proxy = ProxyWithConnectionTimeout(rpc_url, allowNone=True)
80 server_proxy.queryFactory = QuietQueryFactory
81 else:
82 server_proxy = proxy
83@@ -213,7 +249,7 @@
84 :param libraryfilealias: An `ILibraryFileAlias`.
85 """
86 url = libraryfilealias.http_url
87- logger.debug(
88+ logger.info(
89 "Asking builder on %s to ensure it has file %s (%s, %s)" % (
90 self._file_cache_url, libraryfilealias.filename, url,
91 libraryfilealias.content.sha1))
92@@ -432,7 +468,7 @@
93 return defer.fail(CannotResumeHost('Undefined vm_host.'))
94
95 logger = self._getSlaveScannerLogger()
96- logger.debug("Resuming %s (%s)" % (self.name, self.url))
97+ logger.info("Resuming %s (%s)" % (self.name, self.url))
98
99 d = self.slave.resume()
100 def got_resume_ok((stdout, stderr, returncode)):
101
102=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
103--- lib/lp/buildmaster/tests/test_builder.py 2010-11-10 22:40:05 +0000
104+++ lib/lp/buildmaster/tests/test_builder.py 2010-11-17 21:52:53 +0000
105@@ -43,6 +43,10 @@
106 )
107 from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
108 from lp.buildmaster.interfaces.builder import CannotResumeHost
109+from lp.buildmaster.model.builder import (
110+ BuilderSlave,
111+ ProxyWithConnectionTimeout,
112+ )
113 from lp.buildmaster.model.buildfarmjobbehavior import IdleBuildBehavior
114 from lp.buildmaster.model.buildqueue import BuildQueue
115 from lp.buildmaster.tests.mock_slaves import (
116@@ -1059,6 +1063,56 @@
117 self.slave.build(None, None, None, None, None))
118
119
120+class TestSlaveConnectionTimeouts(TrialTestCase):
121+ # Testing that we can override the default 30 second connection
122+ # timeout.
123+
124+ layer = TwistedLayer
125+
126+ def setUp(self):
127+ super(TestSlaveConnectionTimeouts, self).setUp()
128+ self.slave_helper = SlaveTestHelpers()
129+ self.slave_helper.setUp()
130+ self.addCleanup(self.slave_helper.cleanUp)
131+ self.clock = Clock()
132+ self.proxy = ProxyWithConnectionTimeout("fake_url")
133+ self.slave = self.slave_helper.getClientSlave(
134+ reactor=self.clock, proxy=self.proxy)
135+
136+ def test_connection_timeout(self):
137+ # The default timeout of 30 seconds should not cause a timeout,
138+ # only the config value should.
139+ timeout_config = """
140+ [builddmaster]
141+ socket_timeout: 180
142+ """
143+ config.push('timeout', timeout_config)
144+ self.addCleanup(config.pop, 'timeout')
145+
146+ d = self.slave.echo()
147+ # Advance past the 30 second timeout. The real reactor will
148+ # never call connectTCP() since we're not spinning it up. This
149+ # avoids "connection refused" errors and simulates an
150+ # environment where the endpoint doesn't respond.
151+ self.clock.advance(31)
152+ self.assertFalse(d.called)
153+
154+ # Now advance past the real socket timeout and expect a
155+ # Failure.
156+
157+ def got_timeout(failure):
158+ self.assertIsInstance(failure.value, CancelledError)
159+
160+ d.addBoth(got_timeout)
161+ self.clock.advance(config.builddmaster.socket_timeout + 1)
162+ self.assertTrue(d.called)
163+
164+ def test_BuilderSlave_uses_ProxyWithConnectionTimeout(self):
165+ # Make sure that BuilderSlaves use the custom proxy class.
166+ slave = BuilderSlave.makeBuilderSlave("url", "host")
167+ self.assertIsInstance(slave._server, ProxyWithConnectionTimeout)
168+
169+
170 class TestSlaveWithLibrarian(TrialTestCase):
171 """Tests that need more of Launchpad to run."""
172
173
174=== modified file 'lib/lp/code/model/recipebuilder.py'
175--- lib/lp/code/model/recipebuilder.py 2010-10-27 14:25:19 +0000
176+++ lib/lp/code/model/recipebuilder.py 2010-11-17 21:52:53 +0000
177@@ -122,6 +122,8 @@
178 if chroot is None:
179 raise CannotBuild("Unable to find a chroot for %s" %
180 distroarchseries.displayname)
181+ logger.info(
182+ "Sending chroot file for recipe build to %s" % self._builder.name)
183 d = self._builder.slave.cacheFile(logger, chroot)
184
185 def got_cache_file(ignored):
186@@ -131,7 +133,7 @@
187 buildid = "%s-%s" % (self.build.id, build_queue_id)
188 cookie = self.buildfarmjob.generateSlaveBuildCookie()
189 chroot_sha1 = chroot.content.sha1
190- logger.debug(
191+ logger.info(
192 "Initiating build %s on %s" % (buildid, self._builder.url))
193
194 return self._builder.slave.build(