Merge ~cjwatson/launchpad:librarian-timeouts into launchpad:master

Proposed by Colin Watson
Status: Merged
Approved by: Colin Watson
Approved revision: f3e4e01c00921b293a27c588640a68e799a8525f
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~cjwatson/launchpad:librarian-timeouts
Merge into: launchpad:master
Diff against target: 146 lines (+69/-8)
3 files modified
lib/lp/services/config/schema-lazr.conf (+4/-0)
lib/lp/services/librarian/client.py (+16/-8)
lib/lp/services/librarian/tests/test_client.py (+49/-0)
Reviewer Review Type Date Requested Status
Jürgen Gmach Approve
Review via email: mp+433219@code.launchpad.net

Commit message

Add a socket timeout for librarian client connections

Description of the change

The internal librarian client creates a socket with no timeout, so if anything goes wrong with the connection then it only recovers by being killed manually. Add a socket timeout so that it gives up eventually. (Note that this timeout applies to individual operations on the socket, not to uploading or downloading entire files.)

The default timeout is deliberately quite generous to minimize the chance of causing failures due to connections being merely slow rather than stuck, although we may still find we need to tune it. This isn't intended to ensure that librarian requests fit within timeout budgets set for web requests or similar, but just as a backstop to ensure that we don't get stuck waiting forever.

To post a comment you must log in.
Revision history for this message
Jürgen Gmach (jugmac00) wrote :

I do trust you know what you are doing, but I would appreciate some clarifying answers/comments.

review: Approve
Revision history for this message
Colin Watson (cjwatson) :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index 640865d..ca906a6 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -1269,6 +1269,10 @@ authentication_endpoint: none
1269# authserver.1269# authserver.
1270authentication_timeout: 51270authentication_timeout: 5
12711271
1272# The timeout in seconds for librarian client socket operations.
1273# datatype: integer
1274client_socket_timeout: 60
1275
12721276
1273[librarian_gc]1277[librarian_gc]
1274# The database user which will be used by this process.1278# The database user which will be used by this process.
diff --git a/lib/lp/services/librarian/client.py b/lib/lp/services/librarian/client.py
index 7be460f..80ce5c6 100644
--- a/lib/lp/services/librarian/client.py
+++ b/lib/lp/services/librarian/client.py
@@ -83,6 +83,7 @@ class FileUploadClient:
83 """83 """
84 try:84 try:
85 self.state.s = socket.socket(AF_INET, SOCK_STREAM)85 self.state.s = socket.socket(AF_INET, SOCK_STREAM)
86 self.state.s.settimeout(config.librarian.client_socket_timeout)
86 self.state.s.connect((self.upload_host, self.upload_port))87 self.state.s.connect((self.upload_host, self.upload_port))
87 self.state.f = self.state.s.makefile("rwb", 0)88 self.state.f = self.state.s.makefile("rwb", 0)
8889
@@ -96,13 +97,16 @@ class FileUploadClient:
9697
97 def _close(self):98 def _close(self):
98 """Close connection"""99 """Close connection"""
99 self.state.s_poll.unregister(self.state.s.fileno())100 if hasattr(self.state, "s_poll"):
100 self.state.s_poll.close()101 self.state.s_poll.unregister(self.state.s.fileno())
101 del self.state.s_poll102 self.state.s_poll.close()
102 self.state.s.close()103 del self.state.s_poll
103 del self.state.s104 if hasattr(self.state, "s"):
104 self.state.f.close()105 self.state.s.close()
105 del self.state.f106 del self.state.s
107 if hasattr(self.state, "f"):
108 self.state.f.close()
109 del self.state.f
106110
107 def _checkError(self):111 def _checkError(self):
108 poll_result = self.state.s_poll.poll(self.s_poll_timeout)112 poll_result = self.state.s_poll.poll(self.s_poll_timeout)
@@ -171,8 +175,9 @@ class FileUploadClient:
171 LibraryFileContent,175 LibraryFileContent,
172 )176 )
173177
174 self._connect()
175 try:178 try:
179 self._connect()
180
176 # Get the name of the database the client is using, so that181 # Get the name of the database the client is using, so that
177 # the server can check that the client is using the same182 # the server can check that the client is using the same
178 # database as the server.183 # database as the server.
@@ -259,6 +264,9 @@ class FileUploadClient:
259 aliasID,264 aliasID,
260 )265 )
261 return aliasID266 return aliasID
267 except socket.timeout:
268 timeout = config.librarian.client_socket_timeout
269 raise UploadFailed("Server timed out after %s second(s)" % timeout)
262 finally:270 finally:
263 self._close()271 self._close()
264272
diff --git a/lib/lp/services/librarian/tests/test_client.py b/lib/lp/services/librarian/tests/test_client.py
index aec3fb9..1d06673 100644
--- a/lib/lp/services/librarian/tests/test_client.py
+++ b/lib/lp/services/librarian/tests/test_client.py
@@ -32,6 +32,7 @@ from lp.services.librarian.client import (
32from lp.services.librarian.interfaces.client import UploadFailed32from lp.services.librarian.interfaces.client import UploadFailed
33from lp.services.librarian.model import LibraryFileAlias33from lp.services.librarian.model import LibraryFileAlias
34from lp.services.propertycache import cachedproperty34from lp.services.propertycache import cachedproperty
35from lp.services.timeout import override_timeout, with_timeout
35from lp.testing import TestCase36from lp.testing import TestCase
36from lp.testing.layers import (37from lp.testing.layers import (
37 DatabaseLayer,38 DatabaseLayer,
@@ -231,6 +232,22 @@ class EchoServer(threading.Thread):
231 self.socket.close()232 self.socket.close()
232233
233234
235class TimingOutServer(EchoServer):
236 """Fake TCP server that never sends a reply."""
237
238 def run(self):
239 while not self.should_stop:
240 try:
241 conn, addr = self.socket.accept()
242 self.connections.append(conn)
243 except socket.timeout:
244 pass
245 for conn in self.connections:
246 conn.close()
247 self.connections = []
248 self.socket.close()
249
250
234class LibrarianClientTestCase(TestCase):251class LibrarianClientTestCase(TestCase):
235 layer = LaunchpadFunctionalLayer252 layer = LaunchpadFunctionalLayer
236253
@@ -307,6 +324,38 @@ class LibrarianClientTestCase(TestCase):
307 "text/plain",324 "text/plain",
308 )325 )
309326
327 def test_addFile_fails_when_server_times_out(self):
328 server = TimingOutServer()
329 server.start()
330 self.addCleanup(server.join)
331
332 upload_host, upload_port = server.socket.getsockname()
333 self.pushConfig(
334 "librarian",
335 upload_host=upload_host,
336 upload_port=upload_port,
337 client_socket_timeout=1,
338 )
339
340 client = LibrarianClient()
341
342 @with_timeout()
343 def add_file():
344 self.assertRaisesWithContent(
345 UploadFailed,
346 "Server timed out after 1 second(s)",
347 client.addFile,
348 "sample.txt",
349 6,
350 io.BytesIO(b"sample"),
351 "text/plain",
352 )
353
354 # Time out this operation after a minute, to make sure that the test
355 # doesn't hang forever if the socket timeout isn't set properly.
356 with override_timeout(60):
357 add_file()
358
310 def test_addFile_uses_primary(self):359 def test_addFile_uses_primary(self):
311 # addFile is a write operation, so it should always use the360 # addFile is a write operation, so it should always use the
312 # primary store, even if the standby is the default. Close the361 # primary store, even if the standby is the default. Close the

Subscribers

People subscribed via source and target branches

to status/vote changes: