Merge ~cjwatson/launchpad:librarian-timeouts into launchpad:master

Proposed by Colin Watson
Status: Merged
Approved by: Colin Watson
Approved revision: f3e4e01c00921b293a27c588640a68e799a8525f
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~cjwatson/launchpad:librarian-timeouts
Merge into: launchpad:master
Diff against target: 146 lines (+69/-8)
3 files modified
lib/lp/services/config/schema-lazr.conf (+4/-0)
lib/lp/services/librarian/client.py (+16/-8)
lib/lp/services/librarian/tests/test_client.py (+49/-0)
Reviewer Review Type Date Requested Status
Jürgen Gmach Approve
Review via email: mp+433219@code.launchpad.net

Commit message

Add a socket timeout for librarian client connections

Description of the change

The internal librarian client creates a socket with no timeout, so if anything goes wrong with the connection then it only recovers by being killed manually. Add a socket timeout so that it gives up eventually. (Note that this timeout applies to individual operations on the socket, not to uploading or downloading entire files.)

The default timeout is deliberately quite generous to minimize the chance of causing failures due to connections being merely slow rather than stuck, although we may still find we need to tune it. This isn't intended to ensure that librarian requests fit within timeout budgets set for web requests or similar, but just as a backstop to ensure that we don't get stuck waiting forever.

To post a comment you must log in.
Revision history for this message
Jürgen Gmach (jugmac00) wrote :

I do trust you know what you are doing, but I would appreciate some clarifying answers/comments.

review: Approve
Revision history for this message
Colin Watson (cjwatson) :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
2index 640865d..ca906a6 100644
3--- a/lib/lp/services/config/schema-lazr.conf
4+++ b/lib/lp/services/config/schema-lazr.conf
5@@ -1269,6 +1269,10 @@ authentication_endpoint: none
6 # authserver.
7 authentication_timeout: 5
8
9+# The timeout in seconds for librarian client socket operations.
10+# datatype: integer
11+client_socket_timeout: 60
12+
13
14 [librarian_gc]
15 # The database user which will be used by this process.
16diff --git a/lib/lp/services/librarian/client.py b/lib/lp/services/librarian/client.py
17index 7be460f..80ce5c6 100644
18--- a/lib/lp/services/librarian/client.py
19+++ b/lib/lp/services/librarian/client.py
20@@ -83,6 +83,7 @@ class FileUploadClient:
21 """
22 try:
23 self.state.s = socket.socket(AF_INET, SOCK_STREAM)
24+ self.state.s.settimeout(config.librarian.client_socket_timeout)
25 self.state.s.connect((self.upload_host, self.upload_port))
26 self.state.f = self.state.s.makefile("rwb", 0)
27
28@@ -96,13 +97,16 @@ class FileUploadClient:
29
30 def _close(self):
31 """Close connection"""
32- self.state.s_poll.unregister(self.state.s.fileno())
33- self.state.s_poll.close()
34- del self.state.s_poll
35- self.state.s.close()
36- del self.state.s
37- self.state.f.close()
38- del self.state.f
39+ if hasattr(self.state, "s_poll"):
40+ self.state.s_poll.unregister(self.state.s.fileno())
41+ self.state.s_poll.close()
42+ del self.state.s_poll
43+ if hasattr(self.state, "s"):
44+ self.state.s.close()
45+ del self.state.s
46+ if hasattr(self.state, "f"):
47+ self.state.f.close()
48+ del self.state.f
49
50 def _checkError(self):
51 poll_result = self.state.s_poll.poll(self.s_poll_timeout)
52@@ -171,8 +175,9 @@ class FileUploadClient:
53 LibraryFileContent,
54 )
55
56- self._connect()
57 try:
58+ self._connect()
59+
60 # Get the name of the database the client is using, so that
61 # the server can check that the client is using the same
62 # database as the server.
63@@ -259,6 +264,9 @@ class FileUploadClient:
64 aliasID,
65 )
66 return aliasID
67+ except socket.timeout:
68+ timeout = config.librarian.client_socket_timeout
69+ raise UploadFailed("Server timed out after %s second(s)" % timeout)
70 finally:
71 self._close()
72
73diff --git a/lib/lp/services/librarian/tests/test_client.py b/lib/lp/services/librarian/tests/test_client.py
74index aec3fb9..1d06673 100644
75--- a/lib/lp/services/librarian/tests/test_client.py
76+++ b/lib/lp/services/librarian/tests/test_client.py
77@@ -32,6 +32,7 @@ from lp.services.librarian.client import (
78 from lp.services.librarian.interfaces.client import UploadFailed
79 from lp.services.librarian.model import LibraryFileAlias
80 from lp.services.propertycache import cachedproperty
81+from lp.services.timeout import override_timeout, with_timeout
82 from lp.testing import TestCase
83 from lp.testing.layers import (
84 DatabaseLayer,
85@@ -231,6 +232,22 @@ class EchoServer(threading.Thread):
86 self.socket.close()
87
88
89+class TimingOutServer(EchoServer):
90+ """Fake TCP server that never sends a reply."""
91+
92+ def run(self):
93+ while not self.should_stop:
94+ try:
95+ conn, addr = self.socket.accept()
96+ self.connections.append(conn)
97+ except socket.timeout:
98+ pass
99+ for conn in self.connections:
100+ conn.close()
101+ self.connections = []
102+ self.socket.close()
103+
104+
105 class LibrarianClientTestCase(TestCase):
106 layer = LaunchpadFunctionalLayer
107
108@@ -307,6 +324,38 @@ class LibrarianClientTestCase(TestCase):
109 "text/plain",
110 )
111
112+ def test_addFile_fails_when_server_times_out(self):
113+ server = TimingOutServer()
114+ server.start()
115+ self.addCleanup(server.join)
116+
117+ upload_host, upload_port = server.socket.getsockname()
118+ self.pushConfig(
119+ "librarian",
120+ upload_host=upload_host,
121+ upload_port=upload_port,
122+ client_socket_timeout=1,
123+ )
124+
125+ client = LibrarianClient()
126+
127+ @with_timeout()
128+ def add_file():
129+ self.assertRaisesWithContent(
130+ UploadFailed,
131+ "Server timed out after 1 second(s)",
132+ client.addFile,
133+ "sample.txt",
134+ 6,
135+ io.BytesIO(b"sample"),
136+ "text/plain",
137+ )
138+
139+ # Time out this operation after a minute, to make sure that the test
140+ # doesn't hang forever if the socket timeout isn't set properly.
141+ with override_timeout(60):
142+ add_file()
143+
144 def test_addFile_uses_primary(self):
145 # addFile is a write operation, so it should always use the
146 # primary store, even if the standby is the default. Close the

Subscribers

People subscribed via source and target branches

to status/vote changes: