Merge ~kissiel/checkbox-ng:multi-master into checkbox-ng:master

Proposed by Maciej Kisielewski
Status: Merged
Approved by: Sylvain Pineau
Approved revision: 7ac269310a27d0f49981ceb3c40d524dc055b7b0
Merged at revision: 40a6cb8d780f35627f9525fe4ea5e0b8073eb262
Proposed branch: ~kissiel/checkbox-ng:multi-master
Merge into: checkbox-ng:master
Diff against target: 116 lines (+66/-3)
2 files modified
checkbox_ng/launcher/master.py (+31/-3)
checkbox_ng/launcher/slave.py (+35/-0)
Reviewer Review Type Date Requested Status
Sylvain Pineau (community) Approve
Maciej Kisielewski (community) Needs Resubmitting
Review via email: mp+378497@code.launchpad.net

Description of the change

Fix a problem where two masters could be connected to the same slave casuing lots of race conditions.

To post a comment you must log in.
Revision history for this message
Sylvain Pineau (sylvain-pineau) wrote :
Download full text (3.1 KiB)

Got the following tracebacks when the second master tries to connect:

$ checkbox-cli master 127.0.0.1 ./mylauncher
Connection lost!
connection closed by peer
Connection lost!
connection closed by peer

and this one on slave:

$ sudo checkbox-cli slave
client connection terminated abruptly
Traceback (most recent call last):
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/utils/server.py", line 181, in _authenticate_and_serve_client
    self._serve_client(sock2, credentials)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/utils/server.py", line 202, in _serve_client
    conn = self.service._connect(Channel(SocketStream(sock)), config)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/service.py", line 100, in _connect
    self.on_connect(conn)
  File "/tmp/checkbox-ng/checkbox_ng/launcher/slave.py", line 60, in on_connect
    if SessionAssistantSlave.master_blaster:
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/netref.py", line 220, in method
    return syncreq(_self, consts.HANDLE_CALLATTR, name, args, kwargs)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/netref.py", line 75, in syncreq
    return conn.sync_request(handler, proxy, *args)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/protocol.py", line 471, in sync_request
    return self.async_request(handler, *args, timeout=timeout).value
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/async_.py", line 95, in value
    self.wait()
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/async_.py", line 47, in wait
    raise AsyncResultTimeout("result expired")
TimeoutError: result expired
Exception in thread Thread-2:
Traceback (most recent call last):
  File "/usr/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/utils/server.py", line 181, in _authenticate_and_serve_client
    self._serve_client(sock2, credentials)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/utils/server.py", line 202, in _serve_client
    conn = self.service._connect(Channel(SocketStream(sock)), config)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/service.py", line 100, in _connect
    self.on_connect(conn)
  File "/tmp/checkbox-ng/checkbox_ng/launcher/slave.py", line 60, in on_connect
    if SessionAssistantSlave.master_blaster:
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/netref.py", line 220, in method
    return syncreq(_self, consts.HANDLE_CALLATTR, name, args, kwargs)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/netref.py", line 75, in syncreq
    return conn.sync_request(handler, proxy, *args)
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/protocol.py", line 471, in sync_request
    return self.async_request(handler, *args, timeout=timeout).value
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/async_.py", line 95, in value
    self.wait()
  File "/tmp/checkbox-ng/plainbox/vendor/rpyc/core/async_.py", line 47, in wait
    raise AsyncResultTimeout("result expired")
TimeoutError: result expired

To be precise it happens when I try to connect the second master while being still at the test selection urwid scree...

Read more...

review: Needs Fixing
Revision history for this message
Maciej Kisielewski (kissiel) wrote :

TYVM for catching that. The crash(es) were caused by urwid blocking the master thread therefore blocking a way for slave to run remote kill.

I added code that catches that timeout and just disconnects the master in a less informative way (no information about the new master), but at least it works :D

I feel we cannot do a better thing without rewriting a lot of things.

review: Needs Resubmitting
Revision history for this message
Sylvain Pineau (sylvain-pineau) wrote :

Thanks, +1

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/checkbox_ng/launcher/master.py b/checkbox_ng/launcher/master.py
2index 1565b4a..f110860 100644
3--- a/checkbox_ng/launcher/master.py
4+++ b/checkbox_ng/launcher/master.py
5@@ -19,6 +19,7 @@
6 This module contains implementation of the master end of the remote execution
7 functionality.
8 """
9+import contextlib
10 import getpass
11 import gettext
12 import ipaddress
13@@ -166,6 +167,7 @@ class RemoteMaster(ReportsStage, MainLoopStage):
14 config['allow_all_attrs'] = True
15 config['sync_request_timeout'] = 120
16 keep_running = False
17+ server_msg = None
18 self._prepare_transports()
19 interrupted = False
20 while True:
21@@ -181,6 +183,18 @@ class RemoteMaster(ReportsStage, MainLoopStage):
22 break
23 conn = rpyc.connect(host, port, config=config)
24 keep_running = True
25+ def quitter(msg):
26+ # this will be called when the slave decides to disconnect
27+ # this master
28+ nonlocal server_msg
29+ nonlocal keep_running
30+ keep_running = False
31+ server_msg = msg
32+ with contextlib.suppress(AttributeError):
33+ # TODO: REMOTE_API
34+ # when bumping the remote api make this bit obligatory
35+ # i.e. remove the suppressing
36+ conn.root.register_master_blaster(quitter)
37 self._sa = conn.root.get_sa()
38 self.sa.conn = conn
39 if not self._sudo_provider:
40@@ -213,9 +227,23 @@ class RemoteMaster(ReportsStage, MainLoopStage):
41 self.resume_interacting, interaction=payload),
42 }[state]()
43 except EOFError as exc:
44- print("Connection lost!")
45- _logger.info("master: Connection lost due to: %s", exc)
46- time.sleep(1)
47+ if keep_running:
48+ print("Connection lost!")
49+ # this is yucky but it works, in case of explicit
50+ # connection closing by the slave we get this msg
51+ _logger.info("master: Connection lost due to: %s", exc)
52+ if str(exc) == 'stream has been closed':
53+ print('Slave explicitly disconnected you. Possible '
54+ 'reason: new master connected to the slave')
55+ break
56+ print(exc)
57+ time.sleep(1)
58+ else:
59+ # if keep_running got set to False it means that the
60+ # network interruption was planned, AKA slave disconnected
61+ # this master
62+ print(server_msg)
63+ break
64 except (ConnectionRefusedError, socket.timeout, OSError) as exc:
65 _logger.info("master: Connection lost due to: %s", exc)
66 if not keep_running:
67diff --git a/checkbox_ng/launcher/slave.py b/checkbox_ng/launcher/slave.py
68index b568620..3eed172 100644
69--- a/checkbox_ng/launcher/slave.py
70+++ b/checkbox_ng/launcher/slave.py
71@@ -39,10 +39,45 @@ _logger = logging.getLogger("slave")
72 class SessionAssistantSlave(rpyc.Service):
73
74 session_assistant = None
75+ controlling_master_conn = None
76+ master_blaster = None
77
78 def exposed_get_sa(*args):
79 return SessionAssistantSlave.session_assistant
80
81+ def exposed_register_master_blaster(self, callable):
82+ """
83+ Register a callable that will be called when the slave decides to
84+ disconnect the master. This should be used to prepare the master for
85+ the disconnection, so it can differentiate between network failures
86+ and a planned disconnect.
87+ The callable will be called with one param - a string with a reason
88+ for the disconnect.
89+ """
90+ SessionAssistantSlave.master_blaster = callable
91+
92+ def on_connect(self, conn):
93+ try:
94+ if SessionAssistantSlave.master_blaster:
95+ msg = 'Forcefully disconnected by new master from {}:{}'.format(
96+ conn._config['endpoints'][1][0], conn._config['endpoints'][1][1])
97+ SessionAssistantSlave.master_blaster(msg)
98+ old_master = SessionAssistantSlave.controlling_master_conn
99+ if old_master is not None:
100+ old_master.close()
101+ SessionAssistantSlave.master_blaster = None
102+
103+ SessionAssistantSlave.controlling_master_conn = conn
104+ except TimeoutError as exc:
105+ # this happens when the reference to .master_blaster times out,
106+ # meaning the master is blocked on an urwid screen or some other
107+ # thread blocking operation. In any case it means there was a
108+ # previous master, so we need to kill it
109+ old_master = SessionAssistantSlave.controlling_master_conn
110+ SessionAssistantSlave.master_blaster = None
111+ old_master.close()
112+ SessionAssistantSlave.controlling_master_conn = conn
113+
114
115 class RemoteSlave():
116 """

Subscribers

People subscribed via source and target branches