Merge lp:~cjwatson/launchpad/loggerhead-shutdown-race into lp:launchpad

Proposed by Colin Watson
Status: Merged
Merged at revision: 18749
Proposed branch: lp:~cjwatson/launchpad/loggerhead-shutdown-race
Merge into: lp:launchpad
Diff against target: 83 lines (+24/-12)
2 files modified
lib/lp/services/osutils.py (+13/-6)
scripts/stop-loggerhead.py (+11/-6)
To merge this branch: bzr merge lp:~cjwatson/launchpad/loggerhead-shutdown-race
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+352884@code.launchpad.net

Commit message

Fix stop-loggerhead to do a two-stage kill.

Description of the change

This avoids problems during deployments where stop-loggerhead exits before the old process has actually stopped.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lib/lp/services/osutils.py'
--- lib/lp/services/osutils.py 2018-06-06 12:46:56 +0000
+++ lib/lp/services/osutils.py 2018-08-10 10:18:30 +0000
@@ -110,13 +110,15 @@
110 raise110 raise
111111
112112
113def two_stage_kill(pid, poll_interval=0.1, num_polls=50):113def two_stage_kill(pid, poll_interval=0.1, num_polls=50, get_status=True):
114 """Kill process 'pid' with SIGTERM. If it doesn't die, SIGKILL it.114 """Kill process 'pid' with SIGTERM. If it doesn't die, SIGKILL it.
115115
116 :param pid: The pid of the process to kill.116 :param pid: The pid of the process to kill.
117 :param poll_interval: The polling interval used to check if the117 :param poll_interval: The polling interval used to check if the
118 process is still around.118 process is still around.
119 :param num_polls: The number of polls to do before doing a SIGKILL.119 :param num_polls: The number of polls to do before doing a SIGKILL.
120 :param get_status: If True, collect the process' exit status (which
121 requires it to be a child of the process running this function).
120 """122 """
121 # Kill the process.123 # Kill the process.
122 _kill_may_race(pid, SIGTERM)124 _kill_may_race(pid, SIGTERM)
@@ -124,11 +126,16 @@
124 # Poll until the process has ended.126 # Poll until the process has ended.
125 for i in range(num_polls):127 for i in range(num_polls):
126 try:128 try:
127 # Reap the child process and get its return value. If it's not129 if get_status:
128 # gone yet, continue.130 # Reap the child process and get its return value. If it's
129 new_pid, result = os.waitpid(pid, os.WNOHANG)131 # not gone yet, continue.
130 if new_pid:132 new_pid, result = os.waitpid(pid, os.WNOHANG)
131 return result133 if new_pid:
134 return result
135 else:
136 # If the process isn't gone yet, continue.
137 if not process_exists(pid):
138 return
132 time.sleep(poll_interval)139 time.sleep(poll_interval)
133 except OSError as e:140 except OSError as e:
134 if e.errno in (errno.ESRCH, errno.ECHILD):141 if e.errno in (errno.ESRCH, errno.ECHILD):
135142
=== modified file 'scripts/stop-loggerhead.py'
--- scripts/stop-loggerhead.py 2018-06-06 12:46:56 +0000
+++ scripts/stop-loggerhead.py 2018-08-10 10:18:30 +0000
@@ -8,10 +8,12 @@
8import _pythonpath8import _pythonpath
99
10from optparse import OptionParser10from optparse import OptionParser
11import os
12import signal
13import sys11import sys
1412
13from lp.services.osutils import (
14 process_exists,
15 two_stage_kill,
16 )
15from lp.services.pidfile import get_pid17from lp.services.pidfile import get_pid
1618
1719
@@ -20,9 +22,11 @@
2022
21pid = get_pid("codebrowse")23pid = get_pid("codebrowse")
2224
23try:25if pid is None:
24 os.kill(pid, 0)26 # Already stopped.
25except OSError as e:27 sys.exit(0)
28
29if not process_exists(pid):
26 print('Stale pid file; server is not running.')30 print('Stale pid file; server is not running.')
27 sys.exit(1)31 sys.exit(1)
2832
@@ -30,4 +34,5 @@
30print('Shutting down previous server @ pid %d.' % (pid,))34print('Shutting down previous server @ pid %d.' % (pid,))
31print()35print()
3236
33os.kill(pid, signal.SIGTERM)37# A busy gunicorn can take a while to shut down.
38two_stage_kill(pid, poll_interval=0.5, num_polls=120, get_status=False)