Merge lp://staging/~stub/launchpad/kill-harder into lp://staging/launchpad

Proposed by Stuart Bishop
Status: Merged
Merged at revision: not available
Proposed branch: lp://staging/~stub/launchpad/kill-harder
Merge into: lp://staging/launchpad
Diff against target: None lines
To merge this branch: bzr merge lp://staging/~stub/launchpad/kill-harder
Reviewer Review Type Date Requested Status
Eleanor Berger (community) Approve
Review via email: mp+11517@code.staging.launchpad.net

This proposal supersedes a proposal from 2009-09-10.

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote : Posted in a previous version of this proposal

Addresses Bug #307447

If bin/killservice fails to kill a service with a SIGTERM, retry again with a SIGKILL.

To test, run 'make run' in a terminal. Then, it a different terminal, run 'bin/killservice librarian'. This will test the entire code path due to the way the librarian is spawned. Then, run 'bin/killservice launchpad' to demonstrate the normal code path where the process shutsdown normally.

$ bin/killservice librarian
2009-09-10 11:13:58 INFO Killing librarian (31210)
2009-09-10 11:14:18 WARNING SIGTERM failed to kill librarian (31210). Trying SIGKILL
2009-09-10 11:14:38 ERROR SIGKILL didn't terminate librarian (31210)

$ bin/killservice launchpad
2009-09-10 11:14:47 INFO Killing launchpad (31200)

Revision history for this message
Stuart Bishop (stub) wrote :

Addresses Bug #307447

If bin/killservice fails to kill a service with a SIGTERM, retry again with a SIGKILL.

To test, run 'make run' in a terminal. Then, it a different terminal, run 'bin/killservice librarian'. This will test the entire code path due to the way the librarian is spawned. Then, run 'bin/killservice launchpad' to demonstrate the normal code path where the process shutsdown normally.

$ bin/killservice librarian
2009-09-10 11:13:58 INFO Killing librarian (31210)
2009-09-10 11:14:18 WARNING SIGTERM failed to kill librarian (31210). Trying SIGKILL
2009-09-10 11:14:38 ERROR SIGKILL didn't terminate librarian (31210)

$ bin/killservice launchpad
2009-09-10 11:14:47 INFO Killing launchpad (31200)

Revision history for this message
Eleanor Berger (intellectronica) wrote :

r=me

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/scripts/utilities/killservice.py'
2--- lib/lp/scripts/utilities/killservice.py 2009-07-24 12:32:28 +0000
3+++ lib/lp/scripts/utilities/killservice.py 2009-09-10 11:20:16 +0000
4@@ -8,7 +8,8 @@
5 __metaclass__ = type
6
7 import os, logging
8-from signal import SIGTERM
9+from signal import SIGKILL, SIGTERM
10+import time
11 from optparse import OptionParser
12 from canonical.config import config
13 from canonical.lazr.pidfile import get_pid, pidfile_path, remove_pidfile
14@@ -18,16 +19,26 @@
15
16 def main():
17 parser = OptionParser('Usage: %prog [options] [SERVICE ...]')
18+ parser.add_option("-w", "--wait", metavar="SECS",
19+ default=20, type="int",
20+ help="Wait up to SECS seconds for processes "
21+ "to die before retrying with SIGKILL")
22 logger_options(parser, logging.INFO)
23 (options, args) = parser.parse_args()
24 log = logger(options)
25 if len(args) < 1:
26 parser.error('No service name provided')
27- for service in args:
28- # Mailman is special, but only stop it if it was launched.
29- if service == 'mailman' and config.mailman.launch:
30+
31+ pids = [] # List of pids we tried to kill.
32+ services = args[:]
33+
34+ # Mailman is special, but only stop it if it was launched.
35+ if 'mailman' in services:
36+ if config.mailman.launch:
37 stop_mailman()
38- continue
39+ services.remove('mailman')
40+
41+ for service in services:
42 log.debug("PID file is %s", pidfile_path(service))
43 try:
44 pid = get_pid(service)
45@@ -38,12 +49,72 @@
46 log.info("Killing %s (%d)", service, pid)
47 try:
48 os.kill(pid, SIGTERM)
49+ pids.append((service, pid))
50 except OSError, x:
51- log.error("Unable to kill %s (%d) - %s",
52- service, pid, x.strerror)
53+ log.error(
54+ "Unable to SIGTERM %s (%d) - %s",
55+ service, pid, x.strerror)
56+ else:
57+ log.debug("No PID file for %s", service)
58+
59+ wait_for_pids(pids, options.wait, log)
60+
61+ # Anything that didn't die, kill harder with SIGKILL.
62+ for service, pid in pids:
63+ if not process_exists(pid):
64+ continue
65+ log.warn(
66+ "SIGTERM failed to kill %s (%d). Trying SIGKILL", service, pid)
67+ try:
68+ os.kill(pid, SIGKILL)
69+ except OSError, x:
70+ log.error(
71+ "Unable to SIGKILL %s (%d) - %s", service, pid, x.strerror)
72+
73+ wait_for_pids(pids, options.wait, log)
74+
75+ # Report anything still left running after a SIGKILL.
76+ for service, pid in pids:
77+ if process_exists(pid):
78+ log.error("SIGKILL didn't terminate %s (%d)", service, pid)
79+
80+ # Remove any pidfiles that didn't get cleaned up if there is no
81+ # corresponding process (from an unkillable process, or maybe some
82+ # other job has relaunched it while we were not looking).
83+ for service in services:
84+ pid = get_pid(service)
85+ if pid is not None and not process_exists(pid):
86 try:
87 remove_pidfile(service)
88 except OSError:
89 pass
90- else:
91- log.debug("No PID file for %s", service)
92+
93+
94+def process_exists(pid):
95+ """True if the given process exists."""
96+ try:
97+ pgid = os.getpgid(pid)
98+ except OSError, x:
99+ if x.errno == 3:
100+ return False
101+ log.error("Unknown exception from getpgid - %s", str(x))
102+ return True
103+
104+
105+def wait_for_pids(pids, wait, log):
106+ """
107+ Wait until all signalled processes are dead, or until we hit the
108+ timeout.
109+
110+ Processes discovered to be dead are removed from the list.
111+
112+ :param pids: A list of (service, pid).
113+
114+ :param wait: How many seconds to wait.
115+ """
116+ wait_start = time.time()
117+ while pids and time.time() < wait_start + wait:
118+ for service, pid in pids[:]:
119+ if not process_exists(pid):
120+ pids.remove((service, pid))
121+ time.sleep(0.1)