Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reap workers in the main loop #2314

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 33 additions & 22 deletions gunicorn/arbiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ class Arbiter(object):
# I love dynamic languages
SIG_QUEUE = []
SIGNALS = [getattr(signal, "SIG%s" % x)
for x in "HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()]
for x in "CHLD HUP QUIT INT TERM TTIN TTOU USR1 USR2 WINCH".split()]
SIG_NAMES = dict(
(getattr(signal, name), name[3:].lower()) for name in dir(signal)
if name[:3] == "SIG" and name[3] != "_"
if name[:3] == "SIG" and name[3] != "_" and name[3:] != "CLD"
Copy link

@piskvorky piskvorky Feb 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition gave me a pause, not obvious.

Suggested change
if name[:3] == "SIG" and name[3] != "_" and name[3:] != "CLD"
if name[:3] == "SIG" and name[3] != "_" and name[3:] != "CLD" # SIGCLD is an obsolete name for SIGCHLD

At least according to https://man7.org/linux/man-pages/man7/signal.7.html

What was the motivation for singling it out here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought I encountered an error without this (on Linux) but I'll double check.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sylt's solution in #3148 seems a bit cleaner.

)

def __init__(self, app):
Expand Down Expand Up @@ -186,10 +186,10 @@ def init_signals(self):
# initialize all signals
for s in self.SIGNALS:
signal.signal(s, self.signal)
signal.signal(signal.SIGCHLD, self.handle_chld)
tilgovi marked this conversation as resolved.
Show resolved Hide resolved
tilgovi marked this conversation as resolved.
Show resolved Hide resolved

def signal(self, sig, frame):
if len(self.SIG_QUEUE) < 5:
# limit overload, but do not miss unique signals
if len(self.SIG_QUEUE) < 5 or sig not in self.SIG_QUEUE:
self.SIG_QUEUE.append(sig)
self.wakeup()

Expand Down Expand Up @@ -237,10 +237,25 @@ def run(self):
self.pidfile.unlink()
sys.exit(-1)

def handle_chld(self, sig, frame):
"SIGCHLD handling"
def handle_chld(self):
"""\
SIGCHLD handling.

Reap workers and re-install the signal handler, in case it is not reset
by the OS, as described in the signal module documentation:

A handler for a particular signal, once set, remains installed
until it is explicitly reset (Python emulates the BSD style
interface regardless of the underlying implementation), with the
exception of the handler for SIGCHLD, which follows the underlying
implementation.

Python makes this exception to avoid infinite recursion on platforms
that invoke the handler immediately when installing it from a process
with dead children.
"""
self.reap_workers()
self.wakeup()
signal.signal(signal.SIGCHLD, self.signal)

def handle_hup(self):
"""\
Expand Down Expand Up @@ -391,9 +406,11 @@ def stop(self, graceful=True):
limit = time.time() + self.cfg.graceful_timeout
# instruct the workers to exit
self.kill_workers(sig)
self.reap_workers()
# wait until the graceful timeout
while self.WORKERS and time.time() < limit:
time.sleep(0.1)
self.reap_workers()

self.kill_workers(signal.SIGKILL)

Expand Down Expand Up @@ -492,8 +509,7 @@ def murder_workers(self):
"""
if not self.timeout:
return
workers = list(self.WORKERS.items())
for (pid, worker) in workers:
for (pid, worker) in self.WORKERS.items():
try:
if time.time() - worker.tmp.last_update() <= self.timeout:
continue
Expand All @@ -519,6 +535,12 @@ def reap_workers(self):
if self.reexec_pid == wpid:
self.reexec_pid = 0
else:
worker = self.WORKERS.pop(wpid, None)
if not worker:
continue

worker.tmp.close()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought it was best to ensure that we actually only do anything below here if this is actually our worker. But I can make this a separate PR if that's desired.


# A worker was terminated. If the termination reason was
# that it could not boot, we'll shut it down to avoid
# infinite start/stop cycles.
Expand Down Expand Up @@ -553,10 +575,6 @@ def reap_workers(self):
msg += " Perhaps out of memory?"
self.log.error(msg)

worker = self.WORKERS.pop(wpid, None)
if not worker:
continue
worker.tmp.close()
self.cfg.child_exit(self, worker)
except OSError as e:
if e.errno != errno.ECHILD:
Expand Down Expand Up @@ -647,8 +665,7 @@ def kill_workers(self, sig):
Kill all workers with the signal `sig`
:attr sig: `signal.SIG*` value
"""
worker_pids = list(self.WORKERS.keys())
for pid in worker_pids:
for pid in self.WORKERS:
self.kill_worker(pid, sig)

def kill_worker(self, pid, sig):
Expand All @@ -662,11 +679,5 @@ def kill_worker(self, pid, sig):
os.kill(pid, sig)
except OSError as e:
if e.errno == errno.ESRCH:
try:
benoitc marked this conversation as resolved.
Show resolved Hide resolved
worker = self.WORKERS.pop(pid)
worker.tmp.close()
self.cfg.worker_exit(self, worker)
return
except (KeyError, OSError):
return
return
raise