Download raw body.
gotd: wait asynchronously for child termination
This should help with the current "child PID 0 terminated" due to
races with waitpid().
The idea is to manage the subprocess separately to the client, in
their own queue, and instead of the current kill() + waitpid(WNOHANG)
which doesn't reap the process unless we're very lucky, schedule its
termination (still via kill(2)) and set a timer. Then, if the timer
fires before the corresponding SIGCHLD arrives, the process is killed
abruptly.
There's a small catch in this: since now we listen for SIGCHLD and
tear down the process there, we don't get the imsg EOF. So if a
subprocess dies unexpectedly, we need to disconnect() the matching
client too.
proc_done() now is only called from the SIGCHLD handler, and
kill_proc() only sends the signal. There's a bit of redundancy
between these two because in both codepath (i.e. crash -> SIGCHLD ->
proc_done() or kill_proc()) there are resources to be released, and
closing the pipe on kill_proc() seems sensible.
diff /home/op/w/got
commit - 9b9bad55e7a411af3e01d9e4a86c127876184726
path + /home/op/w/got
blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
file + gotd/gotd.c
--- gotd/gotd.c
+++ gotd/gotd.c
@@ -80,7 +80,11 @@ struct gotd_child_proc {
char repo_path[PATH_MAX];
int pipe[2];
struct gotd_imsgev iev;
+ struct event tmo;
+
+ TAILQ_ENTRY(gotd_child_proc) procs;
};
+TAILQ_HEAD(gotd_procs, gotd_child_proc) procs;
struct gotd_client {
STAILQ_ENTRY(gotd_client) entry;
@@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int);
static const struct got_error *start_auth_child(struct gotd_client *, int,
struct gotd_repo *, char *, const char *, int, int);
static void kill_proc(struct gotd_child_proc *, int);
+static void disconnect_on_error(struct gotd_client *, const struct got_error *);
__dead static void
usage(void)
@@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid)
}
static void
-wait_for_child(pid_t child_pid)
-{
- pid_t pid;
- int status;
-
- log_debug("waiting for child PID %ld to terminate",
- (long)child_pid);
-
- do {
- pid = waitpid(child_pid, &status, WNOHANG);
- if (pid == -1) {
- if (errno != EINTR && errno != ECHILD)
- fatal("wait");
- } else if (WIFSIGNALED(status)) {
- log_warnx("child PID %ld terminated; signal %d",
- (long)pid, WTERMSIG(status));
- }
- } while (pid != -1 || (pid == -1 && errno == EINTR));
-}
-
-static void
proc_done(struct gotd_child_proc *proc)
{
- event_del(&proc->iev.ev);
- msgbuf_clear(&proc->iev.ibuf.w);
- close(proc->iev.ibuf.fd);
- kill_proc(proc, 0);
- wait_for_child(proc->pid);
+ struct gotd_client *clt;
+
+ TAILQ_REMOVE(&procs, proc, procs);
+
+ clt = find_client_by_proc_fd(proc->iev.ibuf.fd);
+ if (clt != NULL) {
+ if (proc == clt->repo)
+ clt->repo = NULL;
+ if (proc == clt->auth)
+ clt->auth = NULL;
+ if (proc == clt->session)
+ clt->session = NULL;
+ disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED));
+ }
+
+ evtimer_del(&proc->tmo);
+
+ if (proc->iev.ibuf.fd != -1) {
+ event_del(&proc->iev.ev);
+ msgbuf_clear(&proc->iev.ibuf.w);
+ close(proc->iev.ibuf.fd);
+ }
+
free(proc);
}
static void
kill_repo_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->repo == NULL)
return;
- proc = client->repo;
+ kill_proc(client->repo, 0);
client->repo = NULL;
-
- proc_done(proc);
}
static void
kill_auth_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->auth == NULL)
return;
- proc = client->auth;
+ kill_proc(client->auth, 0);
client->auth = NULL;
-
- proc_done(proc);
}
static void
kill_session_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->session == NULL)
return;
- proc = client->session;
+ kill_proc(client->session, 0);
client->session = NULL;
-
- proc_done(proc);
}
static void
@@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
static void
kill_proc(struct gotd_child_proc *proc, int fatal)
{
+ struct timeval tv = { 5, 0 };
+
+ log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
+
+ if (proc->iev.ibuf.fd != -1) {
+ event_del(&proc->iev.ev);
+ msgbuf_clear(&proc->iev.ibuf.w);
+ close(proc->iev.ibuf.fd);
+ proc->iev.ibuf.fd = -1;
+ }
+
+ if (!evtimer_pending(&proc->tmo, NULL))
+ evtimer_add(&proc->tmo, &tv);
+
if (fatal) {
log_warnx("sending SIGKILL to PID %d", proc->pid);
kill(proc->pid, SIGKILL);
@@ -753,9 +757,17 @@ gotd_shutdown(void)
}
static void
+kill_proc_timeout(int fd, short ev, void *d)
+{
+ struct gotd_child_proc *proc = d;
+
+ log_warnx("wait timeout for PID %d terminated", proc->pid);
+ kill_proc(proc, 1);
+}
+
+static void
gotd_shutdown(void)
{
- struct gotd_child_proc *proc;
uint64_t slot;
log_debug("shutting down");
@@ -766,20 +778,31 @@ gotd_shutdown(void)
disconnect(c);
}
- proc = gotd.listen_proc;
- msgbuf_clear(&proc->iev.ibuf.w);
- close(proc->iev.ibuf.fd);
- kill_proc(proc, 0);
- wait_for_child(proc->pid);
- free(proc);
+ kill_proc(gotd.listen_proc, 0);
log_info("terminating");
exit(0);
}
+static struct gotd_child_proc *
+find_proc_by_pid(pid_t pid)
+{
+ struct gotd_child_proc *proc;
+
+ TAILQ_FOREACH(proc, &procs, procs)
+ if (proc->pid == pid)
+ break;
+
+ return proc;
+}
+
void
gotd_sighdlr(int sig, short event, void *arg)
{
+ struct gotd_child_proc *proc;
+ pid_t pid;
+ int status;
+
/*
* Normal signal handler rules don't apply because libevent
* decouples for us.
@@ -796,6 +819,35 @@ gotd_sighdlr(int sig, short event, void *arg)
case SIGINT:
gotd_shutdown();
break;
+ case SIGCHLD:
+ for (;;) {
+ pid = waitpid(WAIT_ANY, &status, WNOHANG);
+ if (pid == -1) {
+ if (errno == EINTR)
+ continue;
+ if (errno == ECHILD)
+ break;
+ fatal("waitpid");
+ }
+ if (pid == 0)
+ break;
+
+ log_debug("reaped pid %d", pid);
+ proc = find_proc_by_pid(pid);
+ if (proc == NULL) {
+ log_info("caught exit of unknown child %d",
+ pid);
+ continue;
+ }
+
+ if (WIFSIGNALED(status)) {
+ log_warnx("child PID %d terminated with"
+ " signal %d", pid, WTERMSIG(status));
+ }
+
+ proc_done(proc);
+ }
+ break;
default:
fatalx("unexpected signal");
}
@@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int
if (proc == NULL)
fatal("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, procs);
+
+ /*
+ * XXX start_listener is called before event_init() so can't
+ * initialize proc->tmo here.
+ */
+
proc->type = PROC_LISTEN;
if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK,
@@ -1534,6 +1593,9 @@ start_session_child(struct gotd_client *client, struct
if (proc == NULL)
return got_error_from_errno("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, procs);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
if (client_is_reading(client))
proc->type = PROC_SESSION_READ;
else
@@ -1580,6 +1642,9 @@ start_repo_child(struct gotd_client *client, enum gotd
if (proc == NULL)
return got_error_from_errno("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, procs);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
proc->type = proc_type;
if (strlcpy(proc->repo_name, repo->name,
sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1632,6 +1697,9 @@ start_auth_child(struct gotd_client *client, int requi
return err;
}
+ TAILQ_INSERT_HEAD(&procs, proc, procs);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
proc->type = PROC_AUTH;
if (strlcpy(proc->repo_name, repo->name,
sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1732,10 +1800,12 @@ main(int argc, char **argv)
struct passwd *pw = NULL;
char *repo_path = NULL;
enum gotd_procid proc_id = PROC_GOTD;
- struct event evsigint, evsigterm, evsighup, evsigusr1;
+ struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld;
int *pack_fds = NULL, *temp_fds = NULL;
struct gotd_repo *repo = NULL;
+ TAILQ_INIT(&procs);
+
log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */
while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) {
@@ -1955,18 +2025,23 @@ main(int argc, char **argv)
if (proc_id != PROC_GOTD)
fatal("invalid process id %d", proc_id);
+ evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout,
+ gotd.listen_proc);
+
apply_unveil_selfexec();
signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL);
signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL);
signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL);
signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL);
+ signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL);
signal(SIGPIPE, SIG_IGN);
signal_add(&evsigint, NULL);
signal_add(&evsigterm, NULL);
signal_add(&evsighup, NULL);
signal_add(&evsigusr1, NULL);
+ signal_add(&evsigchld, NULL);
gotd_imsg_event_add(&gotd.listen_proc->iev);
gotd: wait asynchronously for child termination