Download raw body.
gotd: wait asynchronously for child termination
This should help with the current "child PID 0 terminated" due to races with waitpid(). The idea is to manage the subprocess separately to the client, in their own queue, and instead of the current kill() + waitpid(WNOHANG) which doesn't reap the process unless we're very lucky, schedule its termination (still via kill(2)) and set a timer. Then, if the timer fires before the corresponding SIGCHLD arrives, the process is killed abruptly. There's a small catch in this: since now we listen for SIGCHLD and tear down the process there, we don't get the imsg EOF. So if a subprocess dies unexpectedly, we need to disconnect() the matching client too. proc_done() now is only called from the SIGCHLD handler, and kill_proc() only sends the signal. There's a bit of redundancy between these two because in both codepath (i.e. crash -> SIGCHLD -> proc_done() or kill_proc()) there are resources to be released, and closing the pipe on kill_proc() seems sensible. diff /home/op/w/got commit - 9b9bad55e7a411af3e01d9e4a86c127876184726 path + /home/op/w/got blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8 file + gotd/gotd.c --- gotd/gotd.c +++ gotd/gotd.c @@ -80,7 +80,11 @@ struct gotd_child_proc { char repo_path[PATH_MAX]; int pipe[2]; struct gotd_imsgev iev; + struct event tmo; + + TAILQ_ENTRY(gotd_child_proc) procs; }; +TAILQ_HEAD(gotd_procs, gotd_child_proc) procs; struct gotd_client { STAILQ_ENTRY(gotd_client) entry; @@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int); static const struct got_error *start_auth_child(struct gotd_client *, int, struct gotd_repo *, char *, const char *, int, int); static void kill_proc(struct gotd_child_proc *, int); +static void disconnect_on_error(struct gotd_client *, const struct got_error *); __dead static void usage(void) @@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid) } static void -wait_for_child(pid_t child_pid) -{ - pid_t pid; - int status; - - log_debug("waiting for child PID %ld to terminate", - (long)child_pid); - - do { - pid = waitpid(child_pid, &status, WNOHANG); - if (pid == -1) { - if (errno != EINTR && errno != ECHILD) - fatal("wait"); - } else if (WIFSIGNALED(status)) { - log_warnx("child PID %ld terminated; signal %d", - (long)pid, WTERMSIG(status)); - } - } while (pid != -1 || (pid == -1 && errno == EINTR)); -} - -static void proc_done(struct gotd_child_proc *proc) { - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + struct gotd_client *clt; + + TAILQ_REMOVE(&procs, proc, procs); + + clt = find_client_by_proc_fd(proc->iev.ibuf.fd); + if (clt != NULL) { + if (proc == clt->repo) + clt->repo = NULL; + if (proc == clt->auth) + clt->auth = NULL; + if (proc == clt->session) + clt->session = NULL; + disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED)); + } + + evtimer_del(&proc->tmo); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + } + free(proc); } static void kill_repo_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->repo == NULL) return; - proc = client->repo; + kill_proc(client->repo, 0); client->repo = NULL; - - proc_done(proc); } static void kill_auth_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->auth == NULL) return; - proc = client->auth; + kill_proc(client->auth, 0); client->auth = NULL; - - proc_done(proc); } static void kill_session_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->session == NULL) return; - proc = client->session; + kill_proc(client->session, 0); client->session = NULL; - - proc_done(proc); } static void @@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal) static void kill_proc(struct gotd_child_proc *proc, int fatal) { + struct timeval tv = { 5, 0 }; + + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + proc->iev.ibuf.fd = -1; + } + + if (!evtimer_pending(&proc->tmo, NULL)) + evtimer_add(&proc->tmo, &tv); + if (fatal) { log_warnx("sending SIGKILL to PID %d", proc->pid); kill(proc->pid, SIGKILL); @@ -753,9 +757,17 @@ gotd_shutdown(void) } static void +kill_proc_timeout(int fd, short ev, void *d) +{ + struct gotd_child_proc *proc = d; + + log_warnx("wait timeout for PID %d terminated", proc->pid); + kill_proc(proc, 1); +} + +static void gotd_shutdown(void) { - struct gotd_child_proc *proc; uint64_t slot; log_debug("shutting down"); @@ -766,20 +778,31 @@ gotd_shutdown(void) disconnect(c); } - proc = gotd.listen_proc; - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); - free(proc); + kill_proc(gotd.listen_proc, 0); log_info("terminating"); exit(0); } +static struct gotd_child_proc * +find_proc_by_pid(pid_t pid) +{ + struct gotd_child_proc *proc; + + TAILQ_FOREACH(proc, &procs, procs) + if (proc->pid == pid) + break; + + return proc; +} + void gotd_sighdlr(int sig, short event, void *arg) { + struct gotd_child_proc *proc; + pid_t pid; + int status; + /* * Normal signal handler rules don't apply because libevent * decouples for us. @@ -796,6 +819,35 @@ gotd_sighdlr(int sig, short event, void *arg) case SIGINT: gotd_shutdown(); break; + case SIGCHLD: + for (;;) { + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid == -1) { + if (errno == EINTR) + continue; + if (errno == ECHILD) + break; + fatal("waitpid"); + } + if (pid == 0) + break; + + log_debug("reaped pid %d", pid); + proc = find_proc_by_pid(pid); + if (proc == NULL) { + log_info("caught exit of unknown child %d", + pid); + continue; + } + + if (WIFSIGNALED(status)) { + log_warnx("child PID %d terminated with" + " signal %d", pid, WTERMSIG(status)); + } + + proc_done(proc); + } + break; default: fatalx("unexpected signal"); } @@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int if (proc == NULL) fatal("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, procs); + + /* + * XXX start_listener is called before event_init() so can't + * initialize proc->tmo here. + */ + proc->type = PROC_LISTEN; if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, @@ -1534,6 +1593,9 @@ start_session_child(struct gotd_client *client, struct if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + if (client_is_reading(client)) proc->type = PROC_SESSION_READ; else @@ -1580,6 +1642,9 @@ start_repo_child(struct gotd_client *client, enum gotd if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = proc_type; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1632,6 +1697,9 @@ start_auth_child(struct gotd_client *client, int requi return err; } + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = PROC_AUTH; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1732,10 +1800,12 @@ main(int argc, char **argv) struct passwd *pw = NULL; char *repo_path = NULL; enum gotd_procid proc_id = PROC_GOTD; - struct event evsigint, evsigterm, evsighup, evsigusr1; + struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld; int *pack_fds = NULL, *temp_fds = NULL; struct gotd_repo *repo = NULL; + TAILQ_INIT(&procs); + log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */ while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) { @@ -1955,18 +2025,23 @@ main(int argc, char **argv) if (proc_id != PROC_GOTD) fatal("invalid process id %d", proc_id); + evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout, + gotd.listen_proc); + apply_unveil_selfexec(); signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL); signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL); signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL); signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL); + signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL); signal(SIGPIPE, SIG_IGN); signal_add(&evsigint, NULL); signal_add(&evsigterm, NULL); signal_add(&evsighup, NULL); signal_add(&evsigusr1, NULL); + signal_add(&evsigchld, NULL); gotd_imsg_event_add(&gotd.listen_proc->iev);
gotd: wait asynchronously for child termination