Download raw body.
gotd: Don't use WNOHANG when waiting for child processes
On 2023/06/19 23:39:04 +0200, Stefan Sperling <stsp@stsp.name> wrote: > On Sat, Mar 04, 2023 at 11:58:46AM +0100, Omar Polo wrote: > > On 2023/03/03 09:35:59 -0500, Josiah Frentsos <jfrent@tilde.team> wrote: > > > Fixes this: > > > > > > 2023-03-03T13:51:15.155Z silicon gotd[7715]: child PID 0 terminated; signal 12 > > > 2023-03-03T13:51:15.186Z silicon last message repeated 419 times > > > > I don't think the diff is wrong per-se, wait_for_child is just a > > wrapper around waitpid and we're interested in one child only anyway, > > so I *think* WNOHANG is not much important. I'm curious why it's > > needed though. > > > > waitpid(WNOHANG) returns 0 when "there are no stopped or exited > > children". Since we call it always after kill_proc() maybe it's a > > timing issue: we call waitpid(WNOHANG) before the signal is handled > > and the other process exits? > > Given that wait_for_child() runs in libevent handlers (via client_disconnect) > I don't think we want to call waitpid() in blocking mode. We should stay in > control over how long we are going to wait for the child to exit. > Otherwise, if waitpid() never returns for some reason (for example, because > the child is spinning on the CPU and doesn't exit) then all of gotd will > hang because further events will not be processed. > > The diff below gives up after 5 seconds. Good enough? Too long? > > I did consider retrying with kill -9 upon timeout. But for now I would rather > be able to attach gdb to a child that doesn't terminate to figure out why. we talked about this on irc for a bit and i think we agreed on killing the procs and setting a timeout for a kill -9 instead of looping on waitpid(WNOHANG) with sleeps inbetween. diff below is an attempt to do so. I've only tested it on localhost so far, but the logging output seems good. regress passes, but that tells us almost nothing because it doesn't catch zombies and other leaks. it's a bit crude and can be splitted in a few commits, but should be good enough for some testing. I'm running this on my mirror. There's some logging that should help understanding if something goes wrong. More semplification can land after. The idea is to keep a queue of gotd_child_procs. Upon kill_proc we set a timeout and if it fires before the corresponding SIGCHLD it gets killed the hard way with SIGKILL. For local debugging, the kill_proc call in kill_proc_timeout can de commented to attach gdb. diff refs/heads/main refs/heads/async-wait commit - 9fc9df27559545b0cacca46fc6d8db711fdcaf41 commit + 2b6f7f49603a594f2d231229a94497357087b9c3 blob - f15597dbd0e7299cbf018874f9d81d6c8dcaf403 blob + cae57edbe0fd64d98e48affe930440122149ea1e --- gotd/gotd.c +++ gotd/gotd.c @@ -73,6 +73,19 @@ struct gotd_client { GOTD_CLIENT_STATE_ACCESS_GRANTED, }; +struct gotd_child_proc { + pid_t pid; + enum gotd_procid type; + char repo_name[NAME_MAX]; + char repo_path[PATH_MAX]; + int pipe[2]; + struct gotd_imsgev iev; + struct event tmo; + + TAILQ_ENTRY(gotd_child_proc) procs; +}; +TAILQ_HEAD(gotd_procs, gotd_child_proc) procs; + struct gotd_client { STAILQ_ENTRY(gotd_client) entry; enum gotd_client_state state; @@ -267,88 +280,64 @@ wait_for_child(pid_t child_pid) } static void -wait_for_child(pid_t child_pid) -{ - pid_t pid; - int status; - - log_debug("waiting for child PID %ld to terminate", - (long)child_pid); - - do { - pid = waitpid(child_pid, &status, WNOHANG); - if (pid == -1) { - if (errno != EINTR && errno != ECHILD) - fatal("wait"); - } else if (WIFSIGNALED(status)) { - log_warnx("child PID %ld terminated; signal %d", - (long)pid, WTERMSIG(status)); - } - } while (pid != -1 || (pid == -1 && errno == EINTR)); -} - -static void proc_done(struct gotd_child_proc *proc) { - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + TAILQ_REMOVE(&procs, proc, procs); + + evtimer_del(&proc->tmo); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + } + free(proc); } static void +kill_repo_proc(struct gotd_client *client) +{ + if (client->repo == NULL) + return; + + kill_proc(client->repo, 0); + client->repo = NULL; +} + +static void kill_auth_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->auth == NULL) return; - proc = client->auth; + kill_proc(client->auth, 0); client->auth = NULL; - - proc_done(proc); } static void kill_session_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->session == NULL) return; - proc = client->session; + kill_proc(client->session, 0); client->session = NULL; - - proc_done(proc); } static void disconnect(struct gotd_client *client) { struct gotd_imsg_disconnect idisconnect; - struct gotd_child_proc *proc = client->repo; - struct gotd_child_proc *listen_proc = &gotd.listen_proc; + struct gotd_child_proc *listen_proc = gotd.listen_proc; uint64_t slot; log_debug("uid %d: disconnecting", client->euid); kill_auth_proc(client); kill_session_proc(client); + kill_repo_proc(client); - if (proc) { - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); - free(proc); - proc = NULL; - } - idisconnect.client_id = client->id; if (gotd_imsg_compose_event(&listen_proc->iev, GOTD_IMSG_DISCONNECT, PROC_GOTD, -1, @@ -703,7 +692,7 @@ done: client->euid, client->fd); done: if (err) { - struct gotd_child_proc *listen_proc = &gotd.listen_proc; + struct gotd_child_proc *listen_proc = gotd.listen_proc; struct gotd_imsg_disconnect idisconnect; idisconnect.client_id = client->id; @@ -732,6 +721,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal) static void kill_proc(struct gotd_child_proc *proc, int fatal) { + struct timeval tv = { 5, 0 }; + + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + proc->iev.ibuf.fd = -1; + } + + if (!evtimer_pending(&proc->tmo, NULL)) + evtimer_add(&proc->tmo, &tv); + if (fatal) { log_warnx("sending SIGKILL to PID %d", proc->pid); kill(proc->pid, SIGKILL); @@ -740,6 +743,15 @@ gotd_shutdown(void) } static void +kill_proc_timeout(int fd, short ev, void *d) +{ + struct gotd_child_proc *proc = d; + + log_warnx("wait timeout for PID %d terminated", proc->pid); + kill_proc(proc, 1); +} + +static void gotd_shutdown(void) { struct gotd_child_proc *proc; @@ -753,19 +765,32 @@ gotd_shutdown(void) disconnect(c); } - proc = &gotd.listen_proc; - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + TAILQ_FOREACH(proc, &procs, procs) + kill_proc(proc, 0); log_info("terminating"); exit(0); } +static struct gotd_child_proc * +find_proc_by_pid(pid_t pid) +{ + struct gotd_child_proc *proc; + + TAILQ_FOREACH(proc, &procs, procs) + if (proc->pid == pid) + break; + + return proc; +} + void gotd_sighdlr(int sig, short event, void *arg) { + struct gotd_child_proc *proc; + pid_t pid; + int status; + /* * Normal signal handler rules don't apply because libevent * decouples for us. @@ -782,6 +807,35 @@ gotd_sighdlr(int sig, short event, void *arg) case SIGINT: gotd_shutdown(); break; + case SIGCHLD: + for (;;) { + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid == -1) { + if (errno == EINTR) + continue; + if (errno == ECHILD) + break; + fatal("waitpid"); + } + if (pid == 0) + break; + + log_debug("reaped pid %d", pid); + proc = find_proc_by_pid(pid); + if (proc == NULL) { + log_info("caught exit of unknown child %d", + pid); + continue; + } + + if (WIFSIGNALED(status)) { + log_warnx("child PID %d terminated with" + " signal %d", pid, WTERMSIG(status)); + } + + proc_done(proc); + } + break; default: fatalx("unexpected signal"); } @@ -960,7 +1014,7 @@ gotd_dispatch_listener(int fd, short event, void *arg) { struct gotd_imsgev *iev = arg; struct imsgbuf *ibuf = &iev->ibuf; - struct gotd_child_proc *proc = &gotd.listen_proc; + struct gotd_child_proc *proc = gotd.listen_proc; ssize_t n; int shut = 0; struct imsg imsg; @@ -1488,8 +1542,19 @@ start_listener(char *argv0, const char *confpath, int static void start_listener(char *argv0, const char *confpath, int daemonize, int verbosity) { - struct gotd_child_proc *proc = &gotd.listen_proc; + struct gotd_child_proc *proc; + proc = calloc(1, sizeof(*proc)); + if (proc == NULL) + fatal("calloc"); + + TAILQ_INSERT_HEAD(&procs, proc, procs); + + /* + * XXX start_listener is called before event_init() so can't + * initialize proc->tmo here. + */ + proc->type = PROC_LISTEN; if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, @@ -1502,6 +1567,8 @@ start_listener(char *argv0, const char *confpath, int proc->iev.handler = gotd_dispatch_listener; proc->iev.events = EV_READ; proc->iev.handler_arg = NULL; + + gotd.listen_proc = proc; } static const struct got_error * @@ -1514,6 +1581,9 @@ start_session_child(struct gotd_client *client, struct if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + if (client_is_reading(client)) proc->type = PROC_SESSION_READ; else @@ -1560,6 +1630,9 @@ start_repo_child(struct gotd_client *client, enum gotd if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = proc_type; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1612,6 +1685,9 @@ start_auth_child(struct gotd_client *client, int requi return err; } + TAILQ_INSERT_HEAD(&procs, proc, procs); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = PROC_AUTH; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1712,10 +1788,12 @@ main(int argc, char **argv) struct passwd *pw = NULL; char *repo_path = NULL; enum gotd_procid proc_id = PROC_GOTD; - struct event evsigint, evsigterm, evsighup, evsigusr1; + struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld; int *pack_fds = NULL, *temp_fds = NULL; struct gotd_repo *repo = NULL; + TAILQ_INIT(&procs); + log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */ while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) { @@ -1935,20 +2013,25 @@ main(int argc, char **argv) if (proc_id != PROC_GOTD) fatal("invalid process id %d", proc_id); + evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout, + gotd.listen_proc); + apply_unveil_selfexec(); signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL); signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL); signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL); signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL); + signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL); signal(SIGPIPE, SIG_IGN); signal_add(&evsigint, NULL); signal_add(&evsigterm, NULL); signal_add(&evsighup, NULL); signal_add(&evsigusr1, NULL); + signal_add(&evsigchld, NULL); - gotd_imsg_event_add(&gotd.listen_proc.iev); + gotd_imsg_event_add(&gotd.listen_proc->iev); event_dispatch(); blob - 35d6e95f4881eb3c4ce60156fce684fa03070cb4 blob + e902d6f53b03f7682202e2b9b3d0af5fc28c25b6 --- gotd/gotd.h +++ gotd/gotd.h @@ -51,16 +51,6 @@ struct gotd_child_proc { short events; }; -struct gotd_child_proc { - pid_t pid; - enum gotd_procid type; - char repo_name[NAME_MAX]; - char repo_path[PATH_MAX]; - int pipe[2]; - struct gotd_imsgev iev; - size_t nhelpers; -}; - enum gotd_access { GOTD_ACCESS_PERMITTED = 1, GOTD_ACCESS_DENIED @@ -120,13 +110,15 @@ struct gotd { int max_connections; }; +struct gotd_child_proc; + struct gotd { pid_t pid; char unix_socket_path[PATH_MAX]; char user_name[32]; struct gotd_repolist repos; int nrepos; - struct gotd_child_proc listen_proc; + struct gotd_child_proc *listen_proc; struct timeval request_timeout; struct timeval auth_timeout; struct gotd_uid_connection_limit *connection_limits;
gotd: Don't use WNOHANG when waiting for child processes