From: Omar Polo Subject: Re: gotd: wait asynchronously for child termination To: Stefan Sperling Cc: gameoftrees@openbsd.org Date: Thu, 22 Jun 2023 17:40:37 +0200 On 2023/06/22 17:13:38 +0200, Stefan Sperling wrote: > On Thu, Jun 22, 2023 at 04:53:14PM +0200, Omar Polo wrote: > > diff /home/op/w/got > > commit - 9b9bad55e7a411af3e01d9e4a86c127876184726 > > path + /home/op/w/got > > blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8 > > file + gotd/gotd.c > > --- gotd/gotd.c > > +++ gotd/gotd.c > > @@ -80,7 +80,11 @@ struct gotd_child_proc { > > char repo_path[PATH_MAX]; > > int pipe[2]; > > struct gotd_imsgev iev; > > + struct event tmo; > > + > > + TAILQ_ENTRY(gotd_child_proc) procs; > > By convention used so far, the above variable should be called 'entry'. right, fixed. > > -static void > > proc_done(struct gotd_child_proc *proc) > > { > > - event_del(&proc->iev.ev); > > - msgbuf_clear(&proc->iev.ibuf.w); > > - close(proc->iev.ibuf.fd); > > - kill_proc(proc, 0); > > - wait_for_child(proc->pid); > > + struct gotd_client *clt; > > Could you rename 'clt' to 'client' as used elsewhere? done > > + > > + TAILQ_REMOVE(&procs, proc, procs); > > + > > + clt = find_client_by_proc_fd(proc->iev.ibuf.fd); > > + if (clt != NULL) { > > + if (proc == clt->repo) > > + clt->repo = NULL; > > + if (proc == clt->auth) > > + clt->auth = NULL; > > + if (proc == clt->session) > > + clt->session = NULL; > > + disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED)); > > I believe this error might arrive at the client user's terminal, and > they would see "unprivileged process died unexpectedly", even in the > code path where the child process terminates normally. this shouldn't be executed in the normal code path because disconnect() first calls the various kill_*_proc() which set client->{repo,auth,session} to NULL. find_client_by_proc_fd() then won't find the client. > I would suggest to either use GOT_ERR_EOF which will be logged but > not be sent to the user, or simply call disconnect() directly. i've changed it to disconnect() anyway. we already have the logging if a process dies, and the (dis)connected user doesn't need to know. > > @@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal) > > static void > > kill_proc(struct gotd_child_proc *proc, int fatal) > > { > > + struct timeval tv = { 5, 0 }; > > + > > + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); > > + > > + if (proc->iev.ibuf.fd != -1) { > > + event_del(&proc->iev.ev); > > + msgbuf_clear(&proc->iev.ibuf.w); > > + close(proc->iev.ibuf.fd); > > + proc->iev.ibuf.fd = -1; > > + } > > + > > + if (!evtimer_pending(&proc->tmo, NULL)) > > + evtimer_add(&proc->tmo, &tv); > > + > > The above check needs && !fatal, otherwise we will schedule the > timeout again after it has already fired to trigger kill -9. ah, yes. it's pointless to schedule the timer here since we'll get immediately the SIGCHLD. > > static void > > +kill_proc_timeout(int fd, short ev, void *d) > > +{ > > + struct gotd_child_proc *proc = d; > > + > > + log_warnx("wait timeout for PID %d terminated", proc->pid); > > + kill_proc(proc, 1); > > I would rephrase the above as something like: > "timeout waiting for PID %d to terminate; retrying with force" done, thanks > > +static struct gotd_child_proc * > > +find_proc_by_pid(pid_t pid) > > +{ > > + struct gotd_child_proc *proc; > > Must initialize to NULL above, otherwise we return garbage below if no > PID matches. here I cheated a bit. procs contains always at least one process (the listener), but I agree with not being clever and initializing the variable. > > + > > + TAILQ_FOREACH(proc, &procs, procs) > > + if (proc->pid == pid) > > + break; > > + > > + return proc; > > +} > > > @@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int > > if (proc == NULL) > > fatal("calloc"); > > > > + TAILQ_INSERT_HEAD(&procs, proc, procs); > > + > > + /* > > + * XXX start_listener is called before event_init() so can't > > + * initialize proc->tmo here. > > + */ > > + > > proc->type = PROC_LISTEN; > > I would remove the XXX, I suspect that not calling event_init() before > starting the listener was deliberate. At least I vaguely recall having > problems with forking and event_init(). the issue is calling daemon() which fork()s under the hood. we would need to either reinitialize libevent or doing something like the following: : log_init(daemonize ? 0 : 1, LOG_DAEMON); : log_setverbose(verbosity); : : + if (proc_id == PROC_GOTD && daemonize && daemon(1, 0) == -1) : + fatal("daemon"); : + : + event_init(); : + : if (proc_id == PROC_GOTD) { : snprintf(title, sizeof(title), "%s", gotd_proc_names[proc_id]); : arc4random_buf(&clients_hash_key, sizeof(clients_hash_key)); : - if (daemonize && daemon(1, 0) == -1) : - fatal("daemon"); : gotd.pid = getpid(); : start_listener(argv0, confpath, daemonize, verbosity); : } else if (proc_id == PROC_LISTEN) { : @@ -1932,8 +1935,6 @@ main(int argc, char **argv) : if (setuid(pw->pw_uid) == -1) : fatal("setuid %d failed", pw->pw_uid); : : - event_init(); : - : switch (proc_id) { : case PROC_GOTD: : #ifndef PROFILE but this initializes libevent before dropping privileges. > Perhaps mention that the timeout will be initialized in main() instead? done too. diff refs/remotes/got/main refs/heads/main commit - ba91039c1a0a3d55f4850e26c24730cbf4b5c239 commit + 4affeead3c9700295bc3ad55f06ecfc9e839014f blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8 blob + 68a99a122229289490584888ded7f2beb2d99042 --- gotd/gotd.c +++ gotd/gotd.c @@ -80,7 +80,11 @@ struct gotd_child_proc { char repo_path[PATH_MAX]; int pipe[2]; struct gotd_imsgev iev; + struct event tmo; + + TAILQ_ENTRY(gotd_child_proc) entry; }; +TAILQ_HEAD(gotd_procs, gotd_child_proc) procs; struct gotd_client { STAILQ_ENTRY(gotd_client) entry; @@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int); static const struct got_error *start_auth_child(struct gotd_client *, int, struct gotd_repo *, char *, const char *, int, int); static void kill_proc(struct gotd_child_proc *, int); +static void disconnect(struct gotd_client *); __dead static void usage(void) @@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid) } static void -wait_for_child(pid_t child_pid) -{ - pid_t pid; - int status; - - log_debug("waiting for child PID %ld to terminate", - (long)child_pid); - - do { - pid = waitpid(child_pid, &status, WNOHANG); - if (pid == -1) { - if (errno != EINTR && errno != ECHILD) - fatal("wait"); - } else if (WIFSIGNALED(status)) { - log_warnx("child PID %ld terminated; signal %d", - (long)pid, WTERMSIG(status)); - } - } while (pid != -1 || (pid == -1 && errno == EINTR)); -} - -static void proc_done(struct gotd_child_proc *proc) { - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + struct gotd_client *client; + + TAILQ_REMOVE(&procs, proc, entry); + + client = find_client_by_proc_fd(proc->iev.ibuf.fd); + if (client != NULL) { + if (proc == client->repo) + client->repo = NULL; + if (proc == client->auth) + client->auth = NULL; + if (proc == client->session) + client->session = NULL; + disconnect(client); + } + + evtimer_del(&proc->tmo); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + } + free(proc); } static void kill_repo_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->repo == NULL) return; - proc = client->repo; + kill_proc(client->repo, 0); client->repo = NULL; - - proc_done(proc); } static void kill_auth_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->auth == NULL) return; - proc = client->auth; + kill_proc(client->auth, 0); client->auth = NULL; - - proc_done(proc); } static void kill_session_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->session == NULL) return; - proc = client->session; + kill_proc(client->session, 0); client->session = NULL; - - proc_done(proc); } static void @@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal) static void kill_proc(struct gotd_child_proc *proc, int fatal) { + struct timeval tv = { 5, 0 }; + + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + proc->iev.ibuf.fd = -1; + } + + if (!evtimer_pending(&proc->tmo, NULL) && !fatal) + evtimer_add(&proc->tmo, &tv); + if (fatal) { log_warnx("sending SIGKILL to PID %d", proc->pid); kill(proc->pid, SIGKILL); @@ -753,9 +757,18 @@ gotd_shutdown(void) } static void +kill_proc_timeout(int fd, short ev, void *d) +{ + struct gotd_child_proc *proc = d; + + log_warnx("timeout waiting for PID %d to terminate;" + " retrying with force", proc->pid); + kill_proc(proc, 1); +} + +static void gotd_shutdown(void) { - struct gotd_child_proc *proc; uint64_t slot; log_debug("shutting down"); @@ -766,20 +779,31 @@ gotd_shutdown(void) disconnect(c); } - proc = gotd.listen_proc; - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); - free(proc); + kill_proc(gotd.listen_proc, 0); log_info("terminating"); exit(0); } +static struct gotd_child_proc * +find_proc_by_pid(pid_t pid) +{ + struct gotd_child_proc *proc = NULL; + + TAILQ_FOREACH(proc, &procs, entry) + if (proc->pid == pid) + break; + + return proc; +} + void gotd_sighdlr(int sig, short event, void *arg) { + struct gotd_child_proc *proc; + pid_t pid; + int status; + /* * Normal signal handler rules don't apply because libevent * decouples for us. @@ -796,6 +820,35 @@ gotd_sighdlr(int sig, short event, void *arg) case SIGINT: gotd_shutdown(); break; + case SIGCHLD: + for (;;) { + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid == -1) { + if (errno == EINTR) + continue; + if (errno == ECHILD) + break; + fatal("waitpid"); + } + if (pid == 0) + break; + + log_debug("reaped pid %d", pid); + proc = find_proc_by_pid(pid); + if (proc == NULL) { + log_info("caught exit of unknown child %d", + pid); + continue; + } + + if (WIFSIGNALED(status)) { + log_warnx("child PID %d terminated with" + " signal %d", pid, WTERMSIG(status)); + } + + proc_done(proc); + } + break; default: fatalx("unexpected signal"); } @@ -1508,6 +1561,10 @@ start_listener(char *argv0, const char *confpath, int if (proc == NULL) fatal("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + + /* proc->tmo is initialized in main() after event_init() */ + proc->type = PROC_LISTEN; if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, @@ -1534,6 +1591,9 @@ start_session_child(struct gotd_client *client, struct if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + if (client_is_reading(client)) proc->type = PROC_SESSION_READ; else @@ -1580,6 +1640,9 @@ start_repo_child(struct gotd_client *client, enum gotd if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = proc_type; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1632,6 +1695,9 @@ start_auth_child(struct gotd_client *client, int requi return err; } + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = PROC_AUTH; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1732,10 +1798,12 @@ main(int argc, char **argv) struct passwd *pw = NULL; char *repo_path = NULL; enum gotd_procid proc_id = PROC_GOTD; - struct event evsigint, evsigterm, evsighup, evsigusr1; + struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld; int *pack_fds = NULL, *temp_fds = NULL; struct gotd_repo *repo = NULL; + TAILQ_INIT(&procs); + log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */ while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) { @@ -1955,18 +2023,23 @@ main(int argc, char **argv) if (proc_id != PROC_GOTD) fatal("invalid process id %d", proc_id); + evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout, + gotd.listen_proc); + apply_unveil_selfexec(); signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL); signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL); signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL); signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL); + signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL); signal(SIGPIPE, SIG_IGN); signal_add(&evsigint, NULL); signal_add(&evsigterm, NULL); signal_add(&evsighup, NULL); signal_add(&evsigusr1, NULL); + signal_add(&evsigchld, NULL); gotd_imsg_event_add(&gotd.listen_proc->iev);