"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
gotd: wait asynchronously for child termination
To:
gameoftrees@openbsd.org
Date:
Thu, 22 Jun 2023 16:53:14 +0200

Download raw body.

Thread
This should help with the current "child PID 0 terminated" due to
races with waitpid().

The idea is to manage the subprocess separately to the client, in
their own queue, and instead of the current kill() + waitpid(WNOHANG)
which doesn't reap the process unless we're very lucky, schedule its
termination (still via kill(2)) and set a timer.  Then, if the timer
fires before the corresponding SIGCHLD arrives, the process is killed
abruptly.

There's a small catch in this: since now we listen for SIGCHLD and
tear down the process there, we don't get the imsg EOF.  So if a
subprocess dies unexpectedly, we need to disconnect() the matching
client too.

proc_done() now is only called from the SIGCHLD handler, and
kill_proc() only sends the signal.  There's a bit of redundancy
between these two because in both codepath (i.e. crash -> SIGCHLD ->
proc_done() or kill_proc()) there are resources to be released, and
closing the pipe on kill_proc() seems sensible.


diff /home/op/w/got
commit - 9b9bad55e7a411af3e01d9e4a86c127876184726
path + /home/op/w/got
blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
file + gotd/gotd.c
--- gotd/gotd.c
+++ gotd/gotd.c
@@ -80,7 +80,11 @@ struct gotd_child_proc {
 	char				 repo_path[PATH_MAX];
 	int				 pipe[2];
 	struct gotd_imsgev		 iev;
+	struct event			 tmo;
+
+	TAILQ_ENTRY(gotd_child_proc)	 procs;
 };
+TAILQ_HEAD(gotd_procs, gotd_child_proc) procs;
 
 struct gotd_client {
 	STAILQ_ENTRY(gotd_client)	 entry;
@@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int);
 static const struct got_error *start_auth_child(struct gotd_client *, int,
     struct gotd_repo *, char *, const char *, int, int);
 static void kill_proc(struct gotd_child_proc *, int);
+static void disconnect_on_error(struct gotd_client *, const struct got_error *);
 
 __dead static void
 usage(void)
@@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid)
 }
 
 static void
-wait_for_child(pid_t child_pid)
-{
-	pid_t pid;
-	int status;
-
-	log_debug("waiting for child PID %ld to terminate",
-	    (long)child_pid);
-
-	do {
-		pid = waitpid(child_pid, &status, WNOHANG);
-		if (pid == -1) {
-			if (errno != EINTR && errno != ECHILD)
-				fatal("wait");
-		} else if (WIFSIGNALED(status)) {
-			log_warnx("child PID %ld terminated; signal %d",
-			    (long)pid, WTERMSIG(status));
-		}
-	} while (pid != -1 || (pid == -1 && errno == EINTR));
-}
-
-static void
 proc_done(struct gotd_child_proc *proc)
 {
-	event_del(&proc->iev.ev);
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
+	struct gotd_client *clt;
+
+	TAILQ_REMOVE(&procs, proc, procs);
+
+	clt = find_client_by_proc_fd(proc->iev.ibuf.fd);
+	if (clt != NULL) {
+		if (proc == clt->repo)
+			clt->repo = NULL;
+		if (proc == clt->auth)
+			clt->auth = NULL;
+		if (proc == clt->session)
+			clt->session = NULL;
+		disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED));
+	}
+
+	evtimer_del(&proc->tmo);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+	}
+
 	free(proc);
 }
 
 static void
 kill_repo_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->repo == NULL)
 		return;
 
-	proc = client->repo;
+	kill_proc(client->repo, 0);
 	client->repo = NULL;
-
-	proc_done(proc);
 }
 
 static void
 kill_auth_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->auth == NULL)
 		return;
 
-	proc = client->auth;
+	kill_proc(client->auth, 0);
 	client->auth = NULL;
-
-	proc_done(proc);
 }
 
 static void
 kill_session_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->session == NULL)
 		return;
 
-	proc = client->session;
+	kill_proc(client->session, 0);
 	client->session = NULL;
-
-	proc_done(proc);
 }
 
 static void
@@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
 static void
 kill_proc(struct gotd_child_proc *proc, int fatal)
 {
+	struct timeval tv = { 5, 0 };
+
+	log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+		proc->iev.ibuf.fd = -1;
+	}
+
+	if (!evtimer_pending(&proc->tmo, NULL))
+		evtimer_add(&proc->tmo, &tv);
+
 	if (fatal) {
 		log_warnx("sending SIGKILL to PID %d", proc->pid);
 		kill(proc->pid, SIGKILL);
@@ -753,9 +757,17 @@ gotd_shutdown(void)
 }
 
 static void
+kill_proc_timeout(int fd, short ev, void *d)
+{
+	struct gotd_child_proc *proc = d;
+
+	log_warnx("wait timeout for PID %d terminated", proc->pid);
+	kill_proc(proc, 1);
+}
+
+static void
 gotd_shutdown(void)
 {
-	struct gotd_child_proc *proc;
 	uint64_t slot;
 
 	log_debug("shutting down");
@@ -766,20 +778,31 @@ gotd_shutdown(void)
 			disconnect(c);
 	}
 
-	proc = gotd.listen_proc;
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
-	free(proc);
+	kill_proc(gotd.listen_proc, 0);
 
 	log_info("terminating");
 	exit(0);
 }
 
+static struct gotd_child_proc *
+find_proc_by_pid(pid_t pid)
+{
+	struct gotd_child_proc *proc;
+
+	TAILQ_FOREACH(proc, &procs, procs)
+		if (proc->pid == pid)
+			break;
+
+	return proc;
+}
+
 void
 gotd_sighdlr(int sig, short event, void *arg)
 {
+	struct gotd_child_proc *proc;
+	pid_t pid;
+	int status;
+
 	/*
 	 * Normal signal handler rules don't apply because libevent
 	 * decouples for us.
@@ -796,6 +819,35 @@ gotd_sighdlr(int sig, short event, void *arg)
 	case SIGINT:
 		gotd_shutdown();
 		break;
+	case SIGCHLD:
+		for (;;) {
+			pid = waitpid(WAIT_ANY, &status, WNOHANG);
+			if (pid == -1) {
+				if (errno == EINTR)
+					continue;
+				if (errno == ECHILD)
+					break;
+				fatal("waitpid");
+			}
+			if (pid == 0)
+				break;
+
+			log_debug("reaped pid %d", pid);
+			proc = find_proc_by_pid(pid);
+			if (proc == NULL) {
+				log_info("caught exit of unknown child %d",
+				    pid);
+				continue;
+			}
+
+			if (WIFSIGNALED(status)) {
+				log_warnx("child PID %d terminated with"
+				    " signal %d", pid, WTERMSIG(status));
+			}
+
+			proc_done(proc);
+		}
+		break;
 	default:
 		fatalx("unexpected signal");
 	}
@@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int 
 	if (proc == NULL)
 		fatal("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+
+	/*
+	 * XXX start_listener is called before event_init() so can't
+	 * initialize proc->tmo here.
+	 */
+
 	proc->type = PROC_LISTEN;
 
 	if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK,
@@ -1534,6 +1593,9 @@ start_session_child(struct gotd_client *client, struct
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	if (client_is_reading(client))
 		proc->type = PROC_SESSION_READ;
 	else
@@ -1580,6 +1642,9 @@ start_repo_child(struct gotd_client *client, enum gotd
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = proc_type;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1632,6 +1697,9 @@ start_auth_child(struct gotd_client *client, int requi
 		return err;
 	}
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = PROC_AUTH;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1732,10 +1800,12 @@ main(int argc, char **argv)
 	struct passwd *pw = NULL;
 	char *repo_path = NULL;
 	enum gotd_procid proc_id = PROC_GOTD;
-	struct event evsigint, evsigterm, evsighup, evsigusr1;
+	struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld;
 	int *pack_fds = NULL, *temp_fds = NULL;
 	struct gotd_repo *repo = NULL;
 
+	TAILQ_INIT(&procs);
+
 	log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */
 
 	while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) {
@@ -1955,18 +2025,23 @@ main(int argc, char **argv)
 	if (proc_id != PROC_GOTD)
 		fatal("invalid process id %d", proc_id);
 
+	evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout,
+	    gotd.listen_proc);
+
 	apply_unveil_selfexec();
 
 	signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL);
 	signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL);
 	signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL);
 	signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL);
+	signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL);
 	signal(SIGPIPE, SIG_IGN);
 
 	signal_add(&evsigint, NULL);
 	signal_add(&evsigterm, NULL);
 	signal_add(&evsighup, NULL);
 	signal_add(&evsigusr1, NULL);
+	signal_add(&evsigchld, NULL);
 
 	gotd_imsg_event_add(&gotd.listen_proc->iev);