"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: gotd: Don't use WNOHANG when waiting for child processes
To:
Stefan Sperling <stsp@stsp.name>
Cc:
Josiah Frentsos <jfrent@tilde.team>, gameoftrees@openbsd.org
Date:
Wed, 21 Jun 2023 14:05:24 +0200

Download raw body.

Thread
On 2023/06/19 23:39:04 +0200, Stefan Sperling <stsp@stsp.name> wrote:
> On Sat, Mar 04, 2023 at 11:58:46AM +0100, Omar Polo wrote:
> > On 2023/03/03 09:35:59 -0500, Josiah Frentsos <jfrent@tilde.team> wrote:
> > > Fixes this:
> > > 
> > >     2023-03-03T13:51:15.155Z silicon gotd[7715]: child PID 0 terminated; signal 12
> > >     2023-03-03T13:51:15.186Z silicon last message repeated 419 times
> > 
> > I don't think the diff is wrong per-se, wait_for_child is just a
> > wrapper around waitpid and we're interested in one child only anyway,
> > so I *think* WNOHANG is not much important.  I'm curious why it's
> > needed though.
> > 
> > waitpid(WNOHANG) returns 0 when "there are no stopped or exited
> > children".  Since we call it always after kill_proc() maybe it's a
> > timing issue: we call waitpid(WNOHANG) before the signal is handled
> > and the other process exits?
> 
> Given that wait_for_child() runs in libevent handlers (via client_disconnect)
> I don't think we want to call waitpid() in blocking mode. We should stay in
> control over how long we are going to wait for the child to exit.
> Otherwise, if waitpid() never returns for some reason (for example, because
> the child is spinning on the CPU and doesn't exit) then all of gotd will
> hang because further events will not be processed.
> 
> The diff below gives up after 5 seconds. Good enough? Too long?
> 
> I did consider retrying with kill -9 upon timeout. But for now I would rather
> be able to attach gdb to a child that doesn't terminate to figure out why.

we talked about this on irc for a bit and i think we agreed on killing
the procs and setting a timeout for a kill -9 instead of looping on
waitpid(WNOHANG) with sleeps inbetween.

diff below is an attempt to do so.  I've only tested it on localhost
so far, but the logging output seems good.  regress passes, but that
tells us almost nothing because it doesn't catch zombies and other
leaks.

it's a bit crude and can be splitted in a few commits, but should be
good enough for some testing.  I'm running this on my mirror.

There's some logging that should help understanding if something goes
wrong.  More semplification can land after.

The idea is to keep a queue of gotd_child_procs.  Upon kill_proc we
set a timeout and if it fires before the corresponding SIGCHLD it gets
killed the hard way with SIGKILL.  For local debugging, the kill_proc
call in kill_proc_timeout can de commented to attach gdb.

diff refs/heads/main refs/heads/async-wait
commit - 9fc9df27559545b0cacca46fc6d8db711fdcaf41
commit + 2b6f7f49603a594f2d231229a94497357087b9c3
blob - f15597dbd0e7299cbf018874f9d81d6c8dcaf403
blob + cae57edbe0fd64d98e48affe930440122149ea1e
--- gotd/gotd.c
+++ gotd/gotd.c
@@ -73,6 +73,19 @@ struct gotd_client {
 	GOTD_CLIENT_STATE_ACCESS_GRANTED,
 };
 
+struct gotd_child_proc {
+	pid_t				 pid;
+	enum gotd_procid		 type;
+	char				 repo_name[NAME_MAX];
+	char				 repo_path[PATH_MAX];
+	int				 pipe[2];
+	struct gotd_imsgev		 iev;
+	struct event			 tmo;
+
+	TAILQ_ENTRY(gotd_child_proc)	 procs;
+};
+TAILQ_HEAD(gotd_procs, gotd_child_proc) procs;
+
 struct gotd_client {
 	STAILQ_ENTRY(gotd_client)	 entry;
 	enum gotd_client_state		 state;
@@ -267,88 +280,64 @@ wait_for_child(pid_t child_pid)
 }
 
 static void
-wait_for_child(pid_t child_pid)
-{
-	pid_t pid;
-	int status;
-
-	log_debug("waiting for child PID %ld to terminate",
-	    (long)child_pid);
-
-	do {
-		pid = waitpid(child_pid, &status, WNOHANG);
-		if (pid == -1) {
-			if (errno != EINTR && errno != ECHILD)
-				fatal("wait");
-		} else if (WIFSIGNALED(status)) {
-			log_warnx("child PID %ld terminated; signal %d",
-			    (long)pid, WTERMSIG(status));
-		}
-	} while (pid != -1 || (pid == -1 && errno == EINTR));
-}
-
-static void
 proc_done(struct gotd_child_proc *proc)
 {
-	event_del(&proc->iev.ev);
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
+	TAILQ_REMOVE(&procs, proc, procs);
+
+	evtimer_del(&proc->tmo);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+	}
+
 	free(proc);
 }
 
 static void
+kill_repo_proc(struct gotd_client *client)
+{
+	if (client->repo == NULL)
+		return;
+
+	kill_proc(client->repo, 0);
+	client->repo = NULL;
+}
+
+static void
 kill_auth_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->auth == NULL)
 		return;
 
-	proc = client->auth;
+	kill_proc(client->auth, 0);
 	client->auth = NULL;
-
-	proc_done(proc);
 }
 
 static void
 kill_session_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->session == NULL)
 		return;
 
-	proc = client->session;
+	kill_proc(client->session, 0);
 	client->session = NULL;
-
-	proc_done(proc);
 }
 
 static void
 disconnect(struct gotd_client *client)
 {
 	struct gotd_imsg_disconnect idisconnect;
-	struct gotd_child_proc *proc = client->repo;
-	struct gotd_child_proc *listen_proc = &gotd.listen_proc;
+	struct gotd_child_proc *listen_proc = gotd.listen_proc;
 	uint64_t slot;
 
 	log_debug("uid %d: disconnecting", client->euid);
 
 	kill_auth_proc(client);
 	kill_session_proc(client);
+	kill_repo_proc(client);
 
-	if (proc) {
-		event_del(&proc->iev.ev);
-		msgbuf_clear(&proc->iev.ibuf.w);
-		close(proc->iev.ibuf.fd);
-		kill_proc(proc, 0);
-		wait_for_child(proc->pid);
-		free(proc);
-		proc = NULL;
-	}
-
 	idisconnect.client_id = client->id;
 	if (gotd_imsg_compose_event(&listen_proc->iev,
 	    GOTD_IMSG_DISCONNECT, PROC_GOTD, -1,
@@ -703,7 +692,7 @@ done:
 	    client->euid, client->fd);
 done:
 	if (err) {
-		struct gotd_child_proc *listen_proc = &gotd.listen_proc;
+		struct gotd_child_proc *listen_proc = gotd.listen_proc;
 		struct gotd_imsg_disconnect idisconnect;
 
 		idisconnect.client_id = client->id;
@@ -732,6 +721,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
 static void
 kill_proc(struct gotd_child_proc *proc, int fatal)
 {
+	struct timeval tv = { 5, 0 };
+
+	log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+		proc->iev.ibuf.fd = -1;
+	}
+
+	if (!evtimer_pending(&proc->tmo, NULL))
+		evtimer_add(&proc->tmo, &tv);
+
 	if (fatal) {
 		log_warnx("sending SIGKILL to PID %d", proc->pid);
 		kill(proc->pid, SIGKILL);
@@ -740,6 +743,15 @@ gotd_shutdown(void)
 }
 
 static void
+kill_proc_timeout(int fd, short ev, void *d)
+{
+	struct gotd_child_proc *proc = d;
+
+	log_warnx("wait timeout for PID %d terminated", proc->pid);
+	kill_proc(proc, 1);
+}
+
+static void
 gotd_shutdown(void)
 {
 	struct gotd_child_proc *proc;
@@ -753,19 +765,32 @@ gotd_shutdown(void)
 			disconnect(c);
 	}
 
-	proc = &gotd.listen_proc;
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
+	TAILQ_FOREACH(proc, &procs, procs)
+		kill_proc(proc, 0);
 
 	log_info("terminating");
 	exit(0);
 }
 
+static struct gotd_child_proc *
+find_proc_by_pid(pid_t pid)
+{
+	struct gotd_child_proc *proc;
+
+	TAILQ_FOREACH(proc, &procs, procs)
+		if (proc->pid == pid)
+			break;
+
+	return proc;
+}
+
 void
 gotd_sighdlr(int sig, short event, void *arg)
 {
+	struct gotd_child_proc *proc;
+	pid_t pid;
+	int status;
+
 	/*
 	 * Normal signal handler rules don't apply because libevent
 	 * decouples for us.
@@ -782,6 +807,35 @@ gotd_sighdlr(int sig, short event, void *arg)
 	case SIGINT:
 		gotd_shutdown();
 		break;
+	case SIGCHLD:
+		for (;;) {
+			pid = waitpid(WAIT_ANY, &status, WNOHANG);
+			if (pid == -1) {
+				if (errno == EINTR)
+					continue;
+				if (errno == ECHILD)
+					break;
+				fatal("waitpid");
+			}
+			if (pid == 0)
+				break;
+
+			log_debug("reaped pid %d", pid);
+			proc = find_proc_by_pid(pid);
+			if (proc == NULL) {
+				log_info("caught exit of unknown child %d",
+				    pid);
+				continue;
+			}
+
+			if (WIFSIGNALED(status)) {
+				log_warnx("child PID %d terminated with"
+				    " signal %d", pid, WTERMSIG(status));
+			}
+
+			proc_done(proc);
+		}
+		break;
 	default:
 		fatalx("unexpected signal");
 	}
@@ -960,7 +1014,7 @@ gotd_dispatch_listener(int fd, short event, void *arg)
 {
 	struct gotd_imsgev *iev = arg;
 	struct imsgbuf *ibuf = &iev->ibuf;
-	struct gotd_child_proc *proc = &gotd.listen_proc;
+	struct gotd_child_proc *proc = gotd.listen_proc;
 	ssize_t n;
 	int shut = 0;
 	struct imsg imsg;
@@ -1488,8 +1542,19 @@ start_listener(char *argv0, const char *confpath, int 
 static void
 start_listener(char *argv0, const char *confpath, int daemonize, int verbosity)
 {
-	struct gotd_child_proc *proc = &gotd.listen_proc;
+	struct gotd_child_proc *proc;
 
+	proc = calloc(1, sizeof(*proc));
+	if (proc == NULL)
+		fatal("calloc");
+
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+
+	/*
+	 * XXX start_listener is called before event_init() so can't
+	 * initialize proc->tmo here.
+	 */
+
 	proc->type = PROC_LISTEN;
 
 	if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK,
@@ -1502,6 +1567,8 @@ start_listener(char *argv0, const char *confpath, int 
 	proc->iev.handler = gotd_dispatch_listener;
 	proc->iev.events = EV_READ;
 	proc->iev.handler_arg = NULL;
+
+	gotd.listen_proc = proc;
 }
 
 static const struct got_error *
@@ -1514,6 +1581,9 @@ start_session_child(struct gotd_client *client, struct
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	if (client_is_reading(client))
 		proc->type = PROC_SESSION_READ;
 	else
@@ -1560,6 +1630,9 @@ start_repo_child(struct gotd_client *client, enum gotd
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = proc_type;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1612,6 +1685,9 @@ start_auth_child(struct gotd_client *client, int requi
 		return err;
 	}
 
+	TAILQ_INSERT_HEAD(&procs, proc, procs);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = PROC_AUTH;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1712,10 +1788,12 @@ main(int argc, char **argv)
 	struct passwd *pw = NULL;
 	char *repo_path = NULL;
 	enum gotd_procid proc_id = PROC_GOTD;
-	struct event evsigint, evsigterm, evsighup, evsigusr1;
+	struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld;
 	int *pack_fds = NULL, *temp_fds = NULL;
 	struct gotd_repo *repo = NULL;
 
+	TAILQ_INIT(&procs);
+
 	log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */
 
 	while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) {
@@ -1935,20 +2013,25 @@ main(int argc, char **argv)
 	if (proc_id != PROC_GOTD)
 		fatal("invalid process id %d", proc_id);
 
+	evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout,
+	    gotd.listen_proc);
+
 	apply_unveil_selfexec();
 
 	signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL);
 	signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL);
 	signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL);
 	signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL);
+	signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL);
 	signal(SIGPIPE, SIG_IGN);
 
 	signal_add(&evsigint, NULL);
 	signal_add(&evsigterm, NULL);
 	signal_add(&evsighup, NULL);
 	signal_add(&evsigusr1, NULL);
+	signal_add(&evsigchld, NULL);
 
-	gotd_imsg_event_add(&gotd.listen_proc.iev);
+	gotd_imsg_event_add(&gotd.listen_proc->iev);
 
 	event_dispatch();
 
blob - 35d6e95f4881eb3c4ce60156fce684fa03070cb4
blob + e902d6f53b03f7682202e2b9b3d0af5fc28c25b6
--- gotd/gotd.h
+++ gotd/gotd.h
@@ -51,16 +51,6 @@ struct gotd_child_proc {
 	short		 events;
 };
 
-struct gotd_child_proc {
-	pid_t pid;
-	enum gotd_procid type;
-	char repo_name[NAME_MAX];
-	char repo_path[PATH_MAX];
-	int pipe[2];
-	struct gotd_imsgev iev;
-	size_t nhelpers;
-};
-
 enum gotd_access {
 	GOTD_ACCESS_PERMITTED = 1,
 	GOTD_ACCESS_DENIED
@@ -120,13 +110,15 @@ struct gotd {
 	int max_connections;
 };
 
+struct gotd_child_proc;
+
 struct gotd {
 	pid_t pid;
 	char unix_socket_path[PATH_MAX];
 	char user_name[32];
 	struct gotd_repolist repos;
 	int nrepos;
-	struct gotd_child_proc listen_proc;
+	struct gotd_child_proc *listen_proc;
 	struct timeval request_timeout;
 	struct timeval auth_timeout;
 	struct gotd_uid_connection_limit *connection_limits;