"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: gotd: wait asynchronously for child termination
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Thu, 22 Jun 2023 17:40:37 +0200

Download raw body.

Thread
On 2023/06/22 17:13:38 +0200, Stefan Sperling <stsp@stsp.name> wrote:
> On Thu, Jun 22, 2023 at 04:53:14PM +0200, Omar Polo wrote:
> > diff /home/op/w/got
> > commit - 9b9bad55e7a411af3e01d9e4a86c127876184726
> > path + /home/op/w/got
> > blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
> > file + gotd/gotd.c
> > --- gotd/gotd.c
> > +++ gotd/gotd.c
> > @@ -80,7 +80,11 @@ struct gotd_child_proc {
> >  	char				 repo_path[PATH_MAX];
> >  	int				 pipe[2];
> >  	struct gotd_imsgev		 iev;
> > +	struct event			 tmo;
> > +
> > +	TAILQ_ENTRY(gotd_child_proc)	 procs;
> 
> By convention used so far, the above variable should be called 'entry'.

right, fixed.

> > -static void
> >  proc_done(struct gotd_child_proc *proc)
> >  {
> > -	event_del(&proc->iev.ev);
> > -	msgbuf_clear(&proc->iev.ibuf.w);
> > -	close(proc->iev.ibuf.fd);
> > -	kill_proc(proc, 0);
> > -	wait_for_child(proc->pid);
> > +	struct gotd_client *clt;
> 
> Could you rename 'clt' to 'client' as used elsewhere?

done

> > +
> > +	TAILQ_REMOVE(&procs, proc, procs);
> > +
> > +	clt = find_client_by_proc_fd(proc->iev.ibuf.fd);
> > +	if (clt != NULL) {
> > +		if (proc == clt->repo)
> > +			clt->repo = NULL;
> > +		if (proc == clt->auth)
> > +			clt->auth = NULL;
> > +		if (proc == clt->session)
> > +			clt->session = NULL;
> > +		disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED));
> 
> I believe this error might arrive at the client user's terminal, and
> they would see "unprivileged process died unexpectedly", even in the
> code path where the child process terminates normally.

this shouldn't be executed in the normal code path because
disconnect() first calls the various kill_*_proc() which set
client->{repo,auth,session} to NULL.
find_client_by_proc_fd() then won't find the client.

> I would suggest to either use GOT_ERR_EOF which will be logged but
> not be sent to the user, or simply call disconnect() directly.

i've changed it to disconnect() anyway.  we already have the logging
if a process dies, and the (dis)connected user doesn't need to know.

> > @@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
> >  static void
> >  kill_proc(struct gotd_child_proc *proc, int fatal)
> >  {
> > +	struct timeval tv = { 5, 0 };
> > +
> > +	log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
> > +
> > +	if (proc->iev.ibuf.fd != -1) {
> > +		event_del(&proc->iev.ev);
> > +		msgbuf_clear(&proc->iev.ibuf.w);
> > +		close(proc->iev.ibuf.fd);
> > +		proc->iev.ibuf.fd = -1;
> > +	}
> > +
> > +	if (!evtimer_pending(&proc->tmo, NULL))
> > +		evtimer_add(&proc->tmo, &tv);
> > +
> 
> The above check needs && !fatal, otherwise we will schedule the
> timeout again after it has already fired to trigger kill -9.

ah, yes.  it's pointless to schedule the timer here since we'll get
immediately the SIGCHLD.

> >  static void
> > +kill_proc_timeout(int fd, short ev, void *d)
> > +{
> > +	struct gotd_child_proc *proc = d;
> > +
> > +	log_warnx("wait timeout for PID %d terminated", proc->pid);
> > +	kill_proc(proc, 1);
> 
> I would rephrase the above as something like:
>   "timeout waiting for PID %d to terminate; retrying with force"

done, thanks

> > +static struct gotd_child_proc *
> > +find_proc_by_pid(pid_t pid)
> > +{
> > +	struct gotd_child_proc *proc;
> 
> Must initialize to NULL above, otherwise we return garbage below if no
> PID matches.

here I cheated a bit.  procs contains always at least one process (the
listener), but I agree with not being clever and initializing the
variable.

> > +
> > +	TAILQ_FOREACH(proc, &procs, procs)
> > +		if (proc->pid == pid)
> > +			break;
> > +
> > +	return proc;
> > +}
> 
> > @@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int 
> >  	if (proc == NULL)
> >  		fatal("calloc");
> >  
> > +	TAILQ_INSERT_HEAD(&procs, proc, procs);
> > +
> > +	/*
> > +	 * XXX start_listener is called before event_init() so can't
> > +	 * initialize proc->tmo here.
> > +	 */
> > +
> >  	proc->type = PROC_LISTEN;
> 
> I would remove the XXX, I suspect that not calling event_init() before
> starting the listener was deliberate. At least I vaguely recall having
> problems with forking and event_init().

the issue is calling daemon() which fork()s under the hood.  we would
need to either reinitialize libevent or doing something like the
following:

:  	log_init(daemonize ? 0 : 1, LOG_DAEMON);
:  	log_setverbose(verbosity);
:  
: +	if (proc_id == PROC_GOTD && daemonize && daemon(1, 0) == -1)
: +		fatal("daemon");
: +
: +	event_init();
: +
:  	if (proc_id == PROC_GOTD) {
:  		snprintf(title, sizeof(title), "%s", gotd_proc_names[proc_id]);
:  		arc4random_buf(&clients_hash_key, sizeof(clients_hash_key));
: -		if (daemonize && daemon(1, 0) == -1)
: -			fatal("daemon");
:  		gotd.pid = getpid();
:  		start_listener(argv0, confpath, daemonize, verbosity);
:  	} else if (proc_id == PROC_LISTEN) {
: @@ -1932,8 +1935,6 @@ main(int argc, char **argv)
:  	if (setuid(pw->pw_uid) == -1)
:  		fatal("setuid %d failed", pw->pw_uid);
:  
: -	event_init();
: -
:  	switch (proc_id) {
:  	case PROC_GOTD:
:  #ifndef PROFILE

but this initializes libevent before dropping privileges.

> Perhaps mention that the timeout will be initialized in main() instead?

done too.

diff refs/remotes/got/main refs/heads/main
commit - ba91039c1a0a3d55f4850e26c24730cbf4b5c239
commit + 4affeead3c9700295bc3ad55f06ecfc9e839014f
blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
blob + 68a99a122229289490584888ded7f2beb2d99042
--- gotd/gotd.c
+++ gotd/gotd.c
@@ -80,7 +80,11 @@ struct gotd_child_proc {
 	char				 repo_path[PATH_MAX];
 	int				 pipe[2];
 	struct gotd_imsgev		 iev;
+	struct event			 tmo;
+
+	TAILQ_ENTRY(gotd_child_proc)	 entry;
 };
+TAILQ_HEAD(gotd_procs, gotd_child_proc) procs;
 
 struct gotd_client {
 	STAILQ_ENTRY(gotd_client)	 entry;
@@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int);
 static const struct got_error *start_auth_child(struct gotd_client *, int,
     struct gotd_repo *, char *, const char *, int, int);
 static void kill_proc(struct gotd_child_proc *, int);
+static void disconnect(struct gotd_client *);
 
 __dead static void
 usage(void)
@@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid)
 }
 
 static void
-wait_for_child(pid_t child_pid)
-{
-	pid_t pid;
-	int status;
-
-	log_debug("waiting for child PID %ld to terminate",
-	    (long)child_pid);
-
-	do {
-		pid = waitpid(child_pid, &status, WNOHANG);
-		if (pid == -1) {
-			if (errno != EINTR && errno != ECHILD)
-				fatal("wait");
-		} else if (WIFSIGNALED(status)) {
-			log_warnx("child PID %ld terminated; signal %d",
-			    (long)pid, WTERMSIG(status));
-		}
-	} while (pid != -1 || (pid == -1 && errno == EINTR));
-}
-
-static void
 proc_done(struct gotd_child_proc *proc)
 {
-	event_del(&proc->iev.ev);
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
+	struct gotd_client *client;
+
+	TAILQ_REMOVE(&procs, proc, entry);
+
+	client = find_client_by_proc_fd(proc->iev.ibuf.fd);
+	if (client != NULL) {
+		if (proc == client->repo)
+			client->repo = NULL;
+		if (proc == client->auth)
+			client->auth = NULL;
+		if (proc == client->session)
+			client->session = NULL;
+		disconnect(client);
+	}
+
+	evtimer_del(&proc->tmo);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+	}
+
 	free(proc);
 }
 
 static void
 kill_repo_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->repo == NULL)
 		return;
 
-	proc = client->repo;
+	kill_proc(client->repo, 0);
 	client->repo = NULL;
-
-	proc_done(proc);
 }
 
 static void
 kill_auth_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->auth == NULL)
 		return;
 
-	proc = client->auth;
+	kill_proc(client->auth, 0);
 	client->auth = NULL;
-
-	proc_done(proc);
 }
 
 static void
 kill_session_proc(struct gotd_client *client)
 {
-	struct gotd_child_proc *proc;
-
 	if (client->session == NULL)
 		return;
 
-	proc = client->session;
+	kill_proc(client->session, 0);
 	client->session = NULL;
-
-	proc_done(proc);
 }
 
 static void
@@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
 static void
 kill_proc(struct gotd_child_proc *proc, int fatal)
 {
+	struct timeval tv = { 5, 0 };
+
+	log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
+
+	if (proc->iev.ibuf.fd != -1) {
+		event_del(&proc->iev.ev);
+		msgbuf_clear(&proc->iev.ibuf.w);
+		close(proc->iev.ibuf.fd);
+		proc->iev.ibuf.fd = -1;
+	}
+
+	if (!evtimer_pending(&proc->tmo, NULL) && !fatal)
+		evtimer_add(&proc->tmo, &tv);
+
 	if (fatal) {
 		log_warnx("sending SIGKILL to PID %d", proc->pid);
 		kill(proc->pid, SIGKILL);
@@ -753,9 +757,18 @@ gotd_shutdown(void)
 }
 
 static void
+kill_proc_timeout(int fd, short ev, void *d)
+{
+	struct gotd_child_proc *proc = d;
+
+	log_warnx("timeout waiting for PID %d to terminate;"
+	    " retrying with force", proc->pid);
+	kill_proc(proc, 1);
+}
+
+static void
 gotd_shutdown(void)
 {
-	struct gotd_child_proc *proc;
 	uint64_t slot;
 
 	log_debug("shutting down");
@@ -766,20 +779,31 @@ gotd_shutdown(void)
 			disconnect(c);
 	}
 
-	proc = gotd.listen_proc;
-	msgbuf_clear(&proc->iev.ibuf.w);
-	close(proc->iev.ibuf.fd);
-	kill_proc(proc, 0);
-	wait_for_child(proc->pid);
-	free(proc);
+	kill_proc(gotd.listen_proc, 0);
 
 	log_info("terminating");
 	exit(0);
 }
 
+static struct gotd_child_proc *
+find_proc_by_pid(pid_t pid)
+{
+	struct gotd_child_proc *proc = NULL;
+
+	TAILQ_FOREACH(proc, &procs, entry)
+		if (proc->pid == pid)
+			break;
+
+	return proc;
+}
+
 void
 gotd_sighdlr(int sig, short event, void *arg)
 {
+	struct gotd_child_proc *proc;
+	pid_t pid;
+	int status;
+
 	/*
 	 * Normal signal handler rules don't apply because libevent
 	 * decouples for us.
@@ -796,6 +820,35 @@ gotd_sighdlr(int sig, short event, void *arg)
 	case SIGINT:
 		gotd_shutdown();
 		break;
+	case SIGCHLD:
+		for (;;) {
+			pid = waitpid(WAIT_ANY, &status, WNOHANG);
+			if (pid == -1) {
+				if (errno == EINTR)
+					continue;
+				if (errno == ECHILD)
+					break;
+				fatal("waitpid");
+			}
+			if (pid == 0)
+				break;
+
+			log_debug("reaped pid %d", pid);
+			proc = find_proc_by_pid(pid);
+			if (proc == NULL) {
+				log_info("caught exit of unknown child %d",
+				    pid);
+				continue;
+			}
+
+			if (WIFSIGNALED(status)) {
+				log_warnx("child PID %d terminated with"
+				    " signal %d", pid, WTERMSIG(status));
+			}
+
+			proc_done(proc);
+		}
+		break;
 	default:
 		fatalx("unexpected signal");
 	}
@@ -1508,6 +1561,10 @@ start_listener(char *argv0, const char *confpath, int 
 	if (proc == NULL)
 		fatal("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, entry);
+
+	/* proc->tmo is initialized in main() after event_init() */
+
 	proc->type = PROC_LISTEN;
 
 	if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK,
@@ -1534,6 +1591,9 @@ start_session_child(struct gotd_client *client, struct
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, entry);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	if (client_is_reading(client))
 		proc->type = PROC_SESSION_READ;
 	else
@@ -1580,6 +1640,9 @@ start_repo_child(struct gotd_client *client, enum gotd
 	if (proc == NULL)
 		return got_error_from_errno("calloc");
 
+	TAILQ_INSERT_HEAD(&procs, proc, entry);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = proc_type;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1632,6 +1695,9 @@ start_auth_child(struct gotd_client *client, int requi
 		return err;
 	}
 
+	TAILQ_INSERT_HEAD(&procs, proc, entry);
+	evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
 	proc->type = PROC_AUTH;
 	if (strlcpy(proc->repo_name, repo->name,
 	    sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1732,10 +1798,12 @@ main(int argc, char **argv)
 	struct passwd *pw = NULL;
 	char *repo_path = NULL;
 	enum gotd_procid proc_id = PROC_GOTD;
-	struct event evsigint, evsigterm, evsighup, evsigusr1;
+	struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld;
 	int *pack_fds = NULL, *temp_fds = NULL;
 	struct gotd_repo *repo = NULL;
 
+	TAILQ_INIT(&procs);
+
 	log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */
 
 	while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) {
@@ -1955,18 +2023,23 @@ main(int argc, char **argv)
 	if (proc_id != PROC_GOTD)
 		fatal("invalid process id %d", proc_id);
 
+	evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout,
+	    gotd.listen_proc);
+
 	apply_unveil_selfexec();
 
 	signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL);
 	signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL);
 	signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL);
 	signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL);
+	signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL);
 	signal(SIGPIPE, SIG_IGN);
 
 	signal_add(&evsigint, NULL);
 	signal_add(&evsigterm, NULL);
 	signal_add(&evsighup, NULL);
 	signal_add(&evsigusr1, NULL);
+	signal_add(&evsigchld, NULL);
 
 	gotd_imsg_event_add(&gotd.listen_proc->iev);