Download raw body.
gotd: wait asynchronously for child termination
On 2023/06/22 17:13:38 +0200, Stefan Sperling <stsp@stsp.name> wrote:
> On Thu, Jun 22, 2023 at 04:53:14PM +0200, Omar Polo wrote:
> > diff /home/op/w/got
> > commit - 9b9bad55e7a411af3e01d9e4a86c127876184726
> > path + /home/op/w/got
> > blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
> > file + gotd/gotd.c
> > --- gotd/gotd.c
> > +++ gotd/gotd.c
> > @@ -80,7 +80,11 @@ struct gotd_child_proc {
> > char repo_path[PATH_MAX];
> > int pipe[2];
> > struct gotd_imsgev iev;
> > + struct event tmo;
> > +
> > + TAILQ_ENTRY(gotd_child_proc) procs;
>
> By convention used so far, the above variable should be called 'entry'.
right, fixed.
> > -static void
> > proc_done(struct gotd_child_proc *proc)
> > {
> > - event_del(&proc->iev.ev);
> > - msgbuf_clear(&proc->iev.ibuf.w);
> > - close(proc->iev.ibuf.fd);
> > - kill_proc(proc, 0);
> > - wait_for_child(proc->pid);
> > + struct gotd_client *clt;
>
> Could you rename 'clt' to 'client' as used elsewhere?
done
> > +
> > + TAILQ_REMOVE(&procs, proc, procs);
> > +
> > + clt = find_client_by_proc_fd(proc->iev.ibuf.fd);
> > + if (clt != NULL) {
> > + if (proc == clt->repo)
> > + clt->repo = NULL;
> > + if (proc == clt->auth)
> > + clt->auth = NULL;
> > + if (proc == clt->session)
> > + clt->session = NULL;
> > + disconnect_on_error(clt, got_error(GOT_ERR_PRIVSEP_DIED));
>
> I believe this error might arrive at the client user's terminal, and
> they would see "unprivileged process died unexpectedly", even in the
> code path where the child process terminates normally.
this shouldn't be executed in the normal code path because
disconnect() first calls the various kill_*_proc() which set
client->{repo,auth,session} to NULL.
find_client_by_proc_fd() then won't find the client.
> I would suggest to either use GOT_ERR_EOF which will be logged but
> not be sent to the user, or simply call disconnect() directly.
i've changed it to disconnect() anyway. we already have the logging
if a process dies, and the (dis)connected user doesn't need to know.
> > @@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
> > static void
> > kill_proc(struct gotd_child_proc *proc, int fatal)
> > {
> > + struct timeval tv = { 5, 0 };
> > +
> > + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
> > +
> > + if (proc->iev.ibuf.fd != -1) {
> > + event_del(&proc->iev.ev);
> > + msgbuf_clear(&proc->iev.ibuf.w);
> > + close(proc->iev.ibuf.fd);
> > + proc->iev.ibuf.fd = -1;
> > + }
> > +
> > + if (!evtimer_pending(&proc->tmo, NULL))
> > + evtimer_add(&proc->tmo, &tv);
> > +
>
> The above check needs && !fatal, otherwise we will schedule the
> timeout again after it has already fired to trigger kill -9.
ah, yes. it's pointless to schedule the timer here since we'll get
immediately the SIGCHLD.
> > static void
> > +kill_proc_timeout(int fd, short ev, void *d)
> > +{
> > + struct gotd_child_proc *proc = d;
> > +
> > + log_warnx("wait timeout for PID %d terminated", proc->pid);
> > + kill_proc(proc, 1);
>
> I would rephrase the above as something like:
> "timeout waiting for PID %d to terminate; retrying with force"
done, thanks
> > +static struct gotd_child_proc *
> > +find_proc_by_pid(pid_t pid)
> > +{
> > + struct gotd_child_proc *proc;
>
> Must initialize to NULL above, otherwise we return garbage below if no
> PID matches.
here I cheated a bit. procs contains always at least one process (the
listener), but I agree with not being clever and initializing the
variable.
> > +
> > + TAILQ_FOREACH(proc, &procs, procs)
> > + if (proc->pid == pid)
> > + break;
> > +
> > + return proc;
> > +}
>
> > @@ -1508,6 +1560,13 @@ start_listener(char *argv0, const char *confpath, int
> > if (proc == NULL)
> > fatal("calloc");
> >
> > + TAILQ_INSERT_HEAD(&procs, proc, procs);
> > +
> > + /*
> > + * XXX start_listener is called before event_init() so can't
> > + * initialize proc->tmo here.
> > + */
> > +
> > proc->type = PROC_LISTEN;
>
> I would remove the XXX, I suspect that not calling event_init() before
> starting the listener was deliberate. At least I vaguely recall having
> problems with forking and event_init().
the issue is calling daemon() which fork()s under the hood. we would
need to either reinitialize libevent or doing something like the
following:
: log_init(daemonize ? 0 : 1, LOG_DAEMON);
: log_setverbose(verbosity);
:
: + if (proc_id == PROC_GOTD && daemonize && daemon(1, 0) == -1)
: + fatal("daemon");
: +
: + event_init();
: +
: if (proc_id == PROC_GOTD) {
: snprintf(title, sizeof(title), "%s", gotd_proc_names[proc_id]);
: arc4random_buf(&clients_hash_key, sizeof(clients_hash_key));
: - if (daemonize && daemon(1, 0) == -1)
: - fatal("daemon");
: gotd.pid = getpid();
: start_listener(argv0, confpath, daemonize, verbosity);
: } else if (proc_id == PROC_LISTEN) {
: @@ -1932,8 +1935,6 @@ main(int argc, char **argv)
: if (setuid(pw->pw_uid) == -1)
: fatal("setuid %d failed", pw->pw_uid);
:
: - event_init();
: -
: switch (proc_id) {
: case PROC_GOTD:
: #ifndef PROFILE
but this initializes libevent before dropping privileges.
> Perhaps mention that the timeout will be initialized in main() instead?
done too.
diff refs/remotes/got/main refs/heads/main
commit - ba91039c1a0a3d55f4850e26c24730cbf4b5c239
commit + 4affeead3c9700295bc3ad55f06ecfc9e839014f
blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8
blob + 68a99a122229289490584888ded7f2beb2d99042
--- gotd/gotd.c
+++ gotd/gotd.c
@@ -80,7 +80,11 @@ struct gotd_child_proc {
char repo_path[PATH_MAX];
int pipe[2];
struct gotd_imsgev iev;
+ struct event tmo;
+
+ TAILQ_ENTRY(gotd_child_proc) entry;
};
+TAILQ_HEAD(gotd_procs, gotd_child_proc) procs;
struct gotd_client {
STAILQ_ENTRY(gotd_client) entry;
@@ -113,6 +117,7 @@ static void kill_proc(struct gotd_child_proc *, int);
static const struct got_error *start_auth_child(struct gotd_client *, int,
struct gotd_repo *, char *, const char *, int, int);
static void kill_proc(struct gotd_child_proc *, int);
+static void disconnect(struct gotd_client *);
__dead static void
usage(void)
@@ -276,77 +281,62 @@ wait_for_child(pid_t child_pid)
}
static void
-wait_for_child(pid_t child_pid)
-{
- pid_t pid;
- int status;
-
- log_debug("waiting for child PID %ld to terminate",
- (long)child_pid);
-
- do {
- pid = waitpid(child_pid, &status, WNOHANG);
- if (pid == -1) {
- if (errno != EINTR && errno != ECHILD)
- fatal("wait");
- } else if (WIFSIGNALED(status)) {
- log_warnx("child PID %ld terminated; signal %d",
- (long)pid, WTERMSIG(status));
- }
- } while (pid != -1 || (pid == -1 && errno == EINTR));
-}
-
-static void
proc_done(struct gotd_child_proc *proc)
{
- event_del(&proc->iev.ev);
- msgbuf_clear(&proc->iev.ibuf.w);
- close(proc->iev.ibuf.fd);
- kill_proc(proc, 0);
- wait_for_child(proc->pid);
+ struct gotd_client *client;
+
+ TAILQ_REMOVE(&procs, proc, entry);
+
+ client = find_client_by_proc_fd(proc->iev.ibuf.fd);
+ if (client != NULL) {
+ if (proc == client->repo)
+ client->repo = NULL;
+ if (proc == client->auth)
+ client->auth = NULL;
+ if (proc == client->session)
+ client->session = NULL;
+ disconnect(client);
+ }
+
+ evtimer_del(&proc->tmo);
+
+ if (proc->iev.ibuf.fd != -1) {
+ event_del(&proc->iev.ev);
+ msgbuf_clear(&proc->iev.ibuf.w);
+ close(proc->iev.ibuf.fd);
+ }
+
free(proc);
}
static void
kill_repo_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->repo == NULL)
return;
- proc = client->repo;
+ kill_proc(client->repo, 0);
client->repo = NULL;
-
- proc_done(proc);
}
static void
kill_auth_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->auth == NULL)
return;
- proc = client->auth;
+ kill_proc(client->auth, 0);
client->auth = NULL;
-
- proc_done(proc);
}
static void
kill_session_proc(struct gotd_client *client)
{
- struct gotd_child_proc *proc;
-
if (client->session == NULL)
return;
- proc = client->session;
+ kill_proc(client->session, 0);
client->session = NULL;
-
- proc_done(proc);
}
static void
@@ -745,6 +735,20 @@ kill_proc(struct gotd_child_proc *proc, int fatal)
static void
kill_proc(struct gotd_child_proc *proc, int fatal)
{
+ struct timeval tv = { 5, 0 };
+
+ log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid);
+
+ if (proc->iev.ibuf.fd != -1) {
+ event_del(&proc->iev.ev);
+ msgbuf_clear(&proc->iev.ibuf.w);
+ close(proc->iev.ibuf.fd);
+ proc->iev.ibuf.fd = -1;
+ }
+
+ if (!evtimer_pending(&proc->tmo, NULL) && !fatal)
+ evtimer_add(&proc->tmo, &tv);
+
if (fatal) {
log_warnx("sending SIGKILL to PID %d", proc->pid);
kill(proc->pid, SIGKILL);
@@ -753,9 +757,18 @@ gotd_shutdown(void)
}
static void
+kill_proc_timeout(int fd, short ev, void *d)
+{
+ struct gotd_child_proc *proc = d;
+
+ log_warnx("timeout waiting for PID %d to terminate;"
+ " retrying with force", proc->pid);
+ kill_proc(proc, 1);
+}
+
+static void
gotd_shutdown(void)
{
- struct gotd_child_proc *proc;
uint64_t slot;
log_debug("shutting down");
@@ -766,20 +779,31 @@ gotd_shutdown(void)
disconnect(c);
}
- proc = gotd.listen_proc;
- msgbuf_clear(&proc->iev.ibuf.w);
- close(proc->iev.ibuf.fd);
- kill_proc(proc, 0);
- wait_for_child(proc->pid);
- free(proc);
+ kill_proc(gotd.listen_proc, 0);
log_info("terminating");
exit(0);
}
+static struct gotd_child_proc *
+find_proc_by_pid(pid_t pid)
+{
+ struct gotd_child_proc *proc = NULL;
+
+ TAILQ_FOREACH(proc, &procs, entry)
+ if (proc->pid == pid)
+ break;
+
+ return proc;
+}
+
void
gotd_sighdlr(int sig, short event, void *arg)
{
+ struct gotd_child_proc *proc;
+ pid_t pid;
+ int status;
+
/*
* Normal signal handler rules don't apply because libevent
* decouples for us.
@@ -796,6 +820,35 @@ gotd_sighdlr(int sig, short event, void *arg)
case SIGINT:
gotd_shutdown();
break;
+ case SIGCHLD:
+ for (;;) {
+ pid = waitpid(WAIT_ANY, &status, WNOHANG);
+ if (pid == -1) {
+ if (errno == EINTR)
+ continue;
+ if (errno == ECHILD)
+ break;
+ fatal("waitpid");
+ }
+ if (pid == 0)
+ break;
+
+ log_debug("reaped pid %d", pid);
+ proc = find_proc_by_pid(pid);
+ if (proc == NULL) {
+ log_info("caught exit of unknown child %d",
+ pid);
+ continue;
+ }
+
+ if (WIFSIGNALED(status)) {
+ log_warnx("child PID %d terminated with"
+ " signal %d", pid, WTERMSIG(status));
+ }
+
+ proc_done(proc);
+ }
+ break;
default:
fatalx("unexpected signal");
}
@@ -1508,6 +1561,10 @@ start_listener(char *argv0, const char *confpath, int
if (proc == NULL)
fatal("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, entry);
+
+ /* proc->tmo is initialized in main() after event_init() */
+
proc->type = PROC_LISTEN;
if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK,
@@ -1534,6 +1591,9 @@ start_session_child(struct gotd_client *client, struct
if (proc == NULL)
return got_error_from_errno("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, entry);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
if (client_is_reading(client))
proc->type = PROC_SESSION_READ;
else
@@ -1580,6 +1640,9 @@ start_repo_child(struct gotd_client *client, enum gotd
if (proc == NULL)
return got_error_from_errno("calloc");
+ TAILQ_INSERT_HEAD(&procs, proc, entry);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
proc->type = proc_type;
if (strlcpy(proc->repo_name, repo->name,
sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1632,6 +1695,9 @@ start_auth_child(struct gotd_client *client, int requi
return err;
}
+ TAILQ_INSERT_HEAD(&procs, proc, entry);
+ evtimer_set(&proc->tmo, kill_proc_timeout, proc);
+
proc->type = PROC_AUTH;
if (strlcpy(proc->repo_name, repo->name,
sizeof(proc->repo_name)) >= sizeof(proc->repo_name))
@@ -1732,10 +1798,12 @@ main(int argc, char **argv)
struct passwd *pw = NULL;
char *repo_path = NULL;
enum gotd_procid proc_id = PROC_GOTD;
- struct event evsigint, evsigterm, evsighup, evsigusr1;
+ struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld;
int *pack_fds = NULL, *temp_fds = NULL;
struct gotd_repo *repo = NULL;
+ TAILQ_INIT(&procs);
+
log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */
while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) {
@@ -1955,18 +2023,23 @@ main(int argc, char **argv)
if (proc_id != PROC_GOTD)
fatal("invalid process id %d", proc_id);
+ evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout,
+ gotd.listen_proc);
+
apply_unveil_selfexec();
signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL);
signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL);
signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL);
signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL);
+ signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL);
signal(SIGPIPE, SIG_IGN);
signal_add(&evsigint, NULL);
signal_add(&evsigterm, NULL);
signal_add(&evsighup, NULL);
signal_add(&evsigusr1, NULL);
+ signal_add(&evsigchld, NULL);
gotd_imsg_event_add(&gotd.listen_proc->iev);
gotd: wait asynchronously for child termination