Bug 1881386 - Check for zombie processes in ProcessWatcher on Linux. r=glandium

This patch checks whether child processes are in the zombie state
(`exit`ed but not `wait`ed), by parsing `/proc/{pid}/stat`, and treats
them as dead in that case.

Child processes can end up stuck in zombie state if the fork server
exits when child processes are still running, which causes them to be
reparented to pid 1, and pid 1 isn't acting as `init`, which can happen
in container environments like Docker depending on configuration.  In
particular, this is currently the case in the containers used by Mozilla
CI to run tests.  Without this patch, if the fork server is enabled, we
wait forever for the process to exit and then (in the Mozilla CI case)
some other timeout fires and causes the test run to fail.

Differential Revision: https://phabricator.services.mozilla.com/D204096
This commit is contained in:
Jed Davis 2024-03-19 21:05:40 +00:00
Родитель 435da5acce
Коммит 1dcd15d26b
1 изменённых файлов: 74 добавлений и 4 удалений

Просмотреть файл

@ -10,6 +10,7 @@
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <sys/types.h>
@ -37,6 +38,7 @@
#ifdef MOZ_ENABLE_FORKSERVER
# include "mozilla/ipc/ForkServiceChild.h"
# include "mozilla/Printf.h"
#endif
// We could configure-test for `waitid`, but it's been in POSIX for a
@ -195,6 +197,63 @@ void CloseSuperfluousFds(void* aCtx, bool (*aShouldPreserve)(void*, int)) {
}
}
#ifdef MOZ_ENABLE_FORKSERVER
// Returns whether a process (assumed to still exist) is in the zombie
// state. Any failures (if the process doesn't exist, if /proc isn't
// mounted, etc.) will return true, so that we don't try again.
static bool IsZombieProcess(pid_t pid) {
# ifdef XP_LINUX
auto path = mozilla::Smprintf("/proc/%d/stat", pid);
int fd = open(path.get(), O_RDONLY | O_CLOEXEC);
if (fd < 0) {
int e = errno;
CHROMIUM_LOG(ERROR) << "failed to open " << path.get() << ": "
<< strerror(e);
return true;
}
// /proc/%d/stat format is approximately:
//
// %d (%s) %c %d %d %d %d %d ...
//
// The state is the third field; the second field is the thread
// name, in parentheses, but it can contain arbitrary characters.
// So, we read the whole line, check for the last ')' because all of
// the following fields are numeric, and move forward from there.
//
// And because (unlike other uses of this info the codebase) we
// don't care about those other fields, we can read a smaller amount
// of the file.
char buffer[64];
ssize_t len = HANDLE_EINTR(read(fd, buffer, sizeof(buffer) - 1));
int e = errno;
close(fd);
if (len < 1) {
CHROMIUM_LOG(ERROR) << "failed to read " << buffer << ": " << strerror(e);
return true;
}
buffer[len] = '\0';
char* rparen = strrchr(buffer, ')');
if (!rparen || rparen[1] != ' ' || rparen[2] == '\0') {
DCHECK(false) << "/proc/{pid}/stat parse error";
CHROMIUM_LOG(ERROR) << "bad data in /proc/" << pid << "/stat";
return true;
}
if (rparen[2] == 'Z') {
CHROMIUM_LOG(ERROR) << "process " << pid << " is a zombie";
return true;
}
return false;
# else // not XP_LINUX
// The situation where this matters is Linux-specific (pid
// namespaces), so we don't need to bother on other Unixes.
return false;
# endif
}
#endif // MOZ_ENABLE_FORKSERVER
bool IsProcessDead(ProcessHandle handle, bool blocking) {
auto handleForkServer = [handle]() -> mozilla::Maybe<bool> {
#ifdef MOZ_ENABLE_FORKSERVER
@ -205,10 +264,21 @@ bool IsProcessDead(ProcessHandle handle, bool blocking) {
// process any more, it is impossible to use |waitpid()| to wait for
// them.
const int r = kill(handle, 0);
// FIXME: for unexpected errors we should probably log a warning
// and return true, so that the caller doesn't loop / hang /
// try to kill the process. (Bug 1658072 will rewrite this code.)
return mozilla::Some(r < 0 && errno == ESRCH);
if (r < 0) {
const int e = errno;
if (e != ESRCH) {
CHROMIUM_LOG(WARNING) << "unexpected error checking for process "
<< handle << ": " << strerror(e);
// Return true for unknown errors, to avoid the possibility
// of getting stuck in loop of failures.
}
return mozilla::Some(true);
}
// Annoying edge case (bug NNNNNNN): if pid 1 isn't a real
// `init`, like in some container environments, and if the child
// exited after the fork server, it could become a permanent
// zombie. We treat it as dead in that case.
return mozilla::Some(IsZombieProcess(handle));
}
#else
mozilla::Unused << handle;