Bug 1793523 - Replace IPC's use of waitpid with waitid to avoid racing with the crash reporter. r=nika

When IPC is waiting for a process to exit at the same time the crash
reporter is trying to attach to the main thread with `ptrace`, both of
them will use `waitpid` for their respective state changes, and these
calls can race: the ptrace-stop is delivered to IPC, while the crash
reporter and crashed child process hang forever.

This patch changes IPC to use `waitid` with `WNOWAIT`, which allows us to
check the process status without side-effecting it; if it is an exit, the
call is reissued without that flag to obtain the status as before.

(This may not be a problem on macOS, where we use Mach APIs instead of
`ptrace`, but `waitid` has been in POSIX long enough that we should be
safe using it on all Unix platforms.)

This patch also adds some logging when child processes exit with a
failure status or due to a signal.

Differential Revision: https://phabricator.services.mozilla.com/D159186
This commit is contained in:
Jed Davis 2022-10-18 23:36:18 +00:00
Родитель fb2ce68a73
Коммит 9964ac8e24
3 изменённых файлов: 69 добавлений и 9 удалений

Просмотреть файл

@ -274,7 +274,13 @@ bool KillProcess(ProcessHandle process, int exit_code);
// In various error cases (e.g., the process doesn't exist or isn't a
// child of this process) it will also return true to indicate that
// the caller should give up and not try again.
bool IsProcessDead(ProcessHandle handle);
//
// If the `blocking` parameter is set to true, this function will try
// to block the calling thread indefinitely until the process exits.
// This may not be possible (if the child is also being debugged by
// the parent process, e.g. due to the crash reporter), in which case
// it will return false and the caller will need to wait and retry.
bool IsProcessDead(ProcessHandle handle, bool blocking = false);
#endif
} // namespace base

Просмотреть файл

@ -187,7 +187,7 @@ void CloseSuperfluousFds(void* aCtx, bool (*aShouldPreserve)(void*, int)) {
}
}
bool IsProcessDead(ProcessHandle handle) {
bool IsProcessDead(ProcessHandle handle, bool blocking) {
#ifdef MOZ_ENABLE_FORKSERVER
if (mozilla::ipc::ForkServiceChild::Get()) {
// We only know if a process exists, but not if it has crashed.
@ -202,8 +202,15 @@ bool IsProcessDead(ProcessHandle handle) {
return r < 0 && errno == ESRCH;
}
#endif
int status;
const int result = HANDLE_EINTR(waitpid(handle, &status, WNOHANG));
// We use `WNOWAIT` to read the process status without
// side-effecting it, in case it's something unexpected like a
// ptrace-stop for the crash reporter. If is an exit, the call is
// reissued (see the end of this function) without that flag in
// order to collect the process.
siginfo_t si{};
const int wflags = WEXITED | WNOWAIT | (blocking ? 0 : WNOHANG);
int result = HANDLE_EINTR(waitid(P_PID, handle, &si, wflags));
if (result == -1) {
// This shouldn't happen, but sometimes it does. The error is
// probably ECHILD and the reason is probably that a pid was
@ -215,14 +222,54 @@ bool IsProcessDead(ProcessHandle handle) {
// So, lacking reliable information, we indicate that the process
// is dead, in the hope that the caller will give up and stop
// calling us. See also bug 943174 and bug 933680.
CHROMIUM_LOG(ERROR) << "waitpid failed pid:" << handle
<< " errno:" << errno;
CHROMIUM_LOG(ERROR) << "waitid failed pid:" << handle << " errno:" << errno;
return true;
} else if (result == 0) {
}
if (si.si_pid == 0) {
// the child hasn't exited yet.
return false;
}
DCHECK(si.si_pid == handle);
switch (si.si_code) {
case CLD_STOPPED:
case CLD_CONTINUED:
DCHECK(false) << "waitid returned an event type that it shouldn't have";
[[fallthrough]];
case CLD_TRAPPED:
CHROMIUM_LOG(WARNING) << "ignoring non-exit event for process " << handle;
return false;
case CLD_KILLED:
case CLD_DUMPED:
CHROMIUM_LOG(WARNING)
<< "process " << handle << " exited on signal " << si.si_status;
break;
case CLD_EXITED:
if (si.si_status != 0) {
CHROMIUM_LOG(WARNING)
<< "process " << handle << " exited with status " << si.si_status;
}
break;
default:
CHROMIUM_LOG(ERROR) << "unexpected waitid si_code value: " << si.si_code;
DCHECK(false);
// This shouldn't happen, but assume that the process exited to
// avoid the caller possibly ending up in a loop.
}
// Now consume the status / collect the dead process
const int old_si_code = si.si_code;
si.si_pid = 0;
// In theory it shouldn't matter either way if we use `WNOHANG` at
// this point, but just in case, avoid unexpected blocking.
result = HANDLE_EINTR(waitid(P_PID, handle, &si, WEXITED | WNOHANG));
DCHECK(result == 0);
DCHECK(si.si_pid == handle);
DCHECK(si.si_code == old_si_code);
return true;
}

Просмотреть файл

@ -8,6 +8,7 @@
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include "base/eintr_wrapper.h"
#include "base/message_loop.h"
@ -46,8 +47,14 @@ class ChildReaper : public base::MessagePumpLibevent::SignalEvent,
protected:
void WaitForChildExit() {
DCHECK(process_);
HANDLE_EINTR(waitpid(process_, NULL, 0));
CHECK(process_);
while (!base::IsProcessDead(process_, true)) {
// It doesn't matter if this is interrupted; we just need to
// wait for some amount of time while the other process status
// event is (hopefully) handled. This is used only during an
// error case at shutdown, so a 1s wait won't be too noticeable.
sleep(1);
}
}
pid_t process_;