Bug 1793525 - Get crash reports for child processes shutdown hangs on debug/sanitizer/ccov builds. r=nika,yjuglaret

On the build types where child processes need to go through full
shutdown to write some kind of logs or data (debug / refcount logging,
sanitizers, code coverage), the parent process currently waits forever
for them to exit and blocks its own shutdown to do so.  If we were to
time out and kill the child processes, test runs would fail due to
missing the log files they were expected to write; instead, they fail
and blame the parent process for not exiting, which is not especially
helpful.

What we'd like is to know the state of the child process that's failing
to exit.  This patch attempts to get a crash report in that situation; on
Unix we send SIGABRT, and on Windows we inject a thread that calls
DbgBreakPoint, both of which result in invoking our crash reporter.

Differential Revision: https://phabricator.services.mozilla.com/D161098
This commit is contained in:
Jed Davis 2022-12-13 22:57:49 +00:00
Родитель dcf250eeb3
Коммит 46dbfb9167
2 изменённых файлов: 189 добавлений и 13 удалений

Просмотреть файл

@ -9,16 +9,23 @@
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <time.h>
#include "base/eintr_wrapper.h"
#include "base/message_loop.h"
#include "base/process_util.h"
#include "prenv.h"
#include "chrome/common/process_watcher.h"
// Maximum amount of time (in milliseconds) to wait for the process to exit.
// XXX/cjones: fairly arbitrary, chosen to match process_watcher_win.cc
static const int kMaxWaitMs = 2000;
static constexpr int kMaxWaitMs = 2000;
// This is also somewhat arbitrary, but loosely based on Try results.
// See also toolkit.asyncshutdown.crash_timeout (currently 60s) after
// which the parent process will be killed.
static constexpr int kShutdownWaitMs = 8000;
namespace {
@ -129,13 +136,18 @@ class ChildLaxReaper : public ChildReaper,
virtual void WillDestroyCurrentMessageLoop() override {
DCHECK(process_);
if (!process_) {
return;
}
#ifdef NS_FREE_PERMANENT_DATA
printf_stderr("Waiting in WillDestroyCurrentMessageLoop for pid %d\n",
process_);
#endif
// Exception for the fake hang tests in ipc/glue/test/browser
if (!PR_GetEnv("MOZ_TEST_CHILD_EXIT_HANG")) {
CrashProcessIfHanging();
}
if (process_) {
WaitForChildExit();
process_ = 0;
}
// XXX don't think this is necessary, since destruction can only
// be observed once, but can't hurt
@ -146,6 +158,50 @@ class ChildLaxReaper : public ChildReaper,
private:
ChildLaxReaper(const ChildLaxReaper&) = delete;
void CrashProcessIfHanging() {
if (base::IsProcessDead(process_)) {
process_ = 0;
return;
}
// If child processes seems to be hanging on shutdown, wait for a
// reasonable time. The wait is global instead of per-process
// because the child processes should be shutting down in
// parallel, and also we're potentially racing global timeouts
// like nsTerminator. (The counter doesn't need to be atomic;
// this is always called on the I/O thread.)
static int sWaitMs = kShutdownWaitMs;
if (sWaitMs > 0) {
CHROMIUM_LOG(WARNING)
<< "Process " << process_
<< " may be hanging at shutdown; will wait for up to " << sWaitMs
<< "ms";
}
// There isn't a way to do a time-limited wait that's both
// portable and doesn't require messing with signals. Instead, we
// sleep in short increments and poll the process status.
while (sWaitMs > 0) {
static constexpr int kWaitTickMs = 200;
struct timespec ts = {kWaitTickMs / 1000, (kWaitTickMs % 1000) * 1000000};
HANDLE_EINTR(nanosleep(&ts, &ts));
sWaitMs -= kWaitTickMs;
if (base::IsProcessDead(process_)) {
process_ = 0;
return;
}
}
// We want TreeHerder to flag this log line as an error, so that
// this is more obviously a deliberate crash; "fatal error" is one
// of the strings it looks for.
CHROMIUM_LOG(ERROR)
<< "Process " << process_
<< " hanging at shutdown; attempting crash report (fatal error).";
kill(process_, SIGABRT);
}
const ChildLaxReaper& operator=(const ChildLaxReaper&) = delete;
};

Просмотреть файл

@ -6,15 +6,27 @@
#include "chrome/common/process_watcher.h"
#include <algorithm>
#include <processthreadsapi.h>
#include <synchapi.h>
#include "base/message_loop.h"
#include "base/object_watcher.h"
#include "prenv.h"
// Maximum amount of time (in milliseconds) to wait for the process to exit.
static const int kWaitInterval = 2000;
static constexpr int kWaitInterval = 2000;
// This is somewhat arbitrary, but based on Try run results. When
// changing this, be aware of toolkit.asyncshutdown.crash_timeout
// (currently 60s), after which the parent process will be killed.
static constexpr DWORD kShutdownWaitMs = 8000;
namespace {
static bool IsProcessDead(base::ProcessHandle process) {
return WaitForSingleObject(process, 0) == WAIT_OBJECT_0;
}
class ChildReaper : public mozilla::Runnable,
public base::ObjectWatcher::Delegate,
public MessageLoop::DestructionObserver {
@ -36,10 +48,10 @@ class ChildReaper : public mozilla::Runnable,
virtual void WillDestroyCurrentMessageLoop() {
MOZ_ASSERT(!force_);
if (process_) {
#ifdef NS_FREE_PERMANENT_DATA
printf_stderr("Waiting in WillDestroyCurrentMessageLoop for pid %lu\n",
GetProcessId(process_));
#endif
// Exception for the fake hang tests in ipc/glue/test/browser
if (!PR_GetEnv("MOZ_TEST_CHILD_EXIT_HANG")) {
CrashProcessIfHanging();
}
WaitForSingleObject(process_, INFINITE);
base::CloseProcessHandle(process_);
process_ = 0;
@ -89,6 +101,114 @@ class ChildReaper : public mozilla::Runnable,
OnObjectSignaled(process_);
}
void CrashProcessIfHanging() {
if (IsProcessDead(process_)) {
return;
}
DWORD pid = GetProcessId(process_);
DCHECK(pid != 0);
// If child processes seems to be hanging on shutdown, wait for a
// reasonable time. The wait is global instead of per-process
// because the child processes should be shutting down in
// parallel, and also we're potentially racing global timeouts
// like nsTerminator. (The counter doesn't need to be atomic;
// this is always called on the I/O thread.)
static DWORD sWaitMs = kShutdownWaitMs;
if (sWaitMs > 0) {
CHROMIUM_LOG(WARNING)
<< "Process " << pid
<< " may be hanging at shutdown; will wait for up to " << sWaitMs
<< "ms";
}
const auto beforeWait = mozilla::TimeStamp::NowLoRes();
const DWORD waitStatus = WaitForSingleObject(process_, sWaitMs);
const double elapsed =
(mozilla::TimeStamp::NowLoRes() - beforeWait).ToMilliseconds();
sWaitMs -= static_cast<DWORD>(
std::clamp(elapsed, 0.0, static_cast<double>(sWaitMs)));
switch (waitStatus) {
case WAIT_TIMEOUT:
// The process is still running.
break;
case WAIT_OBJECT_0:
// The process exited.
return;
case WAIT_FAILED:
CHROMIUM_LOG(ERROR) << "Waiting for process " << pid
<< " failed; error " << GetLastError();
DCHECK(false) << "WaitForSingleObject failed";
// Process status unclear; assume it's gone.
return;
default:
DCHECK(false) << "WaitForSingleObject returned " << waitStatus;
// Again, not clear what's happening so avoid touching the process
return;
}
// We want TreeHerder to flag this log line as an error, so that
// this is more obviously a deliberate crash; "fatal error" is one
// of the strings it looks for.
CHROMIUM_LOG(ERROR)
<< "Process " << pid
<< " hanging at shutdown; attempting crash report (fatal error)";
// We're going to use CreateRemoteThread to call DbgBreakPoint in
// the target process; it's in a "known DLL" so it should be at
// the same address in all processes. (Normal libraries, like
// xul.dll, are usually at the same address but can be relocated
// in case of conflict.)
//
// DbgBreakPoint doesn't take an argument, so we can give it an
// arbitrary value to make it easier to identify these crash
// reports. (reinterpret_cast isn't constexpr, so this is
// declared as an integer and cast to the required type later.)
// The primary use case for all of this is in CI, where we'll also
// have log messages, but if these crashes end up in Socorro in
// significant numbers then we'll be able to look for this value.
static constexpr uint64_t kIpcMagic = 0x43504900435049;
const HMODULE ntdll = GetModuleHandleW(L"ntdll.dll");
if (!ntdll) {
CHROMIUM_LOG(ERROR) << "couldn't find ntdll.dll: error "
<< GetLastError();
return;
}
const auto dbgBreak = reinterpret_cast<LPTHREAD_START_ROUTINE>(
GetProcAddress(ntdll, "DbgBreakPoint"));
if (!dbgBreak) {
CHROMIUM_LOG(ERROR) << "couldn't find DbgBreakPoint: error "
<< GetLastError();
return;
}
const DWORD rights = PROCESS_CREATE_THREAD | PROCESS_QUERY_INFORMATION |
PROCESS_VM_OPERATION | PROCESS_VM_WRITE |
PROCESS_VM_READ;
HANDLE process_priv = nullptr;
if (!DuplicateHandle(GetCurrentProcess(), process_, GetCurrentProcess(),
&process_priv, rights, /* inherit */ FALSE,
/* options */ 0)) {
const auto error = GetLastError();
CHROMIUM_LOG(ERROR) << "OpenProcess: error " << error;
} else {
DCHECK(process_priv);
HANDLE thread =
CreateRemoteThread(process_priv, /* sec attr */ nullptr,
/* stack */ 0, dbgBreak, (LPVOID)kIpcMagic,
/* flags */ 0, nullptr);
if (!thread) {
const auto error = GetLastError();
CHROMIUM_LOG(ERROR) << "CreateRemoteThread: error " << error;
} else {
CloseHandle(thread);
}
CloseHandle(process_priv);
}
}
// The process that we are watching.
base::ProcessHandle process_;
@ -107,7 +227,7 @@ void ProcessWatcher::EnsureProcessTerminated(base::ProcessHandle process,
DCHECK(process != GetCurrentProcess());
// If already signaled, then we are done!
if (WaitForSingleObject(process, 0) == WAIT_OBJECT_0) {
if (IsProcessDead(process)) {
base::CloseProcessHandle(process);
return;
}