Bug 1865930: Add support for stopping the profiler using UNIX signals r=canaltinova,profiler-reviewers

Differential Revision: https://phabricator.services.mozilla.com/D203146
This commit is contained in:
Adam Brouwers-Harries 2024-04-02 18:35:50 +00:00
Родитель 9de2d01967
Коммит 8df870d509
5 изменённых файлов: 408 добавлений и 14 удалений

Просмотреть файл

@ -42,7 +42,12 @@
#include "ProfilerIOInterposeObserver.h"
#include "ProfilerParent.h"
#include "ProfilerRustBindings.h"
#include "mozilla/Assertions.h"
#include "mozilla/Maybe.h"
#include "mozilla/MozPromise.h"
#include "nsCOMPtr.h"
#include "nsDebug.h"
#include "nsXPCOM.h"
#include "shared-libraries.h"
#include "VTuneProfiler.h"
#include "ETWTools.h"
@ -95,6 +100,7 @@
#include "nsSystemInfo.h"
#include "nsThreadUtils.h"
#include "nsXULAppAPI.h"
#include "nsDirectoryServiceUtils.h"
#include "Tracing.h"
#include "prdtoa.h"
#include "prtime.h"
@ -108,6 +114,18 @@
#include <string_view>
#include <type_traits>
// The signals that we use to control the profiler conflict with the signals
// used to control the code coverage tool. Therefore, if coverage is enabled, we
// need to disable our own signal handling mechanisms.
#ifndef MOZ_CODE_COVERAGE
# ifdef XP_WIN
// TODO: Add support for windows "signal"-like behaviour. See Bug 1867328.
# else
# include <signal.h>
# include <unistd.h>
# endif
#endif
#if defined(GP_OS_android)
# include "JavaExceptions.h"
# include "mozilla/java/GeckoJavaSamplerNatives.h"
@ -234,6 +252,10 @@ ProfileChunkedBuffer& profiler_get_core_buffer() {
mozilla::Atomic<int, mozilla::MemoryOrdering::Relaxed> gSkipSampling;
// Atomic flag to stop the profiler from within the sampling loop
mozilla::Atomic<bool, mozilla::MemoryOrdering::Relaxed> gStopAndDumpFromSignal(
false);
#if defined(GP_OS_android)
class GeckoJavaSampler
: public java::GeckoJavaSampler::Natives<GeckoJavaSampler> {
@ -647,6 +669,7 @@ class CorePS {
PS_GET_AND_SET(const nsACString&, ProcessName)
PS_GET_AND_SET(const nsACString&, ETLDplus1)
PS_GET_AND_SET(const Maybe<nsCOMPtr<nsIFile>>&, DownloadDirectory)
static void SetBandwidthCounter(ProfilerBandwidthCounter* aBandwidthCounter) {
MOZ_ASSERT(sInstance);
@ -695,6 +718,9 @@ class CorePS {
// lock, so it is safe to have only one instance allocated for all of the
// threads.
JsFrameBuffer mJsFrames;
// Cached download directory for when we need to dump profiles to disk.
Maybe<nsCOMPtr<nsIFile>> mDownloadDirectory;
};
CorePS* CorePS::sInstance = nullptr;
@ -2933,16 +2959,6 @@ static void StreamMetaJSCustomObject(
ActivePS::WriteActiveConfiguration(aLock, aWriter,
MakeStringSpan("configuration"));
if (!NS_IsMainThread()) {
// Leave the rest of the properties out if we're not on the main thread.
// At the moment, the only case in which this function is called on a
// background thread is if we're in a content process and are going to
// send this profile to the parent process. In that case, the parent
// process profile's "meta" object already has the rest of the properties,
// and the parent process profile is dumped on that process's main thread.
return;
}
aWriter.DoubleProperty("interval", ActivePS::Interval(aLock));
aWriter.IntProperty("stackwalk", ActivePS::FeatureStackWalk(aLock));
@ -3019,6 +3035,16 @@ static void StreamMetaJSCustomObject(
}
aWriter.EndObject();
if (!NS_IsMainThread()) {
// Leave the rest of the properties out if we're not on the main thread.
// At the moment, the only case in which this function is called on a
// background thread is if we're in a content process and are going to
// send this profile to the parent process. In that case, the parent
// process profile's "meta" object already has the rest of the properties,
// and the parent process profile is dumped on that process's main thread.
return;
}
// We should avoid collecting extension metadata for profiler when there is no
// observer service, since a ExtensionPolicyService could not be created then.
if (nsCOMPtr<nsIObserverService> os = services::GetObserverService()) {
@ -4077,6 +4103,10 @@ static SamplerThread* NewSamplerThread(PSLockRef aLock, uint32_t aGeneration,
return new SamplerThread(aLock, aGeneration, aInterval, aFeatures);
}
// Forward declare the function to call when we need to dump + stop from within
// the sampler thread
void profiler_dump_and_stop();
// This function is the sampler thread. This implementation is used for all
// targets.
void SamplerThread::Run() {
@ -4732,6 +4762,27 @@ void SamplerThread::Run() {
scheduledSampleStart = beforeSleep + sampleInterval;
SleepMicro(static_cast<uint32_t>(sampleInterval.ToMicroseconds()));
}
// Check to see if the hard-reset flag has been set to stop the profiler.
// This should only be used on the worst occasions when we need to stop the
// profiler from within the sampling thread (e.g. if the main thread is
// stuck) We need to do this here as it is outside of the scope of the lock.
// Otherwise we'll encounter a race condition where `profiler_stop` tries to
// get the lock that we already hold. We also need to wait until after we
// have carried out post sampling callbacks, as otherwise we may reach a
// situation where another part of the program is waiting for us to finish
// sampling, but we have ended early!
if (gStopAndDumpFromSignal) {
// Reset the flag in case we restart the profiler at a later point
gStopAndDumpFromSignal = false;
// dump the profile, and stop the profiler
profiler_dump_and_stop();
// profiler_stop will try to destroy the active sampling thread. This will
// also destroy some data structures that are used further down this
// function, leading to invalid accesses. We therefore exit the function
// directly, rather than breaking from the loop.
return;
}
}
// End of `while` loop. We can only be here from a `break` inside the loop.
@ -5240,6 +5291,102 @@ static const char* get_size_suffix(const char* str) {
return ptr;
}
#if !defined(XP_WIN) && !defined(MOZ_CODE_COVERAGE)
static void profiler_stop_signal_handler(int signal, siginfo_t* info,
void* context) {
// We cannot really do any logging here, as this is a signal handler.
// Signal handlers are limited in what functions they can call, for more
// details see: https://man7.org/linux/man-pages/man7/signal-safety.7.html
// Writing to a file is allowed, but as signal handlers are also limited in
// how long they can run, we instead set an atomic variable to true to trigger
// the sampling thread to stop and dump the data in the profiler.
gStopAndDumpFromSignal = true;
}
#endif
// This may fail if we have previously had an issue finding the download
// directory, or if the directory has moved since we cached the path.
// This is non-ideal, but captured by Bug 1885000
Maybe<nsAutoCString> profiler_find_dump_path() {
Maybe<nsCOMPtr<nsIFile>> directory = Nothing();
nsAutoCString path;
{
// Acquire the lock so that we can get things from CorePS
PSAutoLock lock;
Maybe<nsCOMPtr<nsIFile>> downloadDir = Nothing();
downloadDir = CorePS::DownloadDirectory(lock);
// This needs to be done within the context of the lock, as otherwise
// another thread might modify CorePS::mDownloadDirectory while we're
// cloning the pointer.
if (downloadDir) {
nsCOMPtr<nsIFile> d;
downloadDir.value()->Clone(getter_AddRefs(d));
directory = Some(d);
} else {
return Nothing();
}
}
// Now, we can check to see if we have a directory, and use it to construct
// the output file
if (directory) {
// Set up the name of our profile file
path.AppendPrintf("profile_%i_%i.json", XRE_GetProcessType(), getpid());
// Append it to the directory we found
nsresult rv = directory.value()->AppendNative(path);
if (NS_FAILED(rv)) {
LOG("Failed to append path to profile file");
return Nothing();
}
// Write the result *back* to the original path
#if defined(XP_WIN)
rv = directory.value()->GetNativeTarget(path);
#else
rv = directory.value()->GetNativePath(path);
#endif
if (NS_FAILED(rv)) {
LOG("Failed to get native path for temp path");
return Nothing();
}
return Some(path);
}
return Nothing();
}
void profiler_dump_and_stop() {
// pause the profiler until we are done dumping
profiler_pause();
// Try to save the profile to a file
if (auto path = profiler_find_dump_path()) {
profiler_save_profile_to_file(path.value().get());
} else {
LOG("Failed to dump profile to disk");
}
// Stop the profiler
profiler_stop();
}
void profiler_init_signal_handlers() {
#if !defined(XP_WIN) && !defined(MOZ_CODE_COVERAGE)
// Set a handler to stop the profiler
struct sigaction prof_stop_sa {};
memset(&prof_stop_sa, 0, sizeof(struct sigaction));
prof_stop_sa.sa_sigaction = profiler_stop_signal_handler;
prof_stop_sa.sa_flags = SA_RESTART | SA_SIGINFO;
sigemptyset(&prof_stop_sa.sa_mask);
DebugOnly<int> rstop = sigaction(SIGUSR2, &prof_stop_sa, nullptr);
MOZ_ASSERT(rstop == 0, "Failed to install Profiler SIGUSR2 handler");
#endif
}
void profiler_init(void* aStackTop) {
LOG("profiler_init");
@ -5289,10 +5436,12 @@ void profiler_init(void* aStackTop) {
locked_register_thread(lock, offThreadRef);
}
}
// Platform-specific initialization.
PlatformInit(lock);
// Initialise the signal handlers needed to start/stop the profiler
profiler_init_signal_handlers();
#if defined(GP_OS_android)
if (jni::IsAvailable()) {
GeckoJavaSampler::Init();
@ -6309,6 +6458,32 @@ bool profiler_is_paused() {
return ActivePS::AppendPostSamplingCallback(lock, std::move(aCallback));
}
// See `ProfilerControl.h` for more details.
void profiler_lookup_download_directory() {
LOG("profiler_lookup_download_directory");
MOZ_ASSERT(
NS_IsMainThread(),
"We can only get access to the directory service from the main thread");
// Make sure the profiler is actually running~
MOZ_RELEASE_ASSERT(CorePS::Exists());
// take the lock so that we can write to CorePS
PSAutoLock lock;
nsCOMPtr<nsIFile> tDownloadDir;
nsresult rv = NS_GetSpecialDirectory(NS_OS_DEFAULT_DOWNLOAD_DIR,
getter_AddRefs(tDownloadDir));
if (NS_FAILED(rv)) {
LOG("Failed to find download directory. Profiler signal handling will not "
"be able to save to disk. Error: %s",
GetStaticErrorName(rv));
} else {
CorePS::SetDownloadDirectory(lock, Some(tDownloadDir));
}
}
RefPtr<GenericPromise> profiler_pause() {
LOG("profiler_pause");

Просмотреть файл

@ -123,6 +123,18 @@ void profiler_ensure_started(
const char** aFilters, uint32_t aFilterCount, uint64_t aActiveTabID,
const mozilla::Maybe<double>& aDuration = mozilla::Nothing());
// Tell the profiler to look up the download directory for writing profiles.
// With some features, such as signal control, we need to know the location of
// a directory where we can save profiles to disk. Because we start the
// profiler before we start the directory service, we can't access the
// download directory at profiler startup. Similarly, when we need to get the
// directory, we often can't, as we're running in non-main-thread contexts
// that don't have access to the directory service. This function gives us a
// third option, by giving us a hook to look for the download directory when
// the time is right. This might be triggered internally (e.g. when we start
// profiling), or externally, e.g. after the directory service is initialised.
void profiler_lookup_download_directory();
//---------------------------------------------------------------------------
// Control the profiler
//---------------------------------------------------------------------------

Просмотреть файл

@ -0,0 +1,194 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
ChromeUtils.defineESModuleGetters(this, {
Downloads: "resource://gre/modules/Downloads.sys.mjs",
FileUtils: "resource://gre/modules/FileUtils.sys.mjs",
BrowserTestUtils: "resource://testing-common/BrowserTestUtils.sys.mjs",
});
const { ctypes } = ChromeUtils.importESModule(
"resource://gre/modules/ctypes.sys.mjs"
);
// Derived from functionality in js/src/devtools/rootAnalysis/utility.js
function openLibrary(names) {
for (const name of names) {
try {
return ctypes.open(name);
} catch (e) {}
}
return undefined;
}
// Derived heavily from equivalent sandbox testing code.
// For more details see:
// https://searchfox.org/mozilla-central/rev/1aaacaeb4fa3aca6837ecc157e43e947229ba8ce/security/sandbox/test/browser_content_sandbox_utils.js#89
function raiseSignal(pid, sig) {
try {
const libc = openLibrary([
"libc.so.6",
"libc.so",
"libc.dylib",
"libSystem.B.dylib",
]);
if (!libc) {
info("Failed to open any libc shared object");
return { ok: false };
}
// c.f. https://man7.org/linux/man-pages/man2/kill.2.html
// This choice of typing for `pid` is complex, and brittle, as it's
// platform dependent. Getting it wrong can result in incoreect
// generation/calling of the `kill` function. Unfortunately, as it's
// defined as `pid_t` in a header, we can't easily get access to it.
// For now, we just use an integer, and hope that the system int size
// aligns with the `pid_t` size.
const kill = libc.declare(
"kill",
ctypes.default_abi,
ctypes.int, // return value
ctypes.int32_t, // pid
ctypes.int // sig
);
let kres = kill(pid, sig);
if (kres != 0) {
info(`Kill returned a non-zero result ${kres}.`);
return { ok: false };
}
libc.close();
} catch (e) {
info(`Exception ${e} thrown while trying to call kill`);
return { ok: false };
}
return { ok: true };
}
// We would like to use the following to wait for a stop signal to actually be
// handled:
// await Services.profiler.waitOnePeriodicSampling();
// However, as we are trying to shut down the profiler using the sampler
// thread, this can cause complications between the callback waiting for the
// sampling to be over, and the sampler thread actually finishing.
// Instead, we use the BrowserTestUtils.waitForCondition to wait until the
// profiler is no longer active.
async function waitUntilProfilerStopped(interval = 1000, maxTries = 100) {
await BrowserTestUtils.waitForCondition(
() => !Services.profiler.IsActive(),
"the profiler should be inactive",
interval,
maxTries
);
}
async function cleanupAfterTest() {
// We need to cleanup written profiles after a test
// Get the system downloads directory, and use it to build a profile file
let profile = FileUtils.File(await Downloads.getSystemDownloadsDirectory());
// Get the process ID
let pid = Services.appinfo.processID;
// write it to the profile file name
profile.append(`profile_0_${pid}.json`);
// remove the file!
await IOUtils.remove(profile.path, { ignoreAbsent: true });
// Make sure the profiler is fully stopped, even if the test failed
await Services.profiler.StopProfiler();
}
// Hardcode the constants SIGUSR1 and SIGUSR2.
// This is an absolutely terrible idea, as they are implementation defined!
// However, it turns out that for 99% of the platforms we care about, and for
// 99.999% of the platforms we test, these constants are, well, constant.
// Additionally, these constants are only for _testing_ the signal handling
// feature - the actual feature relies on platform specific definitions. This
// may cause a mismatch if we test on on, say, a gnu hurd kernel, or on a
// linux kernel running on sparc, but the feature will not break - only
// the testing.
// const SIGUSR1 = Services.appinfo.OS === "Darwin" ? 30 : 10;
const SIGUSR2 = Services.appinfo.OS === "Darwin" ? 31 : 12;
add_task(async () => {
info("Test that stopping the profiler with a posix signal works.");
registerCleanupFunction(cleanupAfterTest);
Assert.ok(
!Services.profiler.IsActive(),
"The profiler should not begin the test active."
);
const entries = 100;
const interval = 1;
const threads = [];
const features = [];
// Start the profiler, and ensure that it's active
await Services.profiler.StartProfiler(entries, interval, threads, features);
Assert.ok(Services.profiler.IsActive(), "The profiler should now be active.");
// Get the process ID
let pid = Services.appinfo.processID;
// Try and stop the profiler using a signal.
let result = raiseSignal(pid, SIGUSR2);
Assert.ok(result, "Raising a SIGUSR2 signal should succeed.");
await waitUntilProfilerStopped();
do_test_finished();
});
add_task(async () => {
info(
"Test that stopping the profiler with a posix signal writes a profile file to the system download directory."
);
registerCleanupFunction(cleanupAfterTest);
Assert.ok(
!Services.profiler.IsActive(),
"The profiler should not begin the test active."
);
const entries = 100;
const interval = 1;
const threads = [];
const features = [];
// Get the system downloads directory, and use it to build a profile file
let profile = FileUtils.File(await Downloads.getSystemDownloadsDirectory());
// Get the process ID
let pid = Services.appinfo.processID;
// use the pid to construct the name of the profile, and resulting file
profile.append(`profile_0_${pid}.json`);
// Start the profiler, and ensure that it's active
await Services.profiler.StartProfiler(entries, interval, threads, features);
Assert.ok(Services.profiler.IsActive(), "The profiler should now be active.");
// Try and stop the profiler using a signal.
let result = raiseSignal(pid, SIGUSR2);
Assert.ok(result, "Raising a SIGUSR2 signal should succeed.");
// Wait for the file to exist
await BrowserTestUtils.waitForCondition(
async () => await IOUtils.exists(profile.path),
"Waiting for a profile file to be written to disk."
);
await waitUntilProfilerStopped();
Assert.ok(
!Services.profiler.IsActive(),
"The profiler should now be inactive."
);
do_test_finished();
});

Просмотреть файл

@ -24,9 +24,6 @@ skip-if = ["!debug"]
["test_feature_fileioall.js"]
skip-if = ["release_or_beta"]
# The sanitizer checks appears to overwrite our own memory hooks in xpcshell tests,
# and no allocation markers are gathered. Skip this test in that configuration.
["test_feature_java.js"]
skip-if = ["os != 'android'"]
@ -50,6 +47,12 @@ skip-if = [
"socketprocess_networking",
]
["test_feature_posix_signals.js"]
skip-if = [
"ccov",
"os == 'win'",
]
# Native stackwalking is somewhat unreliable depending on the platform.
#
# We don't have frame pointers on macOS release and beta, so stack walking does not
@ -61,6 +64,9 @@ skip-if = [
# For sanitizer builds, there were many intermittents, and we're not getting much
# additional coverage there, so it's better to be a bit more reliable.
# The sanitizer checks appears to overwrite our own memory hooks in xpcshell tests,
# and no allocation markers are gathered. Skip this test in that configuration.
["test_feature_stackwalking.js"]
skip-if = [
"os == 'mac' && release_or_beta",

Просмотреть файл

@ -325,6 +325,8 @@ NS_InitXPCOM(nsIServiceManager** aResult, nsIFile* aBinDirectory,
if (NS_WARN_IF(NS_FAILED(rv))) {
return rv;
}
// Initialise the profiler
AUTO_PROFILER_INIT2;
// Set up the timer globals/timer thread
@ -464,6 +466,11 @@ NS_InitXPCOM(nsIServiceManager** aResult, nsIFile* aBinDirectory,
// to the directory service.
nsDirectoryService::gService->RegisterCategoryProviders();
// Now that both the profiler and directory services have been started
// we can find the download directory, where the profiler can write
// profiles if necessary
profiler_lookup_download_directory();
// Init mozilla::SharedThreadPool (which needs the service manager).
mozilla::SharedThreadPool::InitStatics();