Bug 1661016: aarch64: Invalidate icache when compiling on a background thread; r=nbp,lth

On real hardware, when a background thread finishes compilation, it must signal
to the other executing threads that they need to reload a new version of the
code. Ideally, each executing thread would run an ISB instruction to do so. We
hereby use a system call membarrier that interrupts every other running thread,
and will cause the same effect as a local ISB would. It is heavyweight, so we
make sure to only run it in the case where we're on a background thread.

In the simulator, pending icache flushing requests were never satisfied before
this patch, when the request was emitted from a thread other than the main
thread. Similar behavior as above is emulated.

Differential Revision: https://phabricator.services.mozilla.com/D88395
This commit is contained in:
Benjamin Bouvier 2020-09-02 08:17:33 +00:00
Родитель 2452f172af
Коммит e16e5f0a18
19 изменённых файлов: 212 добавлений и 36 удалений

Просмотреть файл

@ -177,9 +177,19 @@ class ExecutableAllocator {
}
MOZ_MUST_USE
static bool makeExecutableAndFlushICache(void* start, size_t size) {
static bool makeExecutableAndFlushICache(FlushICacheSpec flushSpec,
void* start, size_t size) {
MustFlushICache mustFlushICache;
switch (flushSpec) {
case FlushICacheSpec::LocalThreadOnly:
mustFlushICache = MustFlushICache::LocalThreadOnly;
break;
case FlushICacheSpec::AllThreads:
mustFlushICache = MustFlushICache::AllThreads;
break;
}
return ReprotectRegion(start, size, ProtectionSetting::Executable,
MustFlushICache::Yes);
mustFlushICache);
}
static void poisonCode(JSRuntime* rt, JitPoisonRangeVector& ranges);

Просмотреть файл

@ -18,18 +18,22 @@ namespace jit {
#if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
inline void FlushICache(void* code, size_t size) {
inline void FlushICache(void* code, size_t size,
bool codeIsThreadLocal = true) {
// No-op. Code and data caches are coherent on x86 and x64.
}
#elif (defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)) || \
(defined(JS_CODEGEN_MIPS32) || defined(JS_CODEGEN_MIPS64))
extern void FlushICache(void* code, size_t size);
extern void FlushICache(void* code, size_t size, bool codeIsThreadLocal = true);
#elif defined(JS_CODEGEN_NONE)
inline void FlushICache(void* code, size_t size) { MOZ_CRASH(); }
inline void FlushICache(void* code, size_t size,
bool codeIsThreadLocal = true) {
MOZ_CRASH();
}
#else
# error "Unknown architecture!"

Просмотреть файл

@ -742,7 +742,8 @@ class MOZ_RAII AutoWritableJitCodeFallible {
}
~AutoWritableJitCodeFallible() {
if (!ExecutableAllocator::makeExecutableAndFlushICache(addr_, size_)) {
if (!ExecutableAllocator::makeExecutableAndFlushICache(
FlushICacheSpec::LocalThreadOnly, addr_, size_)) {
MOZ_CRASH();
}
rt_->toggleAutoWritableJitCodeActive(false);

Просмотреть файл

@ -738,9 +738,11 @@ bool js::jit::ReprotectRegion(void* start, size_t size,
ProtectionSetting protection,
MustFlushICache flushICache) {
// Flush ICache when making code executable, before we modify |size|.
if (flushICache == MustFlushICache::Yes) {
if (flushICache == MustFlushICache::LocalThreadOnly ||
flushICache == MustFlushICache::AllThreads) {
MOZ_ASSERT(protection == ProtectionSetting::Executable);
jit::FlushICache(start, size);
bool codeIsThreadLocal = flushICache == MustFlushICache::LocalThreadOnly;
jit::FlushICache(start, size, codeIsThreadLocal);
}
// Calculate the start of the page containing this region,

Просмотреть файл

@ -70,7 +70,15 @@ enum class ProtectionSetting {
Executable,
};
enum class MustFlushICache { No, Yes };
/// Whether the instruction cache must be flushed:
//- No means no flushing will happen.
//- LocalThreadOnly means only the local thread's icache will be flushed.
//- AllThreads means all the threads' icaches will be flushed; this must be used
// when the compiling thread and the executing thread might be different.
enum class MustFlushICache { No, LocalThreadOnly, AllThreads };
enum class FlushICacheSpec { LocalThreadOnly, AllThreads };
extern MOZ_MUST_USE bool ReprotectRegion(void* start, size_t size,
ProtectionSetting protection,

Просмотреть файл

@ -452,7 +452,7 @@ uint32_t FloatRegisters::ActualTotalPhys() {
return 16;
}
void FlushICache(void* code, size_t size) {
void FlushICache(void* code, size_t size, bool codeIsThreadLocal) {
#if defined(JS_SIMULATOR_ARM)
js::jit::SimulatorProcess::FlushICache(code, size);

Просмотреть файл

@ -78,8 +78,12 @@ uint32_t FloatRegister::getRegisterDumpOffsetInBytes() {
uint32_t GetARM64Flags() { return 0; }
void FlushICache(void* code, size_t size) {
vixl::CPU::EnsureIAndDCacheCoherency(code, size);
void FlushICache(void* code, size_t size, bool codeIsThreadLocal) {
vixl::CPU::EnsureIAndDCacheCoherency(code, size, codeIsThreadLocal);
}
bool CanFlushICacheFromBackgroundThreads() {
return vixl::CPU::CanFlushICacheFromBackgroundThreads();
}
} // namespace jit

Просмотреть файл

@ -616,6 +616,8 @@ inline bool hasMultiAlias() { return false; }
uint32_t GetARM64Flags();
bool CanFlushICacheFromBackgroundThreads();
} // namespace jit
} // namespace js

Просмотреть файл

@ -165,7 +165,11 @@ class CPU {
// the I and D caches. I and D caches are not automatically coherent on ARM
// so this operation is required before any dynamically generated code can
// safely run.
static void EnsureIAndDCacheCoherency(void *address, size_t length);
static void EnsureIAndDCacheCoherency(void *address, size_t length, bool codeIsThreadLocal);
// Returns true when the current machine supports flushing the instruction
// cache on a background thread.
static bool CanFlushICacheFromBackgroundThreads();
// Read and interpret the ID registers. This requires
// CPUFeatures::kIDRegisterEmulation, and therefore cannot be called on

Просмотреть файл

@ -33,6 +33,26 @@
# include <libkern/OSCacheControl.h>
#endif
#if defined(__aarch64__) && !defined(_MSC_VER) && !defined(XP_DARWIN)
# if defined(__linux__)
# include <linux/membarrier.h>
# include <sys/syscall.h>
# include <sys/utsname.h>
# elif defined(__ANDROID__)
# include <sys/syscall.h>
# include <unistd.h>
# else
# error "Missing platform-specific declarations for membarrier syscall!"
# endif // __linux__ / ANDROID
# include "vm/JSContext.h" // TlsContext
static int membarrier(int cmd, int flags) {
return syscall(__NR_membarrier, cmd, flags);
}
#endif // __aarch64__
namespace vixl {
@ -84,8 +104,47 @@ uint32_t CPU::GetCacheType() {
#endif
}
bool CPU::CanFlushICacheFromBackgroundThreads() {
#if defined(__aarch64__) && !defined(_MSC_VER) && !defined(XP_DARWIN)
// On linux, check the kernel supports membarrier(2), that is, it's a kernel
// above Linux 4.16 included.
//
// Note: this code has been extracted (August 2020) from
// https://android.googlesource.com/platform/art/+/58520dfba31d6eeef75f5babff15e09aa28e5db8/libartbase/base/membarrier.cc#50
static constexpr int kRequiredMajor = 4;
static constexpr int kRequiredMinor = 16;
void CPU::EnsureIAndDCacheCoherency(void *address, size_t length) {
static bool computed = false;
static bool kernelHasMembarrier = false;
if (!computed) {
struct utsname uts;
int major, minor;
kernelHasMembarrier = uname(&uts) == 0 &&
strcmp(uts.sysname, "Linux") == 0 &&
sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
major >= kRequiredMajor && (major != kRequiredMajor || minor >= kRequiredMinor);
computed = true;
}
if (!kernelHasMembarrier) {
return false;
}
// As a test bed, try to run the syscall with the command registering the
// intent to use the actual membarrier we'll want to carry out later.
if (membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, 0) != 0) {
return false;
}
return true;
#else
// On other platforms, we assume that the provided syscall does the right thing.
return true;
#endif
}
void CPU::EnsureIAndDCacheCoherency(void *address, size_t length, bool codeIsThreadLocal) {
#if defined(JS_SIMULATOR_ARM64) && defined(JS_CACHE_SIMULATOR_ARM64)
// This code attempts to emulate what the following assembly sequence is
// doing, which is sending the information to all cores that some cache line
@ -107,6 +166,11 @@ void CPU::EnsureIAndDCacheCoherency(void *address, size_t length) {
Simulator* sim = vixl::Simulator::Current();
if (sim) {
sim->FlushICache();
} else if (!codeIsThreadLocal) {
// We're on a background thread; emulate what the real hardware would do by
// emitting a membarrier that'll interrupt and cause an icache invalidation
// on all the threads.
SimulatorProcess::membarrier();
}
#elif defined(_MSC_VER) && defined(_M_ARM64)
FlushInstructionCache(GetCurrentProcess(), address, length);
@ -199,6 +263,21 @@ void CPU::EnsureIAndDCacheCoherency(void *address, size_t length) {
// isb : Instruction Synchronisation Barrier
" isb\n"
: : : "memory");
if (!codeIsThreadLocal) {
// If we're on a background thread, emit a membarrier that will synchronize
// all the executing threads with the new version of the code.
JSContext* cx = js::TlsContext.get();
if (!cx || !cx->isMainThreadContext()) {
MOZ_RELEASE_ASSERT(CPU::CanFlushICacheFromBackgroundThreads());
// The intent to use this command has been carried over in
// CanFlushICacheFromBackgroundThreads.
if (membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, 0) != 0) {
// Better safe than sorry.
MOZ_CRASH("membarrier can't be executed");
}
}
}
#else
// If the host isn't AArch64, we must be using the simulator, so this function
// doesn't have to do anything.

Просмотреть файл

@ -208,6 +208,16 @@ void Simulator::Destroy(Simulator* sim) {
void Simulator::ExecuteInstruction() {
// The program counter should always be aligned.
VIXL_ASSERT(IsWordAligned(pc_));
if (pendingCacheRequests) {
// We're here emulating the behavior of the membarrier carried over on
// real hardware does; see syscalls to membarrier in MozCpu-vixl.cpp.
// There's a slight difference that the simulator is not being
// interrupted: instead, we effectively run the icache flush request
// before executing the next instruction, which is close enough and
// sufficient for our use case.
js::jit::AutoLockSimulatorCache alsc;
FlushICache();
}
decoder_->Decode(pc_);
increment_pc();
}
@ -889,6 +899,7 @@ Simulator::FlushICache()
decoder_->FlushICache(flush.start, flush.length);
}
vec.clear();
pendingCacheRequests = false;
}
void CachingDecoder::Decode(const Instruction* instr) {
@ -968,6 +979,13 @@ void SimulatorProcess::recordICacheFlush(void* start, size_t length) {
}
}
void SimulatorProcess::membarrier() {
MOZ_ASSERT(singleton_->lock_.ownedByCurrentThread());
for (auto& s : singleton_->pendingFlushes_) {
s.thread->pendingCacheRequests = true;
}
}
SimulatorProcess::ICacheFlushes& SimulatorProcess::getICacheFlushes(Simulator* sim) {
MOZ_ASSERT(singleton_->lock_.ownedByCurrentThread());
for (auto& s : singleton_->pendingFlushes_) {

Просмотреть файл

@ -503,6 +503,7 @@ class Simulator : public DecoderVisitor {
public:
#ifdef JS_CACHE_SIMULATOR_ARM64
using Decoder = CachingDecoder;
mozilla::Atomic<bool> pendingCacheRequests = mozilla::Atomic<bool>{ false };
#endif
explicit Simulator(Decoder* decoder, FILE* stream = stdout);
~Simulator();
@ -2522,6 +2523,7 @@ class SimulatorProcess
mozilla::Vector<SimFlushes, 1> pendingFlushes_;
static void recordICacheFlush(void* start, size_t length);
static void membarrier();
static ICacheFlushes& getICacheFlushes(Simulator* sim);
static MOZ_MUST_USE bool registerSimulator(Simulator* sim);
static void unregisterSimulator(Simulator* sim);

Просмотреть файл

@ -82,7 +82,7 @@ Registers::Code Registers::FromName(const char* name) {
return Invalid;
}
void FlushICache(void* code, size_t size) {
void FlushICache(void* code, size_t size, bool codeIsThreadLocal) {
#if defined(JS_SIMULATOR)
js::jit::SimulatorProcess::FlushICache(code, size);

Просмотреть файл

@ -876,8 +876,8 @@ bool InitializeJittedAtomics() {
masm.executableCopy(code);
// Reprotect the whole region to avoid having separate RW and RX mappings.
if (!ExecutableAllocator::makeExecutableAndFlushICache(code,
roundedCodeLength)) {
if (!ExecutableAllocator::makeExecutableAndFlushICache(
FlushICacheSpec::LocalThreadOnly, code, roundedCodeLength)) {
DeallocateExecutableMemory(code, roundedCodeLength);
return false;
}

Просмотреть файл

@ -49,8 +49,8 @@ static bool Execute(JSContext* cx, MacroAssembler& masm) {
if (!code) {
return false;
}
if (!ExecutableAllocator::makeExecutableAndFlushICache(code->raw(),
code->bufferSize())) {
if (!ExecutableAllocator::makeExecutableAndFlushICache(
FlushICacheSpec::LocalThreadOnly, code->raw(), code->bufferSize())) {
return false;
}

Просмотреть файл

@ -1458,8 +1458,9 @@ bool wasm::EnsureBuiltinThunksInitialized() {
MOZ_ASSERT(masm.callSiteTargets().empty());
MOZ_ASSERT(masm.trapSites().empty());
if (!ExecutableAllocator::makeExecutableAndFlushICache(thunks->codeBase,
thunks->codeSize)) {
if (!ExecutableAllocator::makeExecutableAndFlushICache(
FlushICacheSpec::LocalThreadOnly, thunks->codeBase,
thunks->codeSize)) {
return false;
}

Просмотреть файл

@ -376,7 +376,7 @@ UniqueModuleSegment ModuleSegment::create(Tier tier, const Bytes& unlinkedBytes,
linkData);
}
bool ModuleSegment::initialize(const CodeTier& codeTier,
bool ModuleSegment::initialize(IsTier2 isTier2, const CodeTier& codeTier,
const LinkData& linkData,
const Metadata& metadata,
const MetadataTier& metadataTier) {
@ -384,9 +384,15 @@ bool ModuleSegment::initialize(const CodeTier& codeTier,
return false;
}
// Optimized compilation finishes on a background thread, so we must make sure
// to flush the icaches of all the executing threads.
FlushICacheSpec flushIcacheSpec = isTier2 == IsTier2::Tier2
? FlushICacheSpec::AllThreads
: FlushICacheSpec::LocalThreadOnly;
// Reprotect the whole region to avoid having separate RW and RX mappings.
if (!ExecutableAllocator::makeExecutableAndFlushICache(
base(), RoundupCodeLength(length()))) {
flushIcacheSpec, base(), RoundupCodeLength(length()))) {
return false;
}
@ -661,6 +667,7 @@ static constexpr unsigned LAZY_STUB_LIFO_DEFAULT_CHUNK_SIZE = 8 * 1024;
bool LazyStubTier::createMany(const Uint32Vector& funcExportIndices,
const CodeTier& codeTier,
bool flushAllThreadsIcaches,
size_t* stubSegmentIndex) {
MOZ_ASSERT(funcExportIndices.length());
@ -739,7 +746,13 @@ bool LazyStubTier::createMany(const Uint32Vector& funcExportIndices,
Assembler::Bind(codePtr, label);
}
if (!ExecutableAllocator::makeExecutableAndFlushICache(codePtr, codeLength)) {
// Optimized compilation finishes on a background thread, so we must make sure
// to flush the icaches of all the executing threads.
FlushICacheSpec flushIcacheSpec = flushAllThreadsIcaches
? FlushICacheSpec::AllThreads
: FlushICacheSpec::LocalThreadOnly;
if (!ExecutableAllocator::makeExecutableAndFlushICache(flushIcacheSpec,
codePtr, codeLength)) {
return false;
}
@ -785,8 +798,13 @@ bool LazyStubTier::createOne(uint32_t funcExportIndex,
return false;
}
// This happens on the executing thread (called via GetInterpEntry), so no
// need to flush the icaches on all the threads.
bool flushAllThreadIcaches = false;
size_t stubSegmentIndex;
if (!createMany(funcExportIndexes, codeTier, &stubSegmentIndex)) {
if (!createMany(funcExportIndexes, codeTier, flushAllThreadIcaches,
&stubSegmentIndex)) {
return false;
}
@ -828,8 +846,13 @@ bool LazyStubTier::createTier2(const Uint32Vector& funcExportIndices,
return true;
}
// This compilation happens on a background compiler thread, so the icache may
// need to be flushed on all the threads.
bool flushAllThreadIcaches = true;
size_t stubSegmentIndex;
if (!createMany(funcExportIndices, codeTier, &stubSegmentIndex)) {
if (!createMany(funcExportIndices, codeTier, flushAllThreadIcaches,
&stubSegmentIndex)) {
return false;
}
@ -1031,15 +1054,15 @@ bool Metadata::getFuncName(NameContext ctx, uint32_t funcIndex,
return AppendFunctionIndexName(funcIndex, name);
}
bool CodeTier::initialize(const Code& code, const LinkData& linkData,
const Metadata& metadata) {
bool CodeTier::initialize(IsTier2 isTier2, const Code& code,
const LinkData& linkData, const Metadata& metadata) {
MOZ_ASSERT(!initialized());
code_ = &code;
MOZ_ASSERT(lazyStubs_.lock()->empty());
// See comments in CodeSegment::initialize() for why this must be last.
if (!segment_->initialize(*this, linkData, metadata, *metadata_)) {
if (!segment_->initialize(isTier2, *this, linkData, metadata, *metadata_)) {
return false;
}
@ -1150,7 +1173,7 @@ Code::Code(UniqueCodeTier tier1, const Metadata& metadata,
bool Code::initialize(const LinkData& linkData) {
MOZ_ASSERT(!initialized());
if (!tier1_->initialize(*this, linkData, *metadata_)) {
if (!tier1_->initialize(IsTier2::NotTier2, *this, linkData, *metadata_)) {
return false;
}
@ -1163,7 +1186,7 @@ bool Code::setTier2(UniqueCodeTier tier2, const LinkData& linkData) const {
MOZ_RELEASE_ASSERT(tier2->tier() == Tier::Optimized &&
tier1_->tier() == Tier::Baseline);
if (!tier2->initialize(*this, linkData, *metadata_)) {
if (!tier2->initialize(IsTier2::Tier2, *this, linkData, *metadata_)) {
return false;
}

Просмотреть файл

@ -162,6 +162,8 @@ class CodeSegment {
using UniqueModuleSegment = UniquePtr<ModuleSegment>;
enum IsTier2 { Tier2, NotTier2 };
class ModuleSegment : public CodeSegment {
const Tier tier_;
uint8_t* const trapCode_;
@ -175,8 +177,9 @@ class ModuleSegment : public CodeSegment {
static UniqueModuleSegment create(Tier tier, const Bytes& unlinkedBytes,
const LinkData& linkData);
bool initialize(const CodeTier& codeTier, const LinkData& linkData,
const Metadata& metadata, const MetadataTier& metadataTier);
bool initialize(IsTier2 compileMode, const CodeTier& codeTier,
const LinkData& linkData, const Metadata& metadata,
const MetadataTier& metadataTier);
Tier tier() const { return tier_; }
@ -519,7 +522,8 @@ class LazyStubTier {
size_t lastStubSegmentIndex_;
bool createMany(const Uint32Vector& funcExportIndices,
const CodeTier& codeTier, size_t* stubSegmentIndex);
const CodeTier& codeTier, bool flushAllThreadsIcaches,
size_t* stubSegmentIndex);
public:
LazyStubTier() : lastStubSegmentIndex_(0) {}
@ -579,7 +583,7 @@ class CodeTier {
lazyStubs_(mutexForTier(segment_->tier())) {}
bool initialized() const { return !!code_ && segment_->initialized(); }
bool initialize(const Code& code, const LinkData& linkData,
bool initialize(IsTier2 isTier2, const Code& code, const LinkData& linkData,
const Metadata& metadata);
Tier tier() const { return segment_->tier(); }

Просмотреть файл

@ -449,6 +449,20 @@ void CompilerEnvironment::computeParameters() {
state_ = Computed;
}
// Check that this architecture either:
// - is cache-coherent, which is the case for most tier-1 architectures we care
// about.
// - or has the ability to invalidate the instruction cache of all threads, so
// background compilation in tiered compilation can be synchronized across all
// threads.
static bool IsICacheSafe() {
#ifdef JS_CODEGEN_ARM64
return jit::CanFlushICacheFromBackgroundThreads();
#else
return true;
#endif
}
void CompilerEnvironment::computeParameters(Decoder& d) {
MOZ_ASSERT(!isComputed());
@ -484,7 +498,7 @@ void CompilerEnvironment::computeParameters(Decoder& d) {
}
if (baselineEnabled && hasSecondTier && CanUseExtraThreads() &&
(TieringBeneficial(codeSectionSize) || forceTiering)) {
(TieringBeneficial(codeSectionSize) || forceTiering) && IsICacheSafe()) {
mode_ = CompileMode::Tier1;
tier_ = Tier::Baseline;
} else {