diff --git a/js/src/jit/ProcessExecutableMemory.cpp b/js/src/jit/ProcessExecutableMemory.cpp index 9224824c9166..5dd04def0522 100644 --- a/js/src/jit/ProcessExecutableMemory.cpp +++ b/js/src/jit/ProcessExecutableMemory.cpp @@ -619,6 +619,13 @@ js::jit::ReleaseProcessExecutableMemory() execMemory.release(); } +size_t +js::jit::LikelyAvailableExecutableMemory() +{ + // Round down available memory to the closest MB. + return MaxCodeBytesPerProcess - AlignBytes(execMemory.bytesAllocated(), 0x100000U); +} + bool js::jit::CanLikelyAllocateMoreExecutableMemory() { diff --git a/js/src/jit/ProcessExecutableMemory.h b/js/src/jit/ProcessExecutableMemory.h index c578becb80e6..26a997168c94 100644 --- a/js/src/jit/ProcessExecutableMemory.h +++ b/js/src/jit/ProcessExecutableMemory.h @@ -50,6 +50,11 @@ extern void DeallocateExecutableMemory(void* addr, size_t bytes); // function. extern bool CanLikelyAllocateMoreExecutableMemory(); +// Returns a rough guess of how much executable memory remains available, +// rounded down to MB limit. Note this can fluctuate as other threads within +// the process allocate executable memory. +extern size_t LikelyAvailableExecutableMemory(); + } // namespace jit } // namespace js diff --git a/js/src/wasm/WasmCompile.cpp b/js/src/wasm/WasmCompile.cpp index 18bce76b9060..0cc473b6f92f 100644 --- a/js/src/wasm/WasmCompile.cpp +++ b/js/src/wasm/WasmCompile.cpp @@ -23,6 +23,7 @@ #include "jsprf.h" +#include "jit/ProcessExecutableMemory.h" #include "wasm/WasmBaselineCompile.h" #include "wasm/WasmBinaryIterator.h" #include "wasm/WasmGenerator.h" @@ -103,10 +104,277 @@ CompileArgs::initFromContext(JSContext* cx, ScriptedCaller&& scriptedCaller) return assumptions.initBuildIdFromContext(cx); } -static bool -BackgroundWorkPossible() +// Classify the current system as one of a set of recognizable classes. This +// really needs to get our tier-1 systems right. +// +// TODO: We don't yet have a good measure of how fast a system is. We +// distinguish between mobile and desktop because these are very different kinds +// of systems, but we could further distinguish between low / medium / high end +// within those major classes. If we do so, then constants below would be +// provided for each (class, architecture, system-tier) combination, not just +// (class, architecture) as now. +// +// CPU clock speed is not by itself a good predictor of system performance, as +// there are high-performance systems with slow clocks (recent Intel) and +// low-performance systems with fast clocks (older AMD). We can also use +// physical memory, core configuration, OS details, CPU class and family, and +// CPU manufacturer to disambiguate. + +enum class SystemClass { - return CanUseExtraThreads() && HelperThreadState().cpuCount > 1; + DesktopX86, + DesktopX64, + DesktopUnknown32, + DesktopUnknown64, + MobileX86, + MobileArm32, + MobileArm64, + MobileUnknown32, + MobileUnknown64 +}; + +static SystemClass +Classify() +{ + bool isDesktop; + +#if defined(ANDROID) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64) + isDesktop = false; +#else + isDesktop = true; +#endif + + if (isDesktop) { +#if defined(JS_CODEGEN_X64) + return SystemClass::DesktopX64; +#elif defined(JS_CODEGEN_X86) + return SystemClass::DesktopX86; +#elif defined(JS_64BIT) + return SystemClass::DesktopUnknown64; +#else + return SystemClass::DesktopUnknown32; +#endif + } else { +#if defined(JS_CODEGEN_X86) + return SystemClass::MobileX86; +#elif defined(JS_CODEGEN_ARM) + return SystemClass::MobileArm32; +#elif defined(JS_CODEGEN_ARM64) + return SystemClass::MobileArm64; +#elif defined(JS_64BIT) + return SystemClass::MobileUnknown64; +#else + return SystemClass::MobileUnknown32; +#endif + } +} + +#ifndef JS_64BIT + +// Code sizes in machine code bytes per bytecode byte, again empirical except +// where marked as "Guess". + +static const double x64Tox86Inflation = 1.25; + +static const double x64IonBytesPerBytecode = 2.45; +static const double x86IonBytesPerBytecode = x64IonBytesPerBytecode * x64Tox86Inflation; +static const double arm32IonBytesPerBytecode = 3.3; +static const double arm64IonBytesPerBytecode = 3.0; // Guess + +static const double x64BaselineBytesPerBytecode = x64IonBytesPerBytecode * 1.43; +static const double x86BaselineBytesPerBytecode = x64BaselineBytesPerBytecode * x64Tox86Inflation; +static const double arm32BaselineBytesPerBytecode = arm32IonBytesPerBytecode * 1.39; +static const double arm64BaselineBytesPerBytecode = arm64IonBytesPerBytecode * 1.39; // Guess + +static double +IonBytesPerBytecode(SystemClass cls) +{ + switch (cls) { + case SystemClass::DesktopX86: + case SystemClass::MobileX86: + case SystemClass::DesktopUnknown32: + return x86IonBytesPerBytecode; + case SystemClass::DesktopX64: + case SystemClass::DesktopUnknown64: + return x64IonBytesPerBytecode; + case SystemClass::MobileArm32: + case SystemClass::MobileUnknown32: + return arm32IonBytesPerBytecode; + case SystemClass::MobileArm64: + case SystemClass::MobileUnknown64: + return arm64IonBytesPerBytecode; + default: + MOZ_CRASH(); + } +} + +static double +BaselineBytesPerBytecode(SystemClass cls) +{ + switch (cls) { + case SystemClass::DesktopX86: + case SystemClass::MobileX86: + case SystemClass::DesktopUnknown32: + return x86BaselineBytesPerBytecode; + case SystemClass::DesktopX64: + case SystemClass::DesktopUnknown64: + return x64BaselineBytesPerBytecode; + case SystemClass::MobileArm32: + case SystemClass::MobileUnknown32: + return arm32BaselineBytesPerBytecode; + case SystemClass::MobileArm64: + case SystemClass::MobileUnknown64: + return arm64BaselineBytesPerBytecode; + default: + MOZ_CRASH(); + } +} + +#endif // !JS_64BIT + +// If parallel Ion compilation is going to take longer than this, we should tier. + +static const double tierCutoffMs = 250; + +// Compilation rate values are empirical except when noted, the reference +// systems are: +// +// Late-2013 MacBook Pro (2.6GHz quad hyperthreaded Haswell) +// Late-2015 Nexus 5X (1.4GHz quad Cortex-A53 + 1.8GHz dual Cortex-A57) + +static const double x64BytecodesPerMs = 2100; +static const double x86BytecodesPerMs = 1500; +static const double arm32BytecodesPerMs = 450; +static const double arm64BytecodesPerMs = 650; // Guess + +// Tiering cutoff values: if code section sizes are below these values (when +// divided by the effective number of cores) we do not tier, because we guess +// that parallel Ion compilation will be fast enough. + +static const double x64DesktopTierCutoff = x64BytecodesPerMs * tierCutoffMs; +static const double x86DesktopTierCutoff = x86BytecodesPerMs * tierCutoffMs; +static const double x86MobileTierCutoff = x86DesktopTierCutoff / 2; // Guess +static const double arm32MobileTierCutoff = arm32BytecodesPerMs * tierCutoffMs; +static const double arm64MobileTierCutoff = arm64BytecodesPerMs * tierCutoffMs; + +static double +CodesizeCutoff(SystemClass cls, uint32_t codeSize) +{ + switch (cls) { + case SystemClass::DesktopX86: + case SystemClass::DesktopUnknown32: + return x86DesktopTierCutoff; + case SystemClass::DesktopX64: + case SystemClass::DesktopUnknown64: + return x64DesktopTierCutoff; + case SystemClass::MobileX86: + return x86MobileTierCutoff; + case SystemClass::MobileArm32: + case SystemClass::MobileUnknown32: + return arm32MobileTierCutoff; + case SystemClass::MobileArm64: + case SystemClass::MobileUnknown64: + return arm64MobileTierCutoff; + default: + MOZ_CRASH(); + } +} + +// As the number of cores grows the effectiveness of each core dwindles (on the +// systems we care about for SpiderMonkey). +// +// The data are empirical, computed from the observed compilation time of the +// Tanks demo code on a variable number of cores. +// +// The heuristic may fail on NUMA systems where the core count is high but the +// performance increase is nil or negative once the program moves beyond one +// socket. However, few browser users have such systems. + +static double +EffectiveCores(SystemClass cls, uint32_t cores) +{ + if (cores <= 3) + return pow(cores, 0.9); + return pow(cores, 0.75); +} + +#ifndef JS_64BIT +// Don't tier if tiering will fill code memory to more to more than this +// fraction. + +static const double spaceCutoffPct = 0.9; +#endif + +// Figure out whether we should use tiered compilation or not. +static bool +GetTieringEnabled(uint32_t codeSize) +{ + if (!CanUseExtraThreads()) + return false; + + uint32_t cpuCount = HelperThreadState().cpuCount; + MOZ_ASSERT(cpuCount > 0); + + // It's mostly sensible not to background compile when there's only one + // hardware thread as we want foreground computation to have access to that. + // However, if wasm background compilation helper threads can be given lower + // priority then background compilation on single-core systems still makes + // some kind of sense. That said, this is a non-issue: as of September 2017 + // 1-core was down to 3.5% of our population and falling. + + if (cpuCount == 1) + return false; + + MOZ_ASSERT(HelperThreadState().threadCount >= cpuCount); + + // Compute the max number of threads available to do actual background + // compilation work. + + uint32_t workers = HelperThreadState().maxWasmCompilationThreads(); + + // The number of cores we will use is bounded both by the CPU count and the + // worker count. + + uint32_t cores = Min(cpuCount, workers); + + SystemClass cls = Classify(); + + // Ion compilation on available cores must take long enough to be worth the + // bother. + + double cutoffSize = CodesizeCutoff(cls, codeSize); + double effectiveCores = EffectiveCores(cls, cores); + + if ((codeSize / effectiveCores) < cutoffSize) + return false; + + // Do not implement a size cutoff for 64-bit systems since the code size + // budget for 64 bit is so large that it will hardly ever be an issue. + // (Also the cutoff percentage might be different on 64-bit.) + +#ifndef JS_64BIT + // If the amount of executable code for baseline compilation jeopardizes the + // availability of executable memory for ion code then do not tier, for now. + // + // TODO: For now we consider this module in isolation. We should really + // worry about what else is going on in this process and might be filling up + // the code memory. It's like we need some kind of code memory reservation + // system or JIT compilation for large modules. + + double ionRatio = IonBytesPerBytecode(cls); + double baselineRatio = BaselineBytesPerBytecode(cls); + double needMemory = codeSize * (ionRatio + baselineRatio); + double availMemory = LikelyAvailableExecutableMemory(); + double cutoff = spaceCutoffPct * MaxCodeBytesPerProcess; + + // If the sum of baseline and ion code makes us exceeds some set percentage + // of the executable memory then disable tiering. + + if ((MaxCodeBytesPerProcess - availMemory) + needMemory > cutoff) + return false; +#endif + + return true; } SharedModule @@ -118,25 +386,30 @@ wasm::CompileInitialTier(const ShareableBytes& bytecode, const CompileArgs& args bool debugEnabled = BaselineCanCompile() && args.debugEnabled; bool ionEnabled = args.ionEnabled || !baselineEnabled; - CompileMode mode; - Tier tier; - DebugEnabled debug; - if (BackgroundWorkPossible() && baselineEnabled && ionEnabled && !debugEnabled) { - mode = CompileMode::Tier1; - tier = Tier::Baseline; - debug = DebugEnabled::False; - } else { - mode = CompileMode::Once; - tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion; - debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False; - } + DebugEnabled debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False; - ModuleEnvironment env(mode, tier, debug); + ModuleEnvironment env(ModuleEnvironment::UnknownMode, ModuleEnvironment::UnknownTier, debug); Decoder d(bytecode.bytes, error); if (!DecodeModuleEnvironment(d, &env)) return nullptr; + uint32_t codeSize; + if (!d.peekSectionSize(SectionId::Code, &env, "code", &codeSize)) + codeSize = 0; + + CompileMode mode; + Tier tier; + if (baselineEnabled && ionEnabled && !debugEnabled && GetTieringEnabled(codeSize)) { + mode = CompileMode::Tier1; + tier = Tier::Baseline; + } else { + mode = CompileMode::Once; + tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion; + } + + env.setModeAndTier(mode, tier); + ModuleGenerator mg(args, &env, nullptr, error); if (!mg.init()) return nullptr; diff --git a/js/src/wasm/WasmGenerator.h b/js/src/wasm/WasmGenerator.h index 9b83114e2ed8..a6a1707612e4 100644 --- a/js/src/wasm/WasmGenerator.h +++ b/js/src/wasm/WasmGenerator.h @@ -144,10 +144,10 @@ class CompileTask return units_; } Tier tier() const { - return env_.tier; + return env_.tier(); } CompileMode mode() const { - return env_.mode; + return env_.mode(); } bool debugEnabled() const { return env_.debug == DebugEnabled::True; @@ -243,8 +243,8 @@ class MOZ_STACK_CLASS ModuleGenerator MOZ_MUST_USE bool initWasm(); bool isAsmJS() const { return env_->isAsmJS(); } - Tier tier() const { return env_->tier; } - CompileMode mode() const { return env_->mode; } + Tier tier() const { return env_->tier(); } + CompileMode mode() const { return env_->mode(); } bool debugEnabled() const { return env_->debugEnabled(); } public: diff --git a/js/src/wasm/WasmValidate.cpp b/js/src/wasm/WasmValidate.cpp index 31bb3143c43a..f08e0b11b6a3 100644 --- a/js/src/wasm/WasmValidate.cpp +++ b/js/src/wasm/WasmValidate.cpp @@ -60,7 +60,7 @@ Decoder::fail(size_t errorOffset, const char* msg) bool Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionStart, - uint32_t* sectionSize, const char* sectionName) + uint32_t* sectionSize, const char* sectionName, bool peeking) { // Record state at beginning of section to allow rewinding to this point // if, after skipping through several custom sections, we don't find the @@ -85,8 +85,11 @@ Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionSta // Rewind to the beginning of the current section since this is what // skipCustomSection() assumes. cur_ = currentSectionStart; - if (!skipCustomSection(env)) + if (!skipCustomSection(env)) { + if (peeking) + goto rewind; return false; + } // Having successfully skipped a custom section, consider the next // section. @@ -97,22 +100,39 @@ Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionSta // Found it, now start the section. - if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize) + if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize) { + if (peeking) + goto rewind; goto fail; + } *sectionStart = cur_ - beg_; + if (peeking) + goto rewind_peeking; return true; rewind: + peeking = false; + rewind_peeking: cur_ = initialCur; env->customSections.shrinkTo(initialCustomSectionsLength); - *sectionStart = NotStarted; + if (!peeking) + *sectionStart = NotStarted; return true; fail: return failf("failed to start %s section", sectionName); } +bool +Decoder::peekSectionSize(SectionId id, ModuleEnvironment* env, const char* sectionName, uint32_t* sectionSize) +{ + uint32_t sectionStart; + if (!startSection(id, env, §ionStart, sectionSize, sectionName, /*peeking=*/true)) + return false; + return sectionStart != NotStarted; +} + bool Decoder::finishSection(uint32_t sectionStart, uint32_t sectionSize, const char* sectionName) { diff --git a/js/src/wasm/WasmValidate.h b/js/src/wasm/WasmValidate.h index 48dfeb1004b1..684eddd58a86 100644 --- a/js/src/wasm/WasmValidate.h +++ b/js/src/wasm/WasmValidate.h @@ -36,11 +36,14 @@ namespace wasm { struct ModuleEnvironment { // Constant parameters for the entire compilation: - const CompileMode mode; - const Tier tier; const DebugEnabled debug; const ModuleKind kind; + // Constant parameters determined no later than at the start of the code + // section: + CompileMode mode_; + Tier tier_; + // Module fields filled out incrementally during decoding: MemoryUsage memoryUsage; Atomic minMemoryLength; @@ -59,18 +62,35 @@ struct ModuleEnvironment NameInBytecodeVector funcNames; CustomSectionVector customSections; + static const CompileMode UnknownMode = (CompileMode)-1; + static const Tier UnknownTier = (Tier)-1; + explicit ModuleEnvironment(CompileMode mode = CompileMode::Once, Tier tier = Tier::Ion, DebugEnabled debug = DebugEnabled::False, ModuleKind kind = ModuleKind::Wasm) - : mode(mode), - tier(tier), - debug(debug), + : debug(debug), kind(kind), + mode_(mode), + tier_(tier), memoryUsage(MemoryUsage::None), minMemoryLength(0) {} + CompileMode mode() const { + MOZ_ASSERT(mode_ != UnknownMode); + return mode_; + } + Tier tier() const { + MOZ_ASSERT(tier_ != UnknownTier); + return tier_; + } + void setModeAndTier(CompileMode mode, Tier tier) { + MOZ_ASSERT(mode_ == UnknownMode); + MOZ_ASSERT(tier_ == UnknownTier); + mode_ = mode; + tier_ = tier; + } size_t numTables() const { return tables.length(); } @@ -546,10 +566,15 @@ class Decoder ModuleEnvironment* env, uint32_t* sectionStart, uint32_t* sectionSize, - const char* sectionName); + const char* sectionName, + bool peeking = false); MOZ_MUST_USE bool finishSection(uint32_t sectionStart, uint32_t sectionSize, const char* sectionName); + MOZ_MUST_USE bool peekSectionSize(SectionId id, + ModuleEnvironment* env, + const char* sectionName, + uint32_t* sectionSize); // Custom sections do not cause validation errors unless the error is in // the section header itself.