Bug 1380033 - Tiering policy with space proxy. r=luke

--HG--
extra : rebase_source : df238cefe35aae6d90e1ad29c8768389a7bdf9a6
extra : amend_source : b7c5abf772eeab5a8f46491c0dc7748b1ad521f2
This commit is contained in:
Lars T Hansen 2017-08-31 12:24:13 +02:00
Родитель ba64ef4717
Коммит 71a8cbdccf
6 изменённых файлов: 360 добавлений и 30 удалений

Просмотреть файл

@ -619,6 +619,13 @@ js::jit::ReleaseProcessExecutableMemory()
execMemory.release();
}
size_t
js::jit::LikelyAvailableExecutableMemory()
{
// Round down available memory to the closest MB.
return MaxCodeBytesPerProcess - AlignBytes(execMemory.bytesAllocated(), 0x100000U);
}
bool
js::jit::CanLikelyAllocateMoreExecutableMemory()
{

Просмотреть файл

@ -50,6 +50,11 @@ extern void DeallocateExecutableMemory(void* addr, size_t bytes);
// function.
extern bool CanLikelyAllocateMoreExecutableMemory();
// Returns a rough guess of how much executable memory remains available,
// rounded down to MB limit. Note this can fluctuate as other threads within
// the process allocate executable memory.
extern size_t LikelyAvailableExecutableMemory();
} // namespace jit
} // namespace js

Просмотреть файл

@ -23,6 +23,7 @@
#include "jsprf.h"
#include "jit/ProcessExecutableMemory.h"
#include "wasm/WasmBaselineCompile.h"
#include "wasm/WasmBinaryIterator.h"
#include "wasm/WasmGenerator.h"
@ -103,10 +104,277 @@ CompileArgs::initFromContext(JSContext* cx, ScriptedCaller&& scriptedCaller)
return assumptions.initBuildIdFromContext(cx);
}
static bool
BackgroundWorkPossible()
// Classify the current system as one of a set of recognizable classes. This
// really needs to get our tier-1 systems right.
//
// TODO: We don't yet have a good measure of how fast a system is. We
// distinguish between mobile and desktop because these are very different kinds
// of systems, but we could further distinguish between low / medium / high end
// within those major classes. If we do so, then constants below would be
// provided for each (class, architecture, system-tier) combination, not just
// (class, architecture) as now.
//
// CPU clock speed is not by itself a good predictor of system performance, as
// there are high-performance systems with slow clocks (recent Intel) and
// low-performance systems with fast clocks (older AMD). We can also use
// physical memory, core configuration, OS details, CPU class and family, and
// CPU manufacturer to disambiguate.
enum class SystemClass
{
return CanUseExtraThreads() && HelperThreadState().cpuCount > 1;
DesktopX86,
DesktopX64,
DesktopUnknown32,
DesktopUnknown64,
MobileX86,
MobileArm32,
MobileArm64,
MobileUnknown32,
MobileUnknown64
};
static SystemClass
Classify()
{
bool isDesktop;
#if defined(ANDROID) || defined(JS_CODEGEN_ARM) || defined(JS_CODEGEN_ARM64)
isDesktop = false;
#else
isDesktop = true;
#endif
if (isDesktop) {
#if defined(JS_CODEGEN_X64)
return SystemClass::DesktopX64;
#elif defined(JS_CODEGEN_X86)
return SystemClass::DesktopX86;
#elif defined(JS_64BIT)
return SystemClass::DesktopUnknown64;
#else
return SystemClass::DesktopUnknown32;
#endif
} else {
#if defined(JS_CODEGEN_X86)
return SystemClass::MobileX86;
#elif defined(JS_CODEGEN_ARM)
return SystemClass::MobileArm32;
#elif defined(JS_CODEGEN_ARM64)
return SystemClass::MobileArm64;
#elif defined(JS_64BIT)
return SystemClass::MobileUnknown64;
#else
return SystemClass::MobileUnknown32;
#endif
}
}
#ifndef JS_64BIT
// Code sizes in machine code bytes per bytecode byte, again empirical except
// where marked as "Guess".
static const double x64Tox86Inflation = 1.25;
static const double x64IonBytesPerBytecode = 2.45;
static const double x86IonBytesPerBytecode = x64IonBytesPerBytecode * x64Tox86Inflation;
static const double arm32IonBytesPerBytecode = 3.3;
static const double arm64IonBytesPerBytecode = 3.0; // Guess
static const double x64BaselineBytesPerBytecode = x64IonBytesPerBytecode * 1.43;
static const double x86BaselineBytesPerBytecode = x64BaselineBytesPerBytecode * x64Tox86Inflation;
static const double arm32BaselineBytesPerBytecode = arm32IonBytesPerBytecode * 1.39;
static const double arm64BaselineBytesPerBytecode = arm64IonBytesPerBytecode * 1.39; // Guess
static double
IonBytesPerBytecode(SystemClass cls)
{
switch (cls) {
case SystemClass::DesktopX86:
case SystemClass::MobileX86:
case SystemClass::DesktopUnknown32:
return x86IonBytesPerBytecode;
case SystemClass::DesktopX64:
case SystemClass::DesktopUnknown64:
return x64IonBytesPerBytecode;
case SystemClass::MobileArm32:
case SystemClass::MobileUnknown32:
return arm32IonBytesPerBytecode;
case SystemClass::MobileArm64:
case SystemClass::MobileUnknown64:
return arm64IonBytesPerBytecode;
default:
MOZ_CRASH();
}
}
static double
BaselineBytesPerBytecode(SystemClass cls)
{
switch (cls) {
case SystemClass::DesktopX86:
case SystemClass::MobileX86:
case SystemClass::DesktopUnknown32:
return x86BaselineBytesPerBytecode;
case SystemClass::DesktopX64:
case SystemClass::DesktopUnknown64:
return x64BaselineBytesPerBytecode;
case SystemClass::MobileArm32:
case SystemClass::MobileUnknown32:
return arm32BaselineBytesPerBytecode;
case SystemClass::MobileArm64:
case SystemClass::MobileUnknown64:
return arm64BaselineBytesPerBytecode;
default:
MOZ_CRASH();
}
}
#endif // !JS_64BIT
// If parallel Ion compilation is going to take longer than this, we should tier.
static const double tierCutoffMs = 250;
// Compilation rate values are empirical except when noted, the reference
// systems are:
//
// Late-2013 MacBook Pro (2.6GHz quad hyperthreaded Haswell)
// Late-2015 Nexus 5X (1.4GHz quad Cortex-A53 + 1.8GHz dual Cortex-A57)
static const double x64BytecodesPerMs = 2100;
static const double x86BytecodesPerMs = 1500;
static const double arm32BytecodesPerMs = 450;
static const double arm64BytecodesPerMs = 650; // Guess
// Tiering cutoff values: if code section sizes are below these values (when
// divided by the effective number of cores) we do not tier, because we guess
// that parallel Ion compilation will be fast enough.
static const double x64DesktopTierCutoff = x64BytecodesPerMs * tierCutoffMs;
static const double x86DesktopTierCutoff = x86BytecodesPerMs * tierCutoffMs;
static const double x86MobileTierCutoff = x86DesktopTierCutoff / 2; // Guess
static const double arm32MobileTierCutoff = arm32BytecodesPerMs * tierCutoffMs;
static const double arm64MobileTierCutoff = arm64BytecodesPerMs * tierCutoffMs;
static double
CodesizeCutoff(SystemClass cls, uint32_t codeSize)
{
switch (cls) {
case SystemClass::DesktopX86:
case SystemClass::DesktopUnknown32:
return x86DesktopTierCutoff;
case SystemClass::DesktopX64:
case SystemClass::DesktopUnknown64:
return x64DesktopTierCutoff;
case SystemClass::MobileX86:
return x86MobileTierCutoff;
case SystemClass::MobileArm32:
case SystemClass::MobileUnknown32:
return arm32MobileTierCutoff;
case SystemClass::MobileArm64:
case SystemClass::MobileUnknown64:
return arm64MobileTierCutoff;
default:
MOZ_CRASH();
}
}
// As the number of cores grows the effectiveness of each core dwindles (on the
// systems we care about for SpiderMonkey).
//
// The data are empirical, computed from the observed compilation time of the
// Tanks demo code on a variable number of cores.
//
// The heuristic may fail on NUMA systems where the core count is high but the
// performance increase is nil or negative once the program moves beyond one
// socket. However, few browser users have such systems.
static double
EffectiveCores(SystemClass cls, uint32_t cores)
{
if (cores <= 3)
return pow(cores, 0.9);
return pow(cores, 0.75);
}
#ifndef JS_64BIT
// Don't tier if tiering will fill code memory to more to more than this
// fraction.
static const double spaceCutoffPct = 0.9;
#endif
// Figure out whether we should use tiered compilation or not.
static bool
GetTieringEnabled(uint32_t codeSize)
{
if (!CanUseExtraThreads())
return false;
uint32_t cpuCount = HelperThreadState().cpuCount;
MOZ_ASSERT(cpuCount > 0);
// It's mostly sensible not to background compile when there's only one
// hardware thread as we want foreground computation to have access to that.
// However, if wasm background compilation helper threads can be given lower
// priority then background compilation on single-core systems still makes
// some kind of sense. That said, this is a non-issue: as of September 2017
// 1-core was down to 3.5% of our population and falling.
if (cpuCount == 1)
return false;
MOZ_ASSERT(HelperThreadState().threadCount >= cpuCount);
// Compute the max number of threads available to do actual background
// compilation work.
uint32_t workers = HelperThreadState().maxWasmCompilationThreads();
// The number of cores we will use is bounded both by the CPU count and the
// worker count.
uint32_t cores = Min(cpuCount, workers);
SystemClass cls = Classify();
// Ion compilation on available cores must take long enough to be worth the
// bother.
double cutoffSize = CodesizeCutoff(cls, codeSize);
double effectiveCores = EffectiveCores(cls, cores);
if ((codeSize / effectiveCores) < cutoffSize)
return false;
// Do not implement a size cutoff for 64-bit systems since the code size
// budget for 64 bit is so large that it will hardly ever be an issue.
// (Also the cutoff percentage might be different on 64-bit.)
#ifndef JS_64BIT
// If the amount of executable code for baseline compilation jeopardizes the
// availability of executable memory for ion code then do not tier, for now.
//
// TODO: For now we consider this module in isolation. We should really
// worry about what else is going on in this process and might be filling up
// the code memory. It's like we need some kind of code memory reservation
// system or JIT compilation for large modules.
double ionRatio = IonBytesPerBytecode(cls);
double baselineRatio = BaselineBytesPerBytecode(cls);
double needMemory = codeSize * (ionRatio + baselineRatio);
double availMemory = LikelyAvailableExecutableMemory();
double cutoff = spaceCutoffPct * MaxCodeBytesPerProcess;
// If the sum of baseline and ion code makes us exceeds some set percentage
// of the executable memory then disable tiering.
if ((MaxCodeBytesPerProcess - availMemory) + needMemory > cutoff)
return false;
#endif
return true;
}
SharedModule
@ -118,25 +386,30 @@ wasm::CompileInitialTier(const ShareableBytes& bytecode, const CompileArgs& args
bool debugEnabled = BaselineCanCompile() && args.debugEnabled;
bool ionEnabled = args.ionEnabled || !baselineEnabled;
CompileMode mode;
Tier tier;
DebugEnabled debug;
if (BackgroundWorkPossible() && baselineEnabled && ionEnabled && !debugEnabled) {
mode = CompileMode::Tier1;
tier = Tier::Baseline;
debug = DebugEnabled::False;
} else {
mode = CompileMode::Once;
tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion;
debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False;
}
DebugEnabled debug = debugEnabled ? DebugEnabled::True : DebugEnabled::False;
ModuleEnvironment env(mode, tier, debug);
ModuleEnvironment env(ModuleEnvironment::UnknownMode, ModuleEnvironment::UnknownTier, debug);
Decoder d(bytecode.bytes, error);
if (!DecodeModuleEnvironment(d, &env))
return nullptr;
uint32_t codeSize;
if (!d.peekSectionSize(SectionId::Code, &env, "code", &codeSize))
codeSize = 0;
CompileMode mode;
Tier tier;
if (baselineEnabled && ionEnabled && !debugEnabled && GetTieringEnabled(codeSize)) {
mode = CompileMode::Tier1;
tier = Tier::Baseline;
} else {
mode = CompileMode::Once;
tier = debugEnabled || !ionEnabled ? Tier::Baseline : Tier::Ion;
}
env.setModeAndTier(mode, tier);
ModuleGenerator mg(args, &env, nullptr, error);
if (!mg.init())
return nullptr;

Просмотреть файл

@ -144,10 +144,10 @@ class CompileTask
return units_;
}
Tier tier() const {
return env_.tier;
return env_.tier();
}
CompileMode mode() const {
return env_.mode;
return env_.mode();
}
bool debugEnabled() const {
return env_.debug == DebugEnabled::True;
@ -243,8 +243,8 @@ class MOZ_STACK_CLASS ModuleGenerator
MOZ_MUST_USE bool initWasm();
bool isAsmJS() const { return env_->isAsmJS(); }
Tier tier() const { return env_->tier; }
CompileMode mode() const { return env_->mode; }
Tier tier() const { return env_->tier(); }
CompileMode mode() const { return env_->mode(); }
bool debugEnabled() const { return env_->debugEnabled(); }
public:

Просмотреть файл

@ -60,7 +60,7 @@ Decoder::fail(size_t errorOffset, const char* msg)
bool
Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionStart,
uint32_t* sectionSize, const char* sectionName)
uint32_t* sectionSize, const char* sectionName, bool peeking)
{
// Record state at beginning of section to allow rewinding to this point
// if, after skipping through several custom sections, we don't find the
@ -85,8 +85,11 @@ Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionSta
// Rewind to the beginning of the current section since this is what
// skipCustomSection() assumes.
cur_ = currentSectionStart;
if (!skipCustomSection(env))
if (!skipCustomSection(env)) {
if (peeking)
goto rewind;
return false;
}
// Having successfully skipped a custom section, consider the next
// section.
@ -97,22 +100,39 @@ Decoder::startSection(SectionId id, ModuleEnvironment* env, uint32_t* sectionSta
// Found it, now start the section.
if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize)
if (!readVarU32(sectionSize) || bytesRemain() < *sectionSize) {
if (peeking)
goto rewind;
goto fail;
}
*sectionStart = cur_ - beg_;
if (peeking)
goto rewind_peeking;
return true;
rewind:
peeking = false;
rewind_peeking:
cur_ = initialCur;
env->customSections.shrinkTo(initialCustomSectionsLength);
*sectionStart = NotStarted;
if (!peeking)
*sectionStart = NotStarted;
return true;
fail:
return failf("failed to start %s section", sectionName);
}
bool
Decoder::peekSectionSize(SectionId id, ModuleEnvironment* env, const char* sectionName, uint32_t* sectionSize)
{
uint32_t sectionStart;
if (!startSection(id, env, &sectionStart, sectionSize, sectionName, /*peeking=*/true))
return false;
return sectionStart != NotStarted;
}
bool
Decoder::finishSection(uint32_t sectionStart, uint32_t sectionSize, const char* sectionName)
{

Просмотреть файл

@ -36,11 +36,14 @@ namespace wasm {
struct ModuleEnvironment
{
// Constant parameters for the entire compilation:
const CompileMode mode;
const Tier tier;
const DebugEnabled debug;
const ModuleKind kind;
// Constant parameters determined no later than at the start of the code
// section:
CompileMode mode_;
Tier tier_;
// Module fields filled out incrementally during decoding:
MemoryUsage memoryUsage;
Atomic<uint32_t> minMemoryLength;
@ -59,18 +62,35 @@ struct ModuleEnvironment
NameInBytecodeVector funcNames;
CustomSectionVector customSections;
static const CompileMode UnknownMode = (CompileMode)-1;
static const Tier UnknownTier = (Tier)-1;
explicit ModuleEnvironment(CompileMode mode = CompileMode::Once,
Tier tier = Tier::Ion,
DebugEnabled debug = DebugEnabled::False,
ModuleKind kind = ModuleKind::Wasm)
: mode(mode),
tier(tier),
debug(debug),
: debug(debug),
kind(kind),
mode_(mode),
tier_(tier),
memoryUsage(MemoryUsage::None),
minMemoryLength(0)
{}
CompileMode mode() const {
MOZ_ASSERT(mode_ != UnknownMode);
return mode_;
}
Tier tier() const {
MOZ_ASSERT(tier_ != UnknownTier);
return tier_;
}
void setModeAndTier(CompileMode mode, Tier tier) {
MOZ_ASSERT(mode_ == UnknownMode);
MOZ_ASSERT(tier_ == UnknownTier);
mode_ = mode;
tier_ = tier;
}
size_t numTables() const {
return tables.length();
}
@ -546,10 +566,15 @@ class Decoder
ModuleEnvironment* env,
uint32_t* sectionStart,
uint32_t* sectionSize,
const char* sectionName);
const char* sectionName,
bool peeking = false);
MOZ_MUST_USE bool finishSection(uint32_t sectionStart,
uint32_t sectionSize,
const char* sectionName);
MOZ_MUST_USE bool peekSectionSize(SectionId id,
ModuleEnvironment* env,
const char* sectionName,
uint32_t* sectionSize);
// Custom sections do not cause validation errors unless the error is in
// the section header itself.