зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1316820 - Rabaldr, optimized zeroing of locals. patch=mds+lth, r=bbouvier
--HG-- extra : rebase_source : e9e61ceda40ca8d135568d19fa7a3d74ef194b42
This commit is contained in:
Родитель
ad11155416
Коммит
721e030388
|
@ -656,6 +656,7 @@ class BaseCompiler
|
|||
FuncOffsets finish();
|
||||
|
||||
MOZ_MUST_USE bool emitFunction();
|
||||
void emitInitStackLocals();
|
||||
|
||||
// Used by some of the ScratchRegister implementations.
|
||||
operator MacroAssembler&() const { return masm; }
|
||||
|
@ -2260,26 +2261,8 @@ class BaseCompiler
|
|||
}
|
||||
}
|
||||
|
||||
// Initialize the stack locals to zero.
|
||||
//
|
||||
// The following are all Bug 1316820:
|
||||
//
|
||||
// TODO / OPTIMIZE: on x64, at least, scratch will be a 64-bit
|
||||
// register and we can move 64 bits at a time.
|
||||
//
|
||||
// TODO / OPTIMIZE: On SSE2 or better SIMD systems we may be
|
||||
// able to store 128 bits at a time. (I suppose on some
|
||||
// systems we have 512-bit SIMD for that matter.)
|
||||
//
|
||||
// TODO / OPTIMIZE: if we have only one initializing store
|
||||
// then it's better to store a zero literal, probably.
|
||||
|
||||
if (varLow_ < varHigh_) {
|
||||
ScratchI32 scratch(*this);
|
||||
masm.mov(ImmWord(0), scratch);
|
||||
for (int32_t i = varLow_ ; i < varHigh_ ; i += 4)
|
||||
storeToFrameI32(scratch, i + 4);
|
||||
}
|
||||
if (varLow_ < varHigh_)
|
||||
emitInitStackLocals();
|
||||
|
||||
if (debugEnabled_)
|
||||
insertBreakablePoint(CallSiteDesc::EnterFrame);
|
||||
|
@ -7497,6 +7480,107 @@ BaseCompiler::emitFunction()
|
|||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
BaseCompiler::emitInitStackLocals()
|
||||
{
|
||||
MOZ_ASSERT(varLow_ < varHigh_, "there should be stack locals to initialize");
|
||||
|
||||
static const uint32_t wordSize = sizeof(void*);
|
||||
|
||||
// A local's localOffset always points above it in the frame, so that when
|
||||
// translated to a stack address we end up with an address pointing to the
|
||||
// base of the local. Thus to go from a raw frame offset to an SP offset we
|
||||
// first add K to the frame offset to obtain a localOffset for a slot of
|
||||
// size K, and then map that to an SP offset. Hence all the adjustments to
|
||||
// `low` in the offset calculations below.
|
||||
|
||||
// On 64-bit systems we may have 32-bit alignment for the local area as it
|
||||
// may be preceded by parameters and prologue/debug data.
|
||||
|
||||
uint32_t low = varLow_;
|
||||
if (low % wordSize) {
|
||||
masm.store32(Imm32(0), Address(StackPointer, localOffsetToSPOffset(low + 4)));
|
||||
low += 4;
|
||||
}
|
||||
MOZ_ASSERT(low % wordSize == 0);
|
||||
|
||||
const uint32_t high = AlignBytes(varHigh_, wordSize);
|
||||
MOZ_ASSERT(high <= uint32_t(localSize_), "localSize_ should be aligned at least that");
|
||||
|
||||
// An unrollLimit of 16 is chosen so that we only need an 8-bit signed
|
||||
// immediate to represent the offset in the store instructions in the loop
|
||||
// on x64.
|
||||
|
||||
const uint32_t unrollLimit = 16;
|
||||
const uint32_t initWords = (high - low) / wordSize;
|
||||
const uint32_t tailWords = initWords % unrollLimit;
|
||||
const uint32_t loopHigh = high - (tailWords * wordSize);
|
||||
|
||||
// With only one word to initialize, just store an immediate zero.
|
||||
|
||||
if (initWords == 1) {
|
||||
masm.storePtr(ImmWord(0), Address(StackPointer, localOffsetToSPOffset(low + wordSize)));
|
||||
return;
|
||||
}
|
||||
|
||||
// For other cases, it's best to have a zero in a register.
|
||||
//
|
||||
// One can do more here with SIMD registers (store 16 bytes at a time) or
|
||||
// with instructions like STRD on ARM (store 8 bytes at a time), but that's
|
||||
// for another day.
|
||||
|
||||
RegI32 zero = needI32();
|
||||
masm.mov(ImmWord(0), zero);
|
||||
|
||||
// For the general case we want to have a loop body of unrollLimit stores
|
||||
// and then a tail of less than unrollLimit stores. When initWords is less
|
||||
// than 2*unrollLimit the loop trip count is at most 1 and there is no
|
||||
// benefit to having the pointer calculations and the compare-and-branch.
|
||||
// So we completely unroll when we have initWords < 2 * unrollLimit. (In
|
||||
// this case we'll end up using 32-bit offsets on x64 for up to half of the
|
||||
// stores, though.)
|
||||
|
||||
// Fully-unrolled case.
|
||||
|
||||
if (initWords < 2 * unrollLimit) {
|
||||
for (uint32_t i = low; i < high; i += wordSize)
|
||||
masm.storePtr(zero, Address(StackPointer, localOffsetToSPOffset(i + wordSize)));
|
||||
freeGPR(zero);
|
||||
return;
|
||||
}
|
||||
|
||||
// Unrolled loop with a tail. Stores will use negative offsets. That's OK
|
||||
// for x86 and ARM, at least.
|
||||
|
||||
// Compute pointer to the highest-addressed slot on the frame.
|
||||
RegI32 p = needI32();
|
||||
masm.computeEffectiveAddress(Address(StackPointer, localOffsetToSPOffset(low + wordSize)),
|
||||
p);
|
||||
|
||||
// Compute pointer to the lowest-addressed slot on the frame that will be
|
||||
// initialized by the loop body.
|
||||
RegI32 lim = needI32();
|
||||
masm.computeEffectiveAddress(Address(StackPointer,
|
||||
localOffsetToSPOffset(loopHigh + wordSize)),
|
||||
lim);
|
||||
|
||||
// The loop body. Eventually we'll have p == lim and exit the loop.
|
||||
Label again;
|
||||
masm.bind(&again);
|
||||
for (uint32_t i = 0; i < unrollLimit; ++i)
|
||||
masm.storePtr(zero, Address(p, -(wordSize * i)));
|
||||
masm.subPtr(Imm32(unrollLimit * wordSize), p);
|
||||
masm.branchPtr(Assembler::LessThan, lim, p, &again);
|
||||
|
||||
// The tail.
|
||||
for (uint32_t i = 0; i < tailWords; ++i)
|
||||
masm.storePtr(zero, Address(p, -(wordSize * i)));
|
||||
|
||||
freeGPR(p);
|
||||
freeGPR(lim);
|
||||
freeGPR(zero);
|
||||
}
|
||||
|
||||
BaseCompiler::BaseCompiler(const ModuleEnvironment& env,
|
||||
Decoder& decoder,
|
||||
const FuncBytes& func,
|
||||
|
|
Загрузка…
Ссылка в новой задаче