Bug 1316820 - Rabaldr, optimized zeroing of locals. patch=mds+lth, r=bbouvier

--HG--
extra : rebase_source : e9e61ceda40ca8d135568d19fa7a3d74ef194b42
This commit is contained in:
Lars T Hansen 2017-08-23 12:09:48 +02:00
Родитель ad11155416
Коммит 721e030388
1 изменённых файлов: 104 добавлений и 20 удалений

Просмотреть файл

@ -656,6 +656,7 @@ class BaseCompiler
FuncOffsets finish(); FuncOffsets finish();
MOZ_MUST_USE bool emitFunction(); MOZ_MUST_USE bool emitFunction();
void emitInitStackLocals();
// Used by some of the ScratchRegister implementations. // Used by some of the ScratchRegister implementations.
operator MacroAssembler&() const { return masm; } operator MacroAssembler&() const { return masm; }
@ -2260,26 +2261,8 @@ class BaseCompiler
} }
} }
// Initialize the stack locals to zero. if (varLow_ < varHigh_)
// emitInitStackLocals();
// The following are all Bug 1316820:
//
// TODO / OPTIMIZE: on x64, at least, scratch will be a 64-bit
// register and we can move 64 bits at a time.
//
// TODO / OPTIMIZE: On SSE2 or better SIMD systems we may be
// able to store 128 bits at a time. (I suppose on some
// systems we have 512-bit SIMD for that matter.)
//
// TODO / OPTIMIZE: if we have only one initializing store
// then it's better to store a zero literal, probably.
if (varLow_ < varHigh_) {
ScratchI32 scratch(*this);
masm.mov(ImmWord(0), scratch);
for (int32_t i = varLow_ ; i < varHigh_ ; i += 4)
storeToFrameI32(scratch, i + 4);
}
if (debugEnabled_) if (debugEnabled_)
insertBreakablePoint(CallSiteDesc::EnterFrame); insertBreakablePoint(CallSiteDesc::EnterFrame);
@ -7497,6 +7480,107 @@ BaseCompiler::emitFunction()
return true; return true;
} }
void
BaseCompiler::emitInitStackLocals()
{
MOZ_ASSERT(varLow_ < varHigh_, "there should be stack locals to initialize");
static const uint32_t wordSize = sizeof(void*);
// A local's localOffset always points above it in the frame, so that when
// translated to a stack address we end up with an address pointing to the
// base of the local. Thus to go from a raw frame offset to an SP offset we
// first add K to the frame offset to obtain a localOffset for a slot of
// size K, and then map that to an SP offset. Hence all the adjustments to
// `low` in the offset calculations below.
// On 64-bit systems we may have 32-bit alignment for the local area as it
// may be preceded by parameters and prologue/debug data.
uint32_t low = varLow_;
if (low % wordSize) {
masm.store32(Imm32(0), Address(StackPointer, localOffsetToSPOffset(low + 4)));
low += 4;
}
MOZ_ASSERT(low % wordSize == 0);
const uint32_t high = AlignBytes(varHigh_, wordSize);
MOZ_ASSERT(high <= uint32_t(localSize_), "localSize_ should be aligned at least that");
// An unrollLimit of 16 is chosen so that we only need an 8-bit signed
// immediate to represent the offset in the store instructions in the loop
// on x64.
const uint32_t unrollLimit = 16;
const uint32_t initWords = (high - low) / wordSize;
const uint32_t tailWords = initWords % unrollLimit;
const uint32_t loopHigh = high - (tailWords * wordSize);
// With only one word to initialize, just store an immediate zero.
if (initWords == 1) {
masm.storePtr(ImmWord(0), Address(StackPointer, localOffsetToSPOffset(low + wordSize)));
return;
}
// For other cases, it's best to have a zero in a register.
//
// One can do more here with SIMD registers (store 16 bytes at a time) or
// with instructions like STRD on ARM (store 8 bytes at a time), but that's
// for another day.
RegI32 zero = needI32();
masm.mov(ImmWord(0), zero);
// For the general case we want to have a loop body of unrollLimit stores
// and then a tail of less than unrollLimit stores. When initWords is less
// than 2*unrollLimit the loop trip count is at most 1 and there is no
// benefit to having the pointer calculations and the compare-and-branch.
// So we completely unroll when we have initWords < 2 * unrollLimit. (In
// this case we'll end up using 32-bit offsets on x64 for up to half of the
// stores, though.)
// Fully-unrolled case.
if (initWords < 2 * unrollLimit) {
for (uint32_t i = low; i < high; i += wordSize)
masm.storePtr(zero, Address(StackPointer, localOffsetToSPOffset(i + wordSize)));
freeGPR(zero);
return;
}
// Unrolled loop with a tail. Stores will use negative offsets. That's OK
// for x86 and ARM, at least.
// Compute pointer to the highest-addressed slot on the frame.
RegI32 p = needI32();
masm.computeEffectiveAddress(Address(StackPointer, localOffsetToSPOffset(low + wordSize)),
p);
// Compute pointer to the lowest-addressed slot on the frame that will be
// initialized by the loop body.
RegI32 lim = needI32();
masm.computeEffectiveAddress(Address(StackPointer,
localOffsetToSPOffset(loopHigh + wordSize)),
lim);
// The loop body. Eventually we'll have p == lim and exit the loop.
Label again;
masm.bind(&again);
for (uint32_t i = 0; i < unrollLimit; ++i)
masm.storePtr(zero, Address(p, -(wordSize * i)));
masm.subPtr(Imm32(unrollLimit * wordSize), p);
masm.branchPtr(Assembler::LessThan, lim, p, &again);
// The tail.
for (uint32_t i = 0; i < tailWords; ++i)
masm.storePtr(zero, Address(p, -(wordSize * i)));
freeGPR(p);
freeGPR(lim);
freeGPR(zero);
}
BaseCompiler::BaseCompiler(const ModuleEnvironment& env, BaseCompiler::BaseCompiler(const ModuleEnvironment& env,
Decoder& decoder, Decoder& decoder,
const FuncBytes& func, const FuncBytes& func,