Bug 1316820 - Rabaldr, optimized zeroing of locals. patch=mds+lth, r=bbouvier

--HG-- extra : rebase_source : e9e61ceda40ca8d135568d19fa7a3d74ef194b42
2017-08-23 12:09:48 +02:00 · 2017-08-23 12:09:48 +02:00 · 721e030388
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -656,6 +656,7 @@ class BaseCompiler
    FuncOffsets finish();

    MOZ_MUST_USE bool emitFunction();
+    void emitInitStackLocals();

    // Used by some of the ScratchRegister implementations.
    operator MacroAssembler&() const { return masm; }
@ -2260,26 +2261,8 @@ class BaseCompiler
            }
        }

-        // Initialize the stack locals to zero.
-        //
-        // The following are all Bug 1316820:
-        //
-        // TODO / OPTIMIZE: on x64, at least, scratch will be a 64-bit
-        // register and we can move 64 bits at a time.
-        //
-        // TODO / OPTIMIZE: On SSE2 or better SIMD systems we may be
-        // able to store 128 bits at a time.  (I suppose on some
-        // systems we have 512-bit SIMD for that matter.)
-        //
-        // TODO / OPTIMIZE: if we have only one initializing store
-        // then it's better to store a zero literal, probably.
-
-        if (varLow_ < varHigh_) {
-            ScratchI32 scratch(*this);
-            masm.mov(ImmWord(0), scratch);
-            for (int32_t i = varLow_ ; i < varHigh_ ; i += 4)
-                storeToFrameI32(scratch, i + 4);
-        }
+        if (varLow_ < varHigh_)
+            emitInitStackLocals();

        if (debugEnabled_)
            insertBreakablePoint(CallSiteDesc::EnterFrame);
@ -7497,6 +7480,107 @@ BaseCompiler::emitFunction()
    return true;
 }

+void
+BaseCompiler::emitInitStackLocals()
+{
+    MOZ_ASSERT(varLow_ < varHigh_, "there should be stack locals to initialize");
+
+    static const uint32_t wordSize = sizeof(void*);
+
+    // A local's localOffset always points above it in the frame, so that when
+    // translated to a stack address we end up with an address pointing to the
+    // base of the local.  Thus to go from a raw frame offset to an SP offset we
+    // first add K to the frame offset to obtain a localOffset for a slot of
+    // size K, and then map that to an SP offset.  Hence all the adjustments to
+    // `low` in the offset calculations below.
+
+    // On 64-bit systems we may have 32-bit alignment for the local area as it
+    // may be preceded by parameters and prologue/debug data.
+
+    uint32_t low = varLow_;
+    if (low % wordSize) {
+        masm.store32(Imm32(0), Address(StackPointer, localOffsetToSPOffset(low + 4)));
+        low += 4;
+    }
+    MOZ_ASSERT(low % wordSize == 0);
+
+    const uint32_t high = AlignBytes(varHigh_, wordSize);
+    MOZ_ASSERT(high <= uint32_t(localSize_), "localSize_ should be aligned at least that");
+
+    // An unrollLimit of 16 is chosen so that we only need an 8-bit signed
+    // immediate to represent the offset in the store instructions in the loop
+    // on x64.
+
+    const uint32_t unrollLimit = 16;
+    const uint32_t initWords = (high - low) / wordSize;
+    const uint32_t tailWords = initWords % unrollLimit;
+    const uint32_t loopHigh = high - (tailWords * wordSize);
+
+    // With only one word to initialize, just store an immediate zero.
+
+    if (initWords == 1) {
+        masm.storePtr(ImmWord(0), Address(StackPointer, localOffsetToSPOffset(low + wordSize)));
+        return;
+    }
+
+    // For other cases, it's best to have a zero in a register.
+    //
+    // One can do more here with SIMD registers (store 16 bytes at a time) or
+    // with instructions like STRD on ARM (store 8 bytes at a time), but that's
+    // for another day.
+
+    RegI32 zero = needI32();
+    masm.mov(ImmWord(0), zero);
+
+    // For the general case we want to have a loop body of unrollLimit stores
+    // and then a tail of less than unrollLimit stores.  When initWords is less
+    // than 2*unrollLimit the loop trip count is at most 1 and there is no
+    // benefit to having the pointer calculations and the compare-and-branch.
+    // So we completely unroll when we have initWords < 2 * unrollLimit.  (In
+    // this case we'll end up using 32-bit offsets on x64 for up to half of the
+    // stores, though.)
+
+    // Fully-unrolled case.
+
+    if (initWords < 2 * unrollLimit)  {
+        for (uint32_t i = low; i < high; i += wordSize)
+            masm.storePtr(zero, Address(StackPointer, localOffsetToSPOffset(i + wordSize)));
+        freeGPR(zero);
+        return;
+    }
+
+    // Unrolled loop with a tail. Stores will use negative offsets. That's OK
+    // for x86 and ARM, at least.
+
+    // Compute pointer to the highest-addressed slot on the frame.
+    RegI32 p = needI32();
+    masm.computeEffectiveAddress(Address(StackPointer, localOffsetToSPOffset(low + wordSize)),
+                                 p);
+
+    // Compute pointer to the lowest-addressed slot on the frame that will be
+    // initialized by the loop body.
+    RegI32 lim = needI32();
+    masm.computeEffectiveAddress(Address(StackPointer,
+                                         localOffsetToSPOffset(loopHigh + wordSize)),
+                                 lim);
+
+    // The loop body.  Eventually we'll have p == lim and exit the loop.
+    Label again;
+    masm.bind(&again);
+    for (uint32_t i = 0; i < unrollLimit; ++i)
+        masm.storePtr(zero, Address(p, -(wordSize * i)));
+    masm.subPtr(Imm32(unrollLimit * wordSize), p);
+    masm.branchPtr(Assembler::LessThan, lim, p, &again);
+
+    // The tail.
+    for (uint32_t i = 0; i < tailWords; ++i)
+        masm.storePtr(zero, Address(p, -(wordSize * i)));
+
+    freeGPR(p);
+    freeGPR(lim);
+    freeGPR(zero);
+}
+
 BaseCompiler::BaseCompiler(const ModuleEnvironment& env,
                           Decoder& decoder,
                           const FuncBytes& func,