Bug 916167 - IonMonkey: Explicitly break dependencies from partial-writes to xmm registers. r=jandem

2013-09-17 15:27:27 -07:00 · 2013-09-17 15:27:27 -07:00 · 93e8d15780
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -1365,7 +1365,7 @@ CodeGeneratorX86Shared::visitFloor(LFloor *lir)
                return false;

            // Test whether the input double was integer-valued.
-            masm.cvtsi2sd(output, scratch);
+            masm.convertInt32ToDouble(output, scratch);
            masm.branchDouble(Assembler::DoubleEqualOrUnordered, input, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
@ -1453,7 +1453,7 @@ CodeGeneratorX86Shared::visitRound(LRound *lir)
                return false;

            // Test whether the truncated double was integer-valued.
-            masm.cvtsi2sd(output, scratch);
+            masm.convertInt32ToDouble(output, scratch);
            masm.branchDouble(Assembler::DoubleEqualOrUnordered, temp, scratch, &end);

            // Input is not integer-valued, so we rounded off-by-one in the
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@ -240,19 +240,38 @@ class MacroAssemblerX86Shared : public Assembler
    }

    void convertInt32ToDouble(const Register &src, const FloatRegister &dest) {
+        // cvtsi2sd and friends write only part of their output register, which
+        // causes slowdowns on out-of-order processors. Explicitly break
+        // dependencies with xorpd (and xorps elsewhere), which are handled
+        // specially in modern CPUs, for this purpose. See sections 8.14, 9.8,
+        // 10.8, 12.9, 13.16, 14.14, and 15.8 of Agner's Microarchitecture
+        // document.
+        zeroDouble(dest);
        cvtsi2sd(src, dest);
    }
    void convertInt32ToDouble(const Address &src, FloatRegister dest) {
+        convertInt32ToDouble(Operand(src), dest);
+    }
+    void convertInt32ToDouble(const Operand &src, FloatRegister dest) {
+        // Clear the output register first to break dependencies; see above;
+        zeroDouble(dest);
        cvtsi2sd(Operand(src), dest);
    }
    void convertInt32ToFloat32(const Register &src, const FloatRegister &dest) {
+        // Clear the output register first to break dependencies; see above;
+        zeroFloat32(dest);
        cvtsi2ss(src, dest);
    }
    void convertInt32ToFloat32(const Address &src, FloatRegister dest) {
-        cvtsi2ss(Operand(src), dest);
+        convertInt32ToFloat32(Operand(src), dest);
+    }
+    void convertInt32ToFloat32(const Operand &src, FloatRegister dest) {
+        // Clear the output register first to break dependencies; see above;
+        zeroFloat32(dest);
+        cvtsi2ss(src, dest);
    }
    Condition testDoubleTruthy(bool truthy, const FloatRegister &reg) {
-        xorpd(ScratchFloatReg, ScratchFloatReg);
+        zeroDouble(ScratchFloatReg);
        ucomisd(ScratchFloatReg, reg);
        return truthy ? NonZero : Zero;
    }
@ -325,6 +344,9 @@ class MacroAssemblerX86Shared : public Assembler
    void zeroDouble(FloatRegister reg) {
        xorpd(reg, reg);
    }
+    void zeroFloat32(FloatRegister reg) {
+        xorps(reg, reg);
+    }
    void negateDouble(FloatRegister reg) {
        // From MacroAssemblerX86Shared::maybeInlineDouble
        pcmpeqw(ScratchFloatReg, ScratchFloatReg);
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@ -984,7 +984,7 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared
        if (dest.isFloat()) {
            Label notInt32, end;
            branchTestInt32(Assembler::NotEqual, src, &notInt32);
-            cvtsi2sd(src.valueReg(), dest.fpu());
+            convertInt32ToDouble(src.valueReg(), dest.fpu());
            jump(&end);
            bind(&notInt32);
            unboxDouble(src, dest.fpu());
@ -996,17 +996,17 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared

    // These two functions use the low 32-bits of the full value register.
    void boolValueToDouble(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2sd(operand.valueReg(), dest);
+        convertInt32ToDouble(operand.valueReg(), dest);
    }
    void int32ValueToDouble(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2sd(operand.valueReg(), dest);
+        convertInt32ToDouble(operand.valueReg(), dest);
    }

    void boolValueToFloat32(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2ss(operand.valueReg(), dest);
+        convertInt32ToFloat32(operand.valueReg(), dest);
    }
    void int32ValueToFloat32(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2ss(operand.valueReg(), dest);
+        convertInt32ToFloat32(operand.valueReg(), dest);
    }

    void loadConstantDouble(double d, const FloatRegister &dest);
@ -1053,7 +1053,7 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared
    void loadInt32OrDouble(const Operand &operand, const FloatRegister &dest) {
        Label notInt32, end;
        branchTestInt32(Assembler::NotEqual, operand, &notInt32);
-        cvtsi2sd(operand, dest);
+        convertInt32ToDouble(operand, dest);
        jump(&end);
        bind(&notInt32);
        movsd(operand, dest);
--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@ -808,7 +808,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
        if (dest.isFloat()) {
            Label notInt32, end;
            branchTestInt32(Assembler::NotEqual, src, &notInt32);
-            cvtsi2sd(src.payloadReg(), dest.fpu());
+            convertInt32ToDouble(src.payloadReg(), dest.fpu());
            jump(&end);
            bind(&notInt32);
            unboxDouble(src, dest.fpu());
@ -852,16 +852,16 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
    }

    void boolValueToDouble(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2sd(operand.payloadReg(), dest);
+        convertInt32ToDouble(operand.payloadReg(), dest);
    }
    void boolValueToFloat32(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2ss(operand.payloadReg(), dest);
+        convertInt32ToFloat32(operand.payloadReg(), dest);
    }
    void int32ValueToDouble(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2sd(operand.payloadReg(), dest);
+        convertInt32ToDouble(operand.payloadReg(), dest);
    }
    void int32ValueToFloat32(const ValueOperand &operand, const FloatRegister &dest) {
-        cvtsi2ss(operand.payloadReg(), dest);
+        convertInt32ToFloat32(operand.payloadReg(), dest);
    }

    void loadConstantDouble(double d, const FloatRegister &dest);
@ -903,7 +903,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
    void loadInt32OrDouble(const Operand &operand, const FloatRegister &dest) {
        Label notInt32, end;
        branchTestInt32(Assembler::NotEqual, operand, &notInt32);
-        cvtsi2sd(ToPayload(operand), dest);
+        convertInt32ToDouble(ToPayload(operand), dest);
        jump(&end);
        bind(&notInt32);
        movsd(operand, dest);
@ -953,7 +953,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
        subl(Imm32(0x80000000), src);

        // Now src is [-2^31, 2^31-1] - int range, but not the same value.
-        cvtsi2sd(src, dest);
+        convertInt32ToDouble(src, dest);

        // dest is now a double with the int range.
        // correct the double value by adding 0x80000000.
@ -966,7 +966,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
        subl(Imm32(0x80000000), src);

        // Do it the GCC way
-        cvtsi2ss(src, dest);
+        convertInt32ToFloat32(src, dest);

        // dest is now a double with the int range.
        // correct the double value by adding 0x80000000.