Bug 1108825: Implement SIMD.int32x4.mul in Odin; r=sunfish

2014-12-11 12:10:35 +01:00 · 2014-12-11 12:10:35 +01:00 · fc140a436a
--- a/js/src/builtin/SIMD.h
+++ b/js/src/builtin/SIMD.h
@ -140,7 +140,6 @@
    _(reciprocalSqrt)                \
    _(fromInt32x4)                   \
    _(fromInt32x4Bits)               \
-    _(mul)                           \
    _(div)                           \
    _(max)                           \
    _(min)                           \
@ -149,6 +148,7 @@
 #define FOREACH_COMMONX4_SIMD_OP(_)  \
    _(add)                           \
    _(sub)                           \
+    _(mul)                           \
    _(lessThan)                      \
    _(lessThanOrEqual)               \
    _(equal)                         \
--- a/js/src/jit-test/tests/asm.js/testSIMD.js
+++ b/js/src/jit-test/tests/asm.js/testSIMD.js
@ -12,6 +12,7 @@ if (!isSimdAvailable() || typeof SIMD === 'undefined') {
 const I32 = 'var i4 = glob.SIMD.int32x4;'
 const I32A = 'var i4a = i4.add;'
 const I32S = 'var i4s = i4.sub;'
+const I32M = 'var i4m = i4.mul;'
 const F32 = 'var f4 = glob.SIMD.float32x4;'
 const F32A = 'var f4a = f4.add;'
 const F32S = 'var f4s = f4.sub;'
@ -453,9 +454,19 @@ CheckF4(F32S, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4s(x,y)', [Math.frou
 CheckF4(F32S, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4(f4s(x,y))', [Math.fround(13.37) - 4,-1,-2,2]);

 // 2.3.3. Multiplications / Divisions
-assertAsmTypeFail('glob', USE_ASM + I32 + "var f4m=i4.mul; function f() {} return f");
 assertAsmTypeFail('glob', USE_ASM + I32 + "var f4d=i4.div; function f() {} return f");

+CheckI4(I32M, 'var x=i4(1,2,3,4); var y=i4(-1,1,0,2); x=i4m(x,y)', [-1,2,0,8]);
+CheckI4(I32M, 'var x=i4(5,4,3,2); var y=i4(1,2,3,4); x=i4m(x,y)', [5,8,9,8]);
+CheckI4(I32M, 'var x=i4(1,2,3,4); x=i4m(x,x)', [1,4,9,16]);
+(function() {
+    var m = INT32_MIN, M = INT32_MAX, imul = Math.imul;
+    CheckI4(I32M, `var x=i4(${m},${m}, ${M}, ${M}); var y=i4(2,-3,4,-5); x=i4m(x,y)`,
+            [imul(m, 2), imul(m, -3), imul(M, 4), imul(M, -5)]);
+    CheckI4(I32M, `var x=i4(${m},${m}, ${M}, ${M}); var y=i4(${m}, ${M}, ${m}, ${M}); x=i4m(x,y)`,
+            [imul(m, m), imul(m, M), imul(M, m), imul(M, M)]);
+})();
+
 CheckF4(F32M, 'var x=f4(1,2,3,4); x=f4m(x,x)', [1,4,9,16]);
 CheckF4(F32M, 'var x=f4(1,2,3,4); var y=f4(4,3,5,2); x=f4m(x,y)', [4,6,15,8]);
 CheckF4(F32M, 'var x=f4(13.37,2,3,4); var y=f4(4,3,5,2); x=f4m(x,y)', [Math.fround(13.37) * 4,6,15,8]);
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@ -366,8 +366,7 @@ class LSimdBinaryCompFx4 : public LSimdBinaryComp
 };

 // Binary SIMD arithmetic operation between two SIMD operands
-template<size_t Temps>
-class LSimdBinaryArith : public LInstructionHelper<1, 2, Temps>
+class LSimdBinaryArith : public LInstructionHelper<1, 2, 1>
 {
  public:
    LSimdBinaryArith() {}
@ -378,6 +377,9 @@ class LSimdBinaryArith : public LInstructionHelper<1, 2, Temps>
    const LAllocation *rhs() {
        return this->getOperand(1);
    }
+    const LDefinition *temp() {
+        return getTemp(0);
+    }

    MSimdBinaryArith::Operation operation() const {
        return this->mir_->toSimdBinaryArith()->operation();
@ -388,23 +390,19 @@ class LSimdBinaryArith : public LInstructionHelper<1, 2, Temps>
 };

 // Binary SIMD arithmetic operation between two Int32x4 operands
-class LSimdBinaryArithIx4 : public LSimdBinaryArith<0>
+class LSimdBinaryArithIx4 : public LSimdBinaryArith
 {
  public:
    LIR_HEADER(SimdBinaryArithIx4);
-    LSimdBinaryArithIx4() : LSimdBinaryArith<0>() {}
+    LSimdBinaryArithIx4() : LSimdBinaryArith() {}
 };

 // Binary SIMD arithmetic operation between two Float32x4 operands
-class LSimdBinaryArithFx4 : public LSimdBinaryArith<1>
+class LSimdBinaryArithFx4 : public LSimdBinaryArith
 {
  public:
    LIR_HEADER(SimdBinaryArithFx4);
-    LSimdBinaryArithFx4() : LSimdBinaryArith<1>() {}
-
-    const LDefinition *temp() {
-        return getTemp(0);
-    }
+    LSimdBinaryArithFx4() : LSimdBinaryArith() {}
 };

 // Unary SIMD arithmetic operation on a SIMD operand
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@ -631,63 +631,6 @@ ReorderComparison(JSOp op, MDefinition **lhsp, MDefinition **rhsp)
    return op;
 }

-static bool
-ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins)
-{
-    // lhs and rhs are used by the commutative operator.
-    MOZ_ASSERT(lhs->hasDefUses());
-    MOZ_ASSERT(rhs->hasDefUses());
-
-    // Ensure that if there is a constant, then it is in rhs.
-    if (rhs->isConstant())
-        return false;
-    if (lhs->isConstant())
-        return true;
-
-    // Since clobbering binary operations clobber the left operand, prefer a
-    // non-constant lhs operand with no further uses. To be fully precise, we
-    // should check whether this is the *last* use, but checking hasOneDefUse()
-    // is a decent approximation which doesn't require any extra analysis.
-    bool rhsSingleUse = rhs->hasOneDefUse();
-    bool lhsSingleUse = lhs->hasOneDefUse();
-    if (rhsSingleUse) {
-        if (!lhsSingleUse)
-            return true;
-    } else {
-        if (lhsSingleUse)
-            return false;
-    }
-
-    // If this is a reduction-style computation, such as
-    //
-    //   sum = 0;
-    //   for (...)
-    //      sum += ...;
-    //
-    // put the phi on the left to promote coalescing. This is fairly specific.
-    if (rhsSingleUse &&
-        rhs->isPhi() &&
-        rhs->block()->isLoopHeader() &&
-        ins == rhs->toPhi()->getLoopBackedgeOperand())
-    {
-        return true;
-    }
-
-    return false;
-}
-
-static void
-ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins)
-{
-    MDefinition *lhs = *lhsp;
-    MDefinition *rhs = *rhsp;
-
-    if (ShouldReorderCommutative(lhs, rhs, ins)) {
-        *rhsp = lhs;
-        *lhsp = rhs;
-    }
-}
-
 void
 LIRGenerator::visitTest(MTest *test)
 {
@ -4083,34 +4026,6 @@ LIRGenerator::visitSimdBinaryComp(MSimdBinaryComp *ins)
    }
 }

-void
-LIRGenerator::visitSimdBinaryArith(MSimdBinaryArith *ins)
-{
-    MOZ_ASSERT(IsSimdType(ins->type()));
-
-    MDefinition *lhs = ins->lhs();
-    MDefinition *rhs = ins->rhs();
-
-    if (ins->isCommutative())
-        ReorderCommutative(&lhs, &rhs, ins);
-
-    if (ins->type() == MIRType_Int32x4) {
-        lowerForFPU(new(alloc()) LSimdBinaryArithIx4(), ins, lhs, rhs);
-        return;
-    }
-
-    MOZ_ASSERT(ins->type() == MIRType_Float32x4, "unknown simd type on binary arith operation");
-
-    LSimdBinaryArithFx4 *lir = new(alloc()) LSimdBinaryArithFx4();
-
-    bool needsTemp = ins->operation() == MSimdBinaryArith::Max ||
-                     ins->operation() == MSimdBinaryArith::MinNum ||
-                     ins->operation() == MSimdBinaryArith::MaxNum;
-    lir->setTemp(0, needsTemp ? temp(LDefinition::FLOAT32X4) : LDefinition::BogusTemp());
-
-    lowerForFPU(lir, ins, lhs, rhs);
-}
-
 void
 LIRGenerator::visitSimdBinaryBitwise(MSimdBinaryBitwise *ins)
 {
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@ -283,7 +283,6 @@ class LIRGenerator : public LIRGeneratorSpecific
    void visitSimdShuffle(MSimdShuffle *ins);
    void visitSimdUnaryArith(MSimdUnaryArith *ins);
    void visitSimdBinaryComp(MSimdBinaryComp *ins);
-    void visitSimdBinaryArith(MSimdBinaryArith *ins);
    void visitSimdBinaryBitwise(MSimdBinaryBitwise *ins);
    void visitSimdShift(MSimdShift *ins);
    void visitSimdConstant(MSimdConstant *ins);
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -1883,7 +1883,7 @@ class MSimdBinaryArith : public MBinaryInstruction
    MSimdBinaryArith(MDefinition *left, MDefinition *right, Operation op, MIRType type)
      : MBinaryInstruction(left, right), operation_(op)
    {
-        MOZ_ASSERT_IF(type == MIRType_Int32x4, op == Add || op == Sub);
+        MOZ_ASSERT_IF(type == MIRType_Int32x4, op == Add || op == Sub || op == Mul);
        MOZ_ASSERT(IsSimdType(type));
        MOZ_ASSERT(left->type() == right->type());
        MOZ_ASSERT(left->type() == type);
--- a/js/src/jit/arm/Lowering-arm.cpp
+++ b/js/src/jit/arm/Lowering-arm.cpp
@ -556,6 +556,12 @@ LIRGeneratorARM::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
    MOZ_CRASH("NYI");
 }

+void
+LIRGeneratorARM::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_CRASH("NYI");
+}
+
 void
 LIRGeneratorARM::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
--- a/js/src/jit/arm/Lowering-arm.h
+++ b/js/src/jit/arm/Lowering-arm.h
@ -107,6 +107,7 @@ class LIRGeneratorARM : public LIRGeneratorShared
    void visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins);
    void visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
    void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
    void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
    void visitSimdSplatX4(MSimdSplatX4 *ins);
    void visitSimdValueX4(MSimdValueX4 *ins);
--- a/js/src/jit/mips/Lowering-mips.cpp
+++ b/js/src/jit/mips/Lowering-mips.cpp
@ -546,6 +546,12 @@ LIRGeneratorMIPS::visitForkJoinGetSlice(MForkJoinGetSlice *ins)
    MOZ_CRASH("NYI");
 }

+void
+LIRGeneratorMIPS::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_CRASH("NYI");
+}
+
 void
 LIRGeneratorMIPS::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
--- a/js/src/jit/mips/Lowering-mips.h
+++ b/js/src/jit/mips/Lowering-mips.h
@ -107,6 +107,7 @@ class LIRGeneratorMIPS : public LIRGeneratorShared
    void visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins);
    void visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins);
    void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
    void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
    void visitSimdSplatX4(MSimdSplatX4 *ins);
    void visitSimdValueX4(MSimdValueX4 *ins);
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@ -602,6 +602,9 @@ class AssemblerX86Shared : public AssemblerShared
    void movdqa(const Operand &src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE2());
        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.movdqa_rr(src.fpu(), dest.code());
+            break;
          case Operand::MEM_REG_DISP:
            masm.movdqa_mr(src.disp(), src.base(), dest.code());
            break;
@ -1812,6 +1815,26 @@ class AssemblerX86Shared : public AssemblerShared
            MOZ_CRASH("unexpected operand kind");
        }
    }
+    void pmuludq(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.pmuludq_rr(src.code(), dest.code());
+    }
+    void pmulld(const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE41());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.pmulld_rr(src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.pmulld_mr(src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.pmulld_mr(src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
    void vaddps(const Operand &src1, FloatRegister src0, FloatRegister dest) {
        MOZ_ASSERT(HasSSE2());
        switch (src1.kind()) {
@ -1981,6 +2004,22 @@ class AssemblerX86Shared : public AssemblerShared
        MOZ_ASSERT(HasSSE2());
        masm.pshufd_irr(mask, src.code(), dest.code());
    }
+    void pshufd(uint32_t mask, const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.pshufd_irr(mask, src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.pshufd_imr(mask, src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.pshufd_imr(mask, src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
    void movhlps(FloatRegister src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE2());
        masm.movhlps_rr(src.code(), dest.code());
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@ -387,6 +387,7 @@ private:
        OP2_PSRAD_VdqWdq    = 0xE2,
        OP2_PXORDQ_VdqWdq   = 0xEF,
        OP2_PSLLD_VdqWdq    = 0xF2,
+        OP2_PMULUDQ_VdqWdq  = 0xF4,
        OP2_PSUBD_VdqWdq    = 0xFA,
        OP2_PADDD_VdqWdq    = 0xFE
    } TwoByteOpcodeID;
@ -400,11 +401,13 @@ private:
        OP3_PTEST_VdVd      = 0x17,
        OP3_INSERTPS_VpsUps = 0x21,
        OP3_PINSRD_VdqEdIb  = 0x22,
+        OP3_PMULLD_VdqWdq   = 0x40,
        OP3_VBLENDVPS_VdqWdq = 0x4A
    } ThreeByteOpcodeID;

    typedef enum {
        ESCAPE_BLENDVPS     = 0x38,
+        ESCAPE_PMULLD       = 0x38,
        ESCAPE_PTEST        = 0x38,
        ESCAPE_PINSRD       = 0x3A,
        ESCAPE_PEXTRD       = 0x3A,
@ -802,6 +805,33 @@ public:
        m_formatter.twoByteOp(OP2_PSUBD_VdqWdq, address, (RegisterID)dst);
    }

+    void pmuludq_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("pmuludq     %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PMULUDQ_VdqWdq, (RegisterID)src, (RegisterID)dst);
+    }
+
+    void pmulld_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("pmulld      %s, %s", nameFPReg(src), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, (RegisterID)src, (RegisterID)dst);
+    }
+    void pmulld_mr(int offset, RegisterID base, XMMRegisterID dst)
+    {
+        spew("pmulld      %s0x%x(%s), %s",
+             PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, offset, base, (RegisterID)dst);
+    }
+    void pmulld_mr(const void* address, XMMRegisterID dst)
+    {
+        spew("pmulld      %p, %s", address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.threeByteOp(OP3_PMULLD_VdqWdq, ESCAPE_PMULLD, address, (RegisterID)dst);
+    }
+
    void vaddps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
    {
        twoByteOpSimd("vaddps", VEX_PS, OP2_ADDPS_VpsWps, src1, src0, dst);
@ -2941,6 +2971,24 @@ public:
        m_formatter.immediate8(uint8_t(mask));
    }

+    void pshufd_imr(uint32_t mask, int offset, RegisterID base, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("pshufd     0x%x, %s0x%x(%s), %s",
+             mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, offset, base, (RegisterID)dst);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
+    void pshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
+    {
+        spew("pshufd     %x, %p, %s", mask, address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_66);
+        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, address, (RegisterID)dst);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
    void shufps_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
    {
        MOZ_ASSERT(mask < 256);
@ -2961,7 +3009,6 @@ public:
    void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
    {
        spew("shufps     %x, %p, %s", mask, address, nameFPReg(dst));
-        m_formatter.prefix(PRE_SSE_F3);
        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, address, (RegisterID)dst);
        m_formatter.immediate8(uint8_t(mask));
    }
@ -4740,6 +4787,16 @@ private:
            memoryModRM(offset, base, reg);
        }

+        void threeByteOp(ThreeByteOpcodeID opcode, ThreeByteEscape escape, const void* address, int reg)
+        {
+            m_buffer.ensureSpace(maxInstructionSize);
+            emitRexIfNeeded(reg, 0, 0);
+            m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
+            m_buffer.putByteUnchecked(escape);
+            m_buffer.putByteUnchecked(opcode);
+            memoryModRM(address, reg);
+        }
+
        void vblendvOpVex(VexOperandType ty, ThreeByteOpcodeID opcode, ThreeByteEscape escape,
                          XMMRegisterID mask, RegisterID rm, XMMRegisterID src0, int reg)
        {
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -2626,9 +2626,27 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4 *ins)
      case MSimdBinaryArith::Sub:
        masm.packedSubInt32(rhs, lhs);
        return;
-      case MSimdBinaryArith::Mul:
-        // we can do mul with a single instruction only if we have SSE4.1
-        // using the PMULLD instruction.
+      case MSimdBinaryArith::Mul: {
+        if (AssemblerX86Shared::HasSSE41()) {
+            masm.pmulld(rhs, lhs);
+            return;
+        }
+
+        masm.loadAlignedInt32x4(rhs, ScratchSimdReg);
+        masm.pmuludq(lhs, ScratchSimdReg);
+        // ScratchSimdReg contains (Rx, _, Rz, _) where R is the resulting vector.
+
+        FloatRegister temp = ToFloatRegister(ins->temp());
+        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
+        masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
+        masm.pmuludq(temp, lhs);
+        // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs);
+        // lhs contains (Ry, Rw, Rx, Rz)
+        masm.shufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs);
+        return;
+      }
      case MSimdBinaryArith::Div:
        // x86 doesn't have SIMD i32 div.
        break;
--- a/js/src/jit/shared/Lowering-shared.cpp
+++ b/js/src/jit/shared/Lowering-shared.cpp
@ -14,6 +14,63 @@
 using namespace js;
 using namespace jit;

+bool
+LIRGeneratorShared::ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins)
+{
+    // lhs and rhs are used by the commutative operator.
+    MOZ_ASSERT(lhs->hasDefUses());
+    MOZ_ASSERT(rhs->hasDefUses());
+
+    // Ensure that if there is a constant, then it is in rhs.
+    if (rhs->isConstant())
+        return false;
+    if (lhs->isConstant())
+        return true;
+
+    // Since clobbering binary operations clobber the left operand, prefer a
+    // non-constant lhs operand with no further uses. To be fully precise, we
+    // should check whether this is the *last* use, but checking hasOneDefUse()
+    // is a decent approximation which doesn't require any extra analysis.
+    bool rhsSingleUse = rhs->hasOneDefUse();
+    bool lhsSingleUse = lhs->hasOneDefUse();
+    if (rhsSingleUse) {
+        if (!lhsSingleUse)
+            return true;
+    } else {
+        if (lhsSingleUse)
+            return false;
+    }
+
+    // If this is a reduction-style computation, such as
+    //
+    //   sum = 0;
+    //   for (...)
+    //      sum += ...;
+    //
+    // put the phi on the left to promote coalescing. This is fairly specific.
+    if (rhsSingleUse &&
+        rhs->isPhi() &&
+        rhs->block()->isLoopHeader() &&
+        ins == rhs->toPhi()->getLoopBackedgeOperand())
+    {
+        return true;
+    }
+
+    return false;
+}
+
+void
+LIRGeneratorShared::ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins)
+{
+    MDefinition *lhs = *lhsp;
+    MDefinition *rhs = *rhsp;
+
+    if (ShouldReorderCommutative(lhs, rhs, ins)) {
+        *rhsp = lhs;
+        *lhsp = rhs;
+    }
+}
+
 void
 LIRGeneratorShared::visitConstant(MConstant *ins)
 {
--- a/js/src/jit/shared/Lowering-shared.h
+++ b/js/src/jit/shared/Lowering-shared.h
@ -50,6 +50,10 @@ class LIRGeneratorShared : public MDefinitionVisitor
    }

  protected:
+
+    static void ReorderCommutative(MDefinition **lhsp, MDefinition **rhsp, MInstruction *ins);
+    static bool ShouldReorderCommutative(MDefinition *lhs, MDefinition *rhs, MInstruction *ins);
+
    // A backend can decide that an instruction should be emitted at its uses,
    // rather than at its definition. To communicate this, set the
    // instruction's virtual register set to 0. When using the instruction,
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@ -655,6 +655,34 @@ LIRGeneratorX86Shared::visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins)
    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
 }

+void
+LIRGeneratorX86Shared::visitSimdBinaryArith(MSimdBinaryArith *ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    MDefinition *lhs = ins->lhs();
+    MDefinition *rhs = ins->rhs();
+
+    if (ins->isCommutative())
+        ReorderCommutative(&lhs, &rhs, ins);
+
+    if (ins->type() == MIRType_Int32x4) {
+        lowerForFPU(new(alloc()) LSimdBinaryArithIx4(), ins, lhs, rhs);
+        return;
+    }
+
+    MOZ_ASSERT(ins->type() == MIRType_Float32x4, "unknown simd type on binary arith operation");
+
+    LSimdBinaryArithFx4 *lir = new(alloc()) LSimdBinaryArithFx4();
+
+    bool needsTemp = ins->operation() == MSimdBinaryArith::Max ||
+                     ins->operation() == MSimdBinaryArith::MinNum ||
+                     ins->operation() == MSimdBinaryArith::MaxNum;
+    lir->setTemp(0, needsTemp ? temp(LDefinition::FLOAT32X4) : LDefinition::BogusTemp());
+
+    lowerForFPU(lir, ins, lhs, rhs);
+}
+
 void
 LIRGeneratorX86Shared::visitSimdTernaryBitwise(MSimdTernaryBitwise *ins)
 {
--- a/js/src/jit/shared/Lowering-x86-shared.h
+++ b/js/src/jit/shared/Lowering-x86-shared.h
@ -53,6 +53,7 @@ class LIRGeneratorX86Shared : public LIRGeneratorShared
    void lowerTruncateDToInt32(MTruncateToInt32 *ins);
    void lowerTruncateFToInt32(MTruncateToInt32 *ins);
    void visitForkJoinGetSlice(MForkJoinGetSlice *ins);
+    void visitSimdBinaryArith(MSimdBinaryArith *ins);
    void visitSimdTernaryBitwise(MSimdTernaryBitwise *ins);
    void visitSimdSplatX4(MSimdSplatX4 *ins);
    void visitSimdValueX4(MSimdValueX4 *ins);