Bug 1021716: SIMD: Use specific instructions for special cases; r=sunfish

2014-10-21 15:39:31 +02:00 · 2014-10-21 15:39:31 +02:00 · 2c3d97c4d8
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@ -242,10 +242,14 @@ class LSimdSwizzleBase : public LInstructionHelper<1, 1, 0>
        return getOperand(0);
    }

-    int32_t laneX() const { return mir_->toSimdSwizzle()->laneX(); }
-    int32_t laneY() const { return mir_->toSimdSwizzle()->laneY(); }
-    int32_t laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
-    int32_t laneW() const { return mir_->toSimdSwizzle()->laneW(); }
+    uint32_t laneX() const { return mir_->toSimdSwizzle()->laneX(); }
+    uint32_t laneY() const { return mir_->toSimdSwizzle()->laneY(); }
+    uint32_t laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
+    uint32_t laneW() const { return mir_->toSimdSwizzle()->laneW(); }
+
+    bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
+        return mir_->toSimdSwizzle()->lanesMatch(x, y, z, w);
+    }
 };

 // Shuffles a int32x4 into another int32x4 vector.
@ -287,10 +291,14 @@ class LSimdShuffle : public LInstructionHelper<1, 2, 1>
        return getTemp(0);
    }

-    int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
-    int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
-    int32_t laneZ() const { return mir_->toSimdShuffle()->laneZ(); }
-    int32_t laneW() const { return mir_->toSimdShuffle()->laneW(); }
+    uint32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
+    uint32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
+    uint32_t laneZ() const { return mir_->toSimdShuffle()->laneZ(); }
+    uint32_t laneW() const { return mir_->toSimdShuffle()->laneW(); }
+
+    bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
+        return mir_->toSimdShuffle()->lanesMatch(x, y, z, w);
+    }
 };

 // Binary SIMD comparison operation between two SIMD operands
--- a/js/src/jit/MIR.cpp
+++ b/js/src/jit/MIR.cpp
@ -819,6 +819,14 @@ MSimdSplatX4::foldsTo(TempAllocator &alloc)
    return MSimdConstant::New(alloc, cst, type());
 }

+MDefinition *
+MSimdSwizzle::foldsTo(TempAllocator &alloc)
+{
+    if (lanesMatch(0, 1, 2, 3))
+        return input();
+    return this;
+}
+
 MCloneLiteral *
 MCloneLiteral::New(TempAllocator &alloc, MDefinition *obj)
 {
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -1585,7 +1585,7 @@ class MSimdShuffleBase
    uint32_t laneMask_;
    uint32_t arity_;

-    MSimdShuffleBase(int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW, MIRType type)
+    MSimdShuffleBase(uint32_t laneX, uint32_t laneY, uint32_t laneZ, uint32_t laneW, MIRType type)
    {
        MOZ_ASSERT(SimdTypeToLength(type) == 4);
        MOZ_ASSERT(IsSimdType(type));
@ -1600,10 +1600,14 @@ class MSimdShuffleBase
  public:
    // For now, these formulas are fine for x4 types. They'll need to be
    // generalized for other SIMD type lengths.
-    int32_t laneX() const { MOZ_ASSERT(arity_ == 4); return laneMask_ & 7; }
-    int32_t laneY() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 3) & 7; }
-    int32_t laneZ() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 6) & 7; }
-    int32_t laneW() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 9) & 7; }
+    uint32_t laneX() const { MOZ_ASSERT(arity_ == 4); return laneMask_ & 7; }
+    uint32_t laneY() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 3) & 7; }
+    uint32_t laneZ() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 6) & 7; }
+    uint32_t laneW() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 9) & 7; }
+
+    bool lanesMatch(uint32_t x, uint32_t y, uint32_t z, uint32_t w) const {
+        return ((x << 0) | (y << 3) | (z << 6) | (w << 9)) == laneMask_;
+    }
 };

 // Applies a shuffle operation to the input, putting the input lanes as
@ -1613,7 +1617,7 @@ class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
 {
  protected:
    MSimdSwizzle(MDefinition *obj, MIRType type,
-                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+                 uint32_t laneX, uint32_t laneY, uint32_t laneZ, uint32_t laneW)
      : MUnaryInstruction(obj), MSimdShuffleBase(laneX, laneY, laneZ, laneW, type)
    {
        MOZ_ASSERT(laneX < 4 && laneY < 4 && laneZ < 4 && laneW < 4);
@ -1628,7 +1632,7 @@ class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
    INSTRUCTION_HEADER(SimdSwizzle);

    static MSimdSwizzle *NewAsmJS(TempAllocator &alloc, MDefinition *obj, MIRType type,
-                                  int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+                                  uint32_t laneX, uint32_t laneY, uint32_t laneZ, uint32_t laneW)
    {
        return new(alloc) MSimdSwizzle(obj, type, laneX, laneY, laneZ, laneW);
    }
@ -1644,6 +1648,8 @@ class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
        return AliasSet::None();
    }

+    MDefinition *foldsTo(TempAllocator &alloc);
+
    ALLOW_CLONE(MSimdSwizzle)
 };

@ -1653,7 +1659,7 @@ class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
 class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
 {
    MSimdShuffle(MDefinition *lhs, MDefinition *rhs, MIRType type,
-                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+                 uint32_t laneX, uint32_t laneY, uint32_t laneZ, uint32_t laneW)
      : MBinaryInstruction(lhs, rhs), MSimdShuffleBase(laneX, laneY, laneZ, laneW, lhs->type())
    {
        MOZ_ASSERT(laneX < 8 && laneY < 8 && laneZ < 8 && laneW < 8);
@ -1670,8 +1676,8 @@ class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
    INSTRUCTION_HEADER(SimdShuffle);

    static MInstruction *NewAsmJS(TempAllocator &alloc, MDefinition *lhs, MDefinition *rhs,
-                                  MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ,
-                                  int32_t laneW)
+                                  MIRType type, uint32_t laneX, uint32_t laneY, uint32_t laneZ,
+                                  uint32_t laneW)
    {
        // Swap operands so that new lanes come from LHS in majority.
        // In the balanced case, swap operands if needs be, in order to be able
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@ -1414,10 +1414,6 @@ class AssemblerX86Shared : public AssemblerShared
        masm.divl_r(divisor.code());
    }

-    void unpcklps(FloatRegister src, FloatRegister dest) {
-        MOZ_ASSERT(HasSSE2());
-        masm.unpcklps_rr(src.code(), dest.code());
-    }
    void pinsrd(unsigned lane, Register src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE41());
        masm.pinsrd_irr(lane, src.code(), dest.code());
@ -1860,6 +1856,18 @@ class AssemblerX86Shared : public AssemblerShared
        MOZ_ASSERT(HasSSE2());
        masm.movhlps_rr(src.code(), dest.code());
    }
+    void movlhps(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.movlhps_rr(src.code(), dest.code());
+    }
+    void unpcklps(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.unpcklps_rr(src.code(), dest.code());
+    }
+    void unpckhps(FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.unpckhps_rr(src.code(), dest.code());
+    }
    void shufps(uint32_t mask, FloatRegister src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE2());
        masm.shufps_irr(mask, src.code(), dest.code());
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@ -295,6 +295,8 @@ private:
        OP2_MOVPS_WpsVps    = 0x11,
        OP2_MOVHLPS_VqUq    = 0x12,
        OP2_UNPCKLPS_VsdWsd = 0x14,
+        OP2_UNPCKHPS_VsdWsd = 0x15,
+        OP2_MOVLHPS_VqUq    = 0x16,
        OP2_MOVAPD_VsdWsd   = 0x28,
        OP2_MOVAPS_VsdWsd   = 0x28,
        OP2_MOVAPS_WsdVsd   = 0x29,
@ -2921,6 +2923,13 @@ public:
        m_formatter.twoByteOp(OP2_UNPCKLPS_VsdWsd, (RegisterID)dst, (RegisterID)src);
    }

+    void unpckhps_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("unpckhps   %s, %s",
+             nameFPReg(src), nameFPReg(dst));
+        m_formatter.twoByteOp(OP2_UNPCKHPS_VsdWsd, (RegisterID)dst, (RegisterID)src);
+    }
+
    void movd_rr(RegisterID src, XMMRegisterID dst)
    {
        spew("movd       %s, %s",
@ -2981,6 +2990,13 @@ public:
        m_formatter.twoByteOp(OP2_MOVHLPS_VqUq, (RegisterID)dst, (RegisterID)src);
    }

+    void movlhps_rr(XMMRegisterID src, XMMRegisterID dst)
+    {
+        spew("movlhps     %s, %s",
+             nameFPReg(src), nameFPReg(dst));
+        m_formatter.twoByteOp(OP2_MOVLHPS_VqUq, (RegisterID)dst, (RegisterID)src);
+    }
+
    void psrldq_ir(int shift, XMMRegisterID dest)
    {
        spew("psrldq     $%d, %s",
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -2394,8 +2394,12 @@ CodeGeneratorX86Shared::visitSimdSwizzleI(LSimdSwizzleI *ins)
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

-    uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ(),
-                                                       ins->laneW());
+    uint32_t x = ins->laneX();
+    uint32_t y = ins->laneY();
+    uint32_t z = ins->laneZ();
+    uint32_t w = ins->laneW();
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
    masm.shuffleInt32(mask, input, output);
    return true;
 }
@ -2406,8 +2410,38 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF *ins)
    FloatRegister input = ToFloatRegister(ins->input());
    FloatRegister output = ToFloatRegister(ins->output());

-    uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ(),
-                                                       ins->laneW());
+    uint32_t x = ins->laneX();
+    uint32_t y = ins->laneY();
+    uint32_t z = ins->laneZ();
+    uint32_t w = ins->laneW();
+
+    // TODO Here and below, arch specific lowering could identify this pattern
+    // and use defineReuseInput to avoid this move (bug 1084404)
+    if (ins->lanesMatch(2, 3, 2, 3)) {
+        masm.movaps(input, output);
+        masm.movhlps(input, output);
+        return true;
+    }
+
+    if (ins->lanesMatch(0, 1, 0, 1)) {
+        masm.movaps(input, output);
+        masm.movlhps(input, output);
+        return true;
+    }
+
+    if (ins->lanesMatch(0, 0, 1, 1)) {
+        masm.movaps(input, output);
+        masm.unpcklps(input, output);
+        return true;
+    }
+
+    if (ins->lanesMatch(2, 2, 3, 3)) {
+        masm.movaps(input, output);
+        masm.unpckhps(input, output);
+        return true;
+    }
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
    masm.shuffleFloat32(mask, input, output);
    return true;
 }
@ -2448,6 +2482,11 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
    if (numLanesFromLHS == 3) {
        unsigned firstMask = -1, secondMask = -1;

+        if (ins->lanesMatch(4, 1, 2, 3)) {
+            masm.movss(rhs, out);
+            return true;
+        }
+
        FloatRegister rhsCopy = ToFloatRegister(ins->temp());

        if (x < 4 && y < 4) {
@ -2497,6 +2536,46 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
    // Two elements from one vector, two other elements from the other
    MOZ_ASSERT(numLanesFromLHS == 2);

+    // TODO Here and below, symmetric case would be more handy to avoid a move,
+    // but can't be reached because operands would get swapped (bug 1084404).
+    if (ins->lanesMatch(2, 3, 6, 7)) {
+        masm.movaps(rhs, ScratchSimdReg);
+        masm.movhlps(lhs, ScratchSimdReg);
+        masm.movaps(ScratchSimdReg, out);
+        return true;
+    }
+
+    if (ins->lanesMatch(0, 1, 4, 5)) {
+        masm.movlhps(rhs, lhs);
+        return true;
+    }
+
+    if (ins->lanesMatch(0, 4, 1, 5)) {
+        masm.unpcklps(rhs, lhs);
+        return true;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (ins->lanesMatch(4, 0, 5, 1)) {
+        masm.movaps(rhs, ScratchSimdReg);
+        masm.unpcklps(lhs, ScratchSimdReg);
+        masm.movaps(ScratchSimdReg, out);
+        return true;
+    }
+
+    if (ins->lanesMatch(2, 6, 3, 7)) {
+        masm.unpckhps(rhs, lhs);
+        return true;
+    }
+
+    // TODO swapped case would be better (bug 1084404)
+    if (ins->lanesMatch(6, 2, 7, 3)) {
+        masm.movaps(rhs, ScratchSimdReg);
+        masm.unpckhps(lhs, ScratchSimdReg);
+        masm.movaps(ScratchSimdReg, out);
+        return true;
+    }
+
    // In one shufps
    if (x < 4 && y < 4) {
        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);