Bug 1021716: SIMD x86-x64: Implement MSimdShuffleMix; r=sunfish

2014-08-27 19:24:41 +02:00 · 2014-08-27 19:24:41 +02:00 · b2b91caa03
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@ -242,10 +242,10 @@ class LSimdSwizzleBase : public LInstructionHelper<1, 1, 0>
        return getOperand(0);
    }

-    SimdLane laneX() const { return mir_->toSimdSwizzle()->laneX(); }
-    SimdLane laneY() const { return mir_->toSimdSwizzle()->laneY(); }
-    SimdLane laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
-    SimdLane laneW() const { return mir_->toSimdSwizzle()->laneW(); }
+    int32_t laneX() const { return mir_->toSimdSwizzle()->laneX(); }
+    int32_t laneY() const { return mir_->toSimdSwizzle()->laneY(); }
+    int32_t laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
+    int32_t laneW() const { return mir_->toSimdSwizzle()->laneW(); }
 };

 // Shuffles a int32x4 into another int32x4 vector.
@ -265,6 +265,27 @@ class LSimdSwizzleF : public LSimdSwizzleBase
    {}
 };

+// Base class for both int32x4 and float32x4 shuffle instructions.
+class LSimdShuffle : public LInstructionHelper<1, 2, 0>
+{
+  public:
+    LIR_HEADER(SimdShuffle);
+    LSimdShuffle()
+    {}
+
+    const LAllocation *lhs() {
+        return getOperand(0);
+    }
+    const LAllocation *rhs() {
+        return getOperand(1);
+    }
+
+    int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
+    int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
+    int32_t laneZ() const { return mir_->toSimdShuffle()->laneZ(); }
+    int32_t laneW() const { return mir_->toSimdShuffle()->laneW(); }
+};
+
 // Binary SIMD comparison operation between two SIMD operands
 class LSimdBinaryComp: public LInstructionHelper<1, 2, 0>
 {
--- a/js/src/jit/LOpcodes.h
+++ b/js/src/jit/LOpcodes.h
@ -26,6 +26,7 @@
    _(SimdSignMaskX4)               \
    _(SimdSwizzleI)                 \
    _(SimdSwizzleF)                 \
+    _(SimdShuffle)                  \
    _(SimdUnaryArithIx4)            \
    _(SimdUnaryArithFx4)            \
    _(SimdBinaryCompIx4)            \
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@ -3830,6 +3830,24 @@ LIRGenerator::visitSimdSwizzle(MSimdSwizzle *ins)
    return false;
 }

+bool
+LIRGenerator::visitSimdShuffle(MSimdShuffle *ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) {
+        MDefinition *lhs = ins->lhs();
+        MDefinition *rhs = ins->rhs();
+        LSimdShuffle *lir = new (alloc()) LSimdShuffle;
+        return lowerForFPU(lir, ins, lhs, rhs);
+    }
+
+    MOZ_CRASH("Unknown SIMD kind when getting lane");
+    return false;
+}
+
 bool
 LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith *ins)
 {
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@ -273,6 +273,7 @@ class LIRGenerator : public LIRGeneratorSpecific
    bool visitSimdInsertElement(MSimdInsertElement *ins);
    bool visitSimdSignMask(MSimdSignMask *ins);
    bool visitSimdSwizzle(MSimdSwizzle *ins);
+    bool visitSimdShuffle(MSimdShuffle *ins);
    bool visitSimdUnaryArith(MSimdUnaryArith *ins);
    bool visitSimdBinaryComp(MSimdBinaryComp *ins);
    bool visitSimdBinaryArith(MSimdBinaryArith *ins);
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -1575,34 +1575,51 @@ class MSimdSignMask : public MUnaryInstruction
    ALLOW_CLONE(MSimdSignMask)
 };

+// Base for the MSimdSwizzle and MSimdShuffle classes.
+class MSimdShuffleBase
+{
+  protected:
+    // As of now, there are at most 4 lanes. For each lane, we need to know
+    // which input we choose and which of the 4 lanes we choose; that can be
+    // packed in 3 bits for each lane, so 12 bits in total.
+    uint32_t laneMask_;
+    uint32_t arity_;
+
+    MSimdShuffleBase(int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW, MIRType type)
+    {
+        MOZ_ASSERT(SimdTypeToLength(type) == 4);
+        MOZ_ASSERT(IsSimdType(type));
+        laneMask_ = (laneX << 0) | (laneY << 3) | (laneZ << 6) | (laneW << 9);
+        arity_ = 4;
+    }
+
+    bool sameLanes(const MSimdShuffleBase *other) const {
+        return laneMask_ == other->laneMask_;
+    }
+
+  public:
+    // For now, these formulas are fine for x4 types. They'll need to be
+    // generalized for other SIMD type lengths.
+    int32_t laneX() const { MOZ_ASSERT(arity_ == 4); return laneMask_ & 7; }
+    int32_t laneY() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 3) & 7; }
+    int32_t laneZ() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 6) & 7; }
+    int32_t laneW() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 9) & 7; }
+};
+
 // Applies a shuffle operation to the input, putting the input lanes as
 // indicated in the output register's lanes. This implements the SIMD.js
 // "shuffle" function, that takes one vector and one mask.
-class MSimdSwizzle : public MUnaryInstruction
+class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
 {
  protected:
-    // As of now, there are at most 4 lanes.
-    SimdLane laneX_;
-    SimdLane laneY_;
-    SimdLane laneZ_;
-    SimdLane laneW_;
-
    MSimdSwizzle(MDefinition *obj, MIRType type,
-                 SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW)
-      : MUnaryInstruction(obj),
-        laneX_(laneX), laneY_(laneY), laneZ_(laneZ), laneW_(laneW)
+                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+      : MUnaryInstruction(obj), MSimdShuffleBase(laneX, laneY, laneZ, laneW, type)
    {
+        MOZ_ASSERT(laneX < 4 && laneY < 4 && laneZ < 4 && laneW < 4);
        MOZ_ASSERT(IsSimdType(obj->type()));
-        // Returned value needs to be in a vector too
        MOZ_ASSERT(IsSimdType(type));
-        MOZ_ASSERT(SimdTypeToScalarType(obj->type()) == type);
-
-        mozilla::DebugOnly<uint32_t> expectedLength = SimdTypeToLength(obj->type());
-        MOZ_ASSERT(uint32_t(laneX_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneY_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneZ_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneW_) < expectedLength);
-
+        MOZ_ASSERT(obj->type() == type);
        setResultType(type);
        setMovable();
    }
@ -1611,36 +1628,68 @@ class MSimdSwizzle : public MUnaryInstruction
    INSTRUCTION_HEADER(SimdSwizzle);

    static MSimdSwizzle *NewAsmJS(TempAllocator &alloc, MDefinition *obj, MIRType type,
-                                  SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW)
+                                  int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
    {
        return new(alloc) MSimdSwizzle(obj, type, laneX, laneY, laneZ, laneW);
    }

-    SimdLane laneX() const { return laneX_; }
-    SimdLane laneY() const { return laneY_; }
-    SimdLane laneZ() const { return laneZ_; }
-    SimdLane laneW() const { return laneW_; }
-
-    AliasSet getAliasSet() const {
-        return AliasSet::None();
-    }
    bool congruentTo(const MDefinition *ins) const {
        if (!ins->isSimdSwizzle())
            return false;
        const MSimdSwizzle *other = ins->toSimdSwizzle();
-        if (other->laneX_ != laneX_ ||
-            other->laneY_ != laneY_ ||
-            other->laneZ_ != laneZ_ ||
-            other->laneW_ != laneW_)
-        {
-            return false;
-        }
-        return congruentIfOperandsEqual(other);
+        return sameLanes(other) && congruentIfOperandsEqual(other);
+    }
+
+    AliasSet getAliasSet() const {
+        return AliasSet::None();
    }

    ALLOW_CLONE(MSimdSwizzle)
 };

+// Applies a shuffle operation to the inputs, selecting the 2 first lanes of the
+// output from lanes of the first input, and the 2 last lanes of the output from
+// lanes of the second input.
+class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
+{
+    MSimdShuffle(MDefinition *lhs, MDefinition *rhs, MIRType type,
+                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+      : MBinaryInstruction(lhs, rhs), MSimdShuffleBase(laneX, laneY, laneZ, laneW, lhs->type())
+    {
+        MOZ_ASSERT(laneX < 8 && laneY < 8 && laneZ < 8 && laneW < 8);
+        MOZ_ASSERT(IsSimdType(lhs->type()));
+        MOZ_ASSERT(IsSimdType(rhs->type()));
+        MOZ_ASSERT(lhs->type() == rhs->type());
+        MOZ_ASSERT(IsSimdType(type));
+        MOZ_ASSERT(lhs->type() == type);
+        setResultType(type);
+        setMovable();
+    }
+
+  public:
+    INSTRUCTION_HEADER(SimdShuffle);
+
+    static MSimdShuffle *NewAsmJS(TempAllocator &alloc, MDefinition *lhs, MDefinition *rhs,
+                                  MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ,
+                                  int32_t laneW)
+    {
+        return new(alloc) MSimdShuffle(lhs, rhs, type, laneX, laneY, laneZ, laneW);
+    }
+
+    bool congruentTo(const MDefinition *ins) const {
+        if (!ins->isSimdShuffle())
+            return false;
+        const MSimdShuffle *other = ins->toSimdShuffle();
+        return sameLanes(other) && binaryCongruentTo(other);
+    }
+
+    AliasSet getAliasSet() const {
+        return AliasSet::None();
+    }
+
+    ALLOW_CLONE(MSimdShuffle)
+};
+
 class MSimdUnaryArith : public MUnaryInstruction
 {
  public:
--- a/js/src/jit/MOpcodes.h
+++ b/js/src/jit/MOpcodes.h
@ -21,6 +21,7 @@ namespace jit {
    _(SimdInsertElement)                                                    \
    _(SimdSignMask)                                                         \
    _(SimdSwizzle)                                                          \
+    _(SimdShuffle)                                                          \
    _(SimdUnaryArith)                                                       \
    _(SimdBinaryComp)                                                       \
    _(SimdBinaryArith)                                                      \
--- a/js/src/jit/ParallelSafetyAnalysis.cpp
+++ b/js/src/jit/ParallelSafetyAnalysis.cpp
@ -120,6 +120,7 @@ class ParallelSafetyVisitor : public MDefinitionVisitor
    SAFE_OP(SimdInsertElement)
    SAFE_OP(SimdSignMask)
    SAFE_OP(SimdSwizzle)
+    SAFE_OP(SimdShuffle)
    SAFE_OP(SimdUnaryArith)
    SAFE_OP(SimdBinaryComp)
    SAFE_OP(SimdBinaryArith)
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@ -1864,6 +1864,22 @@ class AssemblerX86Shared : public AssemblerShared
        MOZ_ASSERT(HasSSE2());
        masm.shufps_irr(mask, src.code(), dest.code());
    }
+    void shufps(uint32_t mask, const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.shufps_irr(mask, src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.shufps_imr(mask, src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.shufps_imr(mask, src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
    void addsd(FloatRegister src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE2());
        masm.addsd_rr(src.code(), dest.code());
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@ -2940,7 +2940,7 @@ public:
    void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
    {
        MOZ_ASSERT(mask < 256);
-        spew("pshufd      0x%x, %s, %s",
+        spew("pshufd     0x%x, %s, %s",
             mask, nameFPReg(src), nameFPReg(dst));
        m_formatter.prefix(PRE_SSE_66);
        m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)dst, (RegisterID)src);
@ -2956,6 +2956,24 @@ public:
        m_formatter.immediate8(uint8_t(mask));
    }

+    void shufps_imr(uint32_t mask, int offset, RegisterID base, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("shufps     0x%x, %s0x%x(%s), %s",
+             mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, base, offset);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
+    void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
+    {
+        spew("shufps     %x, %p, %s",
+             mask, address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, address);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
    void movhlps_rr(XMMRegisterID src, XMMRegisterID dst)
    {
        spew("movhlps     %s, %s",
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -2412,6 +2412,19 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF *ins)
    return true;
 }

+bool
+CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
+{
+    FloatRegister lhs = ToFloatRegister(ins->lhs());
+    Operand rhs = ToOperand(ins->rhs());
+    MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ() - 4,
+                                                       ins->laneW() - 4);
+    masm.shuffleMix(mask, rhs, lhs);
+    return true;
+}
+
 bool
 CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *ins)
 {
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@ -221,6 +221,7 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
    bool visitSimdSignMaskX4(LSimdSignMaskX4 *ins);
    bool visitSimdSwizzleI(LSimdSwizzleI *lir);
    bool visitSimdSwizzleF(LSimdSwizzleF *lir);
+    bool visitSimdShuffle(LSimdShuffle *lir);
    bool visitSimdUnaryArithIx4(LSimdUnaryArithIx4 *lir);
    bool visitSimdUnaryArithFx4(LSimdUnaryArithFx4 *lir);
    bool visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *lir);
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@ -595,13 +595,12 @@ class MacroAssemblerX86Shared : public Assembler
    void packedDivFloat32(const Operand &src, FloatRegister dest) {
        divps(src, dest);
    }
-    static uint32_t ComputeShuffleMask(SimdLane x, SimdLane y = LaneX,
-                                       SimdLane z = LaneX, SimdLane w = LaneX)
+
+    static uint32_t ComputeShuffleMask(uint32_t x = LaneX, uint32_t y = LaneY,
+                                       uint32_t z = LaneZ, uint32_t w = LaneW)
    {
-        uint32_t r = (uint32_t(w) << 6) |
-                     (uint32_t(z) << 4) |
-                     (uint32_t(y) << 2) |
-                     uint32_t(x);
+        MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4);
+        uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0);
        MOZ_ASSERT(r < 256);
        return r;
    }
@ -626,6 +625,11 @@ class MacroAssemblerX86Shared : public Assembler
            moveAlignedFloat32x4(src, dest);
        shufps(mask, dest, dest);
    }
+    void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) {
+        // Note this uses shufps, which is a cross-domain penaly on CPU where it
+        // applies, but that's the way clang and gcc do it.
+        shufps(mask, src, dest);
+    }

    void moveFloatAsDouble(Register src, FloatRegister dest) {
        movd(src, dest);