Bug 1021716: SIMD x86-x64: Implement generic binary shuffle; r=sunfish

2014-10-21 15:39:15 +02:00 · 2014-10-21 15:39:15 +02:00 · ba86b71604
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@ -266,12 +266,16 @@ class LSimdSwizzleF : public LSimdSwizzleBase
 };

 // Base class for both int32x4 and float32x4 shuffle instructions.
-class LSimdShuffle : public LInstructionHelper<1, 2, 0>
+class LSimdShuffle : public LInstructionHelper<1, 2, 1>
 {
  public:
    LIR_HEADER(SimdShuffle);
-    LSimdShuffle()
-    {}
+    LSimdShuffle(const LAllocation &lhs, const LAllocation &rhs, const LDefinition &temp)
+    {
+        setOperand(0, lhs);
+        setOperand(1, rhs);
+        setTemp(0, temp);
+    }

    const LAllocation *lhs() {
        return getOperand(0);
@ -279,6 +283,9 @@ class LSimdShuffle : public LInstructionHelper<1, 2, 0>
    const LAllocation *rhs() {
        return getOperand(1);
    }
+    const LDefinition *temp() {
+        return getTemp(0);
+    }

    int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
    int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@ -3836,16 +3836,20 @@ LIRGenerator::visitSimdShuffle(MSimdShuffle *ins)
    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
    MOZ_ASSERT(IsSimdType(ins->type()));
+    MOZ_ASSERT(ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4);

-    if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) {
-        MDefinition *lhs = ins->lhs();
-        MDefinition *rhs = ins->rhs();
-        LSimdShuffle *lir = new (alloc()) LSimdShuffle;
-        return lowerForFPU(lir, ins, lhs, rhs);
-    }
+    bool zFromLHS = ins->laneZ() < 4;
+    bool wFromLHS = ins->laneW() < 4;
+    uint32_t lanesFromLHS = (ins->laneX() < 4) + (ins->laneY() < 4) + zFromLHS + wFromLHS;

-    MOZ_CRASH("Unknown SIMD kind when getting lane");
-    return false;
+    LUse lhs = useRegisterAtStart(ins->lhs());
+    LUse rhs = useRegister(ins->rhs());
+
+    // See codegen for requirements details.
+    LDefinition temp = (lanesFromLHS == 3) ? tempCopy(ins->rhs(), 1) : LDefinition::BogusTemp();
+
+    LSimdShuffle *lir = new (alloc()) LSimdShuffle(lhs, rhs, temp);
+    return defineReuseInput(lir, ins, 0);
 }

 bool
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -1673,6 +1673,18 @@ class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
                                  MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ,
                                  int32_t laneW)
    {
+        // Swap operands so that new lanes come from LHS in majority.
+        // In the balanced case, swap operands if needs be, in order to be able
+        // to do only one shufps on x86.
+        unsigned lanesFromLHS = (laneX < 4) + (laneY < 4) + (laneZ < 4) + (laneW < 4);
+        if (lanesFromLHS < 2 || (lanesFromLHS == 2 && laneX >= 4 && laneY >=4)) {
+            laneX = (laneX + 4) % 8;
+            laneY = (laneY + 4) % 8;
+            laneZ = (laneZ + 4) % 8;
+            laneW = (laneW + 4) % 8;
+            mozilla::Swap(lhs, rhs);
+        }
+
        return new(alloc) MSimdShuffle(lhs, rhs, type, laneX, laneY, laneZ, laneW);
    }

--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -2416,12 +2416,128 @@ bool
 CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
 {
    FloatRegister lhs = ToFloatRegister(ins->lhs());
-    Operand rhs = ToOperand(ins->rhs());
-    MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
+    FloatRegister rhs = ToFloatRegister(ins->rhs());
+    FloatRegister out = ToFloatRegister(ins->output());
+    MOZ_ASSERT(out == lhs); // define reuse input

-    uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ() - 4,
-                                                       ins->laneW() - 4);
-    masm.shuffleMix(mask, rhs, lhs);
+    uint32_t x = ins->laneX();
+    uint32_t y = ins->laneY();
+    uint32_t z = ins->laneZ();
+    uint32_t w = ins->laneW();
+
+    // Check that lanes come from LHS in majority:
+    unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
+    MOZ_ASSERT(numLanesFromLHS >= 2);
+
+    // When reading this method, remember that shufps takes the two first
+    // inputs of the destination operand (right operand) and the two last
+    // inputs of the source operand (left operand).
+    //
+    // Legend for explanations:
+    // - L: LHS
+    // - R: RHS
+    // - T: temporary
+
+    uint32_t mask;
+
+    // Trivial cases: all lanes come from only one vector (Lx, Ly, Lz, Lz).
+    if (numLanesFromLHS == 4) {
+        mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
+        masm.shufps(mask, lhs, out);
+        return true;
+    }
+
+    // One element of the second, all other elements of the first
+    if (numLanesFromLHS == 3) {
+        unsigned firstMask = -1, secondMask = -1;
+
+        FloatRegister rhsCopy = ToFloatRegister(ins->temp());
+
+        if (x < 4 && y < 4) {
+            if (w >= 4) {
+                w %= 4;
+                // T = (Rw Rw Lz Lz) = shufps(firstMask, lhs, rhs)
+                firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
+                // (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = shufps(secondMask, T, lhs)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneZ, LaneX);
+            } else {
+                MOZ_ASSERT(z >= 4);
+                z %= 4;
+                // T = (Rz Rz Lw Lw) = shufps(firstMask, lhs, rhs)
+                firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
+                // (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = shufps(secondMask, T, lhs)
+                secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneX, LaneZ);
+            }
+
+            masm.shufps(firstMask, lhs, rhsCopy);
+            masm.shufps(secondMask, rhsCopy, lhs);
+            return true;
+        }
+
+        MOZ_ASSERT(z < 4 && w < 4);
+
+        if (y >= 4) {
+            y %= 4;
+            // T = (Ry Ry Lx Lx) = shufps(firstMask, lhs, rhs)
+            firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
+            // (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = shufps(secondMask, lhs, T)
+            secondMask = MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, z, w);
+        } else {
+            MOZ_ASSERT(x >= 4);
+            x %= 4;
+            // T = (Rx Rx Ly Ly) = shufps(firstMask, lhs, rhs)
+            firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
+            // (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = shufps(secondMask, lhs, T)
+            secondMask = MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, z, w);
+        }
+
+        masm.shufps(firstMask, lhs, rhsCopy);
+        masm.shufps(secondMask, lhs, rhsCopy);
+        masm.movaps(rhsCopy, out);
+        return true;
+    }
+
+    // Two elements from one vector, two other elements from the other
+    MOZ_ASSERT(numLanesFromLHS == 2);
+
+    // In one shufps
+    if (x < 4 && y < 4) {
+        mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
+        masm.shufps(mask, rhs, out);
+        return true;
+    }
+
+    // At creation, we should have explicitly swapped in this case.
+    MOZ_ASSERT(!(z >= 4 && w >= 4));
+
+    // In two shufps, for the most generic case:
+    uint32_t firstMask[4], secondMask[4];
+    unsigned i = 0, j = 2, k = 0;
+
+#define COMPUTE_MASK(lane)       \
+    if (lane >= 4) {             \
+        firstMask[j] = lane % 4; \
+        secondMask[k++] = j++;   \
+    } else {                     \
+        firstMask[i] = lane;     \
+        secondMask[k++] = i++;   \
+    }
+
+    COMPUTE_MASK(x)
+    COMPUTE_MASK(y)
+    COMPUTE_MASK(z)
+    COMPUTE_MASK(w)
+#undef COMPUTE_MASK
+
+    MOZ_ASSERT(i == 2 && j == 4 && k == 4);
+
+    mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
+                                              firstMask[2], firstMask[3]);
+    masm.shufps(mask, rhs, lhs);
+
+    mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
+                                              secondMask[2], secondMask[3]);
+    masm.shufps(mask, lhs, lhs);
    return true;
 }