Bug 1136226 - Implement Bool8x16.splat and Bool16x8.splat. r=bbouvier

The scalar argument to this operation is expanded into MIR as either -1 or 0 in an Int32, so the 4-lane splat produces the correct result for 8-lane and 16-lane splats too. Either an all-zeroes vector or an all-ones vector.
2016-05-31 09:00:19 -07:00 · 2016-05-31 09:00:19 -07:00 · 9b1061c296
--- a/js/src/jit/MIR.cpp
+++ b/js/src/jit/MIR.cpp
@ -1161,6 +1161,16 @@ MSimdSplat::foldsTo(TempAllocator& alloc)
        cst = SimdConstant::SplatX4(v);
        break;
      }
+      case MIRType::Int8x16: {
+        int32_t v = op->toConstant()->toInt32();
+        cst = SimdConstant::SplatX16(v);
+        break;
+      }
+      case MIRType::Int16x8: {
+        int32_t v = op->toConstant()->toInt32();
+        cst = SimdConstant::SplatX8(v);
+        break;
+      }
      case MIRType::Int32x4: {
        int32_t v = op->toConstant()->toInt32();
        cst = SimdConstant::SplatX4(v);
--- a/js/src/jit/shared/LIR-shared.h
+++ b/js/src/jit/shared/LIR-shared.h
@ -195,6 +195,36 @@ class LSimdUnbox : public LInstructionHelper<1, 1, 1>
    }
 };

+// Constructs a SIMD value with 16 equal components (int8x16).
+class LSimdSplatX16 : public LInstructionHelper<1, 1, 0>
+{
+  public:
+    LIR_HEADER(SimdSplatX16)
+    explicit LSimdSplatX16(const LAllocation& v)
+    {
+        setOperand(0, v);
+    }
+
+    MSimdSplat* mir() const {
+        return mir_->toSimdSplat();
+    }
+};
+
+// Constructs a SIMD value with 8 equal components (int16x8).
+class LSimdSplatX8 : public LInstructionHelper<1, 1, 0>
+{
+  public:
+    LIR_HEADER(SimdSplatX8)
+    explicit LSimdSplatX8(const LAllocation& v)
+    {
+        setOperand(0, v);
+    }
+
+    MSimdSplat* mir() const {
+        return mir_->toSimdSplat();
+    }
+};
+
 // Constructs a SIMD value with 4 equal components (e.g. int32x4, float32x4).
 class LSimdSplatX4 : public LInstructionHelper<1, 1, 0>
 {
--- a/js/src/jit/shared/LOpcodes-shared.h
+++ b/js/src/jit/shared/LOpcodes-shared.h
@ -20,6 +20,8 @@
    _(Float32)                      \
    _(SimdBox)                      \
    _(SimdUnbox)                    \
+    _(SimdSplatX16)                 \
+    _(SimdSplatX8)                  \
    _(SimdSplatX4)                  \
    _(Simd128Int)                   \
    _(Simd128Float)                 \
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@ -1085,6 +1085,7 @@ class AssemblerX86Shared : public AssemblerShared

    static bool HasSSE2() { return CPUInfo::IsSSE2Present(); }
    static bool HasSSE3() { return CPUInfo::IsSSE3Present(); }
+    static bool HasSSSE3() { return CPUInfo::IsSSSE3Present(); }
    static bool HasSSE41() { return CPUInfo::IsSSE41Present(); }
    static bool HasPOPCNT() { return CPUInfo::IsPOPCNTPresent(); }
    static bool SupportsFloatingPoint() { return CPUInfo::IsSSE2Present(); }
@ -2996,6 +2997,19 @@ class AssemblerX86Shared : public AssemblerShared
            MOZ_CRASH("unexpected operand kind");
        }
    }
+
+    void vpshuflw(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpshuflw_irr(mask, src.encoding(), dest.encoding());
+    }
+    void vpshufhw(uint32_t mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        masm.vpshufhw_irr(mask, src.encoding(), dest.encoding());
+    }
+    void vpshufb(FloatRegister mask, FloatRegister src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSSE3());
+        masm.vpshufb_rr(mask.encoding(), src.encoding(), dest.encoding());
+    }
    void vmovddup(FloatRegister src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE3());
        masm.vmovddup_rr(src.encoding(), dest.encoding());
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@ -2825,6 +2825,21 @@ public:
        twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, invalid_xmm, dst);
    }

+    void vpshuflw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshuflw", VEX_SD, OP2_PSHUFLW_VdqWdqIb, mask, src, invalid_xmm, dst);
+    }
+
+    void vpshufhw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
+    {
+        twoByteOpImmSimd("vpshufhw", VEX_SS, OP2_PSHUFHW_VdqWdqIb, mask, src, invalid_xmm, dst);
+    }
+
+    void vpshufb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, src1, src0, dst);
+    }
+
    void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
    {
        twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -2598,6 +2598,39 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
    masm.vunpcklps(tmp, output, output);
 }

+void
+CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
+{
+    MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
+    Register input = ToRegister(ins->getOperand(0));
+    FloatRegister output = ToFloatRegister(ins->output());
+    masm.vmovd(input, output);
+    if (AssemblerX86Shared::HasSSSE3()) {
+        masm.zeroSimd128Int(ScratchSimd128Reg);
+        masm.vpshufb(ScratchSimd128Reg, output, output);
+    } else {
+        // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+        masm.vpsllw(Imm32(8), output, output);
+        masm.vmovdqa(output, ScratchSimd128Reg);
+        masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
+        masm.vpor(ScratchSimd128Reg, output, output);
+        // Then do an X8 splat.
+        masm.vpshuflw(0, output, output);
+        masm.vpshufd(0, output, output);
+    }
+}
+
+void
+CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
+{
+    MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
+    Register input = ToRegister(ins->getOperand(0));
+    FloatRegister output = ToFloatRegister(ins->output());
+    masm.vmovd(input, output);
+    masm.vpshuflw(0, output, output);
+    masm.vpshufd(0, output, output);
+}
+
 void
 CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
 {
@ -2607,22 +2640,14 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
    MOZ_ASSERT(IsSimdType(mir->type()));
    JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));

-    switch (mir->type()) {
-      case MIRType::Int32x4:
-      case MIRType::Bool32x4: {
-        Register r = ToRegister(ins->getOperand(0));
-        masm.vmovd(r, output);
-        masm.vpshufd(0, output, output);
-        break;
-      }
-      case MIRType::Float32x4: {
+    if (mir->type() == MIRType::Float32x4) {
        FloatRegister r = ToFloatRegister(ins->getOperand(0));
        FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
        masm.vshufps(0, rCopy, rCopy, output);
-        break;
-      }
-      default:
-        MOZ_CRASH("Unknown SIMD kind");
+    } else {
+        Register r = ToRegister(ins->getOperand(0));
+        masm.vmovd(r, output);
+        masm.vpshufd(0, output, output);
    }
 }

--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.h
@ -291,6 +291,8 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
    // SIMD operators
    void visitSimdValueInt32x4(LSimdValueInt32x4* lir);
    void visitSimdValueFloat32x4(LSimdValueFloat32x4* lir);
+    void visitSimdSplatX16(LSimdSplatX16* lir);
+    void visitSimdSplatX8(LSimdSplatX8* lir);
    void visitSimdSplatX4(LSimdSplatX4* lir);
    void visitSimd128Int(LSimd128Int* ins);
    void visitSimd128Float(LSimd128Float* ins);
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@ -217,6 +217,8 @@ enum TwoByteOpcodeID {
    OP2_MOVDQ_VsdWsd    = 0x6F,
    OP2_MOVDQ_VdqWdq    = 0x6F,
    OP2_PSHUFD_VdqWdqIb = 0x70,
+    OP2_PSHUFLW_VdqWdqIb = 0x70,
+    OP2_PSHUFHW_VdqWdqIb = 0x70,
    OP2_PSLLW_UdqIb     = 0x71,
    OP2_PSRAW_UdqIb     = 0x71,
    OP2_PSRLW_UdqIb     = 0x71,
@ -281,6 +283,7 @@ enum TwoByteOpcodeID {
 };

 enum ThreeByteOpcodeID {
+    OP3_PSHUFB_VdqWdq   = 0x00,
    OP3_ROUNDSS_VsdWsd  = 0x0A,
    OP3_ROUNDSD_VsdWsd  = 0x0B,
    OP3_BLENDVPS_VdqWdq = 0x14,
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -814,19 +814,27 @@ void
 LIRGeneratorX86Shared::visitSimdSplat(MSimdSplat* ins)
 {
    LAllocation x = useRegisterAtStart(ins->getOperand(0));
-    LSimdSplatX4* lir = new(alloc()) LSimdSplatX4(x);

    switch (ins->type()) {
-      case MIRType::Int32x4:
-      case MIRType::Bool32x4:
-        define(lir, ins);
+      case MIRType::Int8x16:
+        define(new (alloc()) LSimdSplatX16(x), ins);
        break;
+      case MIRType::Int16x8:
+        define(new (alloc()) LSimdSplatX8(x), ins);
+        break;
+      case MIRType::Int32x4:
      case MIRType::Float32x4:
-        // (Non-AVX) codegen actually wants the input and the output to be in
-        // the same register, but we can't currently use defineReuseInput
-        // because they have different types (scalar vs vector), so a spill slot
-        // for one may not be suitable for the other.
-        define(lir, ins);
+      case MIRType::Bool8x16:
+      case MIRType::Bool16x8:
+      case MIRType::Bool32x4:
+        // Use the SplatX4 instruction for all boolean splats. Since the input
+        // value is a 32-bit int that is either 0 or -1, the X4 splat gives
+        // the right result for all boolean geometries.
+        // For floats, (Non-AVX) codegen actually wants the input and the output
+        // to be in the same register, but we can't currently use
+        // defineReuseInput because they have different types (scalar vs
+        // vector), so a spill slot for one may not be suitable for the other.
+        define(new (alloc()) LSimdSplatX4(x), ins);
        break;
      default:
        MOZ_CRASH("Unknown SIMD kind");