Bug 1065339 - IonMonkey: Use vblendvps for SIMD minNum/maxNum r=jandem

2014-12-08 18:20:30 -08:00 · 2014-12-08 18:20:30 -08:00 · 1dffcd018f
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@ -2234,6 +2234,23 @@ class AssemblerX86Shared : public AssemblerShared
            MOZ_CRASH("unexpected operand kind");
        }
    }
+    void vblendvps(FloatRegister mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE41());
+        masm.vblendvps_rr(mask.code(), src1.code(), src0.code(), dest.code());
+    }
+    void vblendvps(FloatRegister mask, const Operand &src1, FloatRegister src0, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE41());
+        switch (src1.kind()) {
+          case Operand::FPREG:
+            masm.vblendvps_rr(mask.code(), src1.fpu(), src0.code(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.vblendvps_mr(mask.code(), src1.disp(), src1.base(), src0.code(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
    void movsldup(FloatRegister src, FloatRegister dest) {
        MOZ_ASSERT(HasSSE3());
        masm.movsldup_rr(src.code(), dest.code());
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@ -3788,6 +3788,9 @@ public:
    void vblendvps_rr(XMMRegisterID mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
        vblendvOpSimd(mask, src1, src0, dst);
    }
+    void vblendvps_mr(XMMRegisterID mask, int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst) {
+        vblendvOpSimd(mask, offset, base, src0, dst);
+    }

    void movsldup_rr(XMMRegisterID src, XMMRegisterID dst)
    {
@ -4397,6 +4400,25 @@ private:
                                 mask, (RegisterID)rm, src0, dst);
    }

+    void vblendvOpSimd(XMMRegisterID mask, int offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
+    {
+        if (useLegacySSEEncodingForVblendv(mask, src0, dst)) {
+            spew("blendvps   %s0x%x(%s), %s",
+                 PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(src0));
+            // Even though a "ps" instruction, vblendv is encoded with the "pd" prefix.
+            m_formatter.legacySSEPrefix(VEX_PD);
+            m_formatter.threeByteOp(OP3_BLENDVPS_VdqWdq, ESCAPE_BLENDVPS, offset, base, src0);
+            return;
+        }
+
+        spew("vblendvps  %s, %s0x%x(%s), %s, %s",
+             nameFPReg(mask), PRETTY_PRINT_OFFSET(offset), nameIReg(base),
+             nameFPReg(src0), nameFPReg(dst));
+        // Even though a "ps" instruction, vblendv is encoded with the "pd" prefix.
+        m_formatter.vblendvOpVex(VEX_PD, OP3_VBLENDVPS_VdqWdq, ESCAPE_VBLENDVPS,
+                                 mask, offset, base, src0, dst);
+    }
+
 #ifdef JS_CODEGEN_X64
    void twoByteOpSimd64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
                         XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
@ -4733,6 +4755,21 @@ private:
            immediate8(mask << 4);
        }

+        void vblendvOpVex(VexOperandType ty, ThreeByteOpcodeID opcode, ThreeByteEscape escape,
+                          XMMRegisterID mask, int offset, RegisterID base, XMMRegisterID src0, int reg)
+        {
+            int r = (reg >> 3), x = 0, b = (base >> 3);
+            int m = 0, w = 0, v = src0, l = 0;
+            switch (escape) {
+              case 0x38: m = 2; break; // 0x0F 0x38
+              case 0x3A: m = 3; break; // 0x0F 0x3A
+              default: MOZ_CRASH("unexpected escape");
+            }
+            threeOpVex(ty, r, x, b, m, w, v, l, opcode);
+            memoryModRM(offset, base, reg);
+            immediate8(mask << 4);
+        }
+
 #ifdef JS_CODEGEN_X64
        // Quad-word-sized operands:
        //
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -2704,12 +2704,18 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4 *ins)
        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
        masm.vcmpneqps(rhs, rhsCopy, mask);

-        // Emulates blendv
-        if (lhs != output)
-            masm.movaps(lhs, output);
-        masm.andps(Operand(mask), output);
-        masm.andnps(Operand(tmp), mask);
-        masm.orps(Operand(mask), output);
+        if (AssemblerX86Shared::HasAVX()) {
+            masm.vblendvps(mask, lhs, tmp, output);
+        } else {
+            // Emulate vblendvps.
+            // With SSE.4.1 we could use blendvps, however it's awkward since
+            // it requires the mask to be in xmm0.
+            if (lhs != output)
+                masm.movaps(lhs, output);
+            masm.andps(Operand(mask), output);
+            masm.andnps(Operand(tmp), mask);
+            masm.orps(Operand(mask), output);
+        }
        return;
      }
      case MSimdBinaryArith::MaxNum: {
@ -2732,12 +2738,18 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4 *ins)
        FloatRegister rhsCopy = masm.reusedInputAlignedFloat32x4(rhs, mask);
        masm.vcmpneqps(rhs, rhsCopy, mask);

-        // Emulates blendv
-        if (lhs != output)
-            masm.movaps(lhs, output);
-        masm.andps(Operand(mask), output);
-        masm.andnps(Operand(tmp), mask);
-        masm.orps(Operand(mask), output);
+        if (AssemblerX86Shared::HasAVX()) {
+            masm.vblendvps(mask, lhs, tmp, output);
+        } else {
+            // Emulate vblendvps.
+            // With SSE.4.1 we could use blendvps, however it's awkward since
+            // it requires the mask to be in xmm0.
+            if (lhs != output)
+                masm.movaps(lhs, output);
+            masm.andps(Operand(mask), output);
+            masm.andnps(Operand(tmp), mask);
+            masm.orps(Operand(mask), output);
+        }
        return;
      }
    }