Bug 1642909 - Do not scalarize i64x2.mul on x64/x86. r=jseward

This is mostly code removal: we remove the platform-specific lowering and code generation for scalarized i64x2.mul. In its place we use Andrew Brown's code for a platform-agnostic SIMD implementation, this fits in very neatly in the existing code generation pipeline. Differential Revision: https://phabricator.services.mozilla.com/D78015
2020-06-08 07:57:41 +00:00 · 2020-06-08 07:57:41 +00:00 · db2f9302ac
--- a/js/src/jit-test/tests/wasm/simd/ad-hack.js
+++ b/js/src/jit-test/tests/wasm/simd/ad-hack.js
@ -256,7 +256,9 @@ Int32Array.rectify = (x) => sign_extend(x,32);
 Uint32Array.inputs = Int32Array.inputs;
 Uint32Array.rectify = (x) => zero_extend(x,32);

-BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n]];
+BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n],
+                        [(2n ** 38n) / 5n, (2n ** 41n) / 7n],
+                        [-((2n ** 38n) / 5n), (2n ** 41n) / 7n]];
 BigInt64Array.rectify = (x) => BigInt(x);

 Float32Array.inputs = [[1, -1, 1e10, -1e10],
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@ -2077,12 +2077,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
      DEFINED_ON(x86_shared);

  inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
-                         Register64 temp) DEFINED_ON(x64);
-
-  // `temp1` must be edx:eax
-  inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
-                         Register64 temp1, Register64 temp2, Register temp3)
-      DEFINED_ON(x86);
+                         FloatRegister temp) DEFINED_ON(x86_shared);

  // Integer Negate

--- a/js/src/jit/x64/CodeGenerator-x64.cpp
+++ b/js/src/jit/x64/CodeGenerator-x64.cpp
@ -700,14 +700,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
  masm.testq(input, input);
  emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
 }
-
-void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
-#ifdef ENABLE_WASM_SIMD
-  FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
-  FloatRegister rhs = ToFloatRegister(ins->rhs());
-  Register64 temp = ToRegister64(ins->temp1());
-  masm.mulInt64x2(rhs, lhsDest, temp);
-#else
-  MOZ_CRASH("No SIMD");
-#endif
-}
--- a/js/src/jit/x64/Lowering-x64.cpp
+++ b/js/src/jit/x64/Lowering-x64.cpp
@ -404,12 +404,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
                  LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
              ins);
 }
-
-void LIRGeneratorX64::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
-                                           MDefinition* lhs, MDefinition* rhs) {
-  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
-  LAllocation rhsAlloc =
-      lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
-  auto* lir = new (alloc()) LWasmI64x2Mul(lhsDestAlloc, rhsAlloc, tempInt64());
-  defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
-}
--- a/js/src/jit/x64/Lowering-x64.h
+++ b/js/src/jit/x64/Lowering-x64.h
@ -28,8 +28,6 @@ class LIRGeneratorX64 : public LIRGeneratorX86Shared {
      MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
  void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
                        MDefinition* rhs);
-  void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
-                            MDefinition* rhs);

  // Returns a box allocation. reg2 is ignored on 64-bit platforms.
  LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register,
--- a/js/src/jit/x64/MacroAssembler-x64-inl.h
+++ b/js/src/jit/x64/MacroAssembler-x64-inl.h
@ -767,22 +767,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
  cmovCCl(NonZero, one, dest);
 }

-// Integer Multiply
-
-void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
-                                Register64 temp) {
-  ScratchRegisterScope t1(*this);
-  Register t2 = temp.reg;
-  vpextrq(0, lhsDest, t1);
-  vpextrq(0, rhs, t2);
-  imulq(t2, t1);
-  vpinsrq(0, t1, lhsDest, lhsDest);
-  vpextrq(1, lhsDest, t1);
-  vpextrq(1, rhs, t2);
-  imulq(t2, t1);
-  vpinsrq(1, t1, lhsDest, lhsDest);
-}
-
 // Extract lane as scalar

 void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -2452,6 +2452,9 @@ void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
    case wasm::SimdOp::I64x2Sub:
      masm.subInt64x2(rhs, lhsDest);
      break;
+    case wasm::SimdOp::I64x2Mul:
+      masm.mulInt64x2(rhs, lhsDest, temp1);
+      break;
    case wasm::SimdOp::F32x4Add:
      masm.addFloat32x4(rhs, lhsDest);
      break;
--- a/js/src/jit/x86-shared/LIR-x86-shared.h
+++ b/js/src/jit/x86-shared/LIR-x86-shared.h
@ -361,43 +361,6 @@ class LWasmBinarySimd128 : public LInstructionHelper<1, 2, 2> {
  wasm::SimdOp simdOp() const { return mir_->toWasmBinarySimd128()->simdOp(); }
 };

-// (v128, v128) -> v128 effect-free operations for i64x2.mul
-// lhs and dest are the same.
-// x64: one i64 temp.
-// x86: two i64 temps and one i32 temp
-class LWasmI64x2Mul : public LInstructionHelper<1, 2, INT64_PIECES * 2 + 1> {
- public:
-  LIR_HEADER(WasmI64x2Mul)
-
-  static constexpr uint32_t LhsDest = 0;
-  static constexpr uint32_t Rhs = 1;
-
-  LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
-                const LInt64Definition& temp)
-      : LInstructionHelper(classOpcode) {
-    setOperand(LhsDest, lhsDest);
-    setOperand(Rhs, rhs);
-    setInt64Temp(0, temp);
-  }
-
-  LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
-                const LInt64Definition& temp1, const LInt64Definition& temp2,
-                const LDefinition& temp3)
-      : LInstructionHelper(classOpcode) {
-    setOperand(LhsDest, lhsDest);
-    setOperand(Rhs, rhs);
-    setInt64Temp(0, temp1);
-    setInt64Temp(INT64_PIECES, temp2);
-    setTemp(INT64_PIECES * 2, temp3);
-  }
-
-  const LAllocation* lhsDest() { return getOperand(LhsDest); }
-  const LAllocation* rhs() { return getOperand(Rhs); }
-  const LInt64Definition temp1() { return getInt64Temp(0); }
-  const LInt64Definition temp2() { return getInt64Temp(INT64_PIECES); }
-  const LDefinition* temp3() { return getTemp(INT64_PIECES * 2); }
-};
-
 // (v128, i32) -> v128 effect-free variable-width shift operations
 // lhs and dest are the same.
 // temp0 is a GPR (if in use).
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -736,6 +736,7 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
    }
    case wasm::SimdOp::F32x4Max:
    case wasm::SimdOp::F64x2Max:
+    case wasm::SimdOp::I64x2Mul:
    case wasm::SimdOp::V8x16Swizzle:
      tempReg0 = tempSimd128();
      break;
@ -758,16 +759,12 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
      break;
  }

-  if (ins->simdOp() == wasm::SimdOp::I64x2Mul) {
-    lowerForWasmI64x2Mul(ins, lhs, rhs);
-  } else {
-    LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
-    LAllocation rhsAlloc =
-        lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
-    auto* lir = new (alloc())
-        LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
-    defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
-  }
+  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
+  LAllocation rhsAlloc =
+      lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
+  auto* lir = new (alloc())
+      LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
+  defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
 }

 void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@ -1432,6 +1432,28 @@ void MacroAssembler::mulInt32x4(FloatRegister rhs, FloatRegister lhsDest) {
  vpmulld(Operand(rhs), lhsDest, lhsDest);
 }

+void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
+                                FloatRegister temp) {
+  ScratchSimd128Scope temp2(*this);
+  // lhsDest = <D C> <B A>
+  // rhs = <H G> <F E>
+  // result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
+  moveSimd128(lhsDest, temp);                // temp  = <D C> <B A>
+  vpsrlq(Imm32(32), temp, temp);             // temp  = <0 D> <0 B>
+  vpmuludq(rhs, temp, temp);                 // temp  = <DG> <BE>
+  moveSimd128(rhs, temp2);                   // temp2 = <H G> <F E>
+  vpsrlq(Imm32(32), temp2, temp2);           // temp2 = <0 H> <0 F>
+  vpmuludq(lhsDest, temp2, temp2);           // temp2 = <CH> <AF>
+  vpaddq(Operand(temp), temp2, temp2);       // temp2 = <DG+CH> <BE+AF>
+  vpsllq(Imm32(32), temp2, temp2);           // temp2 = <(DG+CH)_low 0>
+                                             //         <(BE+AF)_low 0>
+  vpmuludq(rhs, lhsDest, lhsDest);           // lhsDest = <CG_high CG_low>
+                                             //           <AE_high AE_low>
+  vpaddq(Operand(temp2), lhsDest, lhsDest);  // lhsDest =
+                                             //    <(DG+CH)_low+CG_high CG_low>
+                                             //    <(BE+AF)_low+AE_high AE_low>
+}
+
 // Integer negate

 void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
--- a/js/src/jit/x86/CodeGenerator-x86.cpp
+++ b/js/src/jit/x86/CodeGenerator-x86.cpp
@ -1077,16 +1077,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
  masm.testl(input.low, input.low);
  emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
 }
-
-void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
-#ifdef ENABLE_WASM_SIMD
-  FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
-  FloatRegister rhs = ToFloatRegister(ins->rhs());
-  Register64 temp1 = ToRegister64(ins->temp1());
-  Register64 temp2 = ToRegister64(ins->temp2());
-  Register temp3 = ToRegister(ins->temp3());
-  masm.mulInt64x2(rhs, lhsDest, temp1, temp2, temp3);
-#else
-  MOZ_CRASH("No SIMD");
-#endif
-}
--- a/js/src/jit/x86/Lowering-x86.cpp
+++ b/js/src/jit/x86/Lowering-x86.cpp
@ -681,14 +681,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
                   LInt64Allocation(LAllocation(AnyRegister(edx)),
                                    LAllocation(AnyRegister(eax))));
 }
-
-void LIRGeneratorX86::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
-                                           MDefinition* lhs, MDefinition* rhs) {
-  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
-  LAllocation rhsAlloc =
-      lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
-  auto* lir = new (alloc())
-      LWasmI64x2Mul(lhsDestAlloc, rhsAlloc,
-                    tempInt64Fixed(Register64(edx, eax)), tempInt64(), temp());
-  defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
-}
--- a/js/src/jit/x86/Lowering-x86.h
+++ b/js/src/jit/x86/Lowering-x86.h
@ -47,8 +47,6 @@ class LIRGeneratorX86 : public LIRGeneratorX86Shared {
      MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
  void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
                        MDefinition* rhs);
-  void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
-                            MDefinition* rhs);

  void lowerDivI64(MDiv* div);
  void lowerModI64(MMod* mod);
--- a/js/src/jit/x86/MacroAssembler-x86-inl.h
+++ b/js/src/jit/x86/MacroAssembler-x86-inl.h
@ -997,19 +997,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
  bind(&done);
 }

-void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
-                                Register64 temp1, Register64 temp2,
-                                Register temp3) {
-  extractLaneInt64x2(0, lhsDest, temp1);
-  extractLaneInt64x2(0, rhs, temp2);
-  mul64(temp2, temp1, temp3);
-  replaceLaneInt64x2(0, temp1, lhsDest);
-  extractLaneInt64x2(1, lhsDest, temp1);
-  extractLaneInt64x2(1, rhs, temp2);
-  mul64(temp2, temp1, temp3);
-  replaceLaneInt64x2(1, temp1, lhsDest);
-}
-
 void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
                                        Register64 dest) {
  vpextrd(2 * lane, src, dest.low);
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -8066,7 +8066,6 @@ class BaseCompiler final : public BaseCompilerInterface {
  MOZ_MUST_USE bool emitBitselect();
  MOZ_MUST_USE bool emitVectorShuffle();
  MOZ_MUST_USE bool emitVectorShiftRightI64x2();
-  MOZ_MUST_USE bool emitVectorMulI64x2();
 #endif
 };

@ -12693,6 +12692,11 @@ static void MulF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.mulFloat32x4(rs, rsd);
 }

+static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
+                     RegV128 temp) {
+  masm.mulInt64x2(rs, rsd, temp);
+}
+
 static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.mulFloat64x2(rs, rsd);
 }
@ -13415,41 +13419,6 @@ bool BaseCompiler::emitVectorShiftRightI64x2() {

  return true;
 }
-
-// Must be scalarized on x86/x64 and requires different temp regs on the
-// two architectures.
-bool BaseCompiler::emitVectorMulI64x2() {
-  Nothing unused_a, unused_b;
-  if (!iter_.readBinary(ValType::V128, &unused_a, &unused_b)) {
-    return false;
-  }
-
-  if (deadCode_) {
-    return true;
-  }
-
-#  if defined(JS_CODEGEN_X64)
-  emitVectorBinopWithTemp<RegI64>(
-      [](MacroAssembler& masm, RegV128 rs, RegV128 rsd, RegI64 temp) {
-        masm.mulInt64x2(rs, rsd, temp);
-      });
-#  elif defined(JS_CODEGEN_X86)
-  RegV128 r, rs;
-  pop2xV128(&r, &rs);
-  needI64(specific_.edx_eax);
-  RegI64 temp1 = specific_.edx_eax;
-  RegI64 temp2 = needI64();
-  ScratchI32 temp3(*this);
-  masm.mulInt64x2(rs, r, temp1, temp2, temp3);
-  freeV128(rs);
-  freeI64(temp1);
-  freeI64(temp2);
-  pushV128(r);
-#  else
-  MOZ_CRASH("No porting API for MulI64x2");
-#  endif
-  return true;
-}
 #endif

 bool BaseCompiler::emitBody() {
@ -14463,7 +14432,7 @@ bool BaseCompiler::emitBody() {
          case uint32_t(SimdOp::I64x2Sub):
            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, SubI64x2));
          case uint32_t(SimdOp::I64x2Mul):
-            CHECK_NEXT(emitVectorMulI64x2());
+            CHECK_NEXT(dispatchVectorBinary(emitVectorBinopWithTemp, MulI64x2));
          case uint32_t(SimdOp::F32x4Add):
            CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, AddF32x4));
          case uint32_t(SimdOp::F32x4Sub):