Bug 1753115 - Refactor usage of moveSimd128. r=jseward

* Replace with moveSimd128XXXIfNotAVX, when possible * Fix I16x8ExtaddPairwiseI8x16S lowering; relax lowering for AVX * Add postMoveSimd128IntIfNotAVX utility * Add extadd_pairwise tests * Fix neg-abs-not codegen tests * Fix shift by imm8 VEX encoding Differential Revision: https://phabricator.services.mozilla.com/D137581
2022-02-11 13:27:12 +00:00 · 2022-02-11 13:27:12 +00:00 · bb2a780b8a
--- a/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js
@ -433,3 +433,30 @@ codegenTestX64_v128xLITERAL_v128_avxhack(
       `c5 f1 eb 05 ${RIPRADDR}  vporx ${RIPR}, %xmm1, %xmm0`],
      ['v128.xor', '(v128.const i8x16 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2)',
       `c5 f1 ef 05 ${RIPRADDR}  vpxorx ${RIPR}, %xmm1, %xmm0`]]);
+
+// Shift by constant encodings
+codegenTestX64_v128xLITERAL_v128_avxhack(
+     [['i8x16.shl', '(i32.const 2)', `
+c5 f1 fc c1               vpaddb %xmm1, %xmm1, %xmm0
+66 0f fc c0               paddb %xmm0, %xmm0`],
+      ['i8x16.shl', '(i32.const 4)', `
+c5 f1 db 05 ${RIPRADDR}   vpandx ${RIPR}, %xmm1, %xmm0
+66 0f 71 f0 04            psllw \\$0x04, %xmm0`],
+      ['i16x8.shl', '(i32.const 1)',
+       'c5 f9 71 f1 01            vpsllw \\$0x01, %xmm1, %xmm0'],
+      ['i16x8.shr_s', '(i32.const 3)',
+       'c5 f9 71 e1 03            vpsraw \\$0x03, %xmm1, %xmm0'],
+      ['i16x8.shr_u', '(i32.const 2)',
+       'c5 f9 71 d1 02            vpsrlw \\$0x02, %xmm1, %xmm0'], 
+      ['i32x4.shl', '(i32.const 5)',
+       'c5 f9 72 f1 05            vpslld \\$0x05, %xmm1, %xmm0'],
+      ['i32x4.shr_s', '(i32.const 2)',
+       'c5 f9 72 e1 02            vpsrad \\$0x02, %xmm1, %xmm0'],
+      ['i32x4.shr_u', '(i32.const 5)',
+       'c5 f9 72 d1 05            vpsrld \\$0x05, %xmm1, %xmm0'],
+      ['i64x2.shr_s', '(i32.const 7)', `
+c5 79 70 f9 f5            vpshufd \\$0xF5, %xmm1, %xmm15
+66 41 0f 72 e7 1f         psrad \\$0x1F, %xmm15
+c4 c1 71 ef c7            vpxor %xmm15, %xmm1, %xmm0
+66 0f 73 d0 07            psrlq \\$0x07, %xmm0
+66 41 0f ef c7            pxor %xmm15, %xmm0`]]);
--- a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
@ -1,4 +1,4 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() || getBuildConfiguration().simulator; include:codegen-x64-test.js
+// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js

 // Test that there are no extraneous moves for variable SIMD negate, abs, and
 // not instructions. See README-codegen.md for general information about this
--- a/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js
@ -0,0 +1,38 @@
+// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js
+
+// Tests for SIMD add pairwise instructions.
+
+if (!isAvxPresent()) {
+
+     codegenTestX64_IGNOREDxv128_v128(
+          [['i16x8.extadd_pairwise_i8x16_s', `
+66 0f 6f 05 ${RIPRADDR}    movdqax ${RIPR}, %xmm0
+66 0f 38 04 c1             pmaddubsw %xmm1, %xmm0`],
+           ['i16x8.extadd_pairwise_i8x16_u', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f 38 04 05 ${RIPRADDR} pmaddubswx ${RIPR}, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_s', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_u', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f ef 05 ${RIPRADDR}    pxorx ${RIPR}, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0
+66 0f fe 05 ${RIPRADDR}    padddx ${RIPR}, %xmm0`]]);
+
+} else {
+
+     codegenTestX64_IGNOREDxv128_v128(
+          [['i16x8.extadd_pairwise_i8x16_s', `
+66 0f 6f 05 ${RIPRADDR}    movdqax ${RIPR}, %xmm0
+66 0f 38 04 c1             pmaddubsw %xmm1, %xmm0`],
+           ['i16x8.extadd_pairwise_i8x16_u', `
+c4 e2 71 04 05 ${RIPRADDR} vpmaddubswx ${RIPR}, %xmm1, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_s', `
+c5 f1 f5 05 ${RIPRADDR}    vpmaddwdx ${RIPR}, %xmm1, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_u', `
+c5 f1 ef 05 ${RIPRADDR}    vpxorx ${RIPR}, %xmm1, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0
+66 0f fe 05 ${RIPRADDR}    padddx ${RIPR}, %xmm0`]]);
+
+}
--- a/js/src/jit/x64/BaseAssembler-x64.h
+++ b/js/src/jit/x64/BaseAssembler-x64.h
@ -1118,6 +1118,10 @@ class BaseAssemblerX64 : public BaseAssembler {
    return twoByteRipOpImmSimd("vcmppd", VEX_PD, OP2_CMPPD_VpdWpd,
                               X86Encoding::ConditionCmp_LE, src, dst);
  }
+  [[nodiscard]] JmpSrc vpmaddubsw_ripr(XMMRegisterID src, XMMRegisterID dst) {
+    return threeByteRipOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq,
+                              ESCAPE_38, src, dst);
+  }

  // BMI instructions:

--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@ -408,6 +408,13 @@ void MacroAssemblerX64::vcmplepdSimd128(const SimdConstant& v,
  vpRiprOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX64::vcmplepd_ripr);
 }

+void MacroAssemblerX64::vpmaddubswSimd128(const SimdConstant& v,
+                                          FloatRegister lhs,
+                                          FloatRegister dest) {
+  vpRiprOpSimd128(v, lhs, dest,
+                  &X86Encoding::BaseAssemblerX64::vpmaddubsw_ripr);
+}
+
 void MacroAssemblerX64::bindOffsets(
    const MacroAssemblerX86Shared::UsesVector& uses) {
  for (JmpSrc src : uses) {
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@ -1100,6 +1100,8 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared {
                       FloatRegister dest);
  void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
                       FloatRegister dest);
+  void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
+                         FloatRegister dest);

  void loadWasmPinnedRegsFromTls() {
    loadPtr(Address(WasmTlsReg, offsetof(wasm::TlsData, memoryBase)), HeapReg);
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@ -578,6 +578,11 @@ class BaseAssembler : public GenericAssembler {
    threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38, src1,
                    src0, dst);
  }
+  void vpmaddubsw_mr(const void* address, XMMRegisterID src0,
+                     XMMRegisterID dst) {
+    threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38,
+                    address, src0, dst);
+  }

  void vpaddb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
    twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, src1, src0, dst);
@ -5328,7 +5333,8 @@ class BaseAssembler : public GenericAssembler {

    spew("%-11s$%d, %s, %s", name, int32_t(imm), XMMRegName(src),
         XMMRegName(dst));
-    m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src,
+    // For shift instructions, destination is stored in vvvv field.
+    m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)src, dst,
                             (int)shiftKind);
    m_formatter.immediate8u(imm);
  }
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -2984,9 +2984,7 @@ void CodeGenerator::visitWasmConstantShiftSimd128(
  int32_t shift = ins->shift();

  if (shift == 0) {
-    if (src != dest) {
-      masm.moveSimd128(src, dest);
-    }
+    masm.moveSimd128(src, dest);
    return;
  }

--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -1231,7 +1231,12 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
        case wasm::SimdOp::I64x2ShrS: {
          auto* lir = new (alloc())
              LWasmSignReplicationSimd128(useRegisterAtStart(lhs));
-          defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+          if (isThreeOpAllowed()) {
+            define(lir, ins);
+          } else {
+            // For non-AVX, it is always beneficial to reuse the input.
+            defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+          }
          return;
        }
        default:
@ -1242,11 +1247,14 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
 #  ifdef DEBUG
    js::wasm::ReportSimdAnalysis("shift -> constant shift");
 #  endif
-    // Almost always beneficial, and never detrimental, to reuse the input if
-    // possible.
    auto* lir = new (alloc())
        LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount);
-    defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    if (isThreeOpAllowed()) {
+      define(lir, ins);
+    } else {
+      // For non-AVX, it is always beneficial to reuse the input.
+      defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    }
    return;
  }

@ -1449,8 +1457,11 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
    case wasm::SimdOp::I16x8Neg:
    case wasm::SimdOp::I32x4Neg:
    case wasm::SimdOp::I64x2Neg:
+    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
      // Prefer src != dest to avoid an unconditional src->temp move.
-      MOZ_ASSERT(!useAtStart && !reuseInput);
+      MOZ_ASSERT(!reuseInput);
+      // If AVX is enabled, we prefer useRegisterAtStart.
+      useAtStart = isThreeOpAllowed();
      break;
    case wasm::SimdOp::F32x4Neg:
    case wasm::SimdOp::F64x2Neg:
@ -1465,7 +1476,6 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
    case wasm::SimdOp::I64x2Abs:
    case wasm::SimdOp::I32x4TruncSatF32x4S:
    case wasm::SimdOp::F32x4ConvertI32x4U:
-    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
    case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
@ -1476,18 +1486,19 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
    case wasm::SimdOp::I64x2ExtendHighI32x4S:
    case wasm::SimdOp::I64x2ExtendHighI32x4U:
      // Prefer src == dest to avoid an unconditional src->dest move
-      // for better performance (e.g. non-PSHUFD use).
+      // for better performance in non-AVX mode (e.g. non-PSHUFD use).
      useAtStart = true;
-      reuseInput = true;
+      reuseInput = !isThreeOpAllowed();
      break;
    case wasm::SimdOp::I32x4TruncSatF32x4U:
    case wasm::SimdOp::I32x4TruncSatF64x2SZero:
    case wasm::SimdOp::I32x4TruncSatF64x2UZero:
    case wasm::SimdOp::I8x16Popcnt:
      tempReg = tempSimd128();
-      // Prefer src == dest to avoid an unconditional src->dest move.
+      // Prefer src == dest to avoid an unconditional src->dest move
+      // in non-AVX mode.
      useAtStart = true;
-      reuseInput = true;
+      reuseInput = !isThreeOpAllowed();
      break;
    case wasm::SimdOp::I16x8ExtendLowI8x16S:
    case wasm::SimdOp::I16x8ExtendHighI8x16S:
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@ -55,8 +55,8 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input,
    vbroadcastss(Operand(input), output);
    return;
  }
-  asMasm().moveSimd128Float(input.asSimd128(), output);
-  vshufps(0, output, output, output);
+  input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output);
+  vshufps(0, input, input, output);
 }

 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
@ -251,8 +251,9 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
        loadAlignedSimd128Int(rhs, scratch);
      }
      // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtb(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtb(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
      break;
    }
    case Assembler::Condition::NotEqual:
@ -351,8 +352,9 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
        loadAlignedSimd128Int(rhs, scratch);
      }
      // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtw(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtw(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
      break;
    }
    case Assembler::Condition::NotEqual:
@ -450,8 +452,9 @@ void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
        loadAlignedSimd128Int(rhs, scratch);
      }
      // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtd(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtd(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
      break;
    }
    case Assembler::Condition::NotEqual:
@ -583,8 +586,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
      vpsubq(Operand(lhs), temp1, temp1);
      vpcmpeqd(rhs, temp2, temp2);
      vandpd(temp2, temp1, temp1);
-      asMasm().moveSimd128(lhs, output);
-      vpcmpgtd(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpcmpgtd(rhs, lhs, output);
      vpor(Operand(temp1), output, output);
      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
      break;
@ -593,8 +596,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
      vmovdqa(Operand(lhs), temp2);
      vpcmpgtd(Operand(lhs), temp1, temp1);
      vpcmpeqd(Operand(rhs), temp2, temp2);
-      asMasm().moveSimd128(lhs, output);
-      vpsubq(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpsubq(rhs, lhs, output);
      vandpd(temp2, output, output);
      vpor(Operand(temp1), output, output);
      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@ -604,8 +607,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
      vmovdqa(Operand(lhs), temp2);
      vpcmpgtd(Operand(lhs), temp1, temp1);
      vpcmpeqd(Operand(rhs), temp2, temp2);
-      asMasm().moveSimd128(lhs, output);
-      vpsubq(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpsubq(rhs, lhs, output);
      vandpd(temp2, output, output);
      vpor(Operand(temp1), output, output);
      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@ -617,8 +620,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
      vpsubq(Operand(lhs), temp1, temp1);
      vpcmpeqd(rhs, temp2, temp2);
      vandpd(temp2, temp1, temp1);
-      asMasm().moveSimd128(lhs, output);
-      vpcmpgtd(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpcmpgtd(rhs, lhs, output);
      vpor(Operand(temp1), output, output);
      vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
      asMasm().bitwiseXorSimd128(output, allOnes, output);
@ -967,17 +970,22 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
    Imm32 count, FloatRegister src, FloatRegister dest) {
  MOZ_ASSERT(count.value <= 7);
-  asMasm().moveSimd128(src, dest);
+  if (MOZ_UNLIKELY(count.value == 0)) {
+    moveSimd128Int(src, dest);
+    return;
+  }
+  src = asMasm().moveSimd128IntIfNotAVX(src, dest);
  // Use the doubling trick for low shift counts, otherwise mask off the bits
  // that are shifted out of the low byte of each word and use word shifts.  The
  // optimal cutoff remains to be explored.
  if (count.value <= 3) {
-    for (int32_t shift = count.value; shift > 0; --shift) {
-      asMasm().addInt8x16(dest, dest);
+    vpaddb(Operand(src), src, dest);
+    for (int32_t shift = count.value - 1; shift > 0; --shift) {
+      vpaddb(Operand(dest), dest, dest);
    }
  } else {
-    asMasm().bitwiseAndSimd128(
-        dest, SimdConstant::SplatX16(0xFF >> count.value), dest);
+    asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value),
+                               dest);
    vpsllw(count, dest, dest);
  }
 }
@ -1070,10 +1078,10 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
    FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
  vmovd(count, temp);
-  asMasm().moveSimd128(in, dest);
  asMasm().signReplicationInt64x2(in, scratch);
+  in = asMasm().moveSimd128FloatIfNotAVX(in, dest);
  // Invert if negative, shift all, invert back if negative.
-  vpxor(Operand(scratch), dest, dest);
+  vpxor(Operand(scratch), in, dest);
  vpsrlq(temp, dest, dest);
  vpxor(Operand(scratch), dest, dest);
 }
@ -1088,10 +1096,10 @@ void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
    Imm32 count, FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128(src, dest);
  asMasm().signReplicationInt64x2(src, scratch);
  // Invert if negative, shift all, invert back if negative.
-  vpxor(Operand(scratch), dest, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
+  vpxor(Operand(scratch), src, dest);
  vpsrlq(Imm32(count.value & 63), dest, dest);
  vpxor(Operand(scratch), dest, dest);
 }
@ -1104,11 +1112,16 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
  // Normally the codegen will attempt to enforce these register assignments so
  // that the moves are avoided.

-  asMasm().moveSimd128Int(onTrue, output);
-  asMasm().moveSimd128Int(mask, temp);
+  onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output);
+  if (MOZ_UNLIKELY(mask == onTrue)) {
+    vpor(Operand(onFalse), onTrue, output);
+    return;
+  }

-  vpand(Operand(temp), output, output);
-  vpandn(Operand(onFalse), temp, temp);
+  mask = asMasm().moveSimd128IntIfNotAVX(mask, temp);
+
+  vpand(Operand(mask), onTrue, output);
+  vpandn(Operand(onFalse), mask, temp);
  vpor(Operand(temp), output, output);
 }

@ -1131,7 +1144,6 @@ void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
                                                         FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
  // of range values as we need it to.  We want to saturate too-large positive
@ -1139,9 +1151,10 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
  // become 0.

  // Convert NaN to 0 by masking away values that compare unordered to itself.
-  vmovaps(dest, scratch);
+  vmovaps(src, scratch);
  vcmpeqps(Operand(scratch), scratch, scratch);
-  vpand(Operand(scratch), dest, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
+  vpand(Operand(scratch), src, dest);

  // Compute the complement of each non-NaN lane's sign bit, we'll need this to
  // correct the result of cvttps2dq.  All other output bits are garbage.
@ -1165,7 +1178,7 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
    FloatRegister src, FloatRegister temp, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
  // of range values as we need it to.  We want to saturate too-large positive
@ -1173,7 +1186,7 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(

  // Convert NaN and negative values to zeroes in dest.
  vpxor(Operand(scratch), scratch, scratch);
-  vmaxps(Operand(scratch), dest, dest);
+  vmaxps(Operand(scratch), src, dest);

  // Place the largest positive signed integer in all lanes in scratch.
  // We use it to bias the conversion to handle edge cases.
@ -1217,14 +1230,14 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4Relaxed(
    FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  // Place lanes below 80000000h into dest, otherwise into scratch.
  // Keep dest or scratch 0 as default.
  asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch);
  vcmpltps(Operand(src), scratch, scratch);
  vpand(Operand(src), scratch, scratch);
-  vpxor(Operand(scratch), dest, dest);
+  vpxor(Operand(scratch), src, dest);

  // Convert lanes below 80000000h into unsigned int without issues.
  vcvttps2dq(dest, dest);
@ -1267,10 +1280,10 @@ void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src,
 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
    FloatRegister src, FloatRegister temp, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);

  vxorpd(scratch, scratch, scratch);
-  vmaxpd(Operand(scratch), dest, dest);
+  vmaxpd(Operand(scratch), src, dest);

  asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4294967295.0), temp);
  vminpd(Operand(temp), dest, dest);
@ -1284,11 +1297,10 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4Relaxed(
    FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);

  // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN
  // and out-of-bounds checks.
-  vroundpd(SSERoundingMode::Trunc, Operand(dest), dest);
+  vroundpd(SSERoundingMode::Trunc, Operand(src), dest);
  asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0),
                                    scratch);
  vaddpd(Operand(scratch), dest, dest);
@ -1299,9 +1311,9 @@ void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,
                                            FloatRegister temp,
                                            FloatRegister output) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Float(SimdConstant::SplatX16(0x0f), scratch);
-  asMasm().moveSimd128Int(src, temp);
-  vpand(scratch, temp, temp);
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch);
+  FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp);
+  vpand(scratch, srcForTemp, temp);
  vpandn(src, scratch, scratch);
  int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@ -1452,14 +1452,14 @@ void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,

 void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpslldq(count, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpslldq(count, src, dest);
 }

 void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsrldq(count, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsrldq(count, src, dest);
 }

 // Reverse bytes in lanes.
@ -1467,10 +1467,10 @@ void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
 void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
  // Byteswap is MOV + PSLLW + PSRLW + POR, a small win over PSHUFB.
  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  moveSimd128(src, scratch);
-  vpsllw(Imm32(8), dest, dest);
-  vpsrlw(Imm32(8), scratch, scratch);
+  FloatRegister srcForScratch = moveSimd128IntIfNotAVX(src, scratch);
+  vpsrlw(Imm32(8), srcForScratch, scratch);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsllw(Imm32(8), src, dest);
  vpor(scratch, dest, dest);
 }

@ -1556,8 +1556,8 @@ void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
  // input and shifting rather than masking at the end, but creates a false
  // dependency on the old value of scratch.  The better fix is to allow src to
  // be clobbered.
-  moveSimd128(src, scratch);
-  vpacksswb(Operand(scratch), scratch, scratch);
+  src = moveSimd128IntIfNotAVX(src, scratch);
+  vpacksswb(Operand(src), src, scratch);
  vpmovmskb(scratch, dest);
  andl(Imm32(0xFF), dest);
 }
@ -1713,20 +1713,22 @@ void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
  // lhs = <D C> <B A>
  // rhs = <H G> <F E>
  // result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
-  moveSimd128(lhs, temp);               // temp  = <D C> <B A>
-  vpsrlq(Imm32(32), temp, temp);        // temp  = <0 D> <0 B>
-  vpmuludq(rhs, temp, temp);            // temp  = <DG> <BE>
-  moveSimd128(rhs, temp2);              // temp2 = <H G> <F E>
-  vpsrlq(Imm32(32), temp2, temp2);      // temp2 = <0 H> <0 F>
-  vpmuludq(lhs, temp2, temp2);          // temp2 = <CH> <AF>
-  vpaddq(Operand(temp), temp2, temp2);  // temp2 = <DG+CH> <BE+AF>
-  vpsllq(Imm32(32), temp2, temp2);      // temp2 = <(DG+CH)_low 0>
-                                        //         <(BE+AF)_low 0>
-  vpmuludq(rhs, dest, dest);            // dest = <CG_high CG_low>
-                                        //        <AE_high AE_low>
-  vpaddq(Operand(temp2), dest, dest);   // dest =
-                                        //    <(DG+CH)_low+CG_high CG_low>
-                                        //    <(BE+AF)_low+AE_high AE_low>
+  FloatRegister lhsForTemp =
+      moveSimd128IntIfNotAVX(lhs, temp);  // temp  = <D C> <B A>
+  vpsrlq(Imm32(32), lhsForTemp, temp);    // temp  = <0 D> <0 B>
+  vpmuludq(rhs, temp, temp);              // temp  = <DG> <BE>
+  FloatRegister rhsForTemp =
+      moveSimd128IntIfNotAVX(rhs, temp2);  // temp2 = <H G> <F E>
+  vpsrlq(Imm32(32), rhsForTemp, temp2);    // temp2 = <0 H> <0 F>
+  vpmuludq(lhs, temp2, temp2);             // temp2 = <CH> <AF>
+  vpaddq(Operand(temp), temp2, temp2);     // temp2 = <DG+CH> <BE+AF>
+  vpsllq(Imm32(32), temp2, temp2);         // temp2 = <(DG+CH)_low 0>
+                                           //         <(BE+AF)_low 0>
+  vpmuludq(rhs, dest, dest);               // dest = <CG_high CG_low>
+                                           //        <AE_high AE_low>
+  vpaddq(Operand(temp2), dest, dest);      // dest =
+                                           //    <(DG+CH)_low+CG_high CG_low>
+                                           //    <(BE+AF)_low+AE_high AE_low>
 }

 // Code generation from the PR: https://github.com/WebAssembly/simd/pull/376.
@ -2141,9 +2143,9 @@ void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {

 void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
  signReplicationInt64x2(src, scratch);
-  vpxor(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpxor(Operand(scratch), src, dest);
  vpsubq(Operand(scratch), dest, dest);
 }

@ -2167,7 +2169,7 @@ void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest) {

 void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsllw(count, src, dest);
 }

@ -2178,7 +2180,7 @@ void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest) {

 void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpslld(count, src, dest);
 }

@ -2189,7 +2191,7 @@ void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest) {

 void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsllq(count, src, dest);
 }

@ -2226,7 +2228,7 @@ void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest) {

 void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsraw(count, src, dest);
 }

@ -2238,7 +2240,7 @@ void MacroAssembler::unsignedRightShiftInt16x8(Register rhs,

 void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsrlw(count, src, dest);
 }

@ -2249,7 +2251,7 @@ void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest) {

 void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsrad(count, src, dest);
 }

@ -2261,7 +2263,7 @@ void MacroAssembler::unsignedRightShiftInt32x4(Register rhs,

 void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsrld(count, src, dest);
 }

@ -2284,7 +2286,7 @@ void MacroAssembler::unsignedRightShiftInt64x2(Register rhs,

 void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpsrlq(count, src, dest);
 }

@ -2299,14 +2301,14 @@ void MacroAssembler::signReplicationInt8x16(FloatRegister src,

 void MacroAssembler::signReplicationInt16x8(FloatRegister src,
                                            FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsraw(Imm32(15), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsraw(Imm32(15), src, dest);
 }

 void MacroAssembler::signReplicationInt32x4(FloatRegister src,
                                            FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsrad(Imm32(31), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsrad(Imm32(31), src, dest);
 }

 void MacroAssembler::signReplicationInt64x2(FloatRegister src,
@ -2810,30 +2812,22 @@ void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,

 void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
                                                   FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX16(1), scratch);
-  vpmaddubsw(scratch, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpmaddubswSimd128(SimdConstant::SplatX16(1), src, dest);
 }

 void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
                                           FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
-  vpmaddwd(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpmaddwdSimd128(SimdConstant::SplatX8(1), src, dest);
 }

 void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
                                                   FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(0x8000), scratch);
-  vpxor(scratch, dest, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
-  vpmaddwd(Operand(scratch), dest, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX4(0x00010000), scratch);
-  vpaddd(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpxorSimd128(SimdConstant::SplatX8(-0x8000), src, dest);
+  vpmaddwdSimd128(SimdConstant::SplatX8(1), dest, dest);
+  vpadddSimd128(SimdConstant::SplatX4(0x00010000), dest, dest);
 }

 // Floating square root
@ -3023,8 +3017,8 @@ void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
 }

 void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
-  if (src == dest) {
-    vmovhlps(dest, dest, dest);
+  if (src == dest || HasAVX()) {
+    vmovhlps(src, src, dest);
  } else {
    vpshufd(ComputeShuffleMask(2, 3, 2, 3), src, dest);
  }
@ -3033,11 +3027,10 @@ void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {

 void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
                                              FloatRegister dest) {
-  moveSimd128(src, dest);
-
  ScratchSimd128Scope scratch(*this);
+  src = moveSimd128IntIfNotAVX(src, dest);
  vpxor(scratch, scratch, scratch);
-  vpunpckhdq(scratch, dest, dest);
+  vpunpckhdq(scratch, src, dest);
 }

 // Floating multiply-accumulate: srcDest [+-]= src1 * src2
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@ -587,6 +587,10 @@ class MacroAssemblerX86Shared : public Assembler {
    moveSimd128Int(src, dest);
    return dest;
  }
+  FloatRegister selectDestIfAVX(FloatRegister src, FloatRegister dest) {
+    MOZ_ASSERT(src.isSimd128() && dest.isSimd128());
+    return HasAVX() ? dest : src;
+  }
  void loadUnalignedSimd128Int(const Address& src, FloatRegister dest) {
    vmovdqu(Operand(src), dest);
  }
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@ -421,6 +421,12 @@ void MacroAssemblerX86::vcmplepdSimd128(const SimdConstant& v,
  vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vcmplepd_mr);
 }

+void MacroAssemblerX86::vpmaddubswSimd128(const SimdConstant& v,
+                                          FloatRegister lhs,
+                                          FloatRegister dest) {
+  vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vpmaddubsw_mr);
+}
+
 void MacroAssemblerX86::finish() {
  // Last instruction may be an indirect jump so eagerly insert an undefined
  // instruction byte to prevent processors from decoding data values into
--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@ -1063,6 +1063,8 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared {
                       FloatRegister dest);
  void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
                       FloatRegister dest);
+  void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
+                         FloatRegister dest);

  Condition testInt32Truthy(bool truthy, const ValueOperand& operand) {
    test32(operand.payloadReg(), operand.payloadReg());