From bb2a780b8af8ea40f9e7e34658a65d0d607e9d30 Mon Sep 17 00:00:00 2001
From: Yury Delendik <ydelendik@mozilla.com>
Date: Fri, 11 Feb 2022 13:27:12 +0000
Subject: [PATCH] Bug 1753115 - Refactor usage of moveSimd128. r=jseward

* Replace with moveSimd128XXXIfNotAVX, when possible
* Fix I16x8ExtaddPairwiseI8x16S lowering; relax lowering for AVX
* Add postMoveSimd128IntIfNotAVX utility
* Add extadd_pairwise tests
* Fix neg-abs-not codegen tests
* Fix shift by imm8 VEX encoding

Differential Revision: https://phabricator.services.mozilla.com/D137581
---
 .../tests/wasm/simd/avx2-x64-ion-codegen.js   |  27 +++++
 .../wasm/simd/neg-abs-not-x64-ion-codegen.js  |   2 +-
 .../wasm/simd/pairwise-x64-ion-codegen.js     |  38 ++++++
 js/src/jit/x64/BaseAssembler-x64.h            |   4 +
 js/src/jit/x64/MacroAssembler-x64.cpp         |   7 ++
 js/src/jit/x64/MacroAssembler-x64.h           |   2 +
 .../jit/x86-shared/BaseAssembler-x86-shared.h |   8 +-
 .../x86-shared/CodeGenerator-x86-shared.cpp   |   4 +-
 js/src/jit/x86-shared/Lowering-x86-shared.cpp |  31 +++--
 .../MacroAssembler-x86-shared-SIMD.cpp        |  98 +++++++++-------
 .../MacroAssembler-x86-shared-inl.h           | 111 ++++++++----------
 .../x86-shared/MacroAssembler-x86-shared.h    |   4 +
 js/src/jit/x86/MacroAssembler-x86.cpp         |   6 +
 js/src/jit/x86/MacroAssembler-x86.h           |   2 +
 14 files changed, 227 insertions(+), 117 deletions(-)
 create mode 100644 js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js

diff --git a/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js
index 4479a3d42057..a70423241096 100644
--- a/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js
@@ -433,3 +433,30 @@ codegenTestX64_v128xLITERAL_v128_avxhack(
        `c5 f1 eb 05 ${RIPRADDR}  vporx ${RIPR}, %xmm1, %xmm0`],
       ['v128.xor', '(v128.const i8x16 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2)',
        `c5 f1 ef 05 ${RIPRADDR}  vpxorx ${RIPR}, %xmm1, %xmm0`]]);
+
+// Shift by constant encodings
+codegenTestX64_v128xLITERAL_v128_avxhack(
+     [['i8x16.shl', '(i32.const 2)', `
+c5 f1 fc c1               vpaddb %xmm1, %xmm1, %xmm0
+66 0f fc c0               paddb %xmm0, %xmm0`],
+      ['i8x16.shl', '(i32.const 4)', `
+c5 f1 db 05 ${RIPRADDR}   vpandx ${RIPR}, %xmm1, %xmm0
+66 0f 71 f0 04            psllw \\$0x04, %xmm0`],
+      ['i16x8.shl', '(i32.const 1)',
+       'c5 f9 71 f1 01            vpsllw \\$0x01, %xmm1, %xmm0'],
+      ['i16x8.shr_s', '(i32.const 3)',
+       'c5 f9 71 e1 03            vpsraw \\$0x03, %xmm1, %xmm0'],
+      ['i16x8.shr_u', '(i32.const 2)',
+       'c5 f9 71 d1 02            vpsrlw \\$0x02, %xmm1, %xmm0'], 
+      ['i32x4.shl', '(i32.const 5)',
+       'c5 f9 72 f1 05            vpslld \\$0x05, %xmm1, %xmm0'],
+      ['i32x4.shr_s', '(i32.const 2)',
+       'c5 f9 72 e1 02            vpsrad \\$0x02, %xmm1, %xmm0'],
+      ['i32x4.shr_u', '(i32.const 5)',
+       'c5 f9 72 d1 05            vpsrld \\$0x05, %xmm1, %xmm0'],
+      ['i64x2.shr_s', '(i32.const 7)', `
+c5 79 70 f9 f5            vpshufd \\$0xF5, %xmm1, %xmm15
+66 41 0f 72 e7 1f         psrad \\$0x1F, %xmm15
+c4 c1 71 ef c7            vpxor %xmm15, %xmm1, %xmm0
+66 0f 73 d0 07            psrlq \\$0x07, %xmm0
+66 41 0f ef c7            pxor %xmm15, %xmm0`]]);
diff --git a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
index 20768051c5bc..0ae75f38fb43 100644
--- a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
@@ -1,4 +1,4 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() || getBuildConfiguration().simulator; include:codegen-x64-test.js
+// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js
 
 // Test that there are no extraneous moves for variable SIMD negate, abs, and
 // not instructions. See README-codegen.md for general information about this
diff --git a/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js
new file mode 100644
index 000000000000..53ab47fdb8da
--- /dev/null
+++ b/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js
@@ -0,0 +1,38 @@
+// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js
+
+// Tests for SIMD add pairwise instructions.
+
+if (!isAvxPresent()) {
+
+     codegenTestX64_IGNOREDxv128_v128(
+          [['i16x8.extadd_pairwise_i8x16_s', `
+66 0f 6f 05 ${RIPRADDR}    movdqax ${RIPR}, %xmm0
+66 0f 38 04 c1             pmaddubsw %xmm1, %xmm0`],
+           ['i16x8.extadd_pairwise_i8x16_u', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f 38 04 05 ${RIPRADDR} pmaddubswx ${RIPR}, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_s', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_u', `
+66 0f 6f c1                movdqa %xmm1, %xmm0
+66 0f ef 05 ${RIPRADDR}    pxorx ${RIPR}, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0
+66 0f fe 05 ${RIPRADDR}    padddx ${RIPR}, %xmm0`]]);
+
+} else {
+
+     codegenTestX64_IGNOREDxv128_v128(
+          [['i16x8.extadd_pairwise_i8x16_s', `
+66 0f 6f 05 ${RIPRADDR}    movdqax ${RIPR}, %xmm0
+66 0f 38 04 c1             pmaddubsw %xmm1, %xmm0`],
+           ['i16x8.extadd_pairwise_i8x16_u', `
+c4 e2 71 04 05 ${RIPRADDR} vpmaddubswx ${RIPR}, %xmm1, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_s', `
+c5 f1 f5 05 ${RIPRADDR}    vpmaddwdx ${RIPR}, %xmm1, %xmm0`],
+           ['i32x4.extadd_pairwise_i16x8_u', `
+c5 f1 ef 05 ${RIPRADDR}    vpxorx ${RIPR}, %xmm1, %xmm0
+66 0f f5 05 ${RIPRADDR}    pmaddwdx ${RIPR}, %xmm0
+66 0f fe 05 ${RIPRADDR}    padddx ${RIPR}, %xmm0`]]);
+
+}
diff --git a/js/src/jit/x64/BaseAssembler-x64.h b/js/src/jit/x64/BaseAssembler-x64.h
index 6b4d67338247..47b422d3b788 100644
--- a/js/src/jit/x64/BaseAssembler-x64.h
+++ b/js/src/jit/x64/BaseAssembler-x64.h
@@ -1118,6 +1118,10 @@ class BaseAssemblerX64 : public BaseAssembler {
     return twoByteRipOpImmSimd("vcmppd", VEX_PD, OP2_CMPPD_VpdWpd,
                                X86Encoding::ConditionCmp_LE, src, dst);
   }
+  [[nodiscard]] JmpSrc vpmaddubsw_ripr(XMMRegisterID src, XMMRegisterID dst) {
+    return threeByteRipOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq,
+                              ESCAPE_38, src, dst);
+  }
 
   // BMI instructions:
 
diff --git a/js/src/jit/x64/MacroAssembler-x64.cpp b/js/src/jit/x64/MacroAssembler-x64.cpp
index ec27719cccd4..eafa930bcbe3 100644
--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@@ -408,6 +408,13 @@ void MacroAssemblerX64::vcmplepdSimd128(const SimdConstant& v,
   vpRiprOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX64::vcmplepd_ripr);
 }
 
+void MacroAssemblerX64::vpmaddubswSimd128(const SimdConstant& v,
+                                          FloatRegister lhs,
+                                          FloatRegister dest) {
+  vpRiprOpSimd128(v, lhs, dest,
+                  &X86Encoding::BaseAssemblerX64::vpmaddubsw_ripr);
+}
+
 void MacroAssemblerX64::bindOffsets(
     const MacroAssemblerX86Shared::UsesVector& uses) {
   for (JmpSrc src : uses) {
diff --git a/js/src/jit/x64/MacroAssembler-x64.h b/js/src/jit/x64/MacroAssembler-x64.h
index 2fb558d1d9c9..d25ae252db4c 100644
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@@ -1100,6 +1100,8 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared {
                        FloatRegister dest);
   void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
                        FloatRegister dest);
+  void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
+                         FloatRegister dest);
 
   void loadWasmPinnedRegsFromTls() {
     loadPtr(Address(WasmTlsReg, offsetof(wasm::TlsData, memoryBase)), HeapReg);
diff --git a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
index e62255f41458..d383f91b9f55 100644
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@@ -578,6 +578,11 @@ class BaseAssembler : public GenericAssembler {
     threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38, src1,
                     src0, dst);
   }
+  void vpmaddubsw_mr(const void* address, XMMRegisterID src0,
+                     XMMRegisterID dst) {
+    threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38,
+                    address, src0, dst);
+  }
 
   void vpaddb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
     twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, src1, src0, dst);
@@ -5328,7 +5333,8 @@ class BaseAssembler : public GenericAssembler {
 
     spew("%-11s$%d, %s, %s", name, int32_t(imm), XMMRegName(src),
          XMMRegName(dst));
-    m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src,
+    // For shift instructions, destination is stored in vvvv field.
+    m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)src, dst,
                              (int)shiftKind);
     m_formatter.immediate8u(imm);
   }
diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
index 051b02cf9efc..2eed1bcc3cf8 100644
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@@ -2984,9 +2984,7 @@ void CodeGenerator::visitWasmConstantShiftSimd128(
   int32_t shift = ins->shift();
 
   if (shift == 0) {
-    if (src != dest) {
-      masm.moveSimd128(src, dest);
-    }
+    masm.moveSimd128(src, dest);
     return;
   }
 
diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
index c6220341f946..c2247849440b 100644
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@@ -1231,7 +1231,12 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
         case wasm::SimdOp::I64x2ShrS: {
           auto* lir = new (alloc())
               LWasmSignReplicationSimd128(useRegisterAtStart(lhs));
-          defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+          if (isThreeOpAllowed()) {
+            define(lir, ins);
+          } else {
+            // For non-AVX, it is always beneficial to reuse the input.
+            defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+          }
           return;
         }
         default:
@@ -1242,11 +1247,14 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
 #  ifdef DEBUG
     js::wasm::ReportSimdAnalysis("shift -> constant shift");
 #  endif
-    // Almost always beneficial, and never detrimental, to reuse the input if
-    // possible.
     auto* lir = new (alloc())
         LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount);
-    defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    if (isThreeOpAllowed()) {
+      define(lir, ins);
+    } else {
+      // For non-AVX, it is always beneficial to reuse the input.
+      defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
+    }
     return;
   }
 
@@ -1449,8 +1457,11 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
     case wasm::SimdOp::I16x8Neg:
     case wasm::SimdOp::I32x4Neg:
     case wasm::SimdOp::I64x2Neg:
+    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
       // Prefer src != dest to avoid an unconditional src->temp move.
-      MOZ_ASSERT(!useAtStart && !reuseInput);
+      MOZ_ASSERT(!reuseInput);
+      // If AVX is enabled, we prefer useRegisterAtStart.
+      useAtStart = isThreeOpAllowed();
       break;
     case wasm::SimdOp::F32x4Neg:
     case wasm::SimdOp::F64x2Neg:
@@ -1465,7 +1476,6 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
     case wasm::SimdOp::I64x2Abs:
     case wasm::SimdOp::I32x4TruncSatF32x4S:
     case wasm::SimdOp::F32x4ConvertI32x4U:
-    case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
     case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
     case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
     case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
@@ -1476,18 +1486,19 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
     case wasm::SimdOp::I64x2ExtendHighI32x4S:
     case wasm::SimdOp::I64x2ExtendHighI32x4U:
       // Prefer src == dest to avoid an unconditional src->dest move
-      // for better performance (e.g. non-PSHUFD use).
+      // for better performance in non-AVX mode (e.g. non-PSHUFD use).
       useAtStart = true;
-      reuseInput = true;
+      reuseInput = !isThreeOpAllowed();
       break;
     case wasm::SimdOp::I32x4TruncSatF32x4U:
     case wasm::SimdOp::I32x4TruncSatF64x2SZero:
     case wasm::SimdOp::I32x4TruncSatF64x2UZero:
     case wasm::SimdOp::I8x16Popcnt:
       tempReg = tempSimd128();
-      // Prefer src == dest to avoid an unconditional src->dest move.
+      // Prefer src == dest to avoid an unconditional src->dest move
+      // in non-AVX mode.
       useAtStart = true;
-      reuseInput = true;
+      reuseInput = !isThreeOpAllowed();
       break;
     case wasm::SimdOp::I16x8ExtendLowI8x16S:
     case wasm::SimdOp::I16x8ExtendHighI8x16S:
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
index 142785009e2d..a6ceb6925033 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@@ -55,8 +55,8 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input,
     vbroadcastss(Operand(input), output);
     return;
   }
-  asMasm().moveSimd128Float(input.asSimd128(), output);
-  vshufps(0, output, output, output);
+  input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output);
+  vshufps(0, input, input, output);
 }
 
 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
@@ -251,8 +251,9 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
         loadAlignedSimd128Int(rhs, scratch);
       }
       // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtb(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtb(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
       break;
     }
     case Assembler::Condition::NotEqual:
@@ -351,8 +352,9 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
         loadAlignedSimd128Int(rhs, scratch);
       }
       // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtw(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtw(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
       break;
     }
     case Assembler::Condition::NotEqual:
@@ -450,8 +452,9 @@ void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
         loadAlignedSimd128Int(rhs, scratch);
       }
       // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtd(Operand(lhs), scratch, scratch);
-      moveSimd128Int(scratch, output);
+      FloatRegister outputTemp = selectDestIfAVX(scratch, output);
+      vpcmpgtd(Operand(lhs), scratch, outputTemp);
+      moveSimd128Int(outputTemp, output);
       break;
     }
     case Assembler::Condition::NotEqual:
@@ -583,8 +586,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
       vpsubq(Operand(lhs), temp1, temp1);
       vpcmpeqd(rhs, temp2, temp2);
       vandpd(temp2, temp1, temp1);
-      asMasm().moveSimd128(lhs, output);
-      vpcmpgtd(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpcmpgtd(rhs, lhs, output);
       vpor(Operand(temp1), output, output);
       vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
       break;
@@ -593,8 +596,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
       vmovdqa(Operand(lhs), temp2);
       vpcmpgtd(Operand(lhs), temp1, temp1);
       vpcmpeqd(Operand(rhs), temp2, temp2);
-      asMasm().moveSimd128(lhs, output);
-      vpsubq(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpsubq(rhs, lhs, output);
       vandpd(temp2, output, output);
       vpor(Operand(temp1), output, output);
       vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@@ -604,8 +607,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
       vmovdqa(Operand(lhs), temp2);
       vpcmpgtd(Operand(lhs), temp1, temp1);
       vpcmpeqd(Operand(rhs), temp2, temp2);
-      asMasm().moveSimd128(lhs, output);
-      vpsubq(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpsubq(rhs, lhs, output);
       vandpd(temp2, output, output);
       vpor(Operand(temp1), output, output);
       vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@@ -617,8 +620,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
       vpsubq(Operand(lhs), temp1, temp1);
       vpcmpeqd(rhs, temp2, temp2);
       vandpd(temp2, temp1, temp1);
-      asMasm().moveSimd128(lhs, output);
-      vpcmpgtd(rhs, output, output);
+      lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
+      vpcmpgtd(rhs, lhs, output);
       vpor(Operand(temp1), output, output);
       vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
       asMasm().bitwiseXorSimd128(output, allOnes, output);
@@ -967,17 +970,22 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
     Imm32 count, FloatRegister src, FloatRegister dest) {
   MOZ_ASSERT(count.value <= 7);
-  asMasm().moveSimd128(src, dest);
+  if (MOZ_UNLIKELY(count.value == 0)) {
+    moveSimd128Int(src, dest);
+    return;
+  }
+  src = asMasm().moveSimd128IntIfNotAVX(src, dest);
   // Use the doubling trick for low shift counts, otherwise mask off the bits
   // that are shifted out of the low byte of each word and use word shifts.  The
   // optimal cutoff remains to be explored.
   if (count.value <= 3) {
-    for (int32_t shift = count.value; shift > 0; --shift) {
-      asMasm().addInt8x16(dest, dest);
+    vpaddb(Operand(src), src, dest);
+    for (int32_t shift = count.value - 1; shift > 0; --shift) {
+      vpaddb(Operand(dest), dest, dest);
     }
   } else {
-    asMasm().bitwiseAndSimd128(
-        dest, SimdConstant::SplatX16(0xFF >> count.value), dest);
+    asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value),
+                               dest);
     vpsllw(count, dest, dest);
   }
 }
@@ -1070,10 +1078,10 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
     FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
   vmovd(count, temp);
-  asMasm().moveSimd128(in, dest);
   asMasm().signReplicationInt64x2(in, scratch);
+  in = asMasm().moveSimd128FloatIfNotAVX(in, dest);
   // Invert if negative, shift all, invert back if negative.
-  vpxor(Operand(scratch), dest, dest);
+  vpxor(Operand(scratch), in, dest);
   vpsrlq(temp, dest, dest);
   vpxor(Operand(scratch), dest, dest);
 }
@@ -1088,10 +1096,10 @@ void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
 void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
     Imm32 count, FloatRegister src, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128(src, dest);
   asMasm().signReplicationInt64x2(src, scratch);
   // Invert if negative, shift all, invert back if negative.
-  vpxor(Operand(scratch), dest, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
+  vpxor(Operand(scratch), src, dest);
   vpsrlq(Imm32(count.value & 63), dest, dest);
   vpxor(Operand(scratch), dest, dest);
 }
@@ -1104,11 +1112,16 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
   // Normally the codegen will attempt to enforce these register assignments so
   // that the moves are avoided.
 
-  asMasm().moveSimd128Int(onTrue, output);
-  asMasm().moveSimd128Int(mask, temp);
+  onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output);
+  if (MOZ_UNLIKELY(mask == onTrue)) {
+    vpor(Operand(onFalse), onTrue, output);
+    return;
+  }
 
-  vpand(Operand(temp), output, output);
-  vpandn(Operand(onFalse), temp, temp);
+  mask = asMasm().moveSimd128IntIfNotAVX(mask, temp);
+
+  vpand(Operand(mask), onTrue, output);
+  vpandn(Operand(onFalse), mask, temp);
   vpor(Operand(temp), output, output);
 }
 
@@ -1131,7 +1144,6 @@ void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
                                                          FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
 
   // The cvttps2dq instruction is the workhorse but does not handle NaN or out
   // of range values as we need it to.  We want to saturate too-large positive
@@ -1139,9 +1151,10 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
   // become 0.
 
   // Convert NaN to 0 by masking away values that compare unordered to itself.
-  vmovaps(dest, scratch);
+  vmovaps(src, scratch);
   vcmpeqps(Operand(scratch), scratch, scratch);
-  vpand(Operand(scratch), dest, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
+  vpand(Operand(scratch), src, dest);
 
   // Compute the complement of each non-NaN lane's sign bit, we'll need this to
   // correct the result of cvttps2dq.  All other output bits are garbage.
@@ -1165,7 +1178,7 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
     FloatRegister src, FloatRegister temp, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
 
   // The cvttps2dq instruction is the workhorse but does not handle NaN or out
   // of range values as we need it to.  We want to saturate too-large positive
@@ -1173,7 +1186,7 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
 
   // Convert NaN and negative values to zeroes in dest.
   vpxor(Operand(scratch), scratch, scratch);
-  vmaxps(Operand(scratch), dest, dest);
+  vmaxps(Operand(scratch), src, dest);
 
   // Place the largest positive signed integer in all lanes in scratch.
   // We use it to bias the conversion to handle edge cases.
@@ -1217,14 +1230,14 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4Relaxed(
     FloatRegister src, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
 
   // Place lanes below 80000000h into dest, otherwise into scratch.
   // Keep dest or scratch 0 as default.
   asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch);
   vcmpltps(Operand(src), scratch, scratch);
   vpand(Operand(src), scratch, scratch);
-  vpxor(Operand(scratch), dest, dest);
+  vpxor(Operand(scratch), src, dest);
 
   // Convert lanes below 80000000h into unsigned int without issues.
   vcvttps2dq(dest, dest);
@@ -1267,10 +1280,10 @@ void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src,
 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
     FloatRegister src, FloatRegister temp, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
 
   vxorpd(scratch, scratch, scratch);
-  vmaxpd(Operand(scratch), dest, dest);
+  vmaxpd(Operand(scratch), src, dest);
 
   asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4294967295.0), temp);
   vminpd(Operand(temp), dest, dest);
@@ -1284,11 +1297,10 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
 void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4Relaxed(
     FloatRegister src, FloatRegister dest) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
 
   // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN
   // and out-of-bounds checks.
-  vroundpd(SSERoundingMode::Trunc, Operand(dest), dest);
+  vroundpd(SSERoundingMode::Trunc, Operand(src), dest);
   asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0),
                                     scratch);
   vaddpd(Operand(scratch), dest, dest);
@@ -1299,9 +1311,9 @@ void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,
                                             FloatRegister temp,
                                             FloatRegister output) {
   ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Float(SimdConstant::SplatX16(0x0f), scratch);
-  asMasm().moveSimd128Int(src, temp);
-  vpand(scratch, temp, temp);
+  asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch);
+  FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp);
+  vpand(scratch, srcForTemp, temp);
   vpandn(src, scratch, scratch);
   int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
   asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
index 8a0b22bda761..1896ded29007 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@@ -1452,14 +1452,14 @@ void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,
 
 void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpslldq(count, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpslldq(count, src, dest);
 }
 
 void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
                                        FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsrldq(count, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsrldq(count, src, dest);
 }
 
 // Reverse bytes in lanes.
@@ -1467,10 +1467,10 @@ void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
 void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
   // Byteswap is MOV + PSLLW + PSRLW + POR, a small win over PSHUFB.
   ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  moveSimd128(src, scratch);
-  vpsllw(Imm32(8), dest, dest);
-  vpsrlw(Imm32(8), scratch, scratch);
+  FloatRegister srcForScratch = moveSimd128IntIfNotAVX(src, scratch);
+  vpsrlw(Imm32(8), srcForScratch, scratch);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsllw(Imm32(8), src, dest);
   vpor(scratch, dest, dest);
 }
 
@@ -1556,8 +1556,8 @@ void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
   // input and shifting rather than masking at the end, but creates a false
   // dependency on the old value of scratch.  The better fix is to allow src to
   // be clobbered.
-  moveSimd128(src, scratch);
-  vpacksswb(Operand(scratch), scratch, scratch);
+  src = moveSimd128IntIfNotAVX(src, scratch);
+  vpacksswb(Operand(src), src, scratch);
   vpmovmskb(scratch, dest);
   andl(Imm32(0xFF), dest);
 }
@@ -1713,20 +1713,22 @@ void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
   // lhs = <D C> <B A>
   // rhs = <H G> <F E>
   // result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
-  moveSimd128(lhs, temp);               // temp  = <D C> <B A>
-  vpsrlq(Imm32(32), temp, temp);        // temp  = <0 D> <0 B>
-  vpmuludq(rhs, temp, temp);            // temp  = <DG> <BE>
-  moveSimd128(rhs, temp2);              // temp2 = <H G> <F E>
-  vpsrlq(Imm32(32), temp2, temp2);      // temp2 = <0 H> <0 F>
-  vpmuludq(lhs, temp2, temp2);          // temp2 = <CH> <AF>
-  vpaddq(Operand(temp), temp2, temp2);  // temp2 = <DG+CH> <BE+AF>
-  vpsllq(Imm32(32), temp2, temp2);      // temp2 = <(DG+CH)_low 0>
-                                        //         <(BE+AF)_low 0>
-  vpmuludq(rhs, dest, dest);            // dest = <CG_high CG_low>
-                                        //        <AE_high AE_low>
-  vpaddq(Operand(temp2), dest, dest);   // dest =
-                                        //    <(DG+CH)_low+CG_high CG_low>
-                                        //    <(BE+AF)_low+AE_high AE_low>
+  FloatRegister lhsForTemp =
+      moveSimd128IntIfNotAVX(lhs, temp);  // temp  = <D C> <B A>
+  vpsrlq(Imm32(32), lhsForTemp, temp);    // temp  = <0 D> <0 B>
+  vpmuludq(rhs, temp, temp);              // temp  = <DG> <BE>
+  FloatRegister rhsForTemp =
+      moveSimd128IntIfNotAVX(rhs, temp2);  // temp2 = <H G> <F E>
+  vpsrlq(Imm32(32), rhsForTemp, temp2);    // temp2 = <0 H> <0 F>
+  vpmuludq(lhs, temp2, temp2);             // temp2 = <CH> <AF>
+  vpaddq(Operand(temp), temp2, temp2);     // temp2 = <DG+CH> <BE+AF>
+  vpsllq(Imm32(32), temp2, temp2);         // temp2 = <(DG+CH)_low 0>
+                                           //         <(BE+AF)_low 0>
+  vpmuludq(rhs, dest, dest);               // dest = <CG_high CG_low>
+                                           //        <AE_high AE_low>
+  vpaddq(Operand(temp2), dest, dest);      // dest =
+                                           //    <(DG+CH)_low+CG_high CG_low>
+                                           //    <(BE+AF)_low+AE_high AE_low>
 }
 
 // Code generation from the PR: https://github.com/WebAssembly/simd/pull/376.
@@ -2141,9 +2143,9 @@ void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {
 
 void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
   ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
   signReplicationInt64x2(src, scratch);
-  vpxor(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpxor(Operand(scratch), src, dest);
   vpsubq(Operand(scratch), dest, dest);
 }
 
@@ -2167,7 +2169,7 @@ void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest) {
 
 void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsllw(count, src, dest);
 }
 
@@ -2178,7 +2180,7 @@ void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest) {
 
 void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpslld(count, src, dest);
 }
 
@@ -2189,7 +2191,7 @@ void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest) {
 
 void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsllq(count, src, dest);
 }
 
@@ -2226,7 +2228,7 @@ void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest) {
 
 void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
                                        FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsraw(count, src, dest);
 }
 
@@ -2238,7 +2240,7 @@ void MacroAssembler::unsignedRightShiftInt16x8(Register rhs,
 
 void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
                                                FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsrlw(count, src, dest);
 }
 
@@ -2249,7 +2251,7 @@ void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest) {
 
 void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
                                        FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsrad(count, src, dest);
 }
 
@@ -2261,7 +2263,7 @@ void MacroAssembler::unsignedRightShiftInt32x4(Register rhs,
 
 void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
                                                FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsrld(count, src, dest);
 }
 
@@ -2284,7 +2286,7 @@ void MacroAssembler::unsignedRightShiftInt64x2(Register rhs,
 
 void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
                                                FloatRegister dest) {
-  moveSimd128(src, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpsrlq(count, src, dest);
 }
 
@@ -2299,14 +2301,14 @@ void MacroAssembler::signReplicationInt8x16(FloatRegister src,
 
 void MacroAssembler::signReplicationInt16x8(FloatRegister src,
                                             FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsraw(Imm32(15), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsraw(Imm32(15), src, dest);
 }
 
 void MacroAssembler::signReplicationInt32x4(FloatRegister src,
                                             FloatRegister dest) {
-  moveSimd128(src, dest);
-  vpsrad(Imm32(31), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpsrad(Imm32(31), src, dest);
 }
 
 void MacroAssembler::signReplicationInt64x2(FloatRegister src,
@@ -2810,30 +2812,22 @@ void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,
 
 void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
                                                    FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX16(1), scratch);
-  vpmaddubsw(scratch, dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpmaddubswSimd128(SimdConstant::SplatX16(1), src, dest);
 }
 
 void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
                                            FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
-  vpmaddwd(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpmaddwdSimd128(SimdConstant::SplatX8(1), src, dest);
 }
 
 void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
                                                    FloatRegister dest) {
-  ScratchSimd128Scope scratch(*this);
-  moveSimd128(src, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(0x8000), scratch);
-  vpxor(scratch, dest, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
-  vpmaddwd(Operand(scratch), dest, dest);
-  loadConstantSimd128Int(SimdConstant::SplatX4(0x00010000), scratch);
-  vpaddd(Operand(scratch), dest, dest);
+  src = moveSimd128IntIfNotAVX(src, dest);
+  vpxorSimd128(SimdConstant::SplatX8(-0x8000), src, dest);
+  vpmaddwdSimd128(SimdConstant::SplatX8(1), dest, dest);
+  vpadddSimd128(SimdConstant::SplatX4(0x00010000), dest, dest);
 }
 
 // Floating square root
@@ -3023,8 +3017,8 @@ void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
 }
 
 void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
-  if (src == dest) {
-    vmovhlps(dest, dest, dest);
+  if (src == dest || HasAVX()) {
+    vmovhlps(src, src, dest);
   } else {
     vpshufd(ComputeShuffleMask(2, 3, 2, 3), src, dest);
   }
@@ -3033,11 +3027,10 @@ void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
 
 void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
-
   ScratchSimd128Scope scratch(*this);
+  src = moveSimd128IntIfNotAVX(src, dest);
   vpxor(scratch, scratch, scratch);
-  vpunpckhdq(scratch, dest, dest);
+  vpunpckhdq(scratch, src, dest);
 }
 
 // Floating multiply-accumulate: srcDest [+-]= src1 * src2
diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
index c39c48510d53..4364151dc81a 100644
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@@ -587,6 +587,10 @@ class MacroAssemblerX86Shared : public Assembler {
     moveSimd128Int(src, dest);
     return dest;
   }
+  FloatRegister selectDestIfAVX(FloatRegister src, FloatRegister dest) {
+    MOZ_ASSERT(src.isSimd128() && dest.isSimd128());
+    return HasAVX() ? dest : src;
+  }
   void loadUnalignedSimd128Int(const Address& src, FloatRegister dest) {
     vmovdqu(Operand(src), dest);
   }
diff --git a/js/src/jit/x86/MacroAssembler-x86.cpp b/js/src/jit/x86/MacroAssembler-x86.cpp
index 4095bcc3dc7a..2baafdddf0fe 100644
--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@@ -421,6 +421,12 @@ void MacroAssemblerX86::vcmplepdSimd128(const SimdConstant& v,
   vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vcmplepd_mr);
 }
 
+void MacroAssemblerX86::vpmaddubswSimd128(const SimdConstant& v,
+                                          FloatRegister lhs,
+                                          FloatRegister dest) {
+  vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vpmaddubsw_mr);
+}
+
 void MacroAssemblerX86::finish() {
   // Last instruction may be an indirect jump so eagerly insert an undefined
   // instruction byte to prevent processors from decoding data values into
diff --git a/js/src/jit/x86/MacroAssembler-x86.h b/js/src/jit/x86/MacroAssembler-x86.h
index c5dd10665e86..74551a2c7f0e 100644
--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@@ -1063,6 +1063,8 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared {
                        FloatRegister dest);
   void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
                        FloatRegister dest);
+  void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
+                         FloatRegister dest);
 
   Condition testInt32Truthy(bool truthy, const ValueOperand& operand) {
     test32(operand.payloadReg(), operand.payloadReg());