From bb2a780b8af8ea40f9e7e34658a65d0d607e9d30 Mon Sep 17 00:00:00 2001 From: Yury Delendik Date: Fri, 11 Feb 2022 13:27:12 +0000 Subject: [PATCH] Bug 1753115 - Refactor usage of moveSimd128. r=jseward * Replace with moveSimd128XXXIfNotAVX, when possible * Fix I16x8ExtaddPairwiseI8x16S lowering; relax lowering for AVX * Add postMoveSimd128IntIfNotAVX utility * Add extadd_pairwise tests * Fix neg-abs-not codegen tests * Fix shift by imm8 VEX encoding Differential Revision: https://phabricator.services.mozilla.com/D137581 --- .../tests/wasm/simd/avx2-x64-ion-codegen.js | 27 +++++ .../wasm/simd/neg-abs-not-x64-ion-codegen.js | 2 +- .../wasm/simd/pairwise-x64-ion-codegen.js | 38 ++++++ js/src/jit/x64/BaseAssembler-x64.h | 4 + js/src/jit/x64/MacroAssembler-x64.cpp | 7 ++ js/src/jit/x64/MacroAssembler-x64.h | 2 + .../jit/x86-shared/BaseAssembler-x86-shared.h | 8 +- .../x86-shared/CodeGenerator-x86-shared.cpp | 4 +- js/src/jit/x86-shared/Lowering-x86-shared.cpp | 31 +++-- .../MacroAssembler-x86-shared-SIMD.cpp | 98 +++++++++------- .../MacroAssembler-x86-shared-inl.h | 111 ++++++++---------- .../x86-shared/MacroAssembler-x86-shared.h | 4 + js/src/jit/x86/MacroAssembler-x86.cpp | 6 + js/src/jit/x86/MacroAssembler-x86.h | 2 + 14 files changed, 227 insertions(+), 117 deletions(-) create mode 100644 js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js diff --git a/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js index 4479a3d42057..a70423241096 100644 --- a/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js +++ b/js/src/jit-test/tests/wasm/simd/avx2-x64-ion-codegen.js @@ -433,3 +433,30 @@ codegenTestX64_v128xLITERAL_v128_avxhack( `c5 f1 eb 05 ${RIPRADDR} vporx ${RIPR}, %xmm1, %xmm0`], ['v128.xor', '(v128.const i8x16 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2)', `c5 f1 ef 05 ${RIPRADDR} vpxorx ${RIPR}, %xmm1, %xmm0`]]); + +// Shift by constant encodings +codegenTestX64_v128xLITERAL_v128_avxhack( + [['i8x16.shl', '(i32.const 2)', ` +c5 f1 fc c1 vpaddb %xmm1, %xmm1, %xmm0 +66 0f fc c0 paddb %xmm0, %xmm0`], + ['i8x16.shl', '(i32.const 4)', ` +c5 f1 db 05 ${RIPRADDR} vpandx ${RIPR}, %xmm1, %xmm0 +66 0f 71 f0 04 psllw \\$0x04, %xmm0`], + ['i16x8.shl', '(i32.const 1)', + 'c5 f9 71 f1 01 vpsllw \\$0x01, %xmm1, %xmm0'], + ['i16x8.shr_s', '(i32.const 3)', + 'c5 f9 71 e1 03 vpsraw \\$0x03, %xmm1, %xmm0'], + ['i16x8.shr_u', '(i32.const 2)', + 'c5 f9 71 d1 02 vpsrlw \\$0x02, %xmm1, %xmm0'], + ['i32x4.shl', '(i32.const 5)', + 'c5 f9 72 f1 05 vpslld \\$0x05, %xmm1, %xmm0'], + ['i32x4.shr_s', '(i32.const 2)', + 'c5 f9 72 e1 02 vpsrad \\$0x02, %xmm1, %xmm0'], + ['i32x4.shr_u', '(i32.const 5)', + 'c5 f9 72 d1 05 vpsrld \\$0x05, %xmm1, %xmm0'], + ['i64x2.shr_s', '(i32.const 7)', ` +c5 79 70 f9 f5 vpshufd \\$0xF5, %xmm1, %xmm15 +66 41 0f 72 e7 1f psrad \\$0x1F, %xmm15 +c4 c1 71 ef c7 vpxor %xmm15, %xmm1, %xmm0 +66 0f 73 d0 07 psrlq \\$0x07, %xmm0 +66 41 0f ef c7 pxor %xmm15, %xmm0`]]); diff --git a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js index 20768051c5bc..0ae75f38fb43 100644 --- a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js +++ b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js @@ -1,4 +1,4 @@ -// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() || getBuildConfiguration().simulator; include:codegen-x64-test.js +// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js // Test that there are no extraneous moves for variable SIMD negate, abs, and // not instructions. See README-codegen.md for general information about this diff --git a/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js b/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js new file mode 100644 index 000000000000..53ab47fdb8da --- /dev/null +++ b/js/src/jit-test/tests/wasm/simd/pairwise-x64-ion-codegen.js @@ -0,0 +1,38 @@ +// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js + +// Tests for SIMD add pairwise instructions. + +if (!isAvxPresent()) { + + codegenTestX64_IGNOREDxv128_v128( + [['i16x8.extadd_pairwise_i8x16_s', ` +66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0 +66 0f 38 04 c1 pmaddubsw %xmm1, %xmm0`], + ['i16x8.extadd_pairwise_i8x16_u', ` +66 0f 6f c1 movdqa %xmm1, %xmm0 +66 0f 38 04 05 ${RIPRADDR} pmaddubswx ${RIPR}, %xmm0`], + ['i32x4.extadd_pairwise_i16x8_s', ` +66 0f 6f c1 movdqa %xmm1, %xmm0 +66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0`], + ['i32x4.extadd_pairwise_i16x8_u', ` +66 0f 6f c1 movdqa %xmm1, %xmm0 +66 0f ef 05 ${RIPRADDR} pxorx ${RIPR}, %xmm0 +66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0 +66 0f fe 05 ${RIPRADDR} padddx ${RIPR}, %xmm0`]]); + +} else { + + codegenTestX64_IGNOREDxv128_v128( + [['i16x8.extadd_pairwise_i8x16_s', ` +66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0 +66 0f 38 04 c1 pmaddubsw %xmm1, %xmm0`], + ['i16x8.extadd_pairwise_i8x16_u', ` +c4 e2 71 04 05 ${RIPRADDR} vpmaddubswx ${RIPR}, %xmm1, %xmm0`], + ['i32x4.extadd_pairwise_i16x8_s', ` +c5 f1 f5 05 ${RIPRADDR} vpmaddwdx ${RIPR}, %xmm1, %xmm0`], + ['i32x4.extadd_pairwise_i16x8_u', ` +c5 f1 ef 05 ${RIPRADDR} vpxorx ${RIPR}, %xmm1, %xmm0 +66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0 +66 0f fe 05 ${RIPRADDR} padddx ${RIPR}, %xmm0`]]); + +} diff --git a/js/src/jit/x64/BaseAssembler-x64.h b/js/src/jit/x64/BaseAssembler-x64.h index 6b4d67338247..47b422d3b788 100644 --- a/js/src/jit/x64/BaseAssembler-x64.h +++ b/js/src/jit/x64/BaseAssembler-x64.h @@ -1118,6 +1118,10 @@ class BaseAssemblerX64 : public BaseAssembler { return twoByteRipOpImmSimd("vcmppd", VEX_PD, OP2_CMPPD_VpdWpd, X86Encoding::ConditionCmp_LE, src, dst); } + [[nodiscard]] JmpSrc vpmaddubsw_ripr(XMMRegisterID src, XMMRegisterID dst) { + return threeByteRipOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, + ESCAPE_38, src, dst); + } // BMI instructions: diff --git a/js/src/jit/x64/MacroAssembler-x64.cpp b/js/src/jit/x64/MacroAssembler-x64.cpp index ec27719cccd4..eafa930bcbe3 100644 --- a/js/src/jit/x64/MacroAssembler-x64.cpp +++ b/js/src/jit/x64/MacroAssembler-x64.cpp @@ -408,6 +408,13 @@ void MacroAssemblerX64::vcmplepdSimd128(const SimdConstant& v, vpRiprOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX64::vcmplepd_ripr); } +void MacroAssemblerX64::vpmaddubswSimd128(const SimdConstant& v, + FloatRegister lhs, + FloatRegister dest) { + vpRiprOpSimd128(v, lhs, dest, + &X86Encoding::BaseAssemblerX64::vpmaddubsw_ripr); +} + void MacroAssemblerX64::bindOffsets( const MacroAssemblerX86Shared::UsesVector& uses) { for (JmpSrc src : uses) { diff --git a/js/src/jit/x64/MacroAssembler-x64.h b/js/src/jit/x64/MacroAssembler-x64.h index 2fb558d1d9c9..d25ae252db4c 100644 --- a/js/src/jit/x64/MacroAssembler-x64.h +++ b/js/src/jit/x64/MacroAssembler-x64.h @@ -1100,6 +1100,8 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared { FloatRegister dest); void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs, FloatRegister dest); + void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs, + FloatRegister dest); void loadWasmPinnedRegsFromTls() { loadPtr(Address(WasmTlsReg, offsetof(wasm::TlsData, memoryBase)), HeapReg); diff --git a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h index e62255f41458..d383f91b9f55 100644 --- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h +++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h @@ -578,6 +578,11 @@ class BaseAssembler : public GenericAssembler { threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38, src1, src0, dst); } + void vpmaddubsw_mr(const void* address, XMMRegisterID src0, + XMMRegisterID dst) { + threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38, + address, src0, dst); + } void vpaddb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) { twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, src1, src0, dst); @@ -5328,7 +5333,8 @@ class BaseAssembler : public GenericAssembler { spew("%-11s$%d, %s, %s", name, int32_t(imm), XMMRegName(src), XMMRegName(dst)); - m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src, + // For shift instructions, destination is stored in vvvv field. + m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)src, dst, (int)shiftKind); m_formatter.immediate8u(imm); } diff --git a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp index 051b02cf9efc..2eed1bcc3cf8 100644 --- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp +++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp @@ -2984,9 +2984,7 @@ void CodeGenerator::visitWasmConstantShiftSimd128( int32_t shift = ins->shift(); if (shift == 0) { - if (src != dest) { - masm.moveSimd128(src, dest); - } + masm.moveSimd128(src, dest); return; } diff --git a/js/src/jit/x86-shared/Lowering-x86-shared.cpp b/js/src/jit/x86-shared/Lowering-x86-shared.cpp index c6220341f946..c2247849440b 100644 --- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp +++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp @@ -1231,7 +1231,12 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) { case wasm::SimdOp::I64x2ShrS: { auto* lir = new (alloc()) LWasmSignReplicationSimd128(useRegisterAtStart(lhs)); - defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + if (isThreeOpAllowed()) { + define(lir, ins); + } else { + // For non-AVX, it is always beneficial to reuse the input. + defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + } return; } default: @@ -1242,11 +1247,14 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) { # ifdef DEBUG js::wasm::ReportSimdAnalysis("shift -> constant shift"); # endif - // Almost always beneficial, and never detrimental, to reuse the input if - // possible. auto* lir = new (alloc()) LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount); - defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + if (isThreeOpAllowed()) { + define(lir, ins); + } else { + // For non-AVX, it is always beneficial to reuse the input. + defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src); + } return; } @@ -1449,8 +1457,11 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) { case wasm::SimdOp::I16x8Neg: case wasm::SimdOp::I32x4Neg: case wasm::SimdOp::I64x2Neg: + case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S: // Prefer src != dest to avoid an unconditional src->temp move. - MOZ_ASSERT(!useAtStart && !reuseInput); + MOZ_ASSERT(!reuseInput); + // If AVX is enabled, we prefer useRegisterAtStart. + useAtStart = isThreeOpAllowed(); break; case wasm::SimdOp::F32x4Neg: case wasm::SimdOp::F64x2Neg: @@ -1465,7 +1476,6 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) { case wasm::SimdOp::I64x2Abs: case wasm::SimdOp::I32x4TruncSatF32x4S: case wasm::SimdOp::F32x4ConvertI32x4U: - case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S: case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U: case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S: case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U: @@ -1476,18 +1486,19 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) { case wasm::SimdOp::I64x2ExtendHighI32x4S: case wasm::SimdOp::I64x2ExtendHighI32x4U: // Prefer src == dest to avoid an unconditional src->dest move - // for better performance (e.g. non-PSHUFD use). + // for better performance in non-AVX mode (e.g. non-PSHUFD use). useAtStart = true; - reuseInput = true; + reuseInput = !isThreeOpAllowed(); break; case wasm::SimdOp::I32x4TruncSatF32x4U: case wasm::SimdOp::I32x4TruncSatF64x2SZero: case wasm::SimdOp::I32x4TruncSatF64x2UZero: case wasm::SimdOp::I8x16Popcnt: tempReg = tempSimd128(); - // Prefer src == dest to avoid an unconditional src->dest move. + // Prefer src == dest to avoid an unconditional src->dest move + // in non-AVX mode. useAtStart = true; - reuseInput = true; + reuseInput = !isThreeOpAllowed(); break; case wasm::SimdOp::I16x8ExtendLowI8x16S: case wasm::SimdOp::I16x8ExtendHighI8x16S: diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp index 142785009e2d..a6ceb6925033 100644 --- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp @@ -55,8 +55,8 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input, vbroadcastss(Operand(input), output); return; } - asMasm().moveSimd128Float(input.asSimd128(), output); - vshufps(0, output, output, output); + input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output); + vshufps(0, input, input, output); } void MacroAssemblerX86Shared::splatX2(FloatRegister input, @@ -251,8 +251,9 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs, loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) - vpcmpgtb(Operand(lhs), scratch, scratch); - moveSimd128Int(scratch, output); + FloatRegister outputTemp = selectDestIfAVX(scratch, output); + vpcmpgtb(Operand(lhs), scratch, outputTemp); + moveSimd128Int(outputTemp, output); break; } case Assembler::Condition::NotEqual: @@ -351,8 +352,9 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs, loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) - vpcmpgtw(Operand(lhs), scratch, scratch); - moveSimd128Int(scratch, output); + FloatRegister outputTemp = selectDestIfAVX(scratch, output); + vpcmpgtw(Operand(lhs), scratch, outputTemp); + moveSimd128Int(outputTemp, output); break; } case Assembler::Condition::NotEqual: @@ -450,8 +452,9 @@ void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs, loadAlignedSimd128Int(rhs, scratch); } // src := src > lhs (i.e. lhs < rhs) - vpcmpgtd(Operand(lhs), scratch, scratch); - moveSimd128Int(scratch, output); + FloatRegister outputTemp = selectDestIfAVX(scratch, output); + vpcmpgtd(Operand(lhs), scratch, outputTemp); + moveSimd128Int(outputTemp, output); break; } case Assembler::Condition::NotEqual: @@ -583,8 +586,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2( vpsubq(Operand(lhs), temp1, temp1); vpcmpeqd(rhs, temp2, temp2); vandpd(temp2, temp1, temp1); - asMasm().moveSimd128(lhs, output); - vpcmpgtd(rhs, output, output); + lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); + vpcmpgtd(rhs, lhs, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); break; @@ -593,8 +596,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2( vmovdqa(Operand(lhs), temp2); vpcmpgtd(Operand(lhs), temp1, temp1); vpcmpeqd(Operand(rhs), temp2, temp2); - asMasm().moveSimd128(lhs, output); - vpsubq(rhs, output, output); + lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); + vpsubq(rhs, lhs, output); vandpd(temp2, output, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); @@ -604,8 +607,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2( vmovdqa(Operand(lhs), temp2); vpcmpgtd(Operand(lhs), temp1, temp1); vpcmpeqd(Operand(rhs), temp2, temp2); - asMasm().moveSimd128(lhs, output); - vpsubq(rhs, output, output); + lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); + vpsubq(rhs, lhs, output); vandpd(temp2, output, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); @@ -617,8 +620,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2( vpsubq(Operand(lhs), temp1, temp1); vpcmpeqd(rhs, temp2, temp2); vandpd(temp2, temp1, temp1); - asMasm().moveSimd128(lhs, output); - vpcmpgtd(rhs, output, output); + lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output); + vpcmpgtd(rhs, lhs, output); vpor(Operand(temp1), output, output); vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output); asMasm().bitwiseXorSimd128(output, allOnes, output); @@ -967,17 +970,22 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16( Imm32 count, FloatRegister src, FloatRegister dest) { MOZ_ASSERT(count.value <= 7); - asMasm().moveSimd128(src, dest); + if (MOZ_UNLIKELY(count.value == 0)) { + moveSimd128Int(src, dest); + return; + } + src = asMasm().moveSimd128IntIfNotAVX(src, dest); // Use the doubling trick for low shift counts, otherwise mask off the bits // that are shifted out of the low byte of each word and use word shifts. The // optimal cutoff remains to be explored. if (count.value <= 3) { - for (int32_t shift = count.value; shift > 0; --shift) { - asMasm().addInt8x16(dest, dest); + vpaddb(Operand(src), src, dest); + for (int32_t shift = count.value - 1; shift > 0; --shift) { + vpaddb(Operand(dest), dest, dest); } } else { - asMasm().bitwiseAndSimd128( - dest, SimdConstant::SplatX16(0xFF >> count.value), dest); + asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value), + dest); vpsllw(count, dest, dest); } } @@ -1070,10 +1078,10 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); vmovd(count, temp); - asMasm().moveSimd128(in, dest); asMasm().signReplicationInt64x2(in, scratch); + in = asMasm().moveSimd128FloatIfNotAVX(in, dest); // Invert if negative, shift all, invert back if negative. - vpxor(Operand(scratch), dest, dest); + vpxor(Operand(scratch), in, dest); vpsrlq(temp, dest, dest); vpxor(Operand(scratch), dest, dest); } @@ -1088,10 +1096,10 @@ void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2( void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2( Imm32 count, FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128(src, dest); asMasm().signReplicationInt64x2(src, scratch); // Invert if negative, shift all, invert back if negative. - vpxor(Operand(scratch), dest, dest); + src = asMasm().moveSimd128FloatIfNotAVX(src, dest); + vpxor(Operand(scratch), src, dest); vpsrlq(Imm32(count.value & 63), dest, dest); vpxor(Operand(scratch), dest, dest); } @@ -1104,11 +1112,16 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask, // Normally the codegen will attempt to enforce these register assignments so // that the moves are avoided. - asMasm().moveSimd128Int(onTrue, output); - asMasm().moveSimd128Int(mask, temp); + onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output); + if (MOZ_UNLIKELY(mask == onTrue)) { + vpor(Operand(onFalse), onTrue, output); + return; + } - vpand(Operand(temp), output, output); - vpandn(Operand(onFalse), temp, temp); + mask = asMasm().moveSimd128IntIfNotAVX(mask, temp); + + vpand(Operand(mask), onTrue, output); + vpandn(Operand(onFalse), mask, temp); vpor(Operand(temp), output, output); } @@ -1131,7 +1144,6 @@ void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4( void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128Float(src, dest); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive @@ -1139,9 +1151,10 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, // become 0. // Convert NaN to 0 by masking away values that compare unordered to itself. - vmovaps(dest, scratch); + vmovaps(src, scratch); vcmpeqps(Operand(scratch), scratch, scratch); - vpand(Operand(scratch), dest, dest); + src = asMasm().moveSimd128FloatIfNotAVX(src, dest); + vpand(Operand(scratch), src, dest); // Compute the complement of each non-NaN lane's sign bit, we'll need this to // correct the result of cvttps2dq. All other output bits are garbage. @@ -1165,7 +1178,7 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src, void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( FloatRegister src, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128Float(src, dest); + src = asMasm().moveSimd128FloatIfNotAVX(src, dest); // The cvttps2dq instruction is the workhorse but does not handle NaN or out // of range values as we need it to. We want to saturate too-large positive @@ -1173,7 +1186,7 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( // Convert NaN and negative values to zeroes in dest. vpxor(Operand(scratch), scratch, scratch); - vmaxps(Operand(scratch), dest, dest); + vmaxps(Operand(scratch), src, dest); // Place the largest positive signed integer in all lanes in scratch. // We use it to bias the conversion to handle edge cases. @@ -1217,14 +1230,14 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4( void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4Relaxed( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128Float(src, dest); + src = asMasm().moveSimd128FloatIfNotAVX(src, dest); // Place lanes below 80000000h into dest, otherwise into scratch. // Keep dest or scratch 0 as default. asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch); vcmpltps(Operand(src), scratch, scratch); vpand(Operand(src), scratch, scratch); - vpxor(Operand(scratch), dest, dest); + vpxor(Operand(scratch), src, dest); // Convert lanes below 80000000h into unsigned int without issues. vcvttps2dq(dest, dest); @@ -1267,10 +1280,10 @@ void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src, void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4( FloatRegister src, FloatRegister temp, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128Float(src, dest); + src = asMasm().moveSimd128FloatIfNotAVX(src, dest); vxorpd(scratch, scratch, scratch); - vmaxpd(Operand(scratch), dest, dest); + vmaxpd(Operand(scratch), src, dest); asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4294967295.0), temp); vminpd(Operand(temp), dest, dest); @@ -1284,11 +1297,10 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4( void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4Relaxed( FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(asMasm()); - asMasm().moveSimd128Float(src, dest); // The same as unsignedConvertInt32x4ToFloat64x2, but without NaN // and out-of-bounds checks. - vroundpd(SSERoundingMode::Trunc, Operand(dest), dest); + vroundpd(SSERoundingMode::Trunc, Operand(src), dest); asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0), scratch); vaddpd(Operand(scratch), dest, dest); @@ -1299,9 +1311,9 @@ void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src, FloatRegister temp, FloatRegister output) { ScratchSimd128Scope scratch(asMasm()); - asMasm().loadConstantSimd128Float(SimdConstant::SplatX16(0x0f), scratch); - asMasm().moveSimd128Int(src, temp); - vpand(scratch, temp, temp); + asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch); + FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp); + vpand(scratch, srcForTemp, temp); vpandn(src, scratch, scratch); int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output); diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h index 8a0b22bda761..1896ded29007 100644 --- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h @@ -1452,14 +1452,14 @@ void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs, void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); - vpslldq(count, dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpslldq(count, src, dest); } void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); - vpsrldq(count, dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpsrldq(count, src, dest); } // Reverse bytes in lanes. @@ -1467,10 +1467,10 @@ void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src, void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) { // Byteswap is MOV + PSLLW + PSRLW + POR, a small win over PSHUFB. ScratchSimd128Scope scratch(*this); - moveSimd128(src, dest); - moveSimd128(src, scratch); - vpsllw(Imm32(8), dest, dest); - vpsrlw(Imm32(8), scratch, scratch); + FloatRegister srcForScratch = moveSimd128IntIfNotAVX(src, scratch); + vpsrlw(Imm32(8), srcForScratch, scratch); + src = moveSimd128IntIfNotAVX(src, dest); + vpsllw(Imm32(8), src, dest); vpor(scratch, dest, dest); } @@ -1556,8 +1556,8 @@ void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) { // input and shifting rather than masking at the end, but creates a false // dependency on the old value of scratch. The better fix is to allow src to // be clobbered. - moveSimd128(src, scratch); - vpacksswb(Operand(scratch), scratch, scratch); + src = moveSimd128IntIfNotAVX(src, scratch); + vpacksswb(Operand(src), src, scratch); vpmovmskb(scratch, dest); andl(Imm32(0xFF), dest); } @@ -1713,20 +1713,22 @@ void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs, // lhs = // rhs = // result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low> - moveSimd128(lhs, temp); // temp = - vpsrlq(Imm32(32), temp, temp); // temp = <0 D> <0 B> - vpmuludq(rhs, temp, temp); // temp = - moveSimd128(rhs, temp2); // temp2 = - vpsrlq(Imm32(32), temp2, temp2); // temp2 = <0 H> <0 F> - vpmuludq(lhs, temp2, temp2); // temp2 = - vpaddq(Operand(temp), temp2, temp2); // temp2 = - vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0> - // <(BE+AF)_low 0> - vpmuludq(rhs, dest, dest); // dest = - // - vpaddq(Operand(temp2), dest, dest); // dest = - // <(DG+CH)_low+CG_high CG_low> - // <(BE+AF)_low+AE_high AE_low> + FloatRegister lhsForTemp = + moveSimd128IntIfNotAVX(lhs, temp); // temp = + vpsrlq(Imm32(32), lhsForTemp, temp); // temp = <0 D> <0 B> + vpmuludq(rhs, temp, temp); // temp = + FloatRegister rhsForTemp = + moveSimd128IntIfNotAVX(rhs, temp2); // temp2 = + vpsrlq(Imm32(32), rhsForTemp, temp2); // temp2 = <0 H> <0 F> + vpmuludq(lhs, temp2, temp2); // temp2 = + vpaddq(Operand(temp), temp2, temp2); // temp2 = + vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0> + // <(BE+AF)_low 0> + vpmuludq(rhs, dest, dest); // dest = + // + vpaddq(Operand(temp2), dest, dest); // dest = + // <(DG+CH)_low+CG_high CG_low> + // <(BE+AF)_low+AE_high AE_low> } // Code generation from the PR: https://github.com/WebAssembly/simd/pull/376. @@ -2141,9 +2143,9 @@ void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) { void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) { ScratchSimd128Scope scratch(*this); - moveSimd128(src, dest); signReplicationInt64x2(src, scratch); - vpxor(Operand(scratch), dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpxor(Operand(scratch), src, dest); vpsubq(Operand(scratch), dest, dest); } @@ -2167,7 +2169,7 @@ void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest) { void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsllw(count, src, dest); } @@ -2178,7 +2180,7 @@ void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest) { void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpslld(count, src, dest); } @@ -2189,7 +2191,7 @@ void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest) { void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsllq(count, src, dest); } @@ -2226,7 +2228,7 @@ void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest) { void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsraw(count, src, dest); } @@ -2238,7 +2240,7 @@ void MacroAssembler::unsignedRightShiftInt16x8(Register rhs, void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsrlw(count, src, dest); } @@ -2249,7 +2251,7 @@ void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest) { void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsrad(count, src, dest); } @@ -2261,7 +2263,7 @@ void MacroAssembler::unsignedRightShiftInt32x4(Register rhs, void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsrld(count, src, dest); } @@ -2284,7 +2286,7 @@ void MacroAssembler::unsignedRightShiftInt64x2(Register rhs, void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); + src = moveSimd128IntIfNotAVX(src, dest); vpsrlq(count, src, dest); } @@ -2299,14 +2301,14 @@ void MacroAssembler::signReplicationInt8x16(FloatRegister src, void MacroAssembler::signReplicationInt16x8(FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); - vpsraw(Imm32(15), dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpsraw(Imm32(15), src, dest); } void MacroAssembler::signReplicationInt32x4(FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); - vpsrad(Imm32(31), dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpsrad(Imm32(31), src, dest); } void MacroAssembler::signReplicationInt64x2(FloatRegister src, @@ -2810,30 +2812,22 @@ void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src, void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src, FloatRegister dest) { - ScratchSimd128Scope scratch(*this); - moveSimd128(src, dest); - loadConstantSimd128Int(SimdConstant::SplatX16(1), scratch); - vpmaddubsw(scratch, dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpmaddubswSimd128(SimdConstant::SplatX16(1), src, dest); } void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src, FloatRegister dest) { - ScratchSimd128Scope scratch(*this); - moveSimd128(src, dest); - loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch); - vpmaddwd(Operand(scratch), dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpmaddwdSimd128(SimdConstant::SplatX8(1), src, dest); } void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src, FloatRegister dest) { - ScratchSimd128Scope scratch(*this); - moveSimd128(src, dest); - loadConstantSimd128Int(SimdConstant::SplatX8(0x8000), scratch); - vpxor(scratch, dest, dest); - loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch); - vpmaddwd(Operand(scratch), dest, dest); - loadConstantSimd128Int(SimdConstant::SplatX4(0x00010000), scratch); - vpaddd(Operand(scratch), dest, dest); + src = moveSimd128IntIfNotAVX(src, dest); + vpxorSimd128(SimdConstant::SplatX8(-0x8000), src, dest); + vpmaddwdSimd128(SimdConstant::SplatX8(1), dest, dest); + vpadddSimd128(SimdConstant::SplatX4(0x00010000), dest, dest); } // Floating square root @@ -3023,8 +3017,8 @@ void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src, } void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) { - if (src == dest) { - vmovhlps(dest, dest, dest); + if (src == dest || HasAVX()) { + vmovhlps(src, src, dest); } else { vpshufd(ComputeShuffleMask(2, 3, 2, 3), src, dest); } @@ -3033,11 +3027,10 @@ void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) { void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src, FloatRegister dest) { - moveSimd128(src, dest); - ScratchSimd128Scope scratch(*this); + src = moveSimd128IntIfNotAVX(src, dest); vpxor(scratch, scratch, scratch); - vpunpckhdq(scratch, dest, dest); + vpunpckhdq(scratch, src, dest); } // Floating multiply-accumulate: srcDest [+-]= src1 * src2 diff --git a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h index c39c48510d53..4364151dc81a 100644 --- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h +++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h @@ -587,6 +587,10 @@ class MacroAssemblerX86Shared : public Assembler { moveSimd128Int(src, dest); return dest; } + FloatRegister selectDestIfAVX(FloatRegister src, FloatRegister dest) { + MOZ_ASSERT(src.isSimd128() && dest.isSimd128()); + return HasAVX() ? dest : src; + } void loadUnalignedSimd128Int(const Address& src, FloatRegister dest) { vmovdqu(Operand(src), dest); } diff --git a/js/src/jit/x86/MacroAssembler-x86.cpp b/js/src/jit/x86/MacroAssembler-x86.cpp index 4095bcc3dc7a..2baafdddf0fe 100644 --- a/js/src/jit/x86/MacroAssembler-x86.cpp +++ b/js/src/jit/x86/MacroAssembler-x86.cpp @@ -421,6 +421,12 @@ void MacroAssemblerX86::vcmplepdSimd128(const SimdConstant& v, vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vcmplepd_mr); } +void MacroAssemblerX86::vpmaddubswSimd128(const SimdConstant& v, + FloatRegister lhs, + FloatRegister dest) { + vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vpmaddubsw_mr); +} + void MacroAssemblerX86::finish() { // Last instruction may be an indirect jump so eagerly insert an undefined // instruction byte to prevent processors from decoding data values into diff --git a/js/src/jit/x86/MacroAssembler-x86.h b/js/src/jit/x86/MacroAssembler-x86.h index c5dd10665e86..74551a2c7f0e 100644 --- a/js/src/jit/x86/MacroAssembler-x86.h +++ b/js/src/jit/x86/MacroAssembler-x86.h @@ -1063,6 +1063,8 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared { FloatRegister dest); void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs, FloatRegister dest); + void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs, + FloatRegister dest); Condition testInt32Truthy(bool truthy, const ValueOperand& operand) { test32(operand.payloadReg(), operand.payloadReg());