Bug 1753115 - Refactor usage of moveSimd128. r=jseward

* Replace with moveSimd128XXXIfNotAVX, when possible
* Fix I16x8ExtaddPairwiseI8x16S lowering; relax lowering for AVX
* Add postMoveSimd128IntIfNotAVX utility
* Add extadd_pairwise tests
* Fix neg-abs-not codegen tests
* Fix shift by imm8 VEX encoding

Differential Revision: https://phabricator.services.mozilla.com/D137581
This commit is contained in:
Yury Delendik 2022-02-11 13:27:12 +00:00
Родитель e661e480a4
Коммит bb2a780b8a
14 изменённых файлов: 227 добавлений и 117 удалений

Просмотреть файл

@ -433,3 +433,30 @@ codegenTestX64_v128xLITERAL_v128_avxhack(
`c5 f1 eb 05 ${RIPRADDR} vporx ${RIPR}, %xmm1, %xmm0`],
['v128.xor', '(v128.const i8x16 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2)',
`c5 f1 ef 05 ${RIPRADDR} vpxorx ${RIPR}, %xmm1, %xmm0`]]);
// Shift by constant encodings
codegenTestX64_v128xLITERAL_v128_avxhack(
[['i8x16.shl', '(i32.const 2)', `
c5 f1 fc c1 vpaddb %xmm1, %xmm1, %xmm0
66 0f fc c0 paddb %xmm0, %xmm0`],
['i8x16.shl', '(i32.const 4)', `
c5 f1 db 05 ${RIPRADDR} vpandx ${RIPR}, %xmm1, %xmm0
66 0f 71 f0 04 psllw \\$0x04, %xmm0`],
['i16x8.shl', '(i32.const 1)',
'c5 f9 71 f1 01 vpsllw \\$0x01, %xmm1, %xmm0'],
['i16x8.shr_s', '(i32.const 3)',
'c5 f9 71 e1 03 vpsraw \\$0x03, %xmm1, %xmm0'],
['i16x8.shr_u', '(i32.const 2)',
'c5 f9 71 d1 02 vpsrlw \\$0x02, %xmm1, %xmm0'],
['i32x4.shl', '(i32.const 5)',
'c5 f9 72 f1 05 vpslld \\$0x05, %xmm1, %xmm0'],
['i32x4.shr_s', '(i32.const 2)',
'c5 f9 72 e1 02 vpsrad \\$0x02, %xmm1, %xmm0'],
['i32x4.shr_u', '(i32.const 5)',
'c5 f9 72 d1 05 vpsrld \\$0x05, %xmm1, %xmm0'],
['i64x2.shr_s', '(i32.const 7)', `
c5 79 70 f9 f5 vpshufd \\$0xF5, %xmm1, %xmm15
66 41 0f 72 e7 1f psrad \\$0x1F, %xmm15
c4 c1 71 ef c7 vpxor %xmm15, %xmm1, %xmm0
66 0f 73 d0 07 psrlq \\$0x07, %xmm0
66 41 0f ef c7 pxor %xmm15, %xmm0`]]);

Просмотреть файл

@ -1,4 +1,4 @@
// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() || getBuildConfiguration().simulator; include:codegen-x64-test.js
// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js
// Test that there are no extraneous moves for variable SIMD negate, abs, and
// not instructions. See README-codegen.md for general information about this

Просмотреть файл

@ -0,0 +1,38 @@
// |jit-test| skip-if: !wasmSimdEnabled() || !hasDisassembler() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64 || getBuildConfiguration().simulator; include:codegen-x64-test.js
// Tests for SIMD add pairwise instructions.
if (!isAvxPresent()) {
codegenTestX64_IGNOREDxv128_v128(
[['i16x8.extadd_pairwise_i8x16_s', `
66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0
66 0f 38 04 c1 pmaddubsw %xmm1, %xmm0`],
['i16x8.extadd_pairwise_i8x16_u', `
66 0f 6f c1 movdqa %xmm1, %xmm0
66 0f 38 04 05 ${RIPRADDR} pmaddubswx ${RIPR}, %xmm0`],
['i32x4.extadd_pairwise_i16x8_s', `
66 0f 6f c1 movdqa %xmm1, %xmm0
66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0`],
['i32x4.extadd_pairwise_i16x8_u', `
66 0f 6f c1 movdqa %xmm1, %xmm0
66 0f ef 05 ${RIPRADDR} pxorx ${RIPR}, %xmm0
66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0
66 0f fe 05 ${RIPRADDR} padddx ${RIPR}, %xmm0`]]);
} else {
codegenTestX64_IGNOREDxv128_v128(
[['i16x8.extadd_pairwise_i8x16_s', `
66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0
66 0f 38 04 c1 pmaddubsw %xmm1, %xmm0`],
['i16x8.extadd_pairwise_i8x16_u', `
c4 e2 71 04 05 ${RIPRADDR} vpmaddubswx ${RIPR}, %xmm1, %xmm0`],
['i32x4.extadd_pairwise_i16x8_s', `
c5 f1 f5 05 ${RIPRADDR} vpmaddwdx ${RIPR}, %xmm1, %xmm0`],
['i32x4.extadd_pairwise_i16x8_u', `
c5 f1 ef 05 ${RIPRADDR} vpxorx ${RIPR}, %xmm1, %xmm0
66 0f f5 05 ${RIPRADDR} pmaddwdx ${RIPR}, %xmm0
66 0f fe 05 ${RIPRADDR} padddx ${RIPR}, %xmm0`]]);
}

Просмотреть файл

@ -1118,6 +1118,10 @@ class BaseAssemblerX64 : public BaseAssembler {
return twoByteRipOpImmSimd("vcmppd", VEX_PD, OP2_CMPPD_VpdWpd,
X86Encoding::ConditionCmp_LE, src, dst);
}
[[nodiscard]] JmpSrc vpmaddubsw_ripr(XMMRegisterID src, XMMRegisterID dst) {
return threeByteRipOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq,
ESCAPE_38, src, dst);
}
// BMI instructions:

Просмотреть файл

@ -408,6 +408,13 @@ void MacroAssemblerX64::vcmplepdSimd128(const SimdConstant& v,
vpRiprOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX64::vcmplepd_ripr);
}
void MacroAssemblerX64::vpmaddubswSimd128(const SimdConstant& v,
FloatRegister lhs,
FloatRegister dest) {
vpRiprOpSimd128(v, lhs, dest,
&X86Encoding::BaseAssemblerX64::vpmaddubsw_ripr);
}
void MacroAssemblerX64::bindOffsets(
const MacroAssemblerX86Shared::UsesVector& uses) {
for (JmpSrc src : uses) {

Просмотреть файл

@ -1100,6 +1100,8 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared {
FloatRegister dest);
void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
FloatRegister dest);
void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
FloatRegister dest);
void loadWasmPinnedRegsFromTls() {
loadPtr(Address(WasmTlsReg, offsetof(wasm::TlsData, memoryBase)), HeapReg);

Просмотреть файл

@ -578,6 +578,11 @@ class BaseAssembler : public GenericAssembler {
threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38, src1,
src0, dst);
}
void vpmaddubsw_mr(const void* address, XMMRegisterID src0,
XMMRegisterID dst) {
threeByteOpSimd("vpmaddubsw", VEX_PD, OP3_PMADDUBSW_VdqWdq, ESCAPE_38,
address, src0, dst);
}
void vpaddb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, src1, src0, dst);
@ -5328,7 +5333,8 @@ class BaseAssembler : public GenericAssembler {
spew("%-11s$%d, %s, %s", name, int32_t(imm), XMMRegName(src),
XMMRegName(dst));
m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src,
// For shift instructions, destination is stored in vvvv field.
m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)src, dst,
(int)shiftKind);
m_formatter.immediate8u(imm);
}

Просмотреть файл

@ -2984,9 +2984,7 @@ void CodeGenerator::visitWasmConstantShiftSimd128(
int32_t shift = ins->shift();
if (shift == 0) {
if (src != dest) {
masm.moveSimd128(src, dest);
}
masm.moveSimd128(src, dest);
return;
}

Просмотреть файл

@ -1231,7 +1231,12 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
case wasm::SimdOp::I64x2ShrS: {
auto* lir = new (alloc())
LWasmSignReplicationSimd128(useRegisterAtStart(lhs));
defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
if (isThreeOpAllowed()) {
define(lir, ins);
} else {
// For non-AVX, it is always beneficial to reuse the input.
defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
}
return;
}
default:
@ -1242,11 +1247,14 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
# ifdef DEBUG
js::wasm::ReportSimdAnalysis("shift -> constant shift");
# endif
// Almost always beneficial, and never detrimental, to reuse the input if
// possible.
auto* lir = new (alloc())
LWasmConstantShiftSimd128(useRegisterAtStart(lhs), shiftCount);
defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
if (isThreeOpAllowed()) {
define(lir, ins);
} else {
// For non-AVX, it is always beneficial to reuse the input.
defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
}
return;
}
@ -1449,8 +1457,11 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
case wasm::SimdOp::I16x8Neg:
case wasm::SimdOp::I32x4Neg:
case wasm::SimdOp::I64x2Neg:
case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
// Prefer src != dest to avoid an unconditional src->temp move.
MOZ_ASSERT(!useAtStart && !reuseInput);
MOZ_ASSERT(!reuseInput);
// If AVX is enabled, we prefer useRegisterAtStart.
useAtStart = isThreeOpAllowed();
break;
case wasm::SimdOp::F32x4Neg:
case wasm::SimdOp::F64x2Neg:
@ -1465,7 +1476,6 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
case wasm::SimdOp::I64x2Abs:
case wasm::SimdOp::I32x4TruncSatF32x4S:
case wasm::SimdOp::F32x4ConvertI32x4U:
case wasm::SimdOp::I16x8ExtaddPairwiseI8x16S:
case wasm::SimdOp::I16x8ExtaddPairwiseI8x16U:
case wasm::SimdOp::I32x4ExtaddPairwiseI16x8S:
case wasm::SimdOp::I32x4ExtaddPairwiseI16x8U:
@ -1476,18 +1486,19 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
case wasm::SimdOp::I64x2ExtendHighI32x4S:
case wasm::SimdOp::I64x2ExtendHighI32x4U:
// Prefer src == dest to avoid an unconditional src->dest move
// for better performance (e.g. non-PSHUFD use).
// for better performance in non-AVX mode (e.g. non-PSHUFD use).
useAtStart = true;
reuseInput = true;
reuseInput = !isThreeOpAllowed();
break;
case wasm::SimdOp::I32x4TruncSatF32x4U:
case wasm::SimdOp::I32x4TruncSatF64x2SZero:
case wasm::SimdOp::I32x4TruncSatF64x2UZero:
case wasm::SimdOp::I8x16Popcnt:
tempReg = tempSimd128();
// Prefer src == dest to avoid an unconditional src->dest move.
// Prefer src == dest to avoid an unconditional src->dest move
// in non-AVX mode.
useAtStart = true;
reuseInput = true;
reuseInput = !isThreeOpAllowed();
break;
case wasm::SimdOp::I16x8ExtendLowI8x16S:
case wasm::SimdOp::I16x8ExtendHighI8x16S:

Просмотреть файл

@ -55,8 +55,8 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input,
vbroadcastss(Operand(input), output);
return;
}
asMasm().moveSimd128Float(input.asSimd128(), output);
vshufps(0, output, output, output);
input = asMasm().moveSimd128FloatIfNotAVX(input.asSimd128(), output);
vshufps(0, input, input, output);
}
void MacroAssemblerX86Shared::splatX2(FloatRegister input,
@ -251,8 +251,9 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
loadAlignedSimd128Int(rhs, scratch);
}
// src := src > lhs (i.e. lhs < rhs)
vpcmpgtb(Operand(lhs), scratch, scratch);
moveSimd128Int(scratch, output);
FloatRegister outputTemp = selectDestIfAVX(scratch, output);
vpcmpgtb(Operand(lhs), scratch, outputTemp);
moveSimd128Int(outputTemp, output);
break;
}
case Assembler::Condition::NotEqual:
@ -351,8 +352,9 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
loadAlignedSimd128Int(rhs, scratch);
}
// src := src > lhs (i.e. lhs < rhs)
vpcmpgtw(Operand(lhs), scratch, scratch);
moveSimd128Int(scratch, output);
FloatRegister outputTemp = selectDestIfAVX(scratch, output);
vpcmpgtw(Operand(lhs), scratch, outputTemp);
moveSimd128Int(outputTemp, output);
break;
}
case Assembler::Condition::NotEqual:
@ -450,8 +452,9 @@ void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
loadAlignedSimd128Int(rhs, scratch);
}
// src := src > lhs (i.e. lhs < rhs)
vpcmpgtd(Operand(lhs), scratch, scratch);
moveSimd128Int(scratch, output);
FloatRegister outputTemp = selectDestIfAVX(scratch, output);
vpcmpgtd(Operand(lhs), scratch, outputTemp);
moveSimd128Int(outputTemp, output);
break;
}
case Assembler::Condition::NotEqual:
@ -583,8 +586,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
vpsubq(Operand(lhs), temp1, temp1);
vpcmpeqd(rhs, temp2, temp2);
vandpd(temp2, temp1, temp1);
asMasm().moveSimd128(lhs, output);
vpcmpgtd(rhs, output, output);
lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
vpcmpgtd(rhs, lhs, output);
vpor(Operand(temp1), output, output);
vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
break;
@ -593,8 +596,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
vmovdqa(Operand(lhs), temp2);
vpcmpgtd(Operand(lhs), temp1, temp1);
vpcmpeqd(Operand(rhs), temp2, temp2);
asMasm().moveSimd128(lhs, output);
vpsubq(rhs, output, output);
lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
vpsubq(rhs, lhs, output);
vandpd(temp2, output, output);
vpor(Operand(temp1), output, output);
vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@ -604,8 +607,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
vmovdqa(Operand(lhs), temp2);
vpcmpgtd(Operand(lhs), temp1, temp1);
vpcmpeqd(Operand(rhs), temp2, temp2);
asMasm().moveSimd128(lhs, output);
vpsubq(rhs, output, output);
lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
vpsubq(rhs, lhs, output);
vandpd(temp2, output, output);
vpor(Operand(temp1), output, output);
vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
@ -617,8 +620,8 @@ void MacroAssemblerX86Shared::compareForOrderingInt64x2(
vpsubq(Operand(lhs), temp1, temp1);
vpcmpeqd(rhs, temp2, temp2);
vandpd(temp2, temp1, temp1);
asMasm().moveSimd128(lhs, output);
vpcmpgtd(rhs, output, output);
lhs = asMasm().moveSimd128IntIfNotAVX(lhs, output);
vpcmpgtd(rhs, lhs, output);
vpor(Operand(temp1), output, output);
vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), output, output);
asMasm().bitwiseXorSimd128(output, allOnes, output);
@ -967,17 +970,22 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
Imm32 count, FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(count.value <= 7);
asMasm().moveSimd128(src, dest);
if (MOZ_UNLIKELY(count.value == 0)) {
moveSimd128Int(src, dest);
return;
}
src = asMasm().moveSimd128IntIfNotAVX(src, dest);
// Use the doubling trick for low shift counts, otherwise mask off the bits
// that are shifted out of the low byte of each word and use word shifts. The
// optimal cutoff remains to be explored.
if (count.value <= 3) {
for (int32_t shift = count.value; shift > 0; --shift) {
asMasm().addInt8x16(dest, dest);
vpaddb(Operand(src), src, dest);
for (int32_t shift = count.value - 1; shift > 0; --shift) {
vpaddb(Operand(dest), dest, dest);
}
} else {
asMasm().bitwiseAndSimd128(
dest, SimdConstant::SplatX16(0xFF >> count.value), dest);
asMasm().bitwiseAndSimd128(src, SimdConstant::SplatX16(0xFF >> count.value),
dest);
vpsllw(count, dest, dest);
}
}
@ -1070,10 +1078,10 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
FloatRegister in, Register count, FloatRegister temp, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
vmovd(count, temp);
asMasm().moveSimd128(in, dest);
asMasm().signReplicationInt64x2(in, scratch);
in = asMasm().moveSimd128FloatIfNotAVX(in, dest);
// Invert if negative, shift all, invert back if negative.
vpxor(Operand(scratch), dest, dest);
vpxor(Operand(scratch), in, dest);
vpsrlq(temp, dest, dest);
vpxor(Operand(scratch), dest, dest);
}
@ -1088,10 +1096,10 @@ void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt64x2(
void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
Imm32 count, FloatRegister src, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128(src, dest);
asMasm().signReplicationInt64x2(src, scratch);
// Invert if negative, shift all, invert back if negative.
vpxor(Operand(scratch), dest, dest);
src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
vpxor(Operand(scratch), src, dest);
vpsrlq(Imm32(count.value & 63), dest, dest);
vpxor(Operand(scratch), dest, dest);
}
@ -1104,11 +1112,16 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
// Normally the codegen will attempt to enforce these register assignments so
// that the moves are avoided.
asMasm().moveSimd128Int(onTrue, output);
asMasm().moveSimd128Int(mask, temp);
onTrue = asMasm().moveSimd128IntIfNotAVX(onTrue, output);
if (MOZ_UNLIKELY(mask == onTrue)) {
vpor(Operand(onFalse), onTrue, output);
return;
}
vpand(Operand(temp), output, output);
vpandn(Operand(onFalse), temp, temp);
mask = asMasm().moveSimd128IntIfNotAVX(mask, temp);
vpand(Operand(mask), onTrue, output);
vpandn(Operand(onFalse), mask, temp);
vpor(Operand(temp), output, output);
}
@ -1131,7 +1144,6 @@ void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128Float(src, dest);
// The cvttps2dq instruction is the workhorse but does not handle NaN or out
// of range values as we need it to. We want to saturate too-large positive
@ -1139,9 +1151,10 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
// become 0.
// Convert NaN to 0 by masking away values that compare unordered to itself.
vmovaps(dest, scratch);
vmovaps(src, scratch);
vcmpeqps(Operand(scratch), scratch, scratch);
vpand(Operand(scratch), dest, dest);
src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
vpand(Operand(scratch), src, dest);
// Compute the complement of each non-NaN lane's sign bit, we'll need this to
// correct the result of cvttps2dq. All other output bits are garbage.
@ -1165,7 +1178,7 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
FloatRegister src, FloatRegister temp, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128Float(src, dest);
src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
// The cvttps2dq instruction is the workhorse but does not handle NaN or out
// of range values as we need it to. We want to saturate too-large positive
@ -1173,7 +1186,7 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
// Convert NaN and negative values to zeroes in dest.
vpxor(Operand(scratch), scratch, scratch);
vmaxps(Operand(scratch), dest, dest);
vmaxps(Operand(scratch), src, dest);
// Place the largest positive signed integer in all lanes in scratch.
// We use it to bias the conversion to handle edge cases.
@ -1217,14 +1230,14 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4Relaxed(
FloatRegister src, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128Float(src, dest);
src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
// Place lanes below 80000000h into dest, otherwise into scratch.
// Keep dest or scratch 0 as default.
asMasm().loadConstantSimd128Float(SimdConstant::SplatX4(0x4f000000), scratch);
vcmpltps(Operand(src), scratch, scratch);
vpand(Operand(src), scratch, scratch);
vpxor(Operand(scratch), dest, dest);
vpxor(Operand(scratch), src, dest);
// Convert lanes below 80000000h into unsigned int without issues.
vcvttps2dq(dest, dest);
@ -1267,10 +1280,10 @@ void MacroAssemblerX86Shared::truncSatFloat64x2ToInt32x4(FloatRegister src,
void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
FloatRegister src, FloatRegister temp, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128Float(src, dest);
src = asMasm().moveSimd128FloatIfNotAVX(src, dest);
vxorpd(scratch, scratch, scratch);
vmaxpd(Operand(scratch), dest, dest);
vmaxpd(Operand(scratch), src, dest);
asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4294967295.0), temp);
vminpd(Operand(temp), dest, dest);
@ -1284,11 +1297,10 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4Relaxed(
FloatRegister src, FloatRegister dest) {
ScratchSimd128Scope scratch(asMasm());
asMasm().moveSimd128Float(src, dest);
// The same as unsignedConvertInt32x4ToFloat64x2, but without NaN
// and out-of-bounds checks.
vroundpd(SSERoundingMode::Trunc, Operand(dest), dest);
vroundpd(SSERoundingMode::Trunc, Operand(src), dest);
asMasm().loadConstantSimd128Float(SimdConstant::SplatX2(4503599627370496.0),
scratch);
vaddpd(Operand(scratch), dest, dest);
@ -1299,9 +1311,9 @@ void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,
FloatRegister temp,
FloatRegister output) {
ScratchSimd128Scope scratch(asMasm());
asMasm().loadConstantSimd128Float(SimdConstant::SplatX16(0x0f), scratch);
asMasm().moveSimd128Int(src, temp);
vpand(scratch, temp, temp);
asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0x0f), scratch);
FloatRegister srcForTemp = asMasm().moveSimd128IntIfNotAVX(src, temp);
vpand(scratch, srcForTemp, temp);
vpandn(src, scratch, scratch);
int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);

Просмотреть файл

@ -1452,14 +1452,14 @@ void MacroAssembler::concatAndRightShiftSimd128(FloatRegister lhs,
void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
vpslldq(count, dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpslldq(count, src, dest);
}
void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
vpsrldq(count, dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrldq(count, src, dest);
}
// Reverse bytes in lanes.
@ -1467,10 +1467,10 @@ void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
void MacroAssembler::reverseInt16x8(FloatRegister src, FloatRegister dest) {
// Byteswap is MOV + PSLLW + PSRLW + POR, a small win over PSHUFB.
ScratchSimd128Scope scratch(*this);
moveSimd128(src, dest);
moveSimd128(src, scratch);
vpsllw(Imm32(8), dest, dest);
vpsrlw(Imm32(8), scratch, scratch);
FloatRegister srcForScratch = moveSimd128IntIfNotAVX(src, scratch);
vpsrlw(Imm32(8), srcForScratch, scratch);
src = moveSimd128IntIfNotAVX(src, dest);
vpsllw(Imm32(8), src, dest);
vpor(scratch, dest, dest);
}
@ -1556,8 +1556,8 @@ void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
// input and shifting rather than masking at the end, but creates a false
// dependency on the old value of scratch. The better fix is to allow src to
// be clobbered.
moveSimd128(src, scratch);
vpacksswb(Operand(scratch), scratch, scratch);
src = moveSimd128IntIfNotAVX(src, scratch);
vpacksswb(Operand(src), src, scratch);
vpmovmskb(scratch, dest);
andl(Imm32(0xFF), dest);
}
@ -1713,20 +1713,22 @@ void MacroAssembler::mulInt64x2(FloatRegister lhs, FloatRegister rhs,
// lhs = <D C> <B A>
// rhs = <H G> <F E>
// result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
moveSimd128(lhs, temp); // temp = <D C> <B A>
vpsrlq(Imm32(32), temp, temp); // temp = <0 D> <0 B>
vpmuludq(rhs, temp, temp); // temp = <DG> <BE>
moveSimd128(rhs, temp2); // temp2 = <H G> <F E>
vpsrlq(Imm32(32), temp2, temp2); // temp2 = <0 H> <0 F>
vpmuludq(lhs, temp2, temp2); // temp2 = <CH> <AF>
vpaddq(Operand(temp), temp2, temp2); // temp2 = <DG+CH> <BE+AF>
vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0>
// <(BE+AF)_low 0>
vpmuludq(rhs, dest, dest); // dest = <CG_high CG_low>
// <AE_high AE_low>
vpaddq(Operand(temp2), dest, dest); // dest =
// <(DG+CH)_low+CG_high CG_low>
// <(BE+AF)_low+AE_high AE_low>
FloatRegister lhsForTemp =
moveSimd128IntIfNotAVX(lhs, temp); // temp = <D C> <B A>
vpsrlq(Imm32(32), lhsForTemp, temp); // temp = <0 D> <0 B>
vpmuludq(rhs, temp, temp); // temp = <DG> <BE>
FloatRegister rhsForTemp =
moveSimd128IntIfNotAVX(rhs, temp2); // temp2 = <H G> <F E>
vpsrlq(Imm32(32), rhsForTemp, temp2); // temp2 = <0 H> <0 F>
vpmuludq(lhs, temp2, temp2); // temp2 = <CH> <AF>
vpaddq(Operand(temp), temp2, temp2); // temp2 = <DG+CH> <BE+AF>
vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0>
// <(BE+AF)_low 0>
vpmuludq(rhs, dest, dest); // dest = <CG_high CG_low>
// <AE_high AE_low>
vpaddq(Operand(temp2), dest, dest); // dest =
// <(DG+CH)_low+CG_high CG_low>
// <(BE+AF)_low+AE_high AE_low>
}
// Code generation from the PR: https://github.com/WebAssembly/simd/pull/376.
@ -2141,9 +2143,9 @@ void MacroAssembler::absInt32x4(FloatRegister src, FloatRegister dest) {
void MacroAssembler::absInt64x2(FloatRegister src, FloatRegister dest) {
ScratchSimd128Scope scratch(*this);
moveSimd128(src, dest);
signReplicationInt64x2(src, scratch);
vpxor(Operand(scratch), dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpxor(Operand(scratch), src, dest);
vpsubq(Operand(scratch), dest, dest);
}
@ -2167,7 +2169,7 @@ void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest) {
void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsllw(count, src, dest);
}
@ -2178,7 +2180,7 @@ void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest) {
void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpslld(count, src, dest);
}
@ -2189,7 +2191,7 @@ void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest) {
void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsllq(count, src, dest);
}
@ -2226,7 +2228,7 @@ void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest) {
void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsraw(count, src, dest);
}
@ -2238,7 +2240,7 @@ void MacroAssembler::unsignedRightShiftInt16x8(Register rhs,
void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrlw(count, src, dest);
}
@ -2249,7 +2251,7 @@ void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest) {
void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrad(count, src, dest);
}
@ -2261,7 +2263,7 @@ void MacroAssembler::unsignedRightShiftInt32x4(Register rhs,
void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrld(count, src, dest);
}
@ -2284,7 +2286,7 @@ void MacroAssembler::unsignedRightShiftInt64x2(Register rhs,
void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrlq(count, src, dest);
}
@ -2299,14 +2301,14 @@ void MacroAssembler::signReplicationInt8x16(FloatRegister src,
void MacroAssembler::signReplicationInt16x8(FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
vpsraw(Imm32(15), dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsraw(Imm32(15), src, dest);
}
void MacroAssembler::signReplicationInt32x4(FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
vpsrad(Imm32(31), dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpsrad(Imm32(31), src, dest);
}
void MacroAssembler::signReplicationInt64x2(FloatRegister src,
@ -2810,30 +2812,22 @@ void MacroAssembler::extAddPairwiseInt8x16(FloatRegister src,
void MacroAssembler::unsignedExtAddPairwiseInt8x16(FloatRegister src,
FloatRegister dest) {
ScratchSimd128Scope scratch(*this);
moveSimd128(src, dest);
loadConstantSimd128Int(SimdConstant::SplatX16(1), scratch);
vpmaddubsw(scratch, dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpmaddubswSimd128(SimdConstant::SplatX16(1), src, dest);
}
void MacroAssembler::extAddPairwiseInt16x8(FloatRegister src,
FloatRegister dest) {
ScratchSimd128Scope scratch(*this);
moveSimd128(src, dest);
loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
vpmaddwd(Operand(scratch), dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpmaddwdSimd128(SimdConstant::SplatX8(1), src, dest);
}
void MacroAssembler::unsignedExtAddPairwiseInt16x8(FloatRegister src,
FloatRegister dest) {
ScratchSimd128Scope scratch(*this);
moveSimd128(src, dest);
loadConstantSimd128Int(SimdConstant::SplatX8(0x8000), scratch);
vpxor(scratch, dest, dest);
loadConstantSimd128Int(SimdConstant::SplatX8(1), scratch);
vpmaddwd(Operand(scratch), dest, dest);
loadConstantSimd128Int(SimdConstant::SplatX4(0x00010000), scratch);
vpaddd(Operand(scratch), dest, dest);
src = moveSimd128IntIfNotAVX(src, dest);
vpxorSimd128(SimdConstant::SplatX8(-0x8000), src, dest);
vpmaddwdSimd128(SimdConstant::SplatX8(1), dest, dest);
vpadddSimd128(SimdConstant::SplatX4(0x00010000), dest, dest);
}
// Floating square root
@ -3023,8 +3017,8 @@ void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
}
void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
if (src == dest) {
vmovhlps(dest, dest, dest);
if (src == dest || HasAVX()) {
vmovhlps(src, src, dest);
} else {
vpshufd(ComputeShuffleMask(2, 3, 2, 3), src, dest);
}
@ -3033,11 +3027,10 @@ void MacroAssembler::widenHighInt32x4(FloatRegister src, FloatRegister dest) {
void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
FloatRegister dest) {
moveSimd128(src, dest);
ScratchSimd128Scope scratch(*this);
src = moveSimd128IntIfNotAVX(src, dest);
vpxor(scratch, scratch, scratch);
vpunpckhdq(scratch, dest, dest);
vpunpckhdq(scratch, src, dest);
}
// Floating multiply-accumulate: srcDest [+-]= src1 * src2

Просмотреть файл

@ -587,6 +587,10 @@ class MacroAssemblerX86Shared : public Assembler {
moveSimd128Int(src, dest);
return dest;
}
FloatRegister selectDestIfAVX(FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(src.isSimd128() && dest.isSimd128());
return HasAVX() ? dest : src;
}
void loadUnalignedSimd128Int(const Address& src, FloatRegister dest) {
vmovdqu(Operand(src), dest);
}

Просмотреть файл

@ -421,6 +421,12 @@ void MacroAssemblerX86::vcmplepdSimd128(const SimdConstant& v,
vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vcmplepd_mr);
}
void MacroAssemblerX86::vpmaddubswSimd128(const SimdConstant& v,
FloatRegister lhs,
FloatRegister dest) {
vpPatchOpSimd128(v, lhs, dest, &X86Encoding::BaseAssemblerX86::vpmaddubsw_mr);
}
void MacroAssemblerX86::finish() {
// Last instruction may be an indirect jump so eagerly insert an undefined
// instruction byte to prevent processors from decoding data values into

Просмотреть файл

@ -1063,6 +1063,8 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared {
FloatRegister dest);
void vcmplepdSimd128(const SimdConstant& v, FloatRegister lhs,
FloatRegister dest);
void vpmaddubswSimd128(const SimdConstant& v, FloatRegister lhs,
FloatRegister dest);
Condition testInt32Truthy(bool truthy, const ValueOperand& operand) {
test32(operand.payloadReg(), operand.payloadReg());