зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1642909 - Do not scalarize i64x2.mul on x64/x86. r=jseward
This is mostly code removal: we remove the platform-specific lowering and code generation for scalarized i64x2.mul. In its place we use Andrew Brown's code for a platform-agnostic SIMD implementation, this fits in very neatly in the existing code generation pipeline. Differential Revision: https://phabricator.services.mozilla.com/D78015
This commit is contained in:
Родитель
326908e0f8
Коммит
db2f9302ac
|
@ -256,7 +256,9 @@ Int32Array.rectify = (x) => sign_extend(x,32);
|
|||
Uint32Array.inputs = Int32Array.inputs;
|
||||
Uint32Array.rectify = (x) => zero_extend(x,32);
|
||||
|
||||
BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n]];
|
||||
BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n],
|
||||
[(2n ** 38n) / 5n, (2n ** 41n) / 7n],
|
||||
[-((2n ** 38n) / 5n), (2n ** 41n) / 7n]];
|
||||
BigInt64Array.rectify = (x) => BigInt(x);
|
||||
|
||||
Float32Array.inputs = [[1, -1, 1e10, -1e10],
|
||||
|
|
|
@ -2077,12 +2077,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
|
|||
DEFINED_ON(x86_shared);
|
||||
|
||||
inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
|
||||
Register64 temp) DEFINED_ON(x64);
|
||||
|
||||
// `temp1` must be edx:eax
|
||||
inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
|
||||
Register64 temp1, Register64 temp2, Register temp3)
|
||||
DEFINED_ON(x86);
|
||||
FloatRegister temp) DEFINED_ON(x86_shared);
|
||||
|
||||
// Integer Negate
|
||||
|
||||
|
|
|
@ -700,14 +700,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
|
|||
masm.testq(input, input);
|
||||
emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
|
||||
}
|
||||
|
||||
void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
|
||||
FloatRegister rhs = ToFloatRegister(ins->rhs());
|
||||
Register64 temp = ToRegister64(ins->temp1());
|
||||
masm.mulInt64x2(rhs, lhsDest, temp);
|
||||
#else
|
||||
MOZ_CRASH("No SIMD");
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -404,12 +404,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
|
|||
LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
|
||||
ins);
|
||||
}
|
||||
|
||||
void LIRGeneratorX64::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
|
||||
MDefinition* lhs, MDefinition* rhs) {
|
||||
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
|
||||
LAllocation rhsAlloc =
|
||||
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
|
||||
auto* lir = new (alloc()) LWasmI64x2Mul(lhsDestAlloc, rhsAlloc, tempInt64());
|
||||
defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
|
||||
}
|
||||
|
|
|
@ -28,8 +28,6 @@ class LIRGeneratorX64 : public LIRGeneratorX86Shared {
|
|||
MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
|
||||
void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
|
||||
MDefinition* rhs);
|
||||
void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
|
||||
MDefinition* rhs);
|
||||
|
||||
// Returns a box allocation. reg2 is ignored on 64-bit platforms.
|
||||
LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register,
|
||||
|
|
|
@ -767,22 +767,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
|
|||
cmovCCl(NonZero, one, dest);
|
||||
}
|
||||
|
||||
// Integer Multiply
|
||||
|
||||
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
|
||||
Register64 temp) {
|
||||
ScratchRegisterScope t1(*this);
|
||||
Register t2 = temp.reg;
|
||||
vpextrq(0, lhsDest, t1);
|
||||
vpextrq(0, rhs, t2);
|
||||
imulq(t2, t1);
|
||||
vpinsrq(0, t1, lhsDest, lhsDest);
|
||||
vpextrq(1, lhsDest, t1);
|
||||
vpextrq(1, rhs, t2);
|
||||
imulq(t2, t1);
|
||||
vpinsrq(1, t1, lhsDest, lhsDest);
|
||||
}
|
||||
|
||||
// Extract lane as scalar
|
||||
|
||||
void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
|
||||
|
|
|
@ -2452,6 +2452,9 @@ void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
|
|||
case wasm::SimdOp::I64x2Sub:
|
||||
masm.subInt64x2(rhs, lhsDest);
|
||||
break;
|
||||
case wasm::SimdOp::I64x2Mul:
|
||||
masm.mulInt64x2(rhs, lhsDest, temp1);
|
||||
break;
|
||||
case wasm::SimdOp::F32x4Add:
|
||||
masm.addFloat32x4(rhs, lhsDest);
|
||||
break;
|
||||
|
|
|
@ -361,43 +361,6 @@ class LWasmBinarySimd128 : public LInstructionHelper<1, 2, 2> {
|
|||
wasm::SimdOp simdOp() const { return mir_->toWasmBinarySimd128()->simdOp(); }
|
||||
};
|
||||
|
||||
// (v128, v128) -> v128 effect-free operations for i64x2.mul
|
||||
// lhs and dest are the same.
|
||||
// x64: one i64 temp.
|
||||
// x86: two i64 temps and one i32 temp
|
||||
class LWasmI64x2Mul : public LInstructionHelper<1, 2, INT64_PIECES * 2 + 1> {
|
||||
public:
|
||||
LIR_HEADER(WasmI64x2Mul)
|
||||
|
||||
static constexpr uint32_t LhsDest = 0;
|
||||
static constexpr uint32_t Rhs = 1;
|
||||
|
||||
LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
|
||||
const LInt64Definition& temp)
|
||||
: LInstructionHelper(classOpcode) {
|
||||
setOperand(LhsDest, lhsDest);
|
||||
setOperand(Rhs, rhs);
|
||||
setInt64Temp(0, temp);
|
||||
}
|
||||
|
||||
LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
|
||||
const LInt64Definition& temp1, const LInt64Definition& temp2,
|
||||
const LDefinition& temp3)
|
||||
: LInstructionHelper(classOpcode) {
|
||||
setOperand(LhsDest, lhsDest);
|
||||
setOperand(Rhs, rhs);
|
||||
setInt64Temp(0, temp1);
|
||||
setInt64Temp(INT64_PIECES, temp2);
|
||||
setTemp(INT64_PIECES * 2, temp3);
|
||||
}
|
||||
|
||||
const LAllocation* lhsDest() { return getOperand(LhsDest); }
|
||||
const LAllocation* rhs() { return getOperand(Rhs); }
|
||||
const LInt64Definition temp1() { return getInt64Temp(0); }
|
||||
const LInt64Definition temp2() { return getInt64Temp(INT64_PIECES); }
|
||||
const LDefinition* temp3() { return getTemp(INT64_PIECES * 2); }
|
||||
};
|
||||
|
||||
// (v128, i32) -> v128 effect-free variable-width shift operations
|
||||
// lhs and dest are the same.
|
||||
// temp0 is a GPR (if in use).
|
||||
|
|
|
@ -736,6 +736,7 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
|||
}
|
||||
case wasm::SimdOp::F32x4Max:
|
||||
case wasm::SimdOp::F64x2Max:
|
||||
case wasm::SimdOp::I64x2Mul:
|
||||
case wasm::SimdOp::V8x16Swizzle:
|
||||
tempReg0 = tempSimd128();
|
||||
break;
|
||||
|
@ -758,16 +759,12 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
|||
break;
|
||||
}
|
||||
|
||||
if (ins->simdOp() == wasm::SimdOp::I64x2Mul) {
|
||||
lowerForWasmI64x2Mul(ins, lhs, rhs);
|
||||
} else {
|
||||
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
|
||||
LAllocation rhsAlloc =
|
||||
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
|
||||
auto* lir = new (alloc())
|
||||
LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
|
||||
defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
|
||||
}
|
||||
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
|
||||
LAllocation rhsAlloc =
|
||||
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
|
||||
auto* lir = new (alloc())
|
||||
LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
|
||||
defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
|
||||
}
|
||||
|
||||
void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
|
||||
|
|
|
@ -1432,6 +1432,28 @@ void MacroAssembler::mulInt32x4(FloatRegister rhs, FloatRegister lhsDest) {
|
|||
vpmulld(Operand(rhs), lhsDest, lhsDest);
|
||||
}
|
||||
|
||||
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
|
||||
FloatRegister temp) {
|
||||
ScratchSimd128Scope temp2(*this);
|
||||
// lhsDest = <D C> <B A>
|
||||
// rhs = <H G> <F E>
|
||||
// result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
|
||||
moveSimd128(lhsDest, temp); // temp = <D C> <B A>
|
||||
vpsrlq(Imm32(32), temp, temp); // temp = <0 D> <0 B>
|
||||
vpmuludq(rhs, temp, temp); // temp = <DG> <BE>
|
||||
moveSimd128(rhs, temp2); // temp2 = <H G> <F E>
|
||||
vpsrlq(Imm32(32), temp2, temp2); // temp2 = <0 H> <0 F>
|
||||
vpmuludq(lhsDest, temp2, temp2); // temp2 = <CH> <AF>
|
||||
vpaddq(Operand(temp), temp2, temp2); // temp2 = <DG+CH> <BE+AF>
|
||||
vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0>
|
||||
// <(BE+AF)_low 0>
|
||||
vpmuludq(rhs, lhsDest, lhsDest); // lhsDest = <CG_high CG_low>
|
||||
// <AE_high AE_low>
|
||||
vpaddq(Operand(temp2), lhsDest, lhsDest); // lhsDest =
|
||||
// <(DG+CH)_low+CG_high CG_low>
|
||||
// <(BE+AF)_low+AE_high AE_low>
|
||||
}
|
||||
|
||||
// Integer negate
|
||||
|
||||
void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
|
||||
|
|
|
@ -1077,16 +1077,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
|
|||
masm.testl(input.low, input.low);
|
||||
emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
|
||||
}
|
||||
|
||||
void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
|
||||
FloatRegister rhs = ToFloatRegister(ins->rhs());
|
||||
Register64 temp1 = ToRegister64(ins->temp1());
|
||||
Register64 temp2 = ToRegister64(ins->temp2());
|
||||
Register temp3 = ToRegister(ins->temp3());
|
||||
masm.mulInt64x2(rhs, lhsDest, temp1, temp2, temp3);
|
||||
#else
|
||||
MOZ_CRASH("No SIMD");
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -681,14 +681,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
|
|||
LInt64Allocation(LAllocation(AnyRegister(edx)),
|
||||
LAllocation(AnyRegister(eax))));
|
||||
}
|
||||
|
||||
void LIRGeneratorX86::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
|
||||
MDefinition* lhs, MDefinition* rhs) {
|
||||
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
|
||||
LAllocation rhsAlloc =
|
||||
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
|
||||
auto* lir = new (alloc())
|
||||
LWasmI64x2Mul(lhsDestAlloc, rhsAlloc,
|
||||
tempInt64Fixed(Register64(edx, eax)), tempInt64(), temp());
|
||||
defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
|
||||
}
|
||||
|
|
|
@ -47,8 +47,6 @@ class LIRGeneratorX86 : public LIRGeneratorX86Shared {
|
|||
MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
|
||||
void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
|
||||
MDefinition* rhs);
|
||||
void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
|
||||
MDefinition* rhs);
|
||||
|
||||
void lowerDivI64(MDiv* div);
|
||||
void lowerModI64(MMod* mod);
|
||||
|
|
|
@ -997,19 +997,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
|
|||
bind(&done);
|
||||
}
|
||||
|
||||
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
|
||||
Register64 temp1, Register64 temp2,
|
||||
Register temp3) {
|
||||
extractLaneInt64x2(0, lhsDest, temp1);
|
||||
extractLaneInt64x2(0, rhs, temp2);
|
||||
mul64(temp2, temp1, temp3);
|
||||
replaceLaneInt64x2(0, temp1, lhsDest);
|
||||
extractLaneInt64x2(1, lhsDest, temp1);
|
||||
extractLaneInt64x2(1, rhs, temp2);
|
||||
mul64(temp2, temp1, temp3);
|
||||
replaceLaneInt64x2(1, temp1, lhsDest);
|
||||
}
|
||||
|
||||
void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
|
||||
Register64 dest) {
|
||||
vpextrd(2 * lane, src, dest.low);
|
||||
|
|
|
@ -8066,7 +8066,6 @@ class BaseCompiler final : public BaseCompilerInterface {
|
|||
MOZ_MUST_USE bool emitBitselect();
|
||||
MOZ_MUST_USE bool emitVectorShuffle();
|
||||
MOZ_MUST_USE bool emitVectorShiftRightI64x2();
|
||||
MOZ_MUST_USE bool emitVectorMulI64x2();
|
||||
#endif
|
||||
};
|
||||
|
||||
|
@ -12693,6 +12692,11 @@ static void MulF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
|
|||
masm.mulFloat32x4(rs, rsd);
|
||||
}
|
||||
|
||||
static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
|
||||
RegV128 temp) {
|
||||
masm.mulInt64x2(rs, rsd, temp);
|
||||
}
|
||||
|
||||
static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
|
||||
masm.mulFloat64x2(rs, rsd);
|
||||
}
|
||||
|
@ -13415,41 +13419,6 @@ bool BaseCompiler::emitVectorShiftRightI64x2() {
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Must be scalarized on x86/x64 and requires different temp regs on the
|
||||
// two architectures.
|
||||
bool BaseCompiler::emitVectorMulI64x2() {
|
||||
Nothing unused_a, unused_b;
|
||||
if (!iter_.readBinary(ValType::V128, &unused_a, &unused_b)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (deadCode_) {
|
||||
return true;
|
||||
}
|
||||
|
||||
# if defined(JS_CODEGEN_X64)
|
||||
emitVectorBinopWithTemp<RegI64>(
|
||||
[](MacroAssembler& masm, RegV128 rs, RegV128 rsd, RegI64 temp) {
|
||||
masm.mulInt64x2(rs, rsd, temp);
|
||||
});
|
||||
# elif defined(JS_CODEGEN_X86)
|
||||
RegV128 r, rs;
|
||||
pop2xV128(&r, &rs);
|
||||
needI64(specific_.edx_eax);
|
||||
RegI64 temp1 = specific_.edx_eax;
|
||||
RegI64 temp2 = needI64();
|
||||
ScratchI32 temp3(*this);
|
||||
masm.mulInt64x2(rs, r, temp1, temp2, temp3);
|
||||
freeV128(rs);
|
||||
freeI64(temp1);
|
||||
freeI64(temp2);
|
||||
pushV128(r);
|
||||
# else
|
||||
MOZ_CRASH("No porting API for MulI64x2");
|
||||
# endif
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool BaseCompiler::emitBody() {
|
||||
|
@ -14463,7 +14432,7 @@ bool BaseCompiler::emitBody() {
|
|||
case uint32_t(SimdOp::I64x2Sub):
|
||||
CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, SubI64x2));
|
||||
case uint32_t(SimdOp::I64x2Mul):
|
||||
CHECK_NEXT(emitVectorMulI64x2());
|
||||
CHECK_NEXT(dispatchVectorBinary(emitVectorBinopWithTemp, MulI64x2));
|
||||
case uint32_t(SimdOp::F32x4Add):
|
||||
CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, AddF32x4));
|
||||
case uint32_t(SimdOp::F32x4Sub):
|
||||
|
|
Загрузка…
Ссылка в новой задаче