Bug 1642909 - Do not scalarize i64x2.mul on x64/x86. r=jseward

This is mostly code removal: we remove the platform-specific lowering
and code generation for scalarized i64x2.mul.  In its place we use
Andrew Brown's code for a platform-agnostic SIMD implementation, this
fits in very neatly in the existing code generation pipeline.

Differential Revision: https://phabricator.services.mozilla.com/D78015
This commit is contained in:
Lars T Hansen 2020-06-08 07:57:41 +00:00
Родитель 326908e0f8
Коммит db2f9302ac
15 изменённых файлов: 42 добавлений и 168 удалений

Просмотреть файл

@ -256,7 +256,9 @@ Int32Array.rectify = (x) => sign_extend(x,32);
Uint32Array.inputs = Int32Array.inputs;
Uint32Array.rectify = (x) => zero_extend(x,32);
BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n]];
BigInt64Array.inputs = [[1,2],[2,1],[-1,-2],[-2,-1],[2n ** 32n, 2n ** 32n - 5n],
[(2n ** 38n) / 5n, (2n ** 41n) / 7n],
[-((2n ** 38n) / 5n), (2n ** 41n) / 7n]];
BigInt64Array.rectify = (x) => BigInt(x);
Float32Array.inputs = [[1, -1, 1e10, -1e10],

Просмотреть файл

@ -2077,12 +2077,7 @@ class MacroAssembler : public MacroAssemblerSpecific {
DEFINED_ON(x86_shared);
inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
Register64 temp) DEFINED_ON(x64);
// `temp1` must be edx:eax
inline void mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
Register64 temp1, Register64 temp2, Register temp3)
DEFINED_ON(x86);
FloatRegister temp) DEFINED_ON(x86_shared);
// Integer Negate

Просмотреть файл

@ -700,14 +700,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
masm.testq(input, input);
emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
}
void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
#ifdef ENABLE_WASM_SIMD
FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
FloatRegister rhs = ToFloatRegister(ins->rhs());
Register64 temp = ToRegister64(ins->temp1());
masm.mulInt64x2(rhs, lhsDest, temp);
#else
MOZ_CRASH("No SIMD");
#endif
}

Просмотреть файл

@ -404,12 +404,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
LSignExtendInt64(useInt64RegisterAtStart(ins->input())),
ins);
}
void LIRGeneratorX64::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
MDefinition* lhs, MDefinition* rhs) {
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
LAllocation rhsAlloc =
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
auto* lir = new (alloc()) LWasmI64x2Mul(lhsDestAlloc, rhsAlloc, tempInt64());
defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
}

Просмотреть файл

@ -28,8 +28,6 @@ class LIRGeneratorX64 : public LIRGeneratorX86Shared {
MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
MDefinition* rhs);
void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
MDefinition* rhs);
// Returns a box allocation. reg2 is ignored on 64-bit platforms.
LBoxAllocation useBoxFixed(MDefinition* mir, Register reg1, Register,

Просмотреть файл

@ -767,22 +767,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
cmovCCl(NonZero, one, dest);
}
// Integer Multiply
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
Register64 temp) {
ScratchRegisterScope t1(*this);
Register t2 = temp.reg;
vpextrq(0, lhsDest, t1);
vpextrq(0, rhs, t2);
imulq(t2, t1);
vpinsrq(0, t1, lhsDest, lhsDest);
vpextrq(1, lhsDest, t1);
vpextrq(1, rhs, t2);
imulq(t2, t1);
vpinsrq(1, t1, lhsDest, lhsDest);
}
// Extract lane as scalar
void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,

Просмотреть файл

@ -2452,6 +2452,9 @@ void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
case wasm::SimdOp::I64x2Sub:
masm.subInt64x2(rhs, lhsDest);
break;
case wasm::SimdOp::I64x2Mul:
masm.mulInt64x2(rhs, lhsDest, temp1);
break;
case wasm::SimdOp::F32x4Add:
masm.addFloat32x4(rhs, lhsDest);
break;

Просмотреть файл

@ -361,43 +361,6 @@ class LWasmBinarySimd128 : public LInstructionHelper<1, 2, 2> {
wasm::SimdOp simdOp() const { return mir_->toWasmBinarySimd128()->simdOp(); }
};
// (v128, v128) -> v128 effect-free operations for i64x2.mul
// lhs and dest are the same.
// x64: one i64 temp.
// x86: two i64 temps and one i32 temp
class LWasmI64x2Mul : public LInstructionHelper<1, 2, INT64_PIECES * 2 + 1> {
public:
LIR_HEADER(WasmI64x2Mul)
static constexpr uint32_t LhsDest = 0;
static constexpr uint32_t Rhs = 1;
LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
const LInt64Definition& temp)
: LInstructionHelper(classOpcode) {
setOperand(LhsDest, lhsDest);
setOperand(Rhs, rhs);
setInt64Temp(0, temp);
}
LWasmI64x2Mul(const LAllocation& lhsDest, const LAllocation& rhs,
const LInt64Definition& temp1, const LInt64Definition& temp2,
const LDefinition& temp3)
: LInstructionHelper(classOpcode) {
setOperand(LhsDest, lhsDest);
setOperand(Rhs, rhs);
setInt64Temp(0, temp1);
setInt64Temp(INT64_PIECES, temp2);
setTemp(INT64_PIECES * 2, temp3);
}
const LAllocation* lhsDest() { return getOperand(LhsDest); }
const LAllocation* rhs() { return getOperand(Rhs); }
const LInt64Definition temp1() { return getInt64Temp(0); }
const LInt64Definition temp2() { return getInt64Temp(INT64_PIECES); }
const LDefinition* temp3() { return getTemp(INT64_PIECES * 2); }
};
// (v128, i32) -> v128 effect-free variable-width shift operations
// lhs and dest are the same.
// temp0 is a GPR (if in use).

Просмотреть файл

@ -736,6 +736,7 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
}
case wasm::SimdOp::F32x4Max:
case wasm::SimdOp::F64x2Max:
case wasm::SimdOp::I64x2Mul:
case wasm::SimdOp::V8x16Swizzle:
tempReg0 = tempSimd128();
break;
@ -758,16 +759,12 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
break;
}
if (ins->simdOp() == wasm::SimdOp::I64x2Mul) {
lowerForWasmI64x2Mul(ins, lhs, rhs);
} else {
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
LAllocation rhsAlloc =
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
auto* lir = new (alloc())
LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
}
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
LAllocation rhsAlloc =
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
auto* lir = new (alloc())
LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
}
void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {

Просмотреть файл

@ -1432,6 +1432,28 @@ void MacroAssembler::mulInt32x4(FloatRegister rhs, FloatRegister lhsDest) {
vpmulld(Operand(rhs), lhsDest, lhsDest);
}
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
FloatRegister temp) {
ScratchSimd128Scope temp2(*this);
// lhsDest = <D C> <B A>
// rhs = <H G> <F E>
// result = <(DG+CH)_low+CG_high CG_low> <(BE+AF)_low+AE_high AE_low>
moveSimd128(lhsDest, temp); // temp = <D C> <B A>
vpsrlq(Imm32(32), temp, temp); // temp = <0 D> <0 B>
vpmuludq(rhs, temp, temp); // temp = <DG> <BE>
moveSimd128(rhs, temp2); // temp2 = <H G> <F E>
vpsrlq(Imm32(32), temp2, temp2); // temp2 = <0 H> <0 F>
vpmuludq(lhsDest, temp2, temp2); // temp2 = <CH> <AF>
vpaddq(Operand(temp), temp2, temp2); // temp2 = <DG+CH> <BE+AF>
vpsllq(Imm32(32), temp2, temp2); // temp2 = <(DG+CH)_low 0>
// <(BE+AF)_low 0>
vpmuludq(rhs, lhsDest, lhsDest); // lhsDest = <CG_high CG_low>
// <AE_high AE_low>
vpaddq(Operand(temp2), lhsDest, lhsDest); // lhsDest =
// <(DG+CH)_low+CG_high CG_low>
// <(BE+AF)_low+AE_high AE_low>
}
// Integer negate
void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {

Просмотреть файл

@ -1077,16 +1077,3 @@ void CodeGenerator::visitTestI64AndBranch(LTestI64AndBranch* lir) {
masm.testl(input.low, input.low);
emitBranch(Assembler::NonZero, lir->ifTrue(), lir->ifFalse());
}
void CodeGenerator::visitWasmI64x2Mul(LWasmI64x2Mul* ins) {
#ifdef ENABLE_WASM_SIMD
FloatRegister lhsDest = ToFloatRegister(ins->lhsDest());
FloatRegister rhs = ToFloatRegister(ins->rhs());
Register64 temp1 = ToRegister64(ins->temp1());
Register64 temp2 = ToRegister64(ins->temp2());
Register temp3 = ToRegister(ins->temp3());
masm.mulInt64x2(rhs, lhsDest, temp1, temp2, temp3);
#else
MOZ_CRASH("No SIMD");
#endif
}

Просмотреть файл

@ -681,14 +681,3 @@ void LIRGenerator::visitSignExtendInt64(MSignExtendInt64* ins) {
LInt64Allocation(LAllocation(AnyRegister(edx)),
LAllocation(AnyRegister(eax))));
}
void LIRGeneratorX86::lowerForWasmI64x2Mul(MWasmBinarySimd128* ins,
MDefinition* lhs, MDefinition* rhs) {
LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
LAllocation rhsAlloc =
lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
auto* lir = new (alloc())
LWasmI64x2Mul(lhsDestAlloc, rhsAlloc,
tempInt64Fixed(Register64(edx, eax)), tempInt64(), temp());
defineReuseInput(lir, ins, LWasmI64x2Mul::LhsDest);
}

Просмотреть файл

@ -47,8 +47,6 @@ class LIRGeneratorX86 : public LIRGeneratorX86Shared {
MDefinition* mir, MDefinition* lhs, MDefinition* rhs);
void lowerForMulInt64(LMulI64* ins, MMul* mir, MDefinition* lhs,
MDefinition* rhs);
void lowerForWasmI64x2Mul(MWasmBinarySimd128* ins, MDefinition* lhs,
MDefinition* rhs);
void lowerDivI64(MDiv* div);
void lowerModI64(MMod* mod);

Просмотреть файл

@ -997,19 +997,6 @@ void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest) {
bind(&done);
}
void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
Register64 temp1, Register64 temp2,
Register temp3) {
extractLaneInt64x2(0, lhsDest, temp1);
extractLaneInt64x2(0, rhs, temp2);
mul64(temp2, temp1, temp3);
replaceLaneInt64x2(0, temp1, lhsDest);
extractLaneInt64x2(1, lhsDest, temp1);
extractLaneInt64x2(1, rhs, temp2);
mul64(temp2, temp1, temp3);
replaceLaneInt64x2(1, temp1, lhsDest);
}
void MacroAssembler::extractLaneInt64x2(uint32_t lane, FloatRegister src,
Register64 dest) {
vpextrd(2 * lane, src, dest.low);

Просмотреть файл

@ -8066,7 +8066,6 @@ class BaseCompiler final : public BaseCompilerInterface {
MOZ_MUST_USE bool emitBitselect();
MOZ_MUST_USE bool emitVectorShuffle();
MOZ_MUST_USE bool emitVectorShiftRightI64x2();
MOZ_MUST_USE bool emitVectorMulI64x2();
#endif
};
@ -12693,6 +12692,11 @@ static void MulF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
masm.mulFloat32x4(rs, rsd);
}
static void MulI64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
RegV128 temp) {
masm.mulInt64x2(rs, rsd, temp);
}
static void MulF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
masm.mulFloat64x2(rs, rsd);
}
@ -13415,41 +13419,6 @@ bool BaseCompiler::emitVectorShiftRightI64x2() {
return true;
}
// Must be scalarized on x86/x64 and requires different temp regs on the
// two architectures.
bool BaseCompiler::emitVectorMulI64x2() {
Nothing unused_a, unused_b;
if (!iter_.readBinary(ValType::V128, &unused_a, &unused_b)) {
return false;
}
if (deadCode_) {
return true;
}
# if defined(JS_CODEGEN_X64)
emitVectorBinopWithTemp<RegI64>(
[](MacroAssembler& masm, RegV128 rs, RegV128 rsd, RegI64 temp) {
masm.mulInt64x2(rs, rsd, temp);
});
# elif defined(JS_CODEGEN_X86)
RegV128 r, rs;
pop2xV128(&r, &rs);
needI64(specific_.edx_eax);
RegI64 temp1 = specific_.edx_eax;
RegI64 temp2 = needI64();
ScratchI32 temp3(*this);
masm.mulInt64x2(rs, r, temp1, temp2, temp3);
freeV128(rs);
freeI64(temp1);
freeI64(temp2);
pushV128(r);
# else
MOZ_CRASH("No porting API for MulI64x2");
# endif
return true;
}
#endif
bool BaseCompiler::emitBody() {
@ -14463,7 +14432,7 @@ bool BaseCompiler::emitBody() {
case uint32_t(SimdOp::I64x2Sub):
CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, SubI64x2));
case uint32_t(SimdOp::I64x2Mul):
CHECK_NEXT(emitVectorMulI64x2());
CHECK_NEXT(dispatchVectorBinary(emitVectorBinopWithTemp, MulI64x2));
case uint32_t(SimdOp::F32x4Add):
CHECK_NEXT(dispatchVectorBinary(emitVectorBinop, AddF32x4));
case uint32_t(SimdOp::F32x4Sub):