зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1750049 - [wasm] Use AVX2 ops for splat instructions. r=jseward
Implements splat and loadXX_splat wasm simd instructions for AVX2. Differential Revision: https://phabricator.services.mozilla.com/D136034
This commit is contained in:
Родитель
1cb735a63a
Коммит
1ff25f1ff2
|
@ -23,6 +23,20 @@ function codegenTestX64_v128xv128_v128_avxhack(inputs, options = {}) {
|
|||
}
|
||||
}
|
||||
|
||||
// Utility function to test SIMD operations encoding, where the input argument
|
||||
// has the specified type (T).
|
||||
// inputs: [[type, complete-opname, expected-pattern], ...]
|
||||
function codegenTestX64_T_v128_avxhack(inputs, options = {}) {
|
||||
for ( let [ty, op, expected] of inputs ) {
|
||||
codegenTestX64_adhoc(wrap(options, `
|
||||
(func (export "f") (param ${ty}) (result v128)
|
||||
(${op} (local.get 0)))`),
|
||||
'f',
|
||||
expected,
|
||||
options);
|
||||
}
|
||||
}
|
||||
|
||||
// Simple binary ops: e.g. add, sub, mul
|
||||
codegenTestX64_v128xv128_v128_avxhack(
|
||||
[['i32x4.add', `c5 f1 fe c2 vpaddd %xmm2, %xmm1, %xmm0`],
|
||||
|
@ -70,3 +84,25 @@ codegenTestX64_adhoc(`(module
|
|||
c4 .. f1 22 .. 01 vpinsrq \\$0x01, %r\\w+, %xmm1, %xmm0` ); // rdi (Linux) or r8 (Win)
|
||||
|
||||
|
||||
if (isAvxPresent(2)) {
|
||||
// First i32 arg is: edi on Linux, and ecx on Windows.
|
||||
codegenTestX64_T_v128_avxhack(
|
||||
[['i32', 'i8x16.splat', `
|
||||
c5 f9 6e .. vmovd %e\\w+, %xmm0
|
||||
c4 e2 79 78 c0 vpbroadcastb %xmm0, %xmm0`],
|
||||
['i32', 'i16x8.splat', `
|
||||
c5 f9 6e .. vmovd %e\\w+, %xmm0
|
||||
c4 e2 79 79 c0 vpbroadcastw %xmm0, %xmm0`],
|
||||
['i32', 'i32x4.splat', `
|
||||
c5 f9 6e .. vmovd %e\\w+, %xmm0
|
||||
c4 e2 79 58 c0 vpbroadcastd %xmm0, %xmm0`],
|
||||
['f32', 'f32x4.splat', `c4 e2 79 18 c0 vbroadcastss %xmm0, %xmm0`]]);
|
||||
|
||||
codegenTestX64_T_v128_avxhack(
|
||||
[['i32', 'v128.load8_splat',
|
||||
'c4 c2 79 78 04 .. vpbroadcastbb \\(%r15,%r\\w+,1\\), %xmm0'],
|
||||
['i32', 'v128.load16_splat',
|
||||
'c4 c2 79 79 04 .. vpbroadcastww \\(%r15,%r\\w+,1\\), %xmm0'],
|
||||
['i32', 'v128.load32_splat',
|
||||
'c4 c2 79 18 04 .. vbroadcastssl \\(%r15,%r\\w+,1\\), %xmm0']], {memory: 1});
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
|
||||
codegenTestX64_PTYPE_v128(
|
||||
[['f32x4.splat', 'f32', `0f c6 c0 00 shufps \\$0x00, %xmm0, %xmm0`],
|
||||
['f64x2.splat', 'f64', `66 0f c6 c0 00 shufpd \\$0x00, %xmm0, %xmm0`]] );
|
||||
['f64x2.splat', 'f64', `f2 0f 12 c0 movddup %xmm0, %xmm0`]] , {log:true});
|
||||
|
||||
// Skip these on Win64 because the ABI differs and there's a different parameter
|
||||
// register, this changes not just the name slightly but the binary encoding in
|
||||
|
|
|
@ -562,7 +562,8 @@ class MemoryAccessDesc {
|
|||
}
|
||||
|
||||
void setSplatSimd128Load() {
|
||||
MOZ_ASSERT(type() == Scalar::Float64);
|
||||
MOZ_ASSERT(type() == Scalar::Uint8 || type() == Scalar::Uint16 ||
|
||||
type() == Scalar::Float32 || type() == Scalar::Float64);
|
||||
MOZ_ASSERT(!isAtomic());
|
||||
MOZ_ASSERT(loadOp_ == Plain);
|
||||
loadOp_ = Splat;
|
||||
|
|
|
@ -927,7 +927,10 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
|
|||
MOZ_ASSERT_IF(
|
||||
access.isZeroExtendSimd128Load(),
|
||||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(
|
||||
access.isSplatSimd128Load(),
|
||||
access.type() == Scalar::Uint8 || access.type() == Scalar::Uint16 ||
|
||||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
|
||||
|
||||
append(access, size());
|
||||
|
@ -936,21 +939,33 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
|
|||
movsbl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Uint8:
|
||||
movzbl(srcAddr, out.gpr());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastb(srcAddr, out.fpu());
|
||||
} else {
|
||||
movzbl(srcAddr, out.gpr());
|
||||
}
|
||||
break;
|
||||
case Scalar::Int16:
|
||||
movswl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Uint16:
|
||||
movzwl(srcAddr, out.gpr());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastw(srcAddr, out.fpu());
|
||||
} else {
|
||||
movzwl(srcAddr, out.gpr());
|
||||
}
|
||||
break;
|
||||
case Scalar::Int32:
|
||||
case Scalar::Uint32:
|
||||
movl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Float32:
|
||||
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
|
||||
vmovss(srcAddr, out.fpu());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastss(srcAddr, out.fpu());
|
||||
} else {
|
||||
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
|
||||
vmovss(srcAddr, out.fpu());
|
||||
}
|
||||
break;
|
||||
case Scalar::Float64:
|
||||
if (access.isSplatSimd128Load()) {
|
||||
|
|
|
@ -4678,6 +4678,92 @@ class AssemblerX86Shared : public AssemblerShared {
|
|||
}
|
||||
}
|
||||
|
||||
void vbroadcastb(const Operand& src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasAVX2());
|
||||
switch (src.kind()) {
|
||||
case Operand::FPREG:
|
||||
masm.vbroadcastb_rr(src.fpu(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_REG_DISP:
|
||||
masm.vbroadcastb_mr(src.disp(), src.base(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_SCALE:
|
||||
masm.vbroadcastb_mr(src.disp(), src.base(), src.index(), src.scale(),
|
||||
dest.encoding());
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
void vbroadcastw(const Operand& src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasAVX2());
|
||||
switch (src.kind()) {
|
||||
case Operand::FPREG:
|
||||
masm.vbroadcastw_rr(src.fpu(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_REG_DISP:
|
||||
masm.vbroadcastw_mr(src.disp(), src.base(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_SCALE:
|
||||
masm.vbroadcastw_mr(src.disp(), src.base(), src.index(), src.scale(),
|
||||
dest.encoding());
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
void vbroadcastd(const Operand& src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasAVX2());
|
||||
switch (src.kind()) {
|
||||
case Operand::FPREG:
|
||||
masm.vbroadcastd_rr(src.fpu(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_REG_DISP:
|
||||
masm.vbroadcastd_mr(src.disp(), src.base(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_SCALE:
|
||||
masm.vbroadcastd_mr(src.disp(), src.base(), src.index(), src.scale(),
|
||||
dest.encoding());
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
void vbroadcastq(const Operand& src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasAVX2());
|
||||
switch (src.kind()) {
|
||||
case Operand::FPREG:
|
||||
masm.vbroadcastq_rr(src.fpu(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_REG_DISP:
|
||||
masm.vbroadcastq_mr(src.disp(), src.base(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_SCALE:
|
||||
masm.vbroadcastq_mr(src.disp(), src.base(), src.index(), src.scale(),
|
||||
dest.encoding());
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
void vbroadcastss(const Operand& src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasAVX2());
|
||||
switch (src.kind()) {
|
||||
case Operand::FPREG:
|
||||
masm.vbroadcastss_rr(src.fpu(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_REG_DISP:
|
||||
masm.vbroadcastss_mr(src.disp(), src.base(), dest.encoding());
|
||||
break;
|
||||
case Operand::MEM_SCALE:
|
||||
masm.vbroadcastss_mr(src.disp(), src.base(), src.index(), src.scale(),
|
||||
dest.encoding());
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
|
||||
void flushBuffer() {}
|
||||
|
||||
// Patching.
|
||||
|
|
|
@ -4271,6 +4271,72 @@ class BaseAssembler : public GenericAssembler {
|
|||
twoByteOpSimd("vpsubq", VEX_PD, OP2_PSUBQ_VdqWdq, src1, src0, dst);
|
||||
}
|
||||
|
||||
void vbroadcastb_rr(XMMRegisterID src, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38, src,
|
||||
invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastb_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38,
|
||||
offset, base, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastb_mr(int32_t offset, RegisterID base, RegisterID index,
|
||||
int32_t scale, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38,
|
||||
offset, base, index, scale, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastw_rr(XMMRegisterID src, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38, src,
|
||||
invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastw_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38,
|
||||
offset, base, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastw_mr(int32_t offset, RegisterID base, RegisterID index,
|
||||
int32_t scale, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38,
|
||||
offset, base, index, scale, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastd_rr(XMMRegisterID src, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38, src,
|
||||
invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastd_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38,
|
||||
offset, base, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastd_mr(int32_t offset, RegisterID base, RegisterID index,
|
||||
int32_t scale, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38,
|
||||
offset, base, index, scale, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastq_rr(XMMRegisterID src, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38, src,
|
||||
invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastq_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38,
|
||||
offset, base, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastq_mr(int32_t offset, RegisterID base, RegisterID index,
|
||||
int32_t scale, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38,
|
||||
offset, base, index, scale, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastss_rr(XMMRegisterID src, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
|
||||
src, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastss_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
|
||||
offset, base, invalid_xmm, dst);
|
||||
}
|
||||
void vbroadcastss_mr(int32_t offset, RegisterID base, RegisterID index,
|
||||
int32_t scale, XMMRegisterID dst) {
|
||||
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
|
||||
offset, base, index, scale, invalid_xmm, dst);
|
||||
}
|
||||
|
||||
// BMI instructions:
|
||||
|
||||
void sarxl_rrr(RegisterID src, RegisterID shift, RegisterID dst) {
|
||||
|
|
|
@ -360,6 +360,7 @@ enum ThreeByteOpcodeID {
|
|||
OP3_PEXTRQ_EvVdqIb = 0x16,
|
||||
OP3_PTEST_VdVd = 0x17,
|
||||
OP3_EXTRACTPS_EdVdqIb = 0x17,
|
||||
OP3_VBROADCASTSS_VxWd = 0x18,
|
||||
OP3_PABSB_VdqWdq = 0x1C,
|
||||
OP3_PABSW_VdqWdq = 0x1D,
|
||||
OP3_PABSD_VdqWdq = 0x1E,
|
||||
|
@ -387,6 +388,10 @@ enum ThreeByteOpcodeID {
|
|||
OP3_PMAXUD_VdqWdq = 0x3F,
|
||||
OP3_PMULLD_VdqWdq = 0x40,
|
||||
OP3_VBLENDVPS_VdqWdq = 0x4A,
|
||||
OP3_VBROADCASTD_VxWx = 0x58,
|
||||
OP3_VBROADCASTQ_VxWx = 0x59,
|
||||
OP3_VBROADCASTB_VxWx = 0x78,
|
||||
OP3_VBROADCASTW_VxWx = 0x79,
|
||||
OP3_SHLX_GyEyBy = 0xF7,
|
||||
OP3_SARX_GyEyBy = 0xF7,
|
||||
OP3_SHRX_GyEyBy = 0xF7,
|
||||
|
|
|
@ -21,24 +21,40 @@ void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
|
|||
ScratchSimd128Scope scratch(asMasm());
|
||||
|
||||
vmovd(input, output);
|
||||
if (HasAVX2()) {
|
||||
vbroadcastb(Operand(output), output);
|
||||
return;
|
||||
}
|
||||
zeroSimd128Int(scratch);
|
||||
vpshufb(scratch, output, output);
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
|
||||
vmovd(input, output);
|
||||
if (HasAVX2()) {
|
||||
vbroadcastw(Operand(output), output);
|
||||
return;
|
||||
}
|
||||
vpshuflw(0, output, output);
|
||||
vpshufd(0, output, output);
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
|
||||
vmovd(input, output);
|
||||
if (HasAVX2()) {
|
||||
vbroadcastd(Operand(output), output);
|
||||
return;
|
||||
}
|
||||
vpshufd(0, output, output);
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::splatX4(FloatRegister input,
|
||||
FloatRegister output) {
|
||||
MOZ_ASSERT(input.isSingle() && output.isSimd128());
|
||||
if (HasAVX2()) {
|
||||
vbroadcastss(Operand(input), output);
|
||||
return;
|
||||
}
|
||||
asMasm().moveSimd128Float(input.asSimd128(), output);
|
||||
vshufps(0, output, output, output);
|
||||
}
|
||||
|
@ -46,8 +62,7 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input,
|
|||
void MacroAssemblerX86Shared::splatX2(FloatRegister input,
|
||||
FloatRegister output) {
|
||||
MOZ_ASSERT(input.isDouble() && output.isSimd128());
|
||||
asMasm().moveSimd128Float(input.asSimd128(), output);
|
||||
vshufpd(0, output, output, output);
|
||||
vmovddup(Operand(input.asSimd128()), output);
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
|
||||
|
|
|
@ -952,7 +952,10 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
|
|||
MOZ_ASSERT_IF(
|
||||
access.isZeroExtendSimd128Load(),
|
||||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(
|
||||
access.isSplatSimd128Load(),
|
||||
access.type() == Scalar::Uint8 || access.type() == Scalar::Uint16 ||
|
||||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
|
||||
MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
|
||||
|
||||
memoryBarrierBefore(access.sync());
|
||||
|
@ -963,21 +966,33 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
|
|||
movsbl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Uint8:
|
||||
movzbl(srcAddr, out.gpr());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastb(srcAddr, out.fpu());
|
||||
} else {
|
||||
movzbl(srcAddr, out.gpr());
|
||||
}
|
||||
break;
|
||||
case Scalar::Int16:
|
||||
movswl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Uint16:
|
||||
movzwl(srcAddr, out.gpr());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastw(srcAddr, out.fpu());
|
||||
} else {
|
||||
movzwl(srcAddr, out.gpr());
|
||||
}
|
||||
break;
|
||||
case Scalar::Int32:
|
||||
case Scalar::Uint32:
|
||||
movl(srcAddr, out.gpr());
|
||||
break;
|
||||
case Scalar::Float32:
|
||||
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
|
||||
vmovss(srcAddr, out.fpu());
|
||||
if (access.isSplatSimd128Load()) {
|
||||
vbroadcastss(srcAddr, out.fpu());
|
||||
} else {
|
||||
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
|
||||
vmovss(srcAddr, out.fpu());
|
||||
}
|
||||
break;
|
||||
case Scalar::Float64:
|
||||
if (access.isSplatSimd128Load()) {
|
||||
|
|
|
@ -1607,7 +1607,14 @@ class FunctionCompiler {
|
|||
bytecodeIfNotAsmJS());
|
||||
|
||||
// Generate better code (on x86)
|
||||
if (viewType == Scalar::Float64) {
|
||||
// If AVX2 is enabled, more broadcast operators are available.
|
||||
if (viewType == Scalar::Float64
|
||||
# if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86)
|
||||
|| (js::jit::CPUInfo::IsAVX2Present() &&
|
||||
(viewType == Scalar::Uint8 || viewType == Scalar::Uint16 ||
|
||||
viewType == Scalar::Float32))
|
||||
# endif
|
||||
) {
|
||||
access.setSplatSimd128Load();
|
||||
return load(addr.base, &access, ValType::V128);
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче