Bug 1750049 - [wasm] Use AVX2 ops for splat instructions. r=jseward

Implements splat and loadXX_splat wasm simd instructions for AVX2.

Differential Revision: https://phabricator.services.mozilla.com/D136034
This commit is contained in:
Yury Delendik 2022-01-20 13:41:47 +00:00
Родитель 1cb735a63a
Коммит 1ff25f1ff2
10 изменённых файлов: 261 добавлений и 15 удалений

Просмотреть файл

@ -23,6 +23,20 @@ function codegenTestX64_v128xv128_v128_avxhack(inputs, options = {}) {
}
}
// Utility function to test SIMD operations encoding, where the input argument
// has the specified type (T).
// inputs: [[type, complete-opname, expected-pattern], ...]
function codegenTestX64_T_v128_avxhack(inputs, options = {}) {
for ( let [ty, op, expected] of inputs ) {
codegenTestX64_adhoc(wrap(options, `
(func (export "f") (param ${ty}) (result v128)
(${op} (local.get 0)))`),
'f',
expected,
options);
}
}
// Simple binary ops: e.g. add, sub, mul
codegenTestX64_v128xv128_v128_avxhack(
[['i32x4.add', `c5 f1 fe c2 vpaddd %xmm2, %xmm1, %xmm0`],
@ -70,3 +84,25 @@ codegenTestX64_adhoc(`(module
c4 .. f1 22 .. 01 vpinsrq \\$0x01, %r\\w+, %xmm1, %xmm0` ); // rdi (Linux) or r8 (Win)
if (isAvxPresent(2)) {
// First i32 arg is: edi on Linux, and ecx on Windows.
codegenTestX64_T_v128_avxhack(
[['i32', 'i8x16.splat', `
c5 f9 6e .. vmovd %e\\w+, %xmm0
c4 e2 79 78 c0 vpbroadcastb %xmm0, %xmm0`],
['i32', 'i16x8.splat', `
c5 f9 6e .. vmovd %e\\w+, %xmm0
c4 e2 79 79 c0 vpbroadcastw %xmm0, %xmm0`],
['i32', 'i32x4.splat', `
c5 f9 6e .. vmovd %e\\w+, %xmm0
c4 e2 79 58 c0 vpbroadcastd %xmm0, %xmm0`],
['f32', 'f32x4.splat', `c4 e2 79 18 c0 vbroadcastss %xmm0, %xmm0`]]);
codegenTestX64_T_v128_avxhack(
[['i32', 'v128.load8_splat',
'c4 c2 79 78 04 .. vpbroadcastbb \\(%r15,%r\\w+,1\\), %xmm0'],
['i32', 'v128.load16_splat',
'c4 c2 79 79 04 .. vpbroadcastww \\(%r15,%r\\w+,1\\), %xmm0'],
['i32', 'v128.load32_splat',
'c4 c2 79 18 04 .. vbroadcastssl \\(%r15,%r\\w+,1\\), %xmm0']], {memory: 1});
}

Просмотреть файл

@ -7,7 +7,7 @@
codegenTestX64_PTYPE_v128(
[['f32x4.splat', 'f32', `0f c6 c0 00 shufps \\$0x00, %xmm0, %xmm0`],
['f64x2.splat', 'f64', `66 0f c6 c0 00 shufpd \\$0x00, %xmm0, %xmm0`]] );
['f64x2.splat', 'f64', `f2 0f 12 c0 movddup %xmm0, %xmm0`]] , {log:true});
// Skip these on Win64 because the ABI differs and there's a different parameter
// register, this changes not just the name slightly but the binary encoding in

Просмотреть файл

@ -562,7 +562,8 @@ class MemoryAccessDesc {
}
void setSplatSimd128Load() {
MOZ_ASSERT(type() == Scalar::Float64);
MOZ_ASSERT(type() == Scalar::Uint8 || type() == Scalar::Uint16 ||
type() == Scalar::Float32 || type() == Scalar::Float64);
MOZ_ASSERT(!isAtomic());
MOZ_ASSERT(loadOp_ == Plain);
loadOp_ = Splat;

Просмотреть файл

@ -927,7 +927,10 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
MOZ_ASSERT_IF(
access.isZeroExtendSimd128Load(),
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
MOZ_ASSERT_IF(
access.isSplatSimd128Load(),
access.type() == Scalar::Uint8 || access.type() == Scalar::Uint16 ||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
append(access, size());
@ -936,21 +939,33 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
movsbl(srcAddr, out.gpr());
break;
case Scalar::Uint8:
movzbl(srcAddr, out.gpr());
if (access.isSplatSimd128Load()) {
vbroadcastb(srcAddr, out.fpu());
} else {
movzbl(srcAddr, out.gpr());
}
break;
case Scalar::Int16:
movswl(srcAddr, out.gpr());
break;
case Scalar::Uint16:
movzwl(srcAddr, out.gpr());
if (access.isSplatSimd128Load()) {
vbroadcastw(srcAddr, out.fpu());
} else {
movzwl(srcAddr, out.gpr());
}
break;
case Scalar::Int32:
case Scalar::Uint32:
movl(srcAddr, out.gpr());
break;
case Scalar::Float32:
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
vmovss(srcAddr, out.fpu());
if (access.isSplatSimd128Load()) {
vbroadcastss(srcAddr, out.fpu());
} else {
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
vmovss(srcAddr, out.fpu());
}
break;
case Scalar::Float64:
if (access.isSplatSimd128Load()) {

Просмотреть файл

@ -4678,6 +4678,92 @@ class AssemblerX86Shared : public AssemblerShared {
}
}
void vbroadcastb(const Operand& src, FloatRegister dest) {
MOZ_ASSERT(HasAVX2());
switch (src.kind()) {
case Operand::FPREG:
masm.vbroadcastb_rr(src.fpu(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vbroadcastb_mr(src.disp(), src.base(), dest.encoding());
break;
case Operand::MEM_SCALE:
masm.vbroadcastb_mr(src.disp(), src.base(), src.index(), src.scale(),
dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vbroadcastw(const Operand& src, FloatRegister dest) {
MOZ_ASSERT(HasAVX2());
switch (src.kind()) {
case Operand::FPREG:
masm.vbroadcastw_rr(src.fpu(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vbroadcastw_mr(src.disp(), src.base(), dest.encoding());
break;
case Operand::MEM_SCALE:
masm.vbroadcastw_mr(src.disp(), src.base(), src.index(), src.scale(),
dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vbroadcastd(const Operand& src, FloatRegister dest) {
MOZ_ASSERT(HasAVX2());
switch (src.kind()) {
case Operand::FPREG:
masm.vbroadcastd_rr(src.fpu(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vbroadcastd_mr(src.disp(), src.base(), dest.encoding());
break;
case Operand::MEM_SCALE:
masm.vbroadcastd_mr(src.disp(), src.base(), src.index(), src.scale(),
dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vbroadcastq(const Operand& src, FloatRegister dest) {
MOZ_ASSERT(HasAVX2());
switch (src.kind()) {
case Operand::FPREG:
masm.vbroadcastq_rr(src.fpu(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vbroadcastq_mr(src.disp(), src.base(), dest.encoding());
break;
case Operand::MEM_SCALE:
masm.vbroadcastq_mr(src.disp(), src.base(), src.index(), src.scale(),
dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vbroadcastss(const Operand& src, FloatRegister dest) {
MOZ_ASSERT(HasAVX2());
switch (src.kind()) {
case Operand::FPREG:
masm.vbroadcastss_rr(src.fpu(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vbroadcastss_mr(src.disp(), src.base(), dest.encoding());
break;
case Operand::MEM_SCALE:
masm.vbroadcastss_mr(src.disp(), src.base(), src.index(), src.scale(),
dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void flushBuffer() {}
// Patching.

Просмотреть файл

@ -4271,6 +4271,72 @@ class BaseAssembler : public GenericAssembler {
twoByteOpSimd("vpsubq", VEX_PD, OP2_PSUBQ_VdqWdq, src1, src0, dst);
}
void vbroadcastb_rr(XMMRegisterID src, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38, src,
invalid_xmm, dst);
}
void vbroadcastb_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38,
offset, base, invalid_xmm, dst);
}
void vbroadcastb_mr(int32_t offset, RegisterID base, RegisterID index,
int32_t scale, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastb", VEX_PD, OP3_VBROADCASTB_VxWx, ESCAPE_38,
offset, base, index, scale, invalid_xmm, dst);
}
void vbroadcastw_rr(XMMRegisterID src, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38, src,
invalid_xmm, dst);
}
void vbroadcastw_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38,
offset, base, invalid_xmm, dst);
}
void vbroadcastw_mr(int32_t offset, RegisterID base, RegisterID index,
int32_t scale, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastw", VEX_PD, OP3_VBROADCASTW_VxWx, ESCAPE_38,
offset, base, index, scale, invalid_xmm, dst);
}
void vbroadcastd_rr(XMMRegisterID src, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38, src,
invalid_xmm, dst);
}
void vbroadcastd_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38,
offset, base, invalid_xmm, dst);
}
void vbroadcastd_mr(int32_t offset, RegisterID base, RegisterID index,
int32_t scale, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastd", VEX_PD, OP3_VBROADCASTD_VxWx, ESCAPE_38,
offset, base, index, scale, invalid_xmm, dst);
}
void vbroadcastq_rr(XMMRegisterID src, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38, src,
invalid_xmm, dst);
}
void vbroadcastq_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38,
offset, base, invalid_xmm, dst);
}
void vbroadcastq_mr(int32_t offset, RegisterID base, RegisterID index,
int32_t scale, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastq", VEX_PD, OP3_VBROADCASTQ_VxWx, ESCAPE_38,
offset, base, index, scale, invalid_xmm, dst);
}
void vbroadcastss_rr(XMMRegisterID src, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
src, invalid_xmm, dst);
}
void vbroadcastss_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
offset, base, invalid_xmm, dst);
}
void vbroadcastss_mr(int32_t offset, RegisterID base, RegisterID index,
int32_t scale, XMMRegisterID dst) {
threeByteOpSimd("vbroadcastss", VEX_PD, OP3_VBROADCASTSS_VxWd, ESCAPE_38,
offset, base, index, scale, invalid_xmm, dst);
}
// BMI instructions:
void sarxl_rrr(RegisterID src, RegisterID shift, RegisterID dst) {

Просмотреть файл

@ -360,6 +360,7 @@ enum ThreeByteOpcodeID {
OP3_PEXTRQ_EvVdqIb = 0x16,
OP3_PTEST_VdVd = 0x17,
OP3_EXTRACTPS_EdVdqIb = 0x17,
OP3_VBROADCASTSS_VxWd = 0x18,
OP3_PABSB_VdqWdq = 0x1C,
OP3_PABSW_VdqWdq = 0x1D,
OP3_PABSD_VdqWdq = 0x1E,
@ -387,6 +388,10 @@ enum ThreeByteOpcodeID {
OP3_PMAXUD_VdqWdq = 0x3F,
OP3_PMULLD_VdqWdq = 0x40,
OP3_VBLENDVPS_VdqWdq = 0x4A,
OP3_VBROADCASTD_VxWx = 0x58,
OP3_VBROADCASTQ_VxWx = 0x59,
OP3_VBROADCASTB_VxWx = 0x78,
OP3_VBROADCASTW_VxWx = 0x79,
OP3_SHLX_GyEyBy = 0xF7,
OP3_SARX_GyEyBy = 0xF7,
OP3_SHRX_GyEyBy = 0xF7,

Просмотреть файл

@ -21,24 +21,40 @@ void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
ScratchSimd128Scope scratch(asMasm());
vmovd(input, output);
if (HasAVX2()) {
vbroadcastb(Operand(output), output);
return;
}
zeroSimd128Int(scratch);
vpshufb(scratch, output, output);
}
void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
vmovd(input, output);
if (HasAVX2()) {
vbroadcastw(Operand(output), output);
return;
}
vpshuflw(0, output, output);
vpshufd(0, output, output);
}
void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {
vmovd(input, output);
if (HasAVX2()) {
vbroadcastd(Operand(output), output);
return;
}
vpshufd(0, output, output);
}
void MacroAssemblerX86Shared::splatX4(FloatRegister input,
FloatRegister output) {
MOZ_ASSERT(input.isSingle() && output.isSimd128());
if (HasAVX2()) {
vbroadcastss(Operand(input), output);
return;
}
asMasm().moveSimd128Float(input.asSimd128(), output);
vshufps(0, output, output, output);
}
@ -46,8 +62,7 @@ void MacroAssemblerX86Shared::splatX4(FloatRegister input,
void MacroAssemblerX86Shared::splatX2(FloatRegister input,
FloatRegister output) {
MOZ_ASSERT(input.isDouble() && output.isSimd128());
asMasm().moveSimd128Float(input.asSimd128(), output);
vshufpd(0, output, output, output);
vmovddup(Operand(input.asSimd128()), output);
}
void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,

Просмотреть файл

@ -952,7 +952,10 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
MOZ_ASSERT_IF(
access.isZeroExtendSimd128Load(),
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
MOZ_ASSERT_IF(
access.isSplatSimd128Load(),
access.type() == Scalar::Uint8 || access.type() == Scalar::Uint16 ||
access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
memoryBarrierBefore(access.sync());
@ -963,21 +966,33 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
movsbl(srcAddr, out.gpr());
break;
case Scalar::Uint8:
movzbl(srcAddr, out.gpr());
if (access.isSplatSimd128Load()) {
vbroadcastb(srcAddr, out.fpu());
} else {
movzbl(srcAddr, out.gpr());
}
break;
case Scalar::Int16:
movswl(srcAddr, out.gpr());
break;
case Scalar::Uint16:
movzwl(srcAddr, out.gpr());
if (access.isSplatSimd128Load()) {
vbroadcastw(srcAddr, out.fpu());
} else {
movzwl(srcAddr, out.gpr());
}
break;
case Scalar::Int32:
case Scalar::Uint32:
movl(srcAddr, out.gpr());
break;
case Scalar::Float32:
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
vmovss(srcAddr, out.fpu());
if (access.isSplatSimd128Load()) {
vbroadcastss(srcAddr, out.fpu());
} else {
// vmovss does the right thing also for access.isZeroExtendSimd128Load()
vmovss(srcAddr, out.fpu());
}
break;
case Scalar::Float64:
if (access.isSplatSimd128Load()) {

Просмотреть файл

@ -1607,7 +1607,14 @@ class FunctionCompiler {
bytecodeIfNotAsmJS());
// Generate better code (on x86)
if (viewType == Scalar::Float64) {
// If AVX2 is enabled, more broadcast operators are available.
if (viewType == Scalar::Float64
# if defined(JS_CODEGEN_X64) || defined(JS_CODEGEN_X86)
|| (js::jit::CPUInfo::IsAVX2Present() &&
(viewType == Scalar::Uint8 || viewType == Scalar::Uint16 ||
viewType == Scalar::Float32))
# endif
) {
access.setSplatSimd128Load();
return load(addr.base, &access, ValType::V128);
}