зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1136226 - Implement Bool8x16.splat and Bool16x8.splat. r=bbouvier
The scalar argument to this operation is expanded into MIR as either -1 or 0 in an Int32, so the 4-lane splat produces the correct result for 8-lane and 16-lane splats too. Either an all-zeroes vector or an all-ones vector.
This commit is contained in:
Родитель
f85827cd00
Коммит
9b1061c296
|
@ -1161,6 +1161,16 @@ MSimdSplat::foldsTo(TempAllocator& alloc)
|
|||
cst = SimdConstant::SplatX4(v);
|
||||
break;
|
||||
}
|
||||
case MIRType::Int8x16: {
|
||||
int32_t v = op->toConstant()->toInt32();
|
||||
cst = SimdConstant::SplatX16(v);
|
||||
break;
|
||||
}
|
||||
case MIRType::Int16x8: {
|
||||
int32_t v = op->toConstant()->toInt32();
|
||||
cst = SimdConstant::SplatX8(v);
|
||||
break;
|
||||
}
|
||||
case MIRType::Int32x4: {
|
||||
int32_t v = op->toConstant()->toInt32();
|
||||
cst = SimdConstant::SplatX4(v);
|
||||
|
|
|
@ -195,6 +195,36 @@ class LSimdUnbox : public LInstructionHelper<1, 1, 1>
|
|||
}
|
||||
};
|
||||
|
||||
// Constructs a SIMD value with 16 equal components (int8x16).
|
||||
class LSimdSplatX16 : public LInstructionHelper<1, 1, 0>
|
||||
{
|
||||
public:
|
||||
LIR_HEADER(SimdSplatX16)
|
||||
explicit LSimdSplatX16(const LAllocation& v)
|
||||
{
|
||||
setOperand(0, v);
|
||||
}
|
||||
|
||||
MSimdSplat* mir() const {
|
||||
return mir_->toSimdSplat();
|
||||
}
|
||||
};
|
||||
|
||||
// Constructs a SIMD value with 8 equal components (int16x8).
|
||||
class LSimdSplatX8 : public LInstructionHelper<1, 1, 0>
|
||||
{
|
||||
public:
|
||||
LIR_HEADER(SimdSplatX8)
|
||||
explicit LSimdSplatX8(const LAllocation& v)
|
||||
{
|
||||
setOperand(0, v);
|
||||
}
|
||||
|
||||
MSimdSplat* mir() const {
|
||||
return mir_->toSimdSplat();
|
||||
}
|
||||
};
|
||||
|
||||
// Constructs a SIMD value with 4 equal components (e.g. int32x4, float32x4).
|
||||
class LSimdSplatX4 : public LInstructionHelper<1, 1, 0>
|
||||
{
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
_(Float32) \
|
||||
_(SimdBox) \
|
||||
_(SimdUnbox) \
|
||||
_(SimdSplatX16) \
|
||||
_(SimdSplatX8) \
|
||||
_(SimdSplatX4) \
|
||||
_(Simd128Int) \
|
||||
_(Simd128Float) \
|
||||
|
|
|
@ -1085,6 +1085,7 @@ class AssemblerX86Shared : public AssemblerShared
|
|||
|
||||
static bool HasSSE2() { return CPUInfo::IsSSE2Present(); }
|
||||
static bool HasSSE3() { return CPUInfo::IsSSE3Present(); }
|
||||
static bool HasSSSE3() { return CPUInfo::IsSSSE3Present(); }
|
||||
static bool HasSSE41() { return CPUInfo::IsSSE41Present(); }
|
||||
static bool HasPOPCNT() { return CPUInfo::IsPOPCNTPresent(); }
|
||||
static bool SupportsFloatingPoint() { return CPUInfo::IsSSE2Present(); }
|
||||
|
@ -2996,6 +2997,19 @@ class AssemblerX86Shared : public AssemblerShared
|
|||
MOZ_CRASH("unexpected operand kind");
|
||||
}
|
||||
}
|
||||
|
||||
void vpshuflw(uint32_t mask, FloatRegister src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSE2());
|
||||
masm.vpshuflw_irr(mask, src.encoding(), dest.encoding());
|
||||
}
|
||||
void vpshufhw(uint32_t mask, FloatRegister src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSE2());
|
||||
masm.vpshufhw_irr(mask, src.encoding(), dest.encoding());
|
||||
}
|
||||
void vpshufb(FloatRegister mask, FloatRegister src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSSE3());
|
||||
masm.vpshufb_rr(mask.encoding(), src.encoding(), dest.encoding());
|
||||
}
|
||||
void vmovddup(FloatRegister src, FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSE3());
|
||||
masm.vmovddup_rr(src.encoding(), dest.encoding());
|
||||
|
|
|
@ -2825,6 +2825,21 @@ public:
|
|||
twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, invalid_xmm, dst);
|
||||
}
|
||||
|
||||
void vpshuflw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
|
||||
{
|
||||
twoByteOpImmSimd("vpshuflw", VEX_SD, OP2_PSHUFLW_VdqWdqIb, mask, src, invalid_xmm, dst);
|
||||
}
|
||||
|
||||
void vpshufhw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
|
||||
{
|
||||
twoByteOpImmSimd("vpshufhw", VEX_SS, OP2_PSHUFHW_VdqWdqIb, mask, src, invalid_xmm, dst);
|
||||
}
|
||||
|
||||
void vpshufb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
|
||||
{
|
||||
threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, src1, src0, dst);
|
||||
}
|
||||
|
||||
void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
|
||||
{
|
||||
twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);
|
||||
|
|
|
@ -2598,6 +2598,39 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
|
|||
masm.vunpcklps(tmp, output, output);
|
||||
}
|
||||
|
||||
void
|
||||
CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
|
||||
{
|
||||
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
|
||||
Register input = ToRegister(ins->getOperand(0));
|
||||
FloatRegister output = ToFloatRegister(ins->output());
|
||||
masm.vmovd(input, output);
|
||||
if (AssemblerX86Shared::HasSSSE3()) {
|
||||
masm.zeroSimd128Int(ScratchSimd128Reg);
|
||||
masm.vpshufb(ScratchSimd128Reg, output, output);
|
||||
} else {
|
||||
// Use two shifts to duplicate the low 8 bits into the low 16 bits.
|
||||
masm.vpsllw(Imm32(8), output, output);
|
||||
masm.vmovdqa(output, ScratchSimd128Reg);
|
||||
masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
|
||||
masm.vpor(ScratchSimd128Reg, output, output);
|
||||
// Then do an X8 splat.
|
||||
masm.vpshuflw(0, output, output);
|
||||
masm.vpshufd(0, output, output);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
|
||||
{
|
||||
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
|
||||
Register input = ToRegister(ins->getOperand(0));
|
||||
FloatRegister output = ToFloatRegister(ins->output());
|
||||
masm.vmovd(input, output);
|
||||
masm.vpshuflw(0, output, output);
|
||||
masm.vpshufd(0, output, output);
|
||||
}
|
||||
|
||||
void
|
||||
CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
|
||||
{
|
||||
|
@ -2607,22 +2640,14 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
|
|||
MOZ_ASSERT(IsSimdType(mir->type()));
|
||||
JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
|
||||
|
||||
switch (mir->type()) {
|
||||
case MIRType::Int32x4:
|
||||
case MIRType::Bool32x4: {
|
||||
Register r = ToRegister(ins->getOperand(0));
|
||||
masm.vmovd(r, output);
|
||||
masm.vpshufd(0, output, output);
|
||||
break;
|
||||
}
|
||||
case MIRType::Float32x4: {
|
||||
if (mir->type() == MIRType::Float32x4) {
|
||||
FloatRegister r = ToFloatRegister(ins->getOperand(0));
|
||||
FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
|
||||
masm.vshufps(0, rCopy, rCopy, output);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
MOZ_CRASH("Unknown SIMD kind");
|
||||
} else {
|
||||
Register r = ToRegister(ins->getOperand(0));
|
||||
masm.vmovd(r, output);
|
||||
masm.vpshufd(0, output, output);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -291,6 +291,8 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
|
|||
// SIMD operators
|
||||
void visitSimdValueInt32x4(LSimdValueInt32x4* lir);
|
||||
void visitSimdValueFloat32x4(LSimdValueFloat32x4* lir);
|
||||
void visitSimdSplatX16(LSimdSplatX16* lir);
|
||||
void visitSimdSplatX8(LSimdSplatX8* lir);
|
||||
void visitSimdSplatX4(LSimdSplatX4* lir);
|
||||
void visitSimd128Int(LSimd128Int* ins);
|
||||
void visitSimd128Float(LSimd128Float* ins);
|
||||
|
|
|
@ -217,6 +217,8 @@ enum TwoByteOpcodeID {
|
|||
OP2_MOVDQ_VsdWsd = 0x6F,
|
||||
OP2_MOVDQ_VdqWdq = 0x6F,
|
||||
OP2_PSHUFD_VdqWdqIb = 0x70,
|
||||
OP2_PSHUFLW_VdqWdqIb = 0x70,
|
||||
OP2_PSHUFHW_VdqWdqIb = 0x70,
|
||||
OP2_PSLLW_UdqIb = 0x71,
|
||||
OP2_PSRAW_UdqIb = 0x71,
|
||||
OP2_PSRLW_UdqIb = 0x71,
|
||||
|
@ -281,6 +283,7 @@ enum TwoByteOpcodeID {
|
|||
};
|
||||
|
||||
enum ThreeByteOpcodeID {
|
||||
OP3_PSHUFB_VdqWdq = 0x00,
|
||||
OP3_ROUNDSS_VsdWsd = 0x0A,
|
||||
OP3_ROUNDSD_VsdWsd = 0x0B,
|
||||
OP3_BLENDVPS_VdqWdq = 0x14,
|
||||
|
|
|
@ -814,19 +814,27 @@ void
|
|||
LIRGeneratorX86Shared::visitSimdSplat(MSimdSplat* ins)
|
||||
{
|
||||
LAllocation x = useRegisterAtStart(ins->getOperand(0));
|
||||
LSimdSplatX4* lir = new(alloc()) LSimdSplatX4(x);
|
||||
|
||||
switch (ins->type()) {
|
||||
case MIRType::Int32x4:
|
||||
case MIRType::Bool32x4:
|
||||
define(lir, ins);
|
||||
case MIRType::Int8x16:
|
||||
define(new (alloc()) LSimdSplatX16(x), ins);
|
||||
break;
|
||||
case MIRType::Int16x8:
|
||||
define(new (alloc()) LSimdSplatX8(x), ins);
|
||||
break;
|
||||
case MIRType::Int32x4:
|
||||
case MIRType::Float32x4:
|
||||
// (Non-AVX) codegen actually wants the input and the output to be in
|
||||
// the same register, but we can't currently use defineReuseInput
|
||||
// because they have different types (scalar vs vector), so a spill slot
|
||||
// for one may not be suitable for the other.
|
||||
define(lir, ins);
|
||||
case MIRType::Bool8x16:
|
||||
case MIRType::Bool16x8:
|
||||
case MIRType::Bool32x4:
|
||||
// Use the SplatX4 instruction for all boolean splats. Since the input
|
||||
// value is a 32-bit int that is either 0 or -1, the X4 splat gives
|
||||
// the right result for all boolean geometries.
|
||||
// For floats, (Non-AVX) codegen actually wants the input and the output
|
||||
// to be in the same register, but we can't currently use
|
||||
// defineReuseInput because they have different types (scalar vs
|
||||
// vector), so a spill slot for one may not be suitable for the other.
|
||||
define(new (alloc()) LSimdSplatX4(x), ins);
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("Unknown SIMD kind");
|
||||
|
|
Загрузка…
Ссылка в новой задаче