Bug 1136226 - Implement Bool8x16.splat and Bool16x8.splat. r=bbouvier

The scalar argument to this operation is expanded into MIR as either -1 or 0 in
an Int32, so the 4-lane splat produces the correct result for 8-lane and
16-lane splats too. Either an all-zeroes vector or an all-ones vector.
This commit is contained in:
Jakob Stoklund Olesen 2016-05-31 09:00:19 -07:00
Родитель f85827cd00
Коммит 9b1061c296
9 изменённых файлов: 131 добавлений и 22 удалений

Просмотреть файл

@ -1161,6 +1161,16 @@ MSimdSplat::foldsTo(TempAllocator& alloc)
cst = SimdConstant::SplatX4(v);
break;
}
case MIRType::Int8x16: {
int32_t v = op->toConstant()->toInt32();
cst = SimdConstant::SplatX16(v);
break;
}
case MIRType::Int16x8: {
int32_t v = op->toConstant()->toInt32();
cst = SimdConstant::SplatX8(v);
break;
}
case MIRType::Int32x4: {
int32_t v = op->toConstant()->toInt32();
cst = SimdConstant::SplatX4(v);

Просмотреть файл

@ -195,6 +195,36 @@ class LSimdUnbox : public LInstructionHelper<1, 1, 1>
}
};
// Constructs a SIMD value with 16 equal components (int8x16).
class LSimdSplatX16 : public LInstructionHelper<1, 1, 0>
{
public:
LIR_HEADER(SimdSplatX16)
explicit LSimdSplatX16(const LAllocation& v)
{
setOperand(0, v);
}
MSimdSplat* mir() const {
return mir_->toSimdSplat();
}
};
// Constructs a SIMD value with 8 equal components (int16x8).
class LSimdSplatX8 : public LInstructionHelper<1, 1, 0>
{
public:
LIR_HEADER(SimdSplatX8)
explicit LSimdSplatX8(const LAllocation& v)
{
setOperand(0, v);
}
MSimdSplat* mir() const {
return mir_->toSimdSplat();
}
};
// Constructs a SIMD value with 4 equal components (e.g. int32x4, float32x4).
class LSimdSplatX4 : public LInstructionHelper<1, 1, 0>
{

Просмотреть файл

@ -20,6 +20,8 @@
_(Float32) \
_(SimdBox) \
_(SimdUnbox) \
_(SimdSplatX16) \
_(SimdSplatX8) \
_(SimdSplatX4) \
_(Simd128Int) \
_(Simd128Float) \

Просмотреть файл

@ -1085,6 +1085,7 @@ class AssemblerX86Shared : public AssemblerShared
static bool HasSSE2() { return CPUInfo::IsSSE2Present(); }
static bool HasSSE3() { return CPUInfo::IsSSE3Present(); }
static bool HasSSSE3() { return CPUInfo::IsSSSE3Present(); }
static bool HasSSE41() { return CPUInfo::IsSSE41Present(); }
static bool HasPOPCNT() { return CPUInfo::IsPOPCNTPresent(); }
static bool SupportsFloatingPoint() { return CPUInfo::IsSSE2Present(); }
@ -2996,6 +2997,19 @@ class AssemblerX86Shared : public AssemblerShared
MOZ_CRASH("unexpected operand kind");
}
}
void vpshuflw(uint32_t mask, FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.vpshuflw_irr(mask, src.encoding(), dest.encoding());
}
void vpshufhw(uint32_t mask, FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.vpshufhw_irr(mask, src.encoding(), dest.encoding());
}
void vpshufb(FloatRegister mask, FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(HasSSSE3());
masm.vpshufb_rr(mask.encoding(), src.encoding(), dest.encoding());
}
void vmovddup(FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(HasSSE3());
masm.vmovddup_rr(src.encoding(), dest.encoding());

Просмотреть файл

@ -2825,6 +2825,21 @@ public:
twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, invalid_xmm, dst);
}
void vpshuflw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
{
twoByteOpImmSimd("vpshuflw", VEX_SD, OP2_PSHUFLW_VdqWdqIb, mask, src, invalid_xmm, dst);
}
void vpshufhw_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
{
twoByteOpImmSimd("vpshufhw", VEX_SS, OP2_PSHUFHW_VdqWdqIb, mask, src, invalid_xmm, dst);
}
void vpshufb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, src1, src0, dst);
}
void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);

Просмотреть файл

@ -2598,6 +2598,39 @@ CodeGeneratorX86Shared::visitSimdValueFloat32x4(LSimdValueFloat32x4* ins)
masm.vunpcklps(tmp, output, output);
}
void
CodeGeneratorX86Shared::visitSimdSplatX16(LSimdSplatX16* ins)
{
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 16);
Register input = ToRegister(ins->getOperand(0));
FloatRegister output = ToFloatRegister(ins->output());
masm.vmovd(input, output);
if (AssemblerX86Shared::HasSSSE3()) {
masm.zeroSimd128Int(ScratchSimd128Reg);
masm.vpshufb(ScratchSimd128Reg, output, output);
} else {
// Use two shifts to duplicate the low 8 bits into the low 16 bits.
masm.vpsllw(Imm32(8), output, output);
masm.vmovdqa(output, ScratchSimd128Reg);
masm.vpsrlw(Imm32(8), ScratchSimd128Reg, ScratchSimd128Reg);
masm.vpor(ScratchSimd128Reg, output, output);
// Then do an X8 splat.
masm.vpshuflw(0, output, output);
masm.vpshufd(0, output, output);
}
}
void
CodeGeneratorX86Shared::visitSimdSplatX8(LSimdSplatX8* ins)
{
MOZ_ASSERT(SimdTypeToLength(ins->mir()->type()) == 8);
Register input = ToRegister(ins->getOperand(0));
FloatRegister output = ToFloatRegister(ins->output());
masm.vmovd(input, output);
masm.vpshuflw(0, output, output);
masm.vpshufd(0, output, output);
}
void
CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
{
@ -2607,22 +2640,14 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4* ins)
MOZ_ASSERT(IsSimdType(mir->type()));
JS_STATIC_ASSERT(sizeof(float) == sizeof(int32_t));
switch (mir->type()) {
case MIRType::Int32x4:
case MIRType::Bool32x4: {
Register r = ToRegister(ins->getOperand(0));
masm.vmovd(r, output);
masm.vpshufd(0, output, output);
break;
}
case MIRType::Float32x4: {
if (mir->type() == MIRType::Float32x4) {
FloatRegister r = ToFloatRegister(ins->getOperand(0));
FloatRegister rCopy = masm.reusedInputFloat32x4(r, output);
masm.vshufps(0, rCopy, rCopy, output);
break;
}
default:
MOZ_CRASH("Unknown SIMD kind");
} else {
Register r = ToRegister(ins->getOperand(0));
masm.vmovd(r, output);
masm.vpshufd(0, output, output);
}
}

Просмотреть файл

@ -291,6 +291,8 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
// SIMD operators
void visitSimdValueInt32x4(LSimdValueInt32x4* lir);
void visitSimdValueFloat32x4(LSimdValueFloat32x4* lir);
void visitSimdSplatX16(LSimdSplatX16* lir);
void visitSimdSplatX8(LSimdSplatX8* lir);
void visitSimdSplatX4(LSimdSplatX4* lir);
void visitSimd128Int(LSimd128Int* ins);
void visitSimd128Float(LSimd128Float* ins);

Просмотреть файл

@ -217,6 +217,8 @@ enum TwoByteOpcodeID {
OP2_MOVDQ_VsdWsd = 0x6F,
OP2_MOVDQ_VdqWdq = 0x6F,
OP2_PSHUFD_VdqWdqIb = 0x70,
OP2_PSHUFLW_VdqWdqIb = 0x70,
OP2_PSHUFHW_VdqWdqIb = 0x70,
OP2_PSLLW_UdqIb = 0x71,
OP2_PSRAW_UdqIb = 0x71,
OP2_PSRLW_UdqIb = 0x71,
@ -281,6 +283,7 @@ enum TwoByteOpcodeID {
};
enum ThreeByteOpcodeID {
OP3_PSHUFB_VdqWdq = 0x00,
OP3_ROUNDSS_VsdWsd = 0x0A,
OP3_ROUNDSD_VsdWsd = 0x0B,
OP3_BLENDVPS_VdqWdq = 0x14,

Просмотреть файл

@ -814,19 +814,27 @@ void
LIRGeneratorX86Shared::visitSimdSplat(MSimdSplat* ins)
{
LAllocation x = useRegisterAtStart(ins->getOperand(0));
LSimdSplatX4* lir = new(alloc()) LSimdSplatX4(x);
switch (ins->type()) {
case MIRType::Int32x4:
case MIRType::Bool32x4:
define(lir, ins);
case MIRType::Int8x16:
define(new (alloc()) LSimdSplatX16(x), ins);
break;
case MIRType::Int16x8:
define(new (alloc()) LSimdSplatX8(x), ins);
break;
case MIRType::Int32x4:
case MIRType::Float32x4:
// (Non-AVX) codegen actually wants the input and the output to be in
// the same register, but we can't currently use defineReuseInput
// because they have different types (scalar vs vector), so a spill slot
// for one may not be suitable for the other.
define(lir, ins);
case MIRType::Bool8x16:
case MIRType::Bool16x8:
case MIRType::Bool32x4:
// Use the SplatX4 instruction for all boolean splats. Since the input
// value is a 32-bit int that is either 0 or -1, the X4 splat gives
// the right result for all boolean geometries.
// For floats, (Non-AVX) codegen actually wants the input and the output
// to be in the same register, but we can't currently use
// defineReuseInput because they have different types (scalar vs
// vector), so a spill slot for one may not be suitable for the other.
define(new (alloc()) LSimdSplatX4(x), ins);
break;
default:
MOZ_CRASH("Unknown SIMD kind");