Bug 1240796 - Implement Uint32x4 <==> Float32x4 conversions. r=sunfish

The conversion from Uint32x4 to Float32x4 is not available as an SSE
instruction, so we need to expand into a larger instruction sequence lifted
from LLVM. Make this expansion early when generating MIR so that it can be
exposed to LICM and GVN optimizations.

The conversion from Float32x4 to Uint32x4 can throw a RangeError. It is handled
similarly to LFloat32x4ToInt32x4. This expansion depends on the details of the
cvttps2dq instruction that can't be expressed in MIR, so it can't be expanded
early.
This commit is contained in:
Jakob Stoklund Olesen 2016-02-01 14:55:07 -08:00
Родитель d01dd37267
Коммит cccd80f8b7
15 изменённых файлов: 365 добавлений и 16 удалений

Просмотреть файл

@ -0,0 +1,81 @@
load(libdir + 'simd.js');
setJitCompilerOption("ion.warmup.trigger", 30);
// Testing Uint32 <-> Float32 conversions.
// These conversions deserve special attention because SSE doesn't provide
// simple conversion instructions.
// Convert an Uint32Array to a Float32Array using scalar conversions.
function cvt_utof_scalar(u32s, f32s) {
assertEq(u32s.length, f32s.length);
for (var i = 0; i < u32s.length; i++) {
f32s[i] = u32s[i];
}
}
// Convert an Uint32Array to a Float32Array using simd conversions.
function cvt_utof_simd(u32s, f32s) {
assertEq(u32s.length, f32s.length);
for (var i = 0; i < u32s.length; i += 4) {
SIMD.Float32x4.store(f32s, i, SIMD.Float32x4.fromUint32x4(SIMD.Uint32x4.load(u32s, i)));
}
}
// Convert a Float32Array to an Uint32Array using scalar conversions.
function cvt_ftou_scalar(f32s, u32s) {
assertEq(f32s.length, u32s.length);
for (var i = 0; i < f32s.length; i++) {
u32s[i] = f32s[i];
}
}
// Convert a Float32Array to an Uint32Array using simd conversions.
function cvt_ftou_simd(f32s, u32s) {
assertEq(f32s.length, u32s.length);
for (var i = 0; i < f32s.length; i += 4) {
SIMD.Uint32x4.store(u32s, i, SIMD.Uint32x4.fromFloat32x4(SIMD.Float32x4.load(f32s, i)));
}
}
function check(a, b) {
assertEq(a.length, b.length);
for (var i = 0; i < a.length; i++) {
assertEq(a[i], b[i]);
}
}
// Uint32x4 --> Float32x4 tests.
var src = new Uint32Array(8000);
var dst1 = new Float32Array(8000);
var dst2 = new Float32Array(8000);
for (var i = 0; i < 2000; i++) {
src[i] = i;
src[i + 2000] = 0x7fffffff - i;
src[i + 4000] = 0x80000000 + i;
src[i + 6000] = 0xffffffff - i;
}
for (var n = 0; n < 10; n++) {
cvt_utof_scalar(src, dst1);
cvt_utof_simd(src, dst2);
check(dst1, dst2);
}
// Float32x4 --> Uint32x4 tests.
var fsrc = dst1;
var fdst1 = new Uint32Array(8000);
var fdst2 = new Uint32Array(8000);
// The 0xffffffff entries in fsrc round to 0x1.0p32f which throws.
// Go as high as 0x0.ffffffp32f.
for (var i = 0; i < 2000; i++) {
fsrc[i + 6000] = 0xffffff7f - i;
}
for (var n = 0; n < 10; n++) {
cvt_ftou_scalar(fsrc, fdst1);
cvt_ftou_simd(fsrc, fdst2);
check(fdst1, fdst2);
}

Просмотреть файл

@ -4101,12 +4101,28 @@ LIRGenerator::visitSimdConvert(MSimdConvert* ins)
LUse use = useRegister(input);
if (ins->type() == MIRType_Int32x4) {
MOZ_ASSERT(input->type() == MIRType_Float32x4);
LFloat32x4ToInt32x4* lir = new(alloc()) LFloat32x4ToInt32x4(use, temp());
if (!gen->compilingAsmJS())
assignSnapshot(lir, Bailout_BoundsCheck);
define(lir, ins);
switch (ins->signedness()) {
case SimdSign::Signed: {
LFloat32x4ToInt32x4* lir = new(alloc()) LFloat32x4ToInt32x4(use, temp());
if (!gen->compilingAsmJS())
assignSnapshot(lir, Bailout_BoundsCheck);
define(lir, ins);
break;
}
case SimdSign::Unsigned: {
LFloat32x4ToUint32x4* lir =
new (alloc()) LFloat32x4ToUint32x4(use, temp(), temp(LDefinition::INT32X4));
if (!gen->compilingAsmJS())
assignSnapshot(lir, Bailout_BoundsCheck);
define(lir, ins);
break;
}
default:
MOZ_CRASH("Unexpected SimdConvert sign");
}
} else if (ins->type() == MIRType_Float32x4) {
MOZ_ASSERT(input->type() == MIRType_Int32x4);
MOZ_ASSERT(ins->signedness() == SimdSign::Signed, "Unexpected SimdConvert sign");
define(new(alloc()) LInt32x4ToFloat32x4(use), ins);
} else {
MOZ_CRASH("Unknown SIMD kind when generating constant");

Просмотреть файл

@ -3377,7 +3377,10 @@ IonBuilder::boxSimd(CallInfo& callInfo, MInstruction* ins, InlineTypedObject* te
{
MSimdBox* obj = MSimdBox::New(alloc(), constraints(), ins, templateObj,
templateObj->group()->initialHeap(constraints()));
current->add(ins);
// In some cases, ins has already been added to current.
if (!ins->block())
current->add(ins);
current->add(obj);
current->push(obj);
@ -3525,16 +3528,15 @@ IonBuilder::inlineSimdConvert(CallInfo& callInfo, JSNative native, bool isCast,
if (!canInlineSimd(callInfo, native, 1, &templateObj))
return InliningStatus_NotInlined;
// TODO JSO: Implement unsigned integer conversions.
if (sign == SimdSign::Unsigned)
return InliningStatus_NotInlined;
// See comment in inlineSimdBinary
MInstruction* ins;
if (isCast)
// Signed/Unsigned doesn't matter for bitcasts.
ins = MSimdReinterpretCast::New(alloc(), callInfo.getArg(0), fromType, toType);
else
ins = MSimdConvert::New(alloc(), callInfo.getArg(0), fromType, toType);
// Possibly expand into multiple instructions.
ins = MSimdConvert::AddLegalized(alloc(), current, callInfo.getArg(0),
fromType, toType, sign);
return boxSimd(callInfo, ins, templateObj);
}

Просмотреть файл

@ -1037,6 +1037,105 @@ MSimdGeneralShuffle::foldsTo(TempAllocator& alloc)
return MSimdShuffle::New(alloc, vector(0), vector(1), type(), lanes[0], lanes[1], lanes[2], lanes[3]);
}
MInstruction*
MSimdConvert::AddLegalized(TempAllocator& alloc, MBasicBlock* addTo, MDefinition* obj,
MIRType fromType, MIRType toType, SimdSign sign)
{
if (SupportsUint32x4FloatConversions || sign != SimdSign::Unsigned) {
MInstruction* ins = New(alloc, obj, fromType, toType, sign);
addTo->add(ins);
return ins;
}
// This architecture can't do Uint32x4 <-> Float32x4 conversions (Hi SSE!)
MOZ_ASSERT(sign == SimdSign::Unsigned);
if (fromType == MIRType_Int32x4 && toType == MIRType_Float32x4) {
// Converting Uint32x4 -> Float32x4. This algorithm is from LLVM.
//
// Split the input number into high and low parts:
//
// uint32_t hi = x >> 16;
// uint32_t lo = x & 0xffff;
//
// Insert these parts as the low mantissa bits in a float32 number with
// the corresponding exponent:
//
// float fhi = (bits-as-float)(hi | 0x53000000); // 0x1.0p39f + hi*2^16
// float flo = (bits-as-float)(lo | 0x4b000000); // 0x1.0p23f + lo
//
// Subtract the bias from the hi part:
//
// fhi -= (0x1.0p39 + 0x1.0p23) // hi*2^16 - 0x1.0p23
//
// And finally combine:
//
// result = flo + fhi // lo + hi*2^16.
// Compute hi = obj >> 16 (lane-wise unsigned shift).
MInstruction* c16 = MConstant::New(alloc, Int32Value(16));
addTo->add(c16);
MInstruction* hi = MSimdShift::New(alloc, obj, c16, MSimdShift::ursh, MIRType_Int32x4);
addTo->add(hi);
// Compute lo = obj & 0xffff (lane-wise).
MInstruction* m16 =
MSimdConstant::New(alloc, SimdConstant::SplatX4(0xffff), MIRType_Int32x4);
addTo->add(m16);
MInstruction* lo =
MSimdBinaryBitwise::New(alloc, obj, m16, MSimdBinaryBitwise::and_, MIRType_Int32x4);
addTo->add(lo);
// Mix in the exponents.
MInstruction* exphi =
MSimdConstant::New(alloc, SimdConstant::SplatX4(0x53000000), MIRType_Int32x4);
addTo->add(exphi);
MInstruction* mhi =
MSimdBinaryBitwise::New(alloc, hi, exphi, MSimdBinaryBitwise::or_, MIRType_Int32x4);
addTo->add(mhi);
MInstruction* explo =
MSimdConstant::New(alloc, SimdConstant::SplatX4(0x4b000000), MIRType_Int32x4);
addTo->add(explo);
MInstruction* mlo =
MSimdBinaryBitwise::New(alloc, lo, explo, MSimdBinaryBitwise::or_, MIRType_Int32x4);
addTo->add(mlo);
// Bit-cast both to Float32x4.
MInstruction* fhi =
MSimdReinterpretCast::New(alloc, mhi, MIRType_Int32x4, MIRType_Float32x4);
addTo->add(fhi);
MInstruction* flo =
MSimdReinterpretCast::New(alloc, mlo, MIRType_Int32x4, MIRType_Float32x4);
addTo->add(flo);
// Subtract out the bias: 0x1.0p39f + 0x1.0p23f.
// MSVC doesn't support the hexadecimal float syntax.
const float BiasValue = 549755813888.f + 8388608.f;
MInstruction* bias =
MSimdConstant::New(alloc, SimdConstant::SplatX4(BiasValue), MIRType_Float32x4);
addTo->add(bias);
MInstruction* fhi_debiased =
MSimdBinaryArith::New(alloc, fhi, bias, MSimdBinaryArith::Op_sub, MIRType_Float32x4);
addTo->add(fhi_debiased);
// Compute the final result.
MInstruction* result = MSimdBinaryArith::New(alloc, fhi_debiased, flo,
MSimdBinaryArith::Op_add, MIRType_Float32x4);
addTo->add(result);
return result;
}
if (fromType == MIRType_Float32x4 && toType == MIRType_Int32x4) {
// The Float32x4 -> Uint32x4 conversion can throw if the input is out of
// range. This is handled by the LFloat32x4ToUint32x4 expansion.
MInstruction* ins = New(alloc, obj, fromType, toType, sign);
addTo->add(ins);
return ins;
}
MOZ_CRASH("Unhandled SIMD type conversion");
}
template <typename T>
static void
PrintOpcodeOperation(T* mir, GenericPrinter& out)

Просмотреть файл

@ -1542,10 +1542,18 @@ class MSimdConvert
: public MUnaryInstruction,
public SimdPolicy<0>::Data
{
MSimdConvert(MDefinition* obj, MIRType fromType, MIRType toType)
: MUnaryInstruction(obj)
// When either fromType or toType is an integer vector, should it be treated
// as signed or unsigned. Note that we don't support int-int conversions -
// use MSimdReinterpretCast for that.
SimdSign sign_;
MSimdConvert(MDefinition* obj, MIRType fromType, MIRType toType, SimdSign sign)
: MUnaryInstruction(obj), sign_(sign)
{
MOZ_ASSERT(IsSimdType(toType));
// All conversions are int <-> float, so signedness is required.
MOZ_ASSERT(sign != SimdSign::NotApplicable);
setResultType(toType);
specialization_ = fromType; // expects fromType as input
@ -1562,20 +1570,35 @@ class MSimdConvert
MIRType toType)
{
MOZ_ASSERT(IsSimdType(obj->type()) && fromType == obj->type());
return new(alloc) MSimdConvert(obj, fromType, toType);
// AsmJS only has signed integer vectors for now.
return new(alloc) MSimdConvert(obj, fromType, toType, SimdSign::Signed);
}
static MSimdConvert* New(TempAllocator& alloc, MDefinition* obj, MIRType fromType,
MIRType toType)
MIRType toType, SimdSign sign)
{
return new(alloc) MSimdConvert(obj, fromType, toType);
return new(alloc) MSimdConvert(obj, fromType, toType, sign);
}
// Create a MSimdConvert instruction and add it to the basic block.
// Possibly create and add an equivalent sequence of instructions instead if
// the current target doesn't support the requested conversion directly.
// Return the inserted MInstruction that computes the converted value.
static MInstruction* AddLegalized(TempAllocator& alloc, MBasicBlock* addTo, MDefinition* obj,
MIRType fromType, MIRType toType, SimdSign sign);
SimdSign signedness() const {
return sign_;
}
AliasSet getAliasSet() const override {
return AliasSet::None();
}
bool congruentTo(const MDefinition* ins) const override {
return congruentIfOperandsEqual(ins);
if (!congruentIfOperandsEqual(ins))
return false;
const MSimdConvert* other = ins->toSimdConvert();
return sign_ == other->sign_;
}
ALLOW_CLONE(MSimdConvert)
};

Просмотреть файл

@ -220,6 +220,9 @@ static_assert(JitStackAlignment % SimdMemoryAlignment == 0,
static const uint32_t AsmJSStackAlignment = SimdMemoryAlignment;
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static MOZ_CONSTEXPR_VAR bool SupportsUint32x4FloatConversions = false;
static const Scale ScalePointer = TimesFour;
class Instruction;

Просмотреть файл

@ -176,6 +176,9 @@ static_assert(CodeAlignment % SimdMemoryAlignment == 0,
static const uint32_t AsmJSStackAlignment = SimdMemoryAlignment;
static const int32_t AsmJSGlobalRegBias = 1024;
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static MOZ_CONSTEXPR_VAR bool SupportsUint32x4FloatConversions = false;
class Assembler : public vixl::Assembler
{
public:

Просмотреть файл

@ -98,6 +98,9 @@ static_assert(JitStackAlignment % sizeof(Value) == 0 && JitStackValueAlignment >
static MOZ_CONSTEXPR_VAR uint32_t SimdMemoryAlignment = 8;
static MOZ_CONSTEXPR_VAR uint32_t AsmJSStackAlignment = SimdMemoryAlignment;
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static MOZ_CONSTEXPR_VAR bool SupportsUint32x4FloatConversions = false;
static MOZ_CONSTEXPR_VAR Scale ScalePointer = TimesFour;
class Assembler : public AssemblerMIPSShared

Просмотреть файл

@ -109,6 +109,9 @@ static MOZ_CONSTEXPR_VAR uint32_t SimdMemoryAlignment = 16;
static MOZ_CONSTEXPR_VAR uint32_t AsmJSStackAlignment = SimdMemoryAlignment;
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static MOZ_CONSTEXPR_VAR bool SupportsUint32x4FloatConversions = false;
static MOZ_CONSTEXPR_VAR Scale ScalePointer = TimesEight;
class Assembler : public AssemblerMIPSShared

Просмотреть файл

@ -18,6 +18,9 @@ static const bool SupportsSimd = false;
static const uint32_t SimdMemoryAlignment = 4; // Make it 4 to avoid a bunch of div-by-zero warnings
static const uint32_t AsmJSStackAlignment = 8;
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static MOZ_CONSTEXPR_VAR bool SupportsUint32x4FloatConversions = false;
class Registers
{
public:

Просмотреть файл

@ -3806,6 +3806,29 @@ class LFloat32x4ToInt32x4 : public LInstructionHelper<1, 1, 1>
}
};
// Float32x4 to Uint32x4 needs one GPR temp and one FloatReg temp.
class LFloat32x4ToUint32x4 : public LInstructionHelper<1, 1, 2>
{
public:
LIR_HEADER(Float32x4ToUint32x4);
explicit LFloat32x4ToUint32x4(const LAllocation& input, const LDefinition& tempR,
const LDefinition& tempF)
{
setOperand(0, input);
setTemp(0, tempR);
setTemp(1, tempF);
}
const LDefinition* tempR() {
return getTemp(0);
}
const LDefinition* tempF() {
return getTemp(1);
}
const MSimdConvert* mir() const {
return mir_->toSimdConvert();
}
};
// Double raised to a half power.
class LPowHalfD : public LInstructionHelper<1, 1, 0>
{

Просмотреть файл

@ -185,6 +185,7 @@
_(ValueToObjectOrNull) \
_(Int32x4ToFloat32x4) \
_(Float32x4ToInt32x4) \
_(Float32x4ToUint32x4) \
_(Start) \
_(OsrEntry) \
_(OsrValue) \

Просмотреть файл

@ -20,6 +20,9 @@
namespace js {
namespace jit {
// Does this architecture support SIMD conversions between Uint32x4 and Float32x4?
static const bool SupportsUint32x4FloatConversions = false;
#if defined(JS_CODEGEN_X86)
// In bytes: slots needed for potential memory->memory move spills.
// +8 for cycles

Просмотреть файл

@ -2306,6 +2306,91 @@ CodeGeneratorX86Shared::visitOutOfLineSimdFloatToIntCheck(OutOfLineSimdFloatToIn
}
}
// Convert Float32x4 to Uint32x4.
//
// If any input lane value is out of range or NaN, bail out.
void
CodeGeneratorX86Shared::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins)
{
FloatRegister in = ToFloatRegister(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
Register temp = ToRegister(ins->tempR());
FloatRegister tempF = ToFloatRegister(ins->tempF());
// Classify lane values into 4 disjoint classes:
//
// N-lanes: in < -0.0
// A-lanes: -0.0 <= in <= 0x0.ffffffp31
// B-lanes: 0x1.0p31 <= in <= 0x0.ffffffp32
// V-lanes: 0x1.0p32 <= in, or isnan(in)
//
// We need to bail out to throw a RangeError if we see any N-lanes or
// V-lanes.
//
// For A-lanes and B-lanes, we make two float -> int32 conversions:
//
// A = cvttps2dq(in)
// B = cvttps2dq(in - 0x1.0p31f)
//
// Note that the subtraction for the B computation is exact for B-lanes.
// There is no rounding, so B is the low 31 bits of the correctly converted
// result.
//
// The cvttps2dq instruction produces 0x80000000 when the input is NaN or
// out of range for a signed int32_t. This conveniently provides the missing
// high bit for B, so the desired result is A for A-lanes and A|B for
// B-lanes.
ScratchSimd128Scope scratch(masm);
// First we need to filter out N-lanes. We need to use a floating point
// comparison to do that because cvttps2dq maps the negative range
// [-0x0.ffffffp0;-0.0] to 0. We can't simply look at the sign bits of in
// because -0.0 is a valid input.
// TODO: It may be faster to let ool code deal with -0.0 and skip the
// vcmpleps here.
masm.zeroFloat32x4(scratch);
masm.vcmpleps(Operand(in), scratch, scratch);
masm.vmovmskps(scratch, temp);
masm.cmp32(temp, Imm32(15));
bailoutIf(Assembler::NotEqual, ins->snapshot());
// TODO: If the majority of lanes are A-lanes, it could be faster to compute
// A first, use vmovmskps to check for any non-A-lanes and handle them in
// ool code. OTOH, we we're wrong about the lane distribution, that would be
// slower.
// Compute B in |scratch|.
static const float Adjust = 0x80000000; // 0x1.0p31f for the benefit of MSVC.
static const SimdConstant Bias = SimdConstant::SplatX4(-Adjust);
masm.loadConstantFloat32x4(Bias, scratch);
masm.packedAddFloat32(Operand(in), scratch);
masm.convertFloat32x4ToInt32x4(scratch, scratch);
// Compute A in |out|. This is the last time we use |in| and the first time
// we use |out|, so we can tolerate if they are the same register.
masm.convertFloat32x4ToInt32x4(in, out);
// Since we filtered out N-lanes, we can identify A-lanes by the sign bits
// in A: Any A-lanes will be positive in A, and B-lanes and V-lanes will be
// 0x80000000 in A. Compute a mask of non-A-lanes into |tempF|.
masm.zeroFloat32x4(tempF);
masm.packedGreaterThanInt32x4(Operand(out), tempF);
// Clear the A-lanes in B.
masm.bitwiseAndX4(Operand(tempF), scratch);
// Compute the final result: A for A-lanes, A|B for B-lanes.
masm.bitwiseOrX4(Operand(scratch), out);
// We still need to filter out the V-lanes. They would show up as 0x80000000
// in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
// the remaining negative lanes in B.
masm.vmovmskps(scratch, temp);
masm.cmp32(temp, Imm32(0));
bailoutIf(Assembler::NotEqual, ins->snapshot());
}
void
CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4* ins)
{

Просмотреть файл

@ -261,6 +261,7 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
void visitFloat32x4(LFloat32x4* ins);
void visitInt32x4ToFloat32x4(LInt32x4ToFloat32x4* ins);
void visitFloat32x4ToInt32x4(LFloat32x4ToInt32x4* ins);
void visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins);
void visitSimdReinterpretCast(LSimdReinterpretCast* lir);
void visitSimdExtractElementB(LSimdExtractElementB* lir);
void visitSimdExtractElementI(LSimdExtractElementI* lir);