зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1021716: SIMD x86-x64: Implement generic binary shuffle; r=sunfish
This commit is contained in:
Родитель
b2b91caa03
Коммит
ba86b71604
|
@ -266,12 +266,16 @@ class LSimdSwizzleF : public LSimdSwizzleBase
|
|||
};
|
||||
|
||||
// Base class for both int32x4 and float32x4 shuffle instructions.
|
||||
class LSimdShuffle : public LInstructionHelper<1, 2, 0>
|
||||
class LSimdShuffle : public LInstructionHelper<1, 2, 1>
|
||||
{
|
||||
public:
|
||||
LIR_HEADER(SimdShuffle);
|
||||
LSimdShuffle()
|
||||
{}
|
||||
LSimdShuffle(const LAllocation &lhs, const LAllocation &rhs, const LDefinition &temp)
|
||||
{
|
||||
setOperand(0, lhs);
|
||||
setOperand(1, rhs);
|
||||
setTemp(0, temp);
|
||||
}
|
||||
|
||||
const LAllocation *lhs() {
|
||||
return getOperand(0);
|
||||
|
@ -279,6 +283,9 @@ class LSimdShuffle : public LInstructionHelper<1, 2, 0>
|
|||
const LAllocation *rhs() {
|
||||
return getOperand(1);
|
||||
}
|
||||
const LDefinition *temp() {
|
||||
return getTemp(0);
|
||||
}
|
||||
|
||||
int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
|
||||
int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
|
||||
|
|
|
@ -3836,16 +3836,20 @@ LIRGenerator::visitSimdShuffle(MSimdShuffle *ins)
|
|||
MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
|
||||
MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
|
||||
MOZ_ASSERT(IsSimdType(ins->type()));
|
||||
MOZ_ASSERT(ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4);
|
||||
|
||||
if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) {
|
||||
MDefinition *lhs = ins->lhs();
|
||||
MDefinition *rhs = ins->rhs();
|
||||
LSimdShuffle *lir = new (alloc()) LSimdShuffle;
|
||||
return lowerForFPU(lir, ins, lhs, rhs);
|
||||
}
|
||||
bool zFromLHS = ins->laneZ() < 4;
|
||||
bool wFromLHS = ins->laneW() < 4;
|
||||
uint32_t lanesFromLHS = (ins->laneX() < 4) + (ins->laneY() < 4) + zFromLHS + wFromLHS;
|
||||
|
||||
MOZ_CRASH("Unknown SIMD kind when getting lane");
|
||||
return false;
|
||||
LUse lhs = useRegisterAtStart(ins->lhs());
|
||||
LUse rhs = useRegister(ins->rhs());
|
||||
|
||||
// See codegen for requirements details.
|
||||
LDefinition temp = (lanesFromLHS == 3) ? tempCopy(ins->rhs(), 1) : LDefinition::BogusTemp();
|
||||
|
||||
LSimdShuffle *lir = new (alloc()) LSimdShuffle(lhs, rhs, temp);
|
||||
return defineReuseInput(lir, ins, 0);
|
||||
}
|
||||
|
||||
bool
|
||||
|
|
|
@ -1673,6 +1673,18 @@ class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
|
|||
MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ,
|
||||
int32_t laneW)
|
||||
{
|
||||
// Swap operands so that new lanes come from LHS in majority.
|
||||
// In the balanced case, swap operands if needs be, in order to be able
|
||||
// to do only one shufps on x86.
|
||||
unsigned lanesFromLHS = (laneX < 4) + (laneY < 4) + (laneZ < 4) + (laneW < 4);
|
||||
if (lanesFromLHS < 2 || (lanesFromLHS == 2 && laneX >= 4 && laneY >=4)) {
|
||||
laneX = (laneX + 4) % 8;
|
||||
laneY = (laneY + 4) % 8;
|
||||
laneZ = (laneZ + 4) % 8;
|
||||
laneW = (laneW + 4) % 8;
|
||||
mozilla::Swap(lhs, rhs);
|
||||
}
|
||||
|
||||
return new(alloc) MSimdShuffle(lhs, rhs, type, laneX, laneY, laneZ, laneW);
|
||||
}
|
||||
|
||||
|
|
|
@ -2416,12 +2416,128 @@ bool
|
|||
CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
|
||||
{
|
||||
FloatRegister lhs = ToFloatRegister(ins->lhs());
|
||||
Operand rhs = ToOperand(ins->rhs());
|
||||
MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
|
||||
FloatRegister rhs = ToFloatRegister(ins->rhs());
|
||||
FloatRegister out = ToFloatRegister(ins->output());
|
||||
MOZ_ASSERT(out == lhs); // define reuse input
|
||||
|
||||
uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ() - 4,
|
||||
ins->laneW() - 4);
|
||||
masm.shuffleMix(mask, rhs, lhs);
|
||||
uint32_t x = ins->laneX();
|
||||
uint32_t y = ins->laneY();
|
||||
uint32_t z = ins->laneZ();
|
||||
uint32_t w = ins->laneW();
|
||||
|
||||
// Check that lanes come from LHS in majority:
|
||||
unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
|
||||
MOZ_ASSERT(numLanesFromLHS >= 2);
|
||||
|
||||
// When reading this method, remember that shufps takes the two first
|
||||
// inputs of the destination operand (right operand) and the two last
|
||||
// inputs of the source operand (left operand).
|
||||
//
|
||||
// Legend for explanations:
|
||||
// - L: LHS
|
||||
// - R: RHS
|
||||
// - T: temporary
|
||||
|
||||
uint32_t mask;
|
||||
|
||||
// Trivial cases: all lanes come from only one vector (Lx, Ly, Lz, Lz).
|
||||
if (numLanesFromLHS == 4) {
|
||||
mask = MacroAssembler::ComputeShuffleMask(x, y, z, w);
|
||||
masm.shufps(mask, lhs, out);
|
||||
return true;
|
||||
}
|
||||
|
||||
// One element of the second, all other elements of the first
|
||||
if (numLanesFromLHS == 3) {
|
||||
unsigned firstMask = -1, secondMask = -1;
|
||||
|
||||
FloatRegister rhsCopy = ToFloatRegister(ins->temp());
|
||||
|
||||
if (x < 4 && y < 4) {
|
||||
if (w >= 4) {
|
||||
w %= 4;
|
||||
// T = (Rw Rw Lz Lz) = shufps(firstMask, lhs, rhs)
|
||||
firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
|
||||
// (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = shufps(secondMask, T, lhs)
|
||||
secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneZ, LaneX);
|
||||
} else {
|
||||
MOZ_ASSERT(z >= 4);
|
||||
z %= 4;
|
||||
// T = (Rz Rz Lw Lw) = shufps(firstMask, lhs, rhs)
|
||||
firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
|
||||
// (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = shufps(secondMask, T, lhs)
|
||||
secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneX, LaneZ);
|
||||
}
|
||||
|
||||
masm.shufps(firstMask, lhs, rhsCopy);
|
||||
masm.shufps(secondMask, rhsCopy, lhs);
|
||||
return true;
|
||||
}
|
||||
|
||||
MOZ_ASSERT(z < 4 && w < 4);
|
||||
|
||||
if (y >= 4) {
|
||||
y %= 4;
|
||||
// T = (Ry Ry Lx Lx) = shufps(firstMask, lhs, rhs)
|
||||
firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
|
||||
// (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = shufps(secondMask, lhs, T)
|
||||
secondMask = MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, z, w);
|
||||
} else {
|
||||
MOZ_ASSERT(x >= 4);
|
||||
x %= 4;
|
||||
// T = (Rx Rx Ly Ly) = shufps(firstMask, lhs, rhs)
|
||||
firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
|
||||
// (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = shufps(secondMask, lhs, T)
|
||||
secondMask = MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, z, w);
|
||||
}
|
||||
|
||||
masm.shufps(firstMask, lhs, rhsCopy);
|
||||
masm.shufps(secondMask, lhs, rhsCopy);
|
||||
masm.movaps(rhsCopy, out);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Two elements from one vector, two other elements from the other
|
||||
MOZ_ASSERT(numLanesFromLHS == 2);
|
||||
|
||||
// In one shufps
|
||||
if (x < 4 && y < 4) {
|
||||
mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
|
||||
masm.shufps(mask, rhs, out);
|
||||
return true;
|
||||
}
|
||||
|
||||
// At creation, we should have explicitly swapped in this case.
|
||||
MOZ_ASSERT(!(z >= 4 && w >= 4));
|
||||
|
||||
// In two shufps, for the most generic case:
|
||||
uint32_t firstMask[4], secondMask[4];
|
||||
unsigned i = 0, j = 2, k = 0;
|
||||
|
||||
#define COMPUTE_MASK(lane) \
|
||||
if (lane >= 4) { \
|
||||
firstMask[j] = lane % 4; \
|
||||
secondMask[k++] = j++; \
|
||||
} else { \
|
||||
firstMask[i] = lane; \
|
||||
secondMask[k++] = i++; \
|
||||
}
|
||||
|
||||
COMPUTE_MASK(x)
|
||||
COMPUTE_MASK(y)
|
||||
COMPUTE_MASK(z)
|
||||
COMPUTE_MASK(w)
|
||||
#undef COMPUTE_MASK
|
||||
|
||||
MOZ_ASSERT(i == 2 && j == 4 && k == 4);
|
||||
|
||||
mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
|
||||
firstMask[2], firstMask[3]);
|
||||
masm.shufps(mask, rhs, lhs);
|
||||
|
||||
mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
|
||||
secondMask[2], secondMask[3]);
|
||||
masm.shufps(mask, lhs, lhs);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче