Bug 1115752 - SpiderMonkey: VEX encodings for instructions with leading immediates r=jandem

This commit is contained in:
Dan Gohman 2014-12-28 07:04:13 -08:00
Родитель ba261c06cc
Коммит e8c913ac57
6 изменённых файлов: 366 добавлений и 208 удалений

Просмотреть файл

@ -1706,7 +1706,7 @@ class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
{
// Swap operands so that new lanes come from LHS in majority.
// In the balanced case, swap operands if needs be, in order to be able
// to do only one shufps on x86.
// to do only one vshufps on x86.
unsigned lanesFromLHS = (laneX < 4) + (laneY < 4) + (laneZ < 4) + (laneW < 4);
if (lanesFromLHS < 2 || (lanesFromLHS == 2 && laneX >= 4 && laneY >=4)) {
laneX = (laneX + 4) % 8;

Просмотреть файл

@ -1496,75 +1496,75 @@ class AssemblerX86Shared : public AssemblerShared
masm.divl_r(divisor.code());
}
void pinsrd(unsigned lane, Register src, FloatRegister dest) {
void vpinsrd(unsigned lane, Register src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
masm.pinsrd_irr(lane, src.code(), dest.code());
masm.vpinsrd_irr(lane, src1.code(), src0.code(), dest.code());
}
void pinsrd(unsigned lane, const Operand &src, FloatRegister dest) {
void vpinsrd(unsigned lane, const Operand &src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
switch (src.kind()) {
switch (src1.kind()) {
case Operand::REG:
masm.pinsrd_irr(lane, src.reg(), dest.code());
masm.vpinsrd_irr(lane, src1.reg(), src0.code(), dest.code());
break;
case Operand::MEM_REG_DISP:
masm.pinsrd_imr(lane, src.disp(), src.base(), dest.code());
masm.vpinsrd_imr(lane, src1.disp(), src1.base(), src0.code(), dest.code());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void pextrd(unsigned lane, FloatRegister src, Register dest) {
void vpextrd(unsigned lane, FloatRegister src, Register dest) {
MOZ_ASSERT(HasSSE41());
masm.pextrd_irr(lane, src.code(), dest.code());
masm.vpextrd_irr(lane, src.code(), dest.code());
}
void pextrd(unsigned lane, FloatRegister src, const Operand &dest) {
void vpextrd(unsigned lane, FloatRegister src, const Operand &dest) {
MOZ_ASSERT(HasSSE41());
switch (dest.kind()) {
case Operand::REG:
masm.pextrd_irr(lane, src.code(), dest.reg());
masm.vpextrd_irr(lane, src.code(), dest.reg());
break;
case Operand::MEM_REG_DISP:
masm.pextrd_imr(lane, src.code(), dest.disp(), dest.base());
masm.vpextrd_irm(lane, src.code(), dest.disp(), dest.base());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void psrldq(Imm32 shift, FloatRegister dest) {
void vpsrldq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.psrldq_ir(shift.value, dest.code());
masm.vpsrldq_ir(shift.value, src0.code(), dest.code());
}
void psllq(Imm32 shift, FloatRegister dest) {
void vpsllq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.psllq_ir(shift.value, dest.code());
masm.vpsllq_ir(shift.value, src0.code(), dest.code());
}
void psrlq(Imm32 shift, FloatRegister dest) {
void vpsrlq(Imm32 shift, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.psrlq_ir(shift.value, dest.code());
masm.vpsrlq_ir(shift.value, src0.code(), dest.code());
}
void vpslld(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.vpslld_rr(src1.code(), src0.code(), dest.code());
}
void pslld(Imm32 count, FloatRegister dest) {
void vpslld(Imm32 count, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.pslld_ir(count.value, dest.code());
masm.vpslld_ir(count.value, src0.code(), dest.code());
}
void vpsrad(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.vpsrad_rr(src1.code(), src0.code(), dest.code());
}
void psrad(Imm32 count, FloatRegister dest) {
void vpsrad(Imm32 count, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.psrad_ir(count.value, dest.code());
masm.vpsrad_ir(count.value, src0.code(), dest.code());
}
void vpsrld(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.vpsrld_rr(src1.code(), src0.code(), dest.code());
}
void psrld(Imm32 count, FloatRegister dest) {
void vpsrld(Imm32 count, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.psrld_ir(count.value, dest.code());
masm.vpsrld_ir(count.value, src0.code(), dest.code());
}
void vcvtsi2sd(const Operand &src1, FloatRegister src0, FloatRegister dest) {
@ -2073,21 +2073,21 @@ class AssemblerX86Shared : public AssemblerShared
}
}
void pshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
void vpshufd(uint32_t mask, FloatRegister src, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.pshufd_irr(mask, src.code(), dest.code());
masm.vpshufd_irr(mask, src.code(), dest.code());
}
void pshufd(uint32_t mask, const Operand &src, FloatRegister dest) {
void vpshufd(uint32_t mask, const Operand &src1, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src.kind()) {
switch (src1.kind()) {
case Operand::FPREG:
masm.pshufd_irr(mask, src.fpu(), dest.code());
masm.vpshufd_irr(mask, src1.fpu(), dest.code());
break;
case Operand::MEM_REG_DISP:
masm.pshufd_imr(mask, src.disp(), src.base(), dest.code());
masm.vpshufd_imr(mask, src1.disp(), src1.base(), dest.code());
break;
case Operand::MEM_ADDRESS32:
masm.pshufd_imr(mask, src.address(), dest.code());
masm.vpshufd_imr(mask, src1.address(), dest.code());
break;
default:
MOZ_CRASH("unexpected operand kind");
@ -2109,21 +2109,21 @@ class AssemblerX86Shared : public AssemblerShared
MOZ_ASSERT(HasSSE2());
masm.vunpckhps_rr(src1.code(), src0.code(), dest.code());
}
void shufps(uint32_t mask, FloatRegister src, FloatRegister dest) {
void vshufps(uint32_t mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
masm.shufps_irr(mask, src.code(), dest.code());
masm.vshufps_irr(mask, src1.code(), src0.code(), dest.code());
}
void shufps(uint32_t mask, const Operand &src, FloatRegister dest) {
void vshufps(uint32_t mask, const Operand &src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src.kind()) {
switch (src1.kind()) {
case Operand::FPREG:
masm.shufps_irr(mask, src.fpu(), dest.code());
masm.vshufps_irr(mask, src1.fpu(), src0.code(), dest.code());
break;
case Operand::MEM_REG_DISP:
masm.shufps_imr(mask, src.disp(), src.base(), dest.code());
masm.vshufps_imr(mask, src1.disp(), src1.base(), src0.code(), dest.code());
break;
case Operand::MEM_ADDRESS32:
masm.shufps_imr(mask, src.address(), dest.code());
masm.vshufps_imr(mask, src1.address(), src0.code(), dest.code());
break;
default:
MOZ_CRASH("unexpected operand kind");
@ -2303,15 +2303,15 @@ class AssemblerX86Shared : public AssemblerShared
MOZ_ASSERT(HasSSE2());
masm.vsqrtss_rr(src1.code(), src0.code(), dest.code());
}
void roundsd(X86Assembler::RoundingMode mode, FloatRegister src, FloatRegister dest) {
void vroundsd(X86Assembler::RoundingMode mode, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
masm.roundsd_rr(mode, src.code(), dest.code());
masm.vroundsd_irr(mode, src1.code(), src0.code(), dest.code());
}
void roundss(X86Assembler::RoundingMode mode, FloatRegister src, FloatRegister dest) {
void vroundss(X86Assembler::RoundingMode mode, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
masm.roundss_rr(mode, src.code(), dest.code());
masm.vroundss_irr(mode, src1.code(), src0.code(), dest.code());
}
unsigned insertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
unsigned vinsertpsMask(SimdLane sourceLane, SimdLane destLane, unsigned zeroMask = 0)
{
// Note that the sourceLane bits are ignored in the case of a source
// memory operand, and the source is the given 32-bits memory location.
@ -2322,9 +2322,9 @@ class AssemblerX86Shared : public AssemblerShared
MOZ_ASSERT(ret < 256);
return ret;
}
void insertps(FloatRegister src, FloatRegister dest, unsigned mask) {
void vinsertps(uint32_t mask, FloatRegister src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
masm.insertps_irr(mask, src.code(), dest.code());
masm.vinsertps_irr(mask, src1.code(), src0.code(), dest.code());
}
unsigned blendpsMask(bool x, bool y, bool z, bool w) {
return x | (y << 1) | (z << 2) | (w << 3);

Просмотреть файл

@ -332,6 +332,15 @@ private:
OP_GROUP5_Ev = 0xFF
};
enum ShiftID {
Shift_vpsrld = 2,
Shift_vpsrlq = 2,
Shift_vpsrldq = 3,
Shift_vpsrad = 4,
Shift_vpslld = 6,
Shift_vpsllq = 6
};
enum TwoByteOpcodeID {
OP2_UD2 = 0x0B,
OP2_MOVSD_VsdWsd = 0x10,
@ -428,6 +437,7 @@ private:
case OP2_MOVSD_WsdVsd: // also OP2_MOVPS_WpsVps
case OP2_MOVAPS_WsdVsd:
case OP2_MOVDQ_WdqVdq:
case OP3_PEXTRD_EdVdqIb:
return true;
default:
break;
@ -2755,18 +2765,15 @@ public:
void vcmpps_rr(uint8_t order, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, src1, src0, dst);
m_formatter.immediate8s(order);
twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, src1, src0, dst);
}
void vcmpps_mr(uint8_t order, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, offset, base, src0, dst);
m_formatter.immediate8s(order);
twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, offset, base, src0, dst);
}
void vcmpps_mr(uint8_t order, const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, address, src0, dst);
m_formatter.immediate8s(order);
twoByteOpImmSimd("vcmpps", VEX_PS, OP2_CMPPS_VpsWps, order, address, src0, dst);
}
void vrcpps_rr(XMMRegisterID src, XMMRegisterID dst) {
@ -2977,50 +2984,30 @@ public:
twoByteOpSimd("vpandn", VEX_PD, OP2_PANDNDQ_VdqWdq, address, src0, dst);
}
void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
void vpshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
{
spew("pshufd $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8u(mask);
twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, src, X86Registers::invalid_xmm, dst);
}
void vpshufd_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
{
twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, offset, base, X86Registers::invalid_xmm, dst);
}
void vpshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
{
twoByteOpImmSimd("vpshufd", VEX_PD, OP2_PSHUFD_VdqWdqIb, mask, address, X86Registers::invalid_xmm, dst);
}
void pshufd_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
MOZ_ASSERT(mask < 256);
spew("pshufd $0x%x, " MEM_ob ", %s", mask, ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, offset, base, (RegisterID)dst);
m_formatter.immediate8u(mask);
twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, src1, src0, dst);
}
void pshufd_imr(uint32_t mask, const void* address, XMMRegisterID dst)
void vshufps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
spew("pshufd $0x%x, %p, %s", mask, address, nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, address, (RegisterID)dst);
m_formatter.immediate8u(mask);
twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, offset, base, src0, dst);
}
void shufps_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
void vshufps_imr(uint32_t mask, const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
spew("shufps $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8u(mask);
}
void shufps_imr(uint32_t mask, int32_t offset, RegisterID base, XMMRegisterID dst)
{
spew("shufps $0x%x, " MEM_ob ", %s", mask, ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, offset, base, (RegisterID)dst);
m_formatter.immediate8u(mask);
}
void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
{
spew("shufps $0x%x, %p, %s", mask, address, nameFPReg(dst));
m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, address, (RegisterID)dst);
m_formatter.immediate8u(mask);
twoByteOpImmSimd("vshufps", VEX_PS, OP2_SHUFPS_VpsWpsIb, mask, address, src0, dst);
}
void vmovhlps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@ -3033,28 +3020,22 @@ public:
twoByteOpSimd("vmovlhps", VEX_PS, OP2_MOVLHPS_VqUq, src1, src0, dst);
}
void psrldq_ir(int shift, XMMRegisterID dest)
void vpsrldq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("psrldq $%d, %s", shift, nameFPReg(dest));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)3);
m_formatter.immediate8s(shift);
MOZ_ASSERT(count < 16);
shiftOpImmSimd("vpsrldq", OP2_PSRLDQ_Vd, Shift_vpsrldq, count, src, dst);
}
void psllq_ir(int shift, XMMRegisterID dest)
void vpsllq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("psllq $%d, %s", shift, nameFPReg(dest));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)6);
m_formatter.immediate8s(shift);
MOZ_ASSERT(count < 64);
shiftOpImmSimd("vpsllq", OP2_PSRLDQ_Vd, Shift_vpsllq, count, src, dst);
}
void psrlq_ir(int shift, XMMRegisterID dest)
void vpsrlq_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("psrlq $%d, %s", shift, nameFPReg(dest));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSRLDQ_Vd, (RegisterID)dest, (RegisterID)2);
m_formatter.immediate8s(shift);
MOZ_ASSERT(count < 64);
shiftOpImmSimd("vpsrlq", OP2_PSRLDQ_Vd, Shift_vpsrlq, count, src, dst);
}
void vpslld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@ -3062,12 +3043,10 @@ public:
twoByteOpSimd("vpslld", VEX_PD, OP2_PSLLD_VdqWdq, src1, src0, dst);
}
void pslld_ir(int32_t count, XMMRegisterID dst)
void vpslld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("pslld $%d, %s", count, nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSLLD_UdqIb, (RegisterID)dst, (RegisterID)6);
m_formatter.immediate8s(int8_t(count));
MOZ_ASSERT(count < 32);
shiftOpImmSimd("vpslld", OP2_PSLLD_UdqIb, Shift_vpslld, count, src, dst);
}
void vpsrad_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@ -3075,12 +3054,10 @@ public:
twoByteOpSimd("vpsrad", VEX_PD, OP2_PSRAD_VdqWdq, src1, src0, dst);
}
void psrad_ir(int32_t count, XMMRegisterID dst)
void vpsrad_ir(int32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("psrad $%d, %s", count, nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSRAD_UdqIb, (RegisterID)dst, (RegisterID)4);
m_formatter.immediate8s(int8_t(count));
MOZ_ASSERT(count < 32);
shiftOpImmSimd("vpsrad", OP2_PSRAD_UdqIb, Shift_vpsrad, count, src, dst);
}
void vpsrld_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@ -3088,12 +3065,10 @@ public:
twoByteOpSimd("vpsrld", VEX_PD, OP2_PSRLD_VdqWdq, src1, src0, dst);
}
void psrld_ir(int32_t count, XMMRegisterID dst)
void vpsrld_ir(uint32_t count, XMMRegisterID src, XMMRegisterID dst)
{
spew("psrld $%d, %s", count, nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PSRLD_UdqIb, (RegisterID)dst, (RegisterID)2);
m_formatter.immediate8s(int8_t(count));
MOZ_ASSERT(count < 32);
shiftOpImmSimd("vpsrld", OP2_PSRLD_UdqIb, Shift_vpsrld, count, src, dst);
}
void vmovmskpd_rr(XMMRegisterID src, RegisterID dst)
@ -3432,12 +3407,10 @@ public:
twoByteOpSimd("vmulss", VEX_SS, OP2_MULSD_VsdWsd, offset, base, src0, dst);
}
void pextrw_irr(int whichWord, XMMRegisterID src, RegisterID dst)
void vpextrw_irr(uint32_t whichWord, XMMRegisterID src, RegisterID dst)
{
FIXME_INSN_PRINTING;
m_formatter.prefix(PRE_SSE_66);
m_formatter.twoByteOp(OP2_PEXTRW_GdUdIb, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8(whichWord);
MOZ_ASSERT(whichWord < 8);
twoByteOpImmSimdInt32("vpextrw", VEX_PD, OP2_PEXTRW_GdUdIb, whichWord, src, dst);
}
void vsubsd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
@ -3580,58 +3553,40 @@ public:
twoByteOpSimd("vsqrtss", VEX_SS, OP2_SQRTSS_VssWss, src1, src0, dst);
}
void roundsd_rr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst)
void vroundsd_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
spew("roundsd $%d, %s, %s", (int)mode, nameFPReg(src), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_ROUNDSD_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8u(mode);
threeByteOpImmSimd("vroundsd", VEX_PD, OP3_ROUNDSD_VsdWsd, ESCAPE_ROUNDSD, mode, src1, src0, dst);
}
void roundss_rr(RoundingMode mode, XMMRegisterID src, XMMRegisterID dst)
void vroundss_irr(RoundingMode mode, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
spew("roundss $%d, %s, %s", (int)mode, nameFPReg(src), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8(mode); // modes are the same for roundsd and roundss
threeByteOpImmSimd("vroundss", VEX_PD, OP3_ROUNDSS_VsdWsd, ESCAPE_ROUNDSD, mode, src1, src0, dst);
}
void insertps_irr(unsigned mask, XMMRegisterID src, XMMRegisterID dst)
void vinsertps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
spew("insertps $0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8u(mask);
threeByteOpImmSimd("vinsertps", VEX_PD, OP3_INSERTPS_VpsUps, ESCAPE_INSERTPS, mask, src1, src0, dst);
}
void pinsrd_irr(unsigned lane, RegisterID src, XMMRegisterID dst)
void vpinsrd_irr(unsigned lane, RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
MOZ_ASSERT(lane < 4);
spew("pinsrd $0x%x, %s, %s", lane, nameIReg(4, src), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, (RegisterID)src, (RegisterID)dst);
m_formatter.immediate8u(lane);
threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, lane, src1, src0, dst);
}
void pinsrd_imr(unsigned lane, int32_t offset, RegisterID base, XMMRegisterID dst)
void vpinsrd_imr(unsigned lane, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
MOZ_ASSERT(lane < 4);
spew("pinsrd $0x%x, " MEM_ob ", %s", lane, ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, offset, base, (RegisterID)dst);
m_formatter.immediate8u(lane);
threeByteOpImmInt32Simd("vpinsrd", VEX_PD, OP3_PINSRD_VdqEdIb, ESCAPE_PINSRD, lane, offset, base, src0, dst);
}
void pextrd_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
void vpextrd_irr(unsigned lane, XMMRegisterID src, RegisterID dst)
{
MOZ_ASSERT(lane < 4);
spew("pextrd $0x%x, %s, %s", lane, nameFPReg(src), nameIReg(4, dst));
m_formatter.prefix(PRE_SSE_66);
m_formatter.threeByteOp(OP3_PEXTRD_EdVdqIb, ESCAPE_PEXTRD, (RegisterID)dst, (RegisterID)src);
m_formatter.immediate8u(lane);
threeByteOpImmSimdInt32("vpextrd", VEX_PD, OP3_PEXTRD_EdVdqIb, ESCAPE_PEXTRD, lane, (XMMRegisterID)dst, (RegisterID)src);
}
void pextrd_imr(unsigned lane, XMMRegisterID src, int32_t offset, RegisterID base)
void vpextrd_irm(unsigned lane, XMMRegisterID src, int32_t offset, RegisterID base)
{
MOZ_ASSERT(lane < 4);
spew("pextrd $0x%x, %s, " MEM_ob, lane, nameFPReg(src), ADDR_ob(offset, base));
@ -3644,16 +3599,14 @@ public:
{
MOZ_ASSERT(imm < 16);
// Despite being a "ps" instruction, vblendps is encoded with the "pd" prefix.
threeByteOpSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, src1, src0, dst);
m_formatter.immediate8u(imm);
threeByteOpImmSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, imm, src1, src0, dst);
}
void vblendps_imr(unsigned imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
MOZ_ASSERT(imm < 16);
// Despite being a "ps" instruction, vblendps is encoded with the "pd" prefix.
threeByteOpSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, offset, base, src0, dst);
m_formatter.immediate8u(imm);
threeByteOpImmSimd("vblendps", VEX_PD, OP3_BLENDPS_VpsWpsIb, ESCAPE_BLENDPS, imm, offset, base, src0, dst);
}
void vblendvps_rr(XMMRegisterID mask, XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
@ -4160,6 +4113,25 @@ private:
m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, src0, dst);
}
void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
uint32_t imm, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
m_formatter.immediate8u(imm);
return;
}
if (src0 == X86Registers::invalid_xmm)
spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(rm), nameFPReg(dst));
else
spew("%-11s$0x%x, %s, %s, %s", name, imm, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, src0, dst);
m_formatter.immediate8u(imm);
}
void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
@ -4215,6 +4187,24 @@ private:
m_formatter.twoByteOpVex_disp32(ty, opcode, offset, base, src0, dst);
}
void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
uint32_t imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm,
ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.twoByteOp(opcode, offset, base, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base),
nameFPReg(src0), nameFPReg(dst));
m_formatter.twoByteOpVex(ty, opcode, offset, base, src0, dst);
m_formatter.immediate8u(imm);
}
void twoByteOpSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
int32_t offset, RegisterID base, RegisterID index, int scale,
XMMRegisterID src0, XMMRegisterID dst)
@ -4271,6 +4261,22 @@ private:
m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
}
void twoByteOpImmSimd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
uint32_t imm, const void *address, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, %p, %s", legacySSEOpName(name), imm, address, nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.twoByteOp(opcode, address, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, %p, %s, %s", name, imm, address, nameFPReg(src0), nameFPReg(dst));
m_formatter.twoByteOpVex(ty, opcode, address, src0, dst);
m_formatter.immediate8u(imm);
}
void twoByteOpInt32Simd(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
RegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
{
@ -4345,6 +4351,22 @@ private:
m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
}
void twoByteOpImmSimdInt32(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
uint32_t imm, XMMRegisterID rm, RegisterID dst)
{
if (useLegacySSEEncodingForOtherOutput()) {
spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameIReg(4, dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.twoByteOp(opcode, (RegisterID)rm, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(rm), nameIReg(4, dst));
m_formatter.twoByteOpVex(ty, opcode, (RegisterID)rm, X86Registers::invalid_xmm, dst);
m_formatter.immediate8u(imm);
}
#ifdef JS_CODEGEN_X64
void twoByteOpSimdInt64(const char *name, VexOperandType ty, TwoByteOpcodeID opcode,
XMMRegisterID rm, RegisterID dst)
@ -4416,6 +4438,23 @@ private:
m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)rm, src0, dst);
}
void threeByteOpImmSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape,
uint32_t imm, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(rm), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, (RegisterID)rm, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, %s, %s, %s", name, imm, nameFPReg(rm), nameFPReg(src0), nameFPReg(dst));
m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)rm, src0, dst);
m_formatter.immediate8u(imm);
}
void threeByteOpSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape,
int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
@ -4433,6 +4472,25 @@ private:
m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
}
void threeByteOpImmSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape,
uint32_t imm, int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm,
ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, offset, base, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base),
nameFPReg(src0), nameFPReg(dst));
m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
m_formatter.immediate8u(imm);
}
void threeByteOpSimd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape,
const void *address, XMMRegisterID src0, XMMRegisterID dst)
@ -4448,6 +4506,77 @@ private:
m_formatter.threeByteOpVex(ty, opcode, escape, address, src0, dst);
}
void threeByteOpImmInt32Simd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape, uint32_t imm,
RegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameIReg(4, src1), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, src1, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, %s, %s, %s", name, imm, nameIReg(4, src1), nameFPReg(src0), nameFPReg(dst));
m_formatter.threeByteOpVex(ty, opcode, escape, src1, src0, dst);
m_formatter.immediate8u(imm);
}
void threeByteOpImmInt32Simd(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape, uint32_t imm,
int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src0, dst)) {
spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm, ADDR_ob(offset, base), nameFPReg(dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, offset, base, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, " MEM_ob ", %s, %s", name, imm, ADDR_ob(offset, base), nameFPReg(src0), nameFPReg(dst));
m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
m_formatter.immediate8u(imm);
}
void threeByteOpImmSimdInt32(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape, uint32_t imm,
XMMRegisterID src, RegisterID dst)
{
if (useLegacySSEEncodingForOtherOutput()) {
spew("%-11s$0x%x, %s, %s", legacySSEOpName(name), imm, nameFPReg(src), nameIReg(4, dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, (RegisterID)src, dst);
m_formatter.immediate8u(imm);
return;
}
if (opcode == OP3_PEXTRD_EdVdqIb)
spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg((XMMRegisterID)dst), nameIReg(4, (RegisterID)src));
else
spew("%-11s$0x%x, %s, %s", name, imm, nameFPReg(src), nameIReg(4, dst));
m_formatter.threeByteOpVex(ty, opcode, escape, (RegisterID)src, X86Registers::invalid_xmm, dst);
m_formatter.immediate8u(imm);
}
void threeByteOpImmSimdInt32(const char *name, VexOperandType ty, ThreeByteOpcodeID opcode,
ThreeByteEscape escape, uint32_t imm,
int32_t offset, RegisterID base, RegisterID dst)
{
if (useLegacySSEEncodingForOtherOutput()) {
spew("%-11s$0x%x, " MEM_ob ", %s", legacySSEOpName(name), imm, ADDR_ob(offset, base), nameIReg(4, dst));
m_formatter.legacySSEPrefix(ty);
m_formatter.threeByteOp(opcode, escape, offset, base, dst);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$0x%x, " MEM_ob ", %s", name, imm, ADDR_ob(offset, base), nameIReg(4, dst));
m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, X86Registers::invalid_xmm, dst);
m_formatter.immediate8u(imm);
}
// Blendv is a three-byte op, but the VEX encoding has a different opcode
// than the SSE encoding, so we handle it specially.
void vblendvOpSimd(XMMRegisterID mask, XMMRegisterID rm, XMMRegisterID src0, XMMRegisterID dst)
@ -4484,6 +4613,22 @@ private:
mask, offset, base, src0, dst);
}
void shiftOpImmSimd(const char *name, TwoByteOpcodeID opcode, ShiftID shiftKind,
uint32_t imm, XMMRegisterID src, XMMRegisterID dst)
{
if (useLegacySSEEncoding(src, dst)) {
spew("%-11s$%d, %s", legacySSEOpName(name), imm, nameFPReg(dst));
m_formatter.legacySSEPrefix(VEX_PD);
m_formatter.twoByteOp(opcode, (RegisterID)dst, (int)shiftKind);
m_formatter.immediate8u(imm);
return;
}
spew("%-11s$%d, %s, %s", name, imm, nameFPReg(src), nameFPReg(dst));
m_formatter.twoByteOpVex(VEX_PD, opcode, (RegisterID)dst, src, (int)shiftKind);
m_formatter.immediate8u(imm);
}
static int32_t getInt32(void* where)
{
return reinterpret_cast<int32_t*>(where)[-1];

Просмотреть файл

@ -1596,7 +1596,7 @@ CodeGeneratorX86Shared::visitFloor(LFloor *lir)
bailoutFrom(&bailout, lir->snapshot());
// Round toward -Infinity.
masm.roundsd(X86Assembler::RoundDown, input, scratch);
masm.vroundsd(X86Assembler::RoundDown, input, scratch, scratch);
bailoutCvttsd2si(scratch, output, lir->snapshot());
} else {
@ -1653,7 +1653,7 @@ CodeGeneratorX86Shared::visitFloorF(LFloorF *lir)
bailoutFrom(&bailout, lir->snapshot());
// Round toward -Infinity.
masm.roundss(X86Assembler::RoundDown, input, scratch);
masm.vroundss(X86Assembler::RoundDown, input, scratch, scratch);
bailoutCvttss2si(scratch, output, lir->snapshot());
} else {
@ -1718,7 +1718,7 @@ CodeGeneratorX86Shared::visitCeil(LCeil *lir)
// x <= -1 or x > -0
masm.bind(&lessThanMinusOne);
// Round toward +Infinity.
masm.roundsd(X86Assembler::RoundUp, input, scratch);
masm.vroundsd(X86Assembler::RoundUp, input, scratch, scratch);
bailoutCvttsd2si(scratch, output, lir->snapshot());
return;
}
@ -1770,7 +1770,7 @@ CodeGeneratorX86Shared::visitCeilF(LCeilF *lir)
// x <= -1 or x > -0
masm.bind(&lessThanMinusOne);
// Round toward +Infinity.
masm.roundss(X86Assembler::RoundUp, input, scratch);
masm.vroundss(X86Assembler::RoundUp, input, scratch, scratch);
bailoutCvttss2si(scratch, output, lir->snapshot());
return;
}
@ -1845,7 +1845,7 @@ CodeGeneratorX86Shared::visitRound(LRound *lir)
// Add 0.5 and round toward -Infinity. The result is stored in the temp
// register (currently contains 0.5).
masm.addDouble(input, temp);
masm.roundsd(X86Assembler::RoundDown, temp, scratch);
masm.vroundsd(X86Assembler::RoundDown, temp, scratch, scratch);
// Truncate.
bailoutCvttsd2si(scratch, output, lir->snapshot());
@ -1928,7 +1928,7 @@ CodeGeneratorX86Shared::visitRoundF(LRoundF *lir)
// Add 0.5 and round toward -Infinity. The result is stored in the temp
// register (currently contains 0.5).
masm.addFloat32(input, temp);
masm.roundss(X86Assembler::RoundDown, temp, scratch);
masm.vroundss(X86Assembler::RoundDown, temp, scratch, scratch);
// Truncate.
bailoutCvttss2si(scratch, output, lir->snapshot());
@ -2093,7 +2093,7 @@ CodeGeneratorX86Shared::visitSimdValueInt32x4(LSimdValueInt32x4 *ins)
masm.vmovd(ToRegister(ins->getOperand(0)), output);
for (size_t i = 1; i < 4; ++i) {
Register r = ToRegister(ins->getOperand(i));
masm.pinsrd(i, r, output);
masm.vpinsrd(i, r, output, output);
}
return;
}
@ -2140,14 +2140,14 @@ CodeGeneratorX86Shared::visitSimdSplatX4(LSimdSplatX4 *ins)
case MIRType_Int32x4: {
Register r = ToRegister(ins->getOperand(0));
masm.vmovd(r, output);
masm.pshufd(0, output, output);
masm.vpshufd(0, output, output);
break;
}
case MIRType_Float32x4: {
FloatRegister r = ToFloatRegister(ins->getOperand(0));
if (r != output)
masm.moveFloat32x4(r, output);
masm.shufps(0, output, output);
masm.vshufps(0, output, output, output);
break;
}
default:
@ -2166,7 +2166,7 @@ CodeGeneratorX86Shared::visitSimdExtractElementI(LSimdExtractElementI *ins)
// The value we want to extract is in the low double-word
masm.moveLowInt32(input, output);
} else if (AssemblerX86Shared::HasSSE41()) {
masm.pextrd(lane, input, output);
masm.vpextrd(lane, input, output);
} else {
uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
masm.shuffleInt32(mask, input, ScratchSimdReg);
@ -2208,7 +2208,8 @@ CodeGeneratorX86Shared::visitSimdInsertElementI(LSimdInsertElementI *ins)
// value goes into the first component, as vmovd clears out the higher lanes
// of the output.
if (AssemblerX86Shared::HasSSE41()) {
masm.pinsrd(component, value, output);
// TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
masm.vpinsrd(component, value, vector, output);
return;
}
@ -2237,7 +2238,7 @@ CodeGeneratorX86Shared::visitSimdInsertElementF(LSimdInsertElementF *ins)
if (AssemblerX86Shared::HasSSE41()) {
// The input value is in the low float32 of the 'value' FloatRegister.
masm.insertps(value, output, masm.insertpsMask(SimdLane::LaneX, ins->lane()));
masm.vinsertps(masm.vinsertpsMask(SimdLane::LaneX, ins->lane()), value, output, output);
return;
}
@ -2343,7 +2344,7 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
unsigned numLanesFromLHS = (x < 4) + (y < 4) + (z < 4) + (w < 4);
MOZ_ASSERT(numLanesFromLHS >= 2);
// When reading this method, remember that shufps takes the two first
// When reading this method, remember that vshufps takes the two first
// inputs of the destination operand (right operand) and the two last
// inputs of the source operand (left operand).
//
@ -2376,7 +2377,7 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
return;
}
// SSE4.1 insertps can handle any single element.
// SSE4.1 vinsertps can handle any single element.
unsigned numLanesUnchanged = (x == 0) + (y == 1) + (z == 2) + (w == 3);
if (AssemblerX86Shared::HasSSE41() && numLanesUnchanged == 3) {
SimdLane srcLane;
@ -2395,7 +2396,7 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
srcLane = SimdLane(w - 4);
dstLane = LaneW;
}
masm.insertps(rhs, out, masm.insertpsMask(srcLane, dstLane));
masm.vinsertps(masm.vinsertpsMask(srcLane, dstLane), rhs, out, out);
return;
}
@ -2404,21 +2405,21 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
if (x < 4 && y < 4) {
if (w >= 4) {
w %= 4;
// T = (Rw Rw Lz Lz) = shufps(firstMask, lhs, rhs)
// T = (Rw Rw Lz Lz) = vshufps(firstMask, lhs, rhs, rhs)
firstMask = MacroAssembler::ComputeShuffleMask(w, w, z, z);
// (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = shufps(secondMask, T, lhs)
// (Lx Ly Lz Rw) = (Lx Ly Tz Tx) = vshufps(secondMask, T, lhs, lhs)
secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneZ, LaneX);
} else {
MOZ_ASSERT(z >= 4);
z %= 4;
// T = (Rz Rz Lw Lw) = shufps(firstMask, lhs, rhs)
// T = (Rz Rz Lw Lw) = vshufps(firstMask, lhs, rhs, rhs)
firstMask = MacroAssembler::ComputeShuffleMask(z, z, w, w);
// (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = shufps(secondMask, T, lhs)
// (Lx Ly Rz Lw) = (Lx Ly Tx Tz) = vshufps(secondMask, T, lhs, lhs)
secondMask = MacroAssembler::ComputeShuffleMask(x, y, LaneX, LaneZ);
}
masm.shufps(firstMask, lhs, rhsCopy);
masm.shufps(secondMask, rhsCopy, lhs);
masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
masm.vshufps(secondMask, rhsCopy, lhs, lhs);
return;
}
@ -2426,21 +2427,21 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
if (y >= 4) {
y %= 4;
// T = (Ry Ry Lx Lx) = shufps(firstMask, lhs, rhs)
// T = (Ry Ry Lx Lx) = vshufps(firstMask, lhs, rhs, rhs)
firstMask = MacroAssembler::ComputeShuffleMask(y, y, x, x);
// (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = shufps(secondMask, lhs, T)
// (Lx Ry Lz Lw) = (Tz Tx Lz Lw) = vshufps(secondMask, lhs, T, T)
secondMask = MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, z, w);
} else {
MOZ_ASSERT(x >= 4);
x %= 4;
// T = (Rx Rx Ly Ly) = shufps(firstMask, lhs, rhs)
// T = (Rx Rx Ly Ly) = vshufps(firstMask, lhs, rhs, rhs)
firstMask = MacroAssembler::ComputeShuffleMask(x, x, y, y);
// (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = shufps(secondMask, lhs, T)
// (Rx Ly Lz Lw) = (Tx Tz Lz Lw) = vshufps(secondMask, lhs, T, T)
secondMask = MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, z, w);
}
masm.shufps(firstMask, lhs, rhsCopy);
masm.shufps(secondMask, lhs, rhsCopy);
masm.vshufps(firstMask, lhs, rhsCopy, rhsCopy);
masm.vshufps(secondMask, lhs, rhsCopy, rhsCopy);
masm.moveFloat32x4(rhsCopy, out);
return;
}
@ -2500,17 +2501,17 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
return;
}
// In one shufps
// In one vshufps
if (x < 4 && y < 4) {
mask = MacroAssembler::ComputeShuffleMask(x, y, z % 4, w % 4);
masm.shufps(mask, rhs, out);
masm.vshufps(mask, rhs, out, out);
return;
}
// At creation, we should have explicitly swapped in this case.
MOZ_ASSERT(!(z >= 4 && w >= 4));
// In two shufps, for the most generic case:
// In two vshufps, for the most generic case:
uint32_t firstMask[4], secondMask[4];
unsigned i = 0, j = 2, k = 0;
@ -2533,11 +2534,11 @@ CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
mask = MacroAssembler::ComputeShuffleMask(firstMask[0], firstMask[1],
firstMask[2], firstMask[3]);
masm.shufps(mask, rhs, lhs);
masm.vshufps(mask, rhs, lhs, lhs);
mask = MacroAssembler::ComputeShuffleMask(secondMask[0], secondMask[1],
secondMask[2], secondMask[3]);
masm.shufps(mask, lhs, lhs);
masm.vshufps(mask, lhs, lhs, lhs);
}
void
@ -2653,14 +2654,14 @@ CodeGeneratorX86Shared::visitSimdBinaryArithIx4(LSimdBinaryArithIx4 *ins)
// ScratchSimdReg contains (Rx, _, Rz, _) where R is the resulting vector.
FloatRegister temp = ToFloatRegister(ins->temp());
masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
masm.pshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), lhs, lhs);
masm.vpshufd(MacroAssembler::ComputeShuffleMask(LaneY, LaneY, LaneW, LaneW), rhs, temp);
masm.vpmuludq(temp, lhs, lhs);
// lhs contains (Ry, _, Rw, _) where R is the resulting vector.
masm.shufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs);
masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneX, LaneZ, LaneX, LaneZ), ScratchSimdReg, lhs, lhs);
// lhs contains (Ry, Rw, Rx, Rz)
masm.shufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs);
masm.vshufps(MacroAssembler::ComputeShuffleMask(LaneZ, LaneX, LaneW, LaneY), lhs, lhs, lhs);
return;
}
case MSimdBinaryArith::Div:
@ -2904,7 +2905,19 @@ CodeGeneratorX86Shared::visitSimdShift(LSimdShift *ins)
// 1068028.
const LAllocation *val = ins->value();
if (val->isConstant()) {
Imm32 count(ToInt32(val));
int32_t c = ToInt32(val);
if (c > 31) {
switch (ins->operation()) {
case MSimdShift::lsh:
case MSimdShift::ursh:
masm.zeroInt32x4(out);
return;
default:
c = 31;
break;
}
}
Imm32 count(c);
switch (ins->operation()) {
case MSimdShift::lsh:
masm.packedLeftShiftByScalar(count, out);

Просмотреть файл

@ -789,14 +789,14 @@ class MacroAssemblerX86Shared : public Assembler
void negateDouble(FloatRegister reg) {
// From MacroAssemblerX86Shared::maybeInlineDouble
vpcmpeqw(ScratchDoubleReg, ScratchDoubleReg, ScratchDoubleReg);
psllq(Imm32(63), ScratchDoubleReg);
vpsllq(Imm32(63), ScratchDoubleReg, ScratchDoubleReg);
// XOR the float in a float register with -0.0.
vxorpd(ScratchDoubleReg, reg, reg); // s ^ 0x80000000000000
}
void negateFloat(FloatRegister reg) {
vpcmpeqw(ScratchFloat32Reg, ScratchFloat32Reg, ScratchFloat32Reg);
psllq(Imm32(31), ScratchFloat32Reg);
vpsllq(Imm32(31), ScratchFloat32Reg, ScratchFloat32Reg);
// XOR the float in a float register with -0.0.
vxorps(ScratchFloat32Reg, reg, reg); // s ^ 0x80000000
@ -922,19 +922,19 @@ class MacroAssemblerX86Shared : public Assembler
vpslld(src, dest, dest);
}
void packedLeftShiftByScalar(Imm32 count, FloatRegister dest) {
pslld(count, dest);
vpslld(count, dest, dest);
}
void packedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
vpsrad(src, dest, dest);
}
void packedRightShiftByScalar(Imm32 count, FloatRegister dest) {
psrad(count, dest);
vpsrad(count, dest, dest);
}
void packedUnsignedRightShiftByScalar(FloatRegister src, FloatRegister dest) {
vpsrld(src, dest, dest);
}
void packedUnsignedRightShiftByScalar(Imm32 count, FloatRegister dest) {
psrld(count, dest);
vpsrld(count, dest, dest);
}
void loadAlignedFloat32x4(const Address &src, FloatRegister dest) {
@ -996,7 +996,7 @@ class MacroAssemblerX86Shared : public Assembler
}
void shuffleInt32(uint32_t mask, FloatRegister src, FloatRegister dest) {
pshufd(mask, src, dest);
vpshufd(mask, src, dest);
}
void moveLowInt32(FloatRegister src, Register dest) {
vmovd(src, dest);
@ -1013,12 +1013,12 @@ class MacroAssemblerX86Shared : public Assembler
// Note: this is useAtStart-safe because src isn't read afterwards.
if (src != dest)
moveFloat32x4(src, dest);
shufps(mask, dest, dest);
vshufps(mask, dest, dest, dest);
}
void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) {
// Note this uses shufps, which is a cross-domain penaly on CPU where it
// Note this uses vshufps, which is a cross-domain penaly on CPU where it
// applies, but that's the way clang and gcc do it.
shufps(mask, src, dest);
vshufps(mask, src, dest, dest);
}
void moveFloatAsDouble(Register src, FloatRegister dest) {
@ -1134,7 +1134,7 @@ class MacroAssemblerX86Shared : public Assembler
}
// It is also possible to load several common constants using vpcmpeqw
// to get all ones and then psllq and psrlq to get zeros at the ends,
// to get all ones and then vpsllq and vpsrlq to get zeros at the ends,
// as described in "13.4 Generating constants" of
// "2. Optimizing subroutines in assembly language" by Agner Fog, and as
// previously implemented here. However, with x86 and x64 both using

Просмотреть файл

@ -868,10 +868,10 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
void boxDouble(FloatRegister src, const ValueOperand &dest) {
if (Assembler::HasSSE41()) {
vmovd(src, dest.payloadReg());
pextrd(1, src, dest.typeReg());
vpextrd(1, src, dest.typeReg());
} else {
vmovd(src, dest.payloadReg());
psrldq(Imm32(4), src);
vpsrldq(Imm32(4), src, src);
vmovd(src, dest.typeReg());
}
}
@ -905,7 +905,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
MOZ_ASSERT(dest != ScratchDoubleReg);
if (Assembler::HasSSE41()) {
vmovd(src.payloadReg(), dest);
pinsrd(1, src.typeReg(), dest);
vpinsrd(1, src.typeReg(), dest, dest);
} else {
vmovd(src.payloadReg(), dest);
vmovd(src.typeReg(), ScratchDoubleReg);
@ -919,7 +919,7 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
movl(payload, scratch);
vmovd(scratch, dest);
movl(type, scratch);
pinsrd(1, scratch, dest);
vpinsrd(1, scratch, dest, dest);
} else {
movl(payload, scratch);
vmovd(scratch, dest);