diff --git a/js/src/nanojit/NativeX64.cpp b/js/src/nanojit/NativeX64.cpp index 0a8caacd79d..298f984e9ef 100644 --- a/js/src/nanojit/NativeX64.cpp +++ b/js/src/nanojit/NativeX64.cpp @@ -99,6 +99,11 @@ namespace nanojit "ah", "ch", "dh", "bh" }; + const char *gpRegNames16[] = { + "ax", "cx", "dx", "bx", "spx", "bpx", "six", "dix", + "r8x", "r9x", "r10x", "r11x", "r12x", "r13x", "r14x", "r15x" + }; + #ifdef _DEBUG #define TODO(x) todo(#x) static void todo(const char *s) { @@ -240,6 +245,11 @@ namespace nanojit emit(rexprb(mod_rr(op, r, b), r, b)); } + // disp32 modrm8 form, when the disp fits in the instruction (opcode is 1-3 bytes) + void Assembler::emitrm8(uint64_t op, Register r, int32_t d, Register b) { + emit(rexrb8(mod_disp32(op, r, b, d), r, b)); + } + // disp32 modrm form, when the disp fits in the instruction (opcode is 1-3 bytes) void Assembler::emitrm(uint64_t op, Register r, int32_t d, Register b) { emit(rexrb(mod_disp32(op, r, b, d), r, b)); @@ -343,6 +353,7 @@ namespace nanojit } #define RB(r) gpRegNames8[(r)] +#define RS(r) gpRegNames16[(r)] #define RBhi(r) gpRegNames8hi[(r)] #define RL(r) gpRegNames32[(r)] #define RQ(r) gpn(r) @@ -439,6 +450,8 @@ namespace nanojit void Assembler::SUBSD( R l, R r) { emitprr(X64_subsd, l,r); asm_output("subsd %s, %s", RQ(l),RQ(r)); } void Assembler::CVTSQ2SD(R l, R r) { emitprr(X64_cvtsq2sd,l,r); asm_output("cvtsq2sd %s, %s",RQ(l),RQ(r)); } void Assembler::CVTSI2SD(R l, R r) { emitprr(X64_cvtsi2sd,l,r); asm_output("cvtsi2sd %s, %s",RQ(l),RL(r)); } + void Assembler::CVTSS2SD(R l, R r) { emitprr(X64_cvtss2sd,l,r); asm_output("cvtss2sd %s, %s",RQ(l),RL(r)); } + void Assembler::CVTSD2SS(R l, R r) { emitprr(X64_cvtsd2ss,l,r); asm_output("cvtsd2ss %s, %s",RL(l),RQ(r)); } void Assembler::UCOMISD( R l, R r) { emitprr(X64_ucomisd, l,r); asm_output("ucomisd %s, %s", RQ(l),RQ(r)); } void Assembler::MOVQRX( R l, R r) { emitprr(X64_movqrx, r,l); asm_output("movq %s, %s", RQ(l),RQ(r)); } // Nb: r and l are deliberately reversed within the emitprr() call. void Assembler::MOVQXR( R l, R r) { emitprr(X64_movqxr, l,r); asm_output("movq %s, %s", RQ(l),RQ(r)); } @@ -483,14 +496,21 @@ namespace nanojit void Assembler::LEAQRM(R r1, I d, R r2) { emitrm(X64_leaqrm,r1,d,r2); asm_output("leaq %s, %d(%s)",RQ(r1),d,RQ(r2)); } void Assembler::MOVLRM(R r1, I d, R r2) { emitrm(X64_movlrm,r1,d,r2); asm_output("movl %s, %d(%s)",RL(r1),d,RQ(r2)); } void Assembler::MOVQRM(R r1, I d, R r2) { emitrm(X64_movqrm,r1,d,r2); asm_output("movq %s, %d(%s)",RQ(r1),d,RQ(r2)); } + void Assembler::MOVBMR(R r1, I d, R r2) { emitrm8(X64_movbmr,r1,d,r2); asm_output("movb %d(%s), %s",d,RQ(r1),RB(r2)); } + void Assembler::MOVSMR(R r1, I d, R r2) { emitprm(X64_movsmr,r1,d,r2); asm_output("movs %d(%s), %s",d,RQ(r1),RS(r2)); } void Assembler::MOVLMR(R r1, I d, R r2) { emitrm(X64_movlmr,r1,d,r2); asm_output("movl %d(%s), %s",d,RQ(r1),RL(r2)); } void Assembler::MOVQMR(R r1, I d, R r2) { emitrm(X64_movqmr,r1,d,r2); asm_output("movq %d(%s), %s",d,RQ(r1),RQ(r2)); } void Assembler::MOVZX8M( R r1, I d, R r2) { emitrm_wide(X64_movzx8m, r1,d,r2); asm_output("movzxb %s, %d(%s)",RQ(r1),d,RQ(r2)); } void Assembler::MOVZX16M(R r1, I d, R r2) { emitrm_wide(X64_movzx16m,r1,d,r2); asm_output("movzxs %s, %d(%s)",RQ(r1),d,RQ(r2)); } + void Assembler::MOVSX8M( R r1, I d, R r2) { emitrm_wide(X64_movsx8m, r1,d,r2); asm_output("movsxb %s, %d(%s)",RQ(r1),d,RQ(r2)); } + void Assembler::MOVSX16M(R r1, I d, R r2) { emitrm_wide(X64_movsx16m,r1,d,r2); asm_output("movsxs %s, %d(%s)",RQ(r1),d,RQ(r2)); } + void Assembler::MOVSDRM(R r1, I d, R r2) { emitprm(X64_movsdrm,r1,d,r2); asm_output("movsd %s, %d(%s)",RQ(r1),d,RQ(r2)); } void Assembler::MOVSDMR(R r1, I d, R r2) { emitprm(X64_movsdmr,r1,d,r2); asm_output("movsd %d(%s), %s",d,RQ(r1),RQ(r2)); } + void Assembler::MOVSSRM(R r1, I d, R r2) { emitprm(X64_movssrm,r1,d,r2); asm_output("movss %s, %d(%s)",RQ(r1),d,RQ(r2)); } + void Assembler::MOVSSMR(R r1, I d, R r2) { emitprm(X64_movssmr,r1,d,r2); asm_output("movss %d(%s), %s",d,RQ(r1),RQ(r2)); } void Assembler::JMP8( S n, NIns* t) { emit_target8(n, X64_jmp8,t); asm_output("jmp %p", t); } @@ -1339,58 +1359,62 @@ namespace nanojit // xmm <- xmm: use movaps. movsd r,r causes partial register stall MOVAPSR(d, s); } else { + NanoAssert(IsFpReg(d) && !IsFpReg(s)); // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r) MOVQXR(d, s); } } - void Assembler::regalloc_load(LIns *ins, Register &rr, int32_t &dr, Register &rb) { + void Assembler::regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &dr, Register &rb) { dr = ins->disp(); LIns *base = ins->oprnd1(); rb = getBaseReg(ins->opcode(), base, dr, BaseRegs); - if (ins->isUnusedOrHasUnknownReg()) { - // use a gpr in case we're copying a non-double - rr = prepResultReg(ins, GpRegs & ~rmask(rb)); + if (ins->isUnusedOrHasUnknownReg() || !(allow & rmask(ins->getReg()))) { + rr = prepResultReg(ins, allow & ~rmask(rb)); } else { // keep already assigned register rr = ins->getReg(); + NanoAssert(allow & rmask(rr)); freeRsrcOf(ins, false); } } void Assembler::asm_load64(LIns *ins) { + Register rr, rb; + int32_t dr; switch (ins->opcode()) { case LIR_ldq: case LIR_ldqc: - // handled by mainline code below for now + regalloc_load(ins, GpRegs, rr, dr, rb); + if (IsGpReg(rr)) { + // general 64bit load, 32bit const displacement + MOVQRM(rr, dr, rb); + } else { + NanoAssert(IsFpReg(rr)); + // load 64bits into XMM. don't know if double or int64, assume double. + MOVSDRM(rr, dr, rb); + } break; case LIR_ld32f: case LIR_ldc32f: - NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture"); - return; + regalloc_load(ins, FpRegs, rr, dr, rb); + NanoAssert(IsFpReg(rr)); + CVTSS2SD(rr, rr); + MOVSSRM(rr, dr, rb); + break; default: NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode"); - return; + break; } - Register rr, rb; - int32_t dr; - regalloc_load(ins, rr, dr, rb); - if (IsGpReg(rr)) { - // general 64bit load, 32bit const displacement - MOVQRM(rr, dr, rb); - } else { - // load 64bits into XMM. don't know if double or int64, assume double. - MOVSDRM(rr, dr, rb); - } } void Assembler::asm_load32(LIns *ins) { NanoAssert(!ins->isQuad()); Register r, b; int32_t d; - regalloc_load(ins, r, d, b); + regalloc_load(ins, GpRegs, r, d, b); LOpcode op = ins->opcode(); switch(op) { case LIR_ldzb: @@ -1406,40 +1430,32 @@ namespace nanojit MOVLRM( r, d, b); break; case LIR_ldsb: - case LIR_ldss: case LIR_ldcsb: + MOVSX8M( r, d, b); + break; + case LIR_ldss: case LIR_ldcss: - NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture"); - return; + MOVSX16M( r, d, b); + break; default: NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode"); - return; + break; } } void Assembler::asm_store64(LOpcode op, LIns *value, int d, LIns *base) { NanoAssert(value->isQuad()); - switch (op) { - case LIR_stqi: - // handled by mainline code below for now - break; - case LIR_st32f: - NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture"); - return; - default: - NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode"); - return; - } - Register b = getBaseReg(LIR_stqi, base, d, BaseRegs); + Register r; // if we have to choose a register, use a GPR, but not the base reg - Register r; if (value->isUnusedOrHasUnknownReg()) { RegisterMask allow; + // If op is LIR_st32f and we have no reg, prefer FPR over GPR: saves an instruction later, + // and the value is almost certainly going to operated on as FP later anyway. // XXX: isFloat doesn't cover float/fmod! see bug 520208. - if (value->isFloat() || value->isop(LIR_float) || value->isop(LIR_fmod)) { + if (op == LIR_st32f || value->isFloat() || value->isop(LIR_float) || value->isop(LIR_fmod)) { allow = FpRegs; } else { allow = GpRegs; @@ -1449,37 +1465,76 @@ namespace nanojit r = value->getReg(); } - if (IsGpReg(r)) { - // gpr store - MOVQMR(r, d, b); - } - else { - // xmm store - MOVSDMR(r, d, b); + switch (op) { + case LIR_stqi: + { + if (IsGpReg(r)) { + // gpr store + MOVQMR(r, d, b); + } + else { + // xmm store + MOVSDMR(r, d, b); + } + break; + } + case LIR_st32f: + { + // need a scratch FPR reg + Register t = registerAllocTmp(FpRegs & ~rmask(r)); + + // store + MOVSSMR(t, d, b); + + // cvt to single-precision + if (IsGpReg(r)) + { + CVTSD2SS(t, t); + MOVQXR(t, r); // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r) + } + else + { + NanoAssert(IsFpReg(r)); + CVTSD2SS(t, r); + } + XORPS(t); // break dependency chains + break; + } + default: + NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode"); + break; } } void Assembler::asm_store32(LOpcode op, LIns *value, int d, LIns *base) { - switch (op) { - case LIR_sti: - // handled by mainline code below for now - break; - case LIR_stb: - case LIR_sts: - NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture"); - return; - default: - NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode"); - return; - } + // quirk of x86-64: reg cannot appear to be ah/bh/ch/dh + // for single-byte stores with REX prefix + const RegisterMask SrcRegs = + (op == LIR_stb) ? + (GpRegs & ~(1<isQuad()); Register b = getBaseReg(LIR_sti, base, d, BaseRegs); - Register r = findRegFor(value, GpRegs & ~rmask(b)); + Register r = findRegFor(value, SrcRegs & ~rmask(b)); + + switch (op) { + case LIR_stb: + MOVBMR(r, d, b); + break; + case LIR_sts: + MOVSMR(r, d, b); + break; + case LIR_sti: + MOVLMR(r, d, b); + break; + default: + NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode"); + break; + } + - // store 32bits to 64bit addr. use rex so we can use all 16 regs - MOVLMR(r, d, b); } // generate a 64bit constant, must not affect condition codes! diff --git a/js/src/nanojit/NativeX64.h b/js/src/nanojit/NativeX64.h index b2e10a3de1b..8bbd7041d9c 100644 --- a/js/src/nanojit/NativeX64.h +++ b/js/src/nanojit/NativeX64.h @@ -61,7 +61,7 @@ namespace nanojit #define NJ_MAX_STACK_ENTRY 256 #define NJ_ALIGN_STACK 16 #define NJ_JTBL_SUPPORTED 1 -#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0 +#define NJ_EXPANDED_LOADSTORE_SUPPORTED 1 enum Register { RAX = 0, // 1st int return, # of sse varargs @@ -191,6 +191,8 @@ namespace nanojit X64_cmpqr8 = 0x00F8834800000004LL, // 64bit compare r,int64(imm8) X64_cvtsi2sd= 0xC02A0F40F2000005LL, // convert int32 to double r = (double) b X64_cvtsq2sd= 0xC02A0F48F2000005LL, // convert int64 to double r = (double) b + X64_cvtss2sd= 0xC05A0F40F3000005LL, // convert float to double r = (double) b + X64_cvtsd2ss= 0xC05A0F40F2000005LL, // convert double to float r = (float) b X64_divsd = 0xC05E0F40F2000005LL, // divide scalar double r /= b X64_mulsd = 0xC0590F40F2000005LL, // multiply scalar double r *= b X64_addsd = 0xC0580F40F2000005LL, // add scalar double r += b @@ -230,6 +232,8 @@ namespace nanojit X64_learm = 0x00000000808D4007LL, // 32bit load effective addr reg <- disp32+base X64_learip = 0x00000000058D4807LL, // 64bit RIP-relative lea. reg <- disp32+rip (modrm = 00rrr101 = 05) X64_movlr = 0xC08B400000000003LL, // 32bit mov r <- b + X64_movbmr = 0x0000000080884007LL, // 8bit store r -> [b+d32] + X64_movsmr = 0x8089406600000004LL, // 16bit store r -> [b+d32] X64_movlmr = 0x0000000080894007LL, // 32bit store r -> [b+d32] X64_movlrm = 0x00000000808B4007LL, // 32bit load r <- [b+d32] X64_movqmr = 0x0000000080894807LL, // 64bit store gpr -> [b+d32] @@ -245,10 +249,14 @@ namespace nanojit X64_movsdrr = 0xC0100F40F2000005LL, // 64bit mov xmm-r <- xmm-b (upper 64bits unchanged) X64_movsdrm = 0x80100F40F2000005LL, // 64bit load xmm-r <- [b+d32] (upper 64 cleared) X64_movsdmr = 0x80110F40F2000005LL, // 64bit store xmm-r -> [b+d32] + X64_movssrm = 0x80100F40F3000005LL, // 32bit load xmm-r <- [b+d32] (upper 96 cleared) + X64_movssmr = 0x80110F40F3000005LL, // 32bit store xmm-r -> [b+d32] X64_movsxdr = 0xC063480000000003LL, // sign extend i32 to i64 r = (int64)(int32) b X64_movzx8 = 0xC0B60F4000000004LL, // zero extend i8 to i64 r = (uint64)(uint8) b X64_movzx8m = 0x80B60F4000000004LL, // zero extend i8 load to i32 r <- [b+d32] X64_movzx16m= 0x80B70F4000000004LL, // zero extend i16 load to i32 r <- [b+d32] + X64_movsx8m = 0x80BE0F4000000004LL, // sign extend i8 load to i32 r <- [b+d32] + X64_movsx16m= 0x80BF0F4000000004LL, // sign extend i16 load to i32 r <- [b+d32] X64_neg = 0xD8F7400000000003LL, // 32bit two's compliment b = -b X64_nop1 = 0x9000000000000001LL, // one byte NOP X64_nop2 = 0x9066000000000002LL, // two byte NOP @@ -359,6 +367,7 @@ namespace nanojit void emitr(uint64_t op, Register b) { emitrr(op, (Register)0, b); }\ void emitr8(uint64_t op, Register b) { emitrr8(op, (Register)0, b); }\ void emitprr(uint64_t op, Register r, Register b);\ + void emitrm8(uint64_t op, Register r, int32_t d, Register b);\ void emitrm(uint64_t op, Register r, int32_t d, Register b);\ void emitrm_wide(uint64_t op, Register r, int32_t d, Register b);\ uint64_t emit_disp32(uint64_t op, int32_t d);\ @@ -380,7 +389,7 @@ namespace nanojit void asm_arith_imm(LIns*);\ void regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra);\ void regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb);\ - void regalloc_load(LIns *ins, Register &rr, int32_t &d, Register &rb);\ + void regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &d, Register &rb);\ void dis(NIns *p, int bytes);\ void asm_cmp(LIns*);\ void asm_cmp_imm(LIns*);\ @@ -460,6 +469,8 @@ namespace nanojit void SUBSD(Register l, Register r);\ void CVTSQ2SD(Register l, Register r);\ void CVTSI2SD(Register l, Register r);\ + void CVTSS2SD(Register l, Register r);\ + void CVTSD2SS(Register l, Register r);\ void UCOMISD(Register l, Register r);\ void MOVQRX(Register l, Register r);\ void MOVQXR(Register l, Register r);\ @@ -495,12 +506,18 @@ namespace nanojit void LEAQRM(Register r1, int d, Register r2);\ void MOVLRM(Register r1, int d, Register r2);\ void MOVQRM(Register r1, int d, Register r2);\ + void MOVBMR(Register r1, int d, Register r2);\ + void MOVSMR(Register r1, int d, Register r2);\ void MOVLMR(Register r1, int d, Register r2);\ void MOVQMR(Register r1, int d, Register r2);\ void MOVZX8M(Register r1, int d, Register r2);\ void MOVZX16M(Register r1, int d, Register r2);\ + void MOVSX8M(Register r1, int d, Register r2);\ + void MOVSX16M(Register r1, int d, Register r2);\ void MOVSDRM(Register r1, int d, Register r2);\ void MOVSDMR(Register r1, int d, Register r2);\ + void MOVSSMR(Register r1, int d, Register r2);\ + void MOVSSRM(Register r1, int d, Register r2);\ void JMP8(size_t n, NIns* t);\ void JMP32(size_t n, NIns* t);\ void JMPX(Register indexreg, NIns** table);\