Redo X64 asm_fneg to only allocate XMM regs, fix indirect calls, and revert asm_restore to old logic (bug 535706 r=nnethercote+)

The code for indirect calls needed shuffling; we must freeResourcesOf() before assigning the call address to a register. The old code was just getting lucky, and the regstate fixes tickled the latent bug. asm_restore() can be stricter once we eliminate all cases where an F64 instruction can be assigned to a GPR. The only known remaining case is asm_quad which is used for both LIR_float and LIR_quad, which should be fixed by bug 534310. --HG-- extra : convert_revision : e46657bf61cade04961da7e8abbb004385aaad7b
2010-03-15 21:52:41 -04:00 · 2010-03-15 21:52:41 -04:00 · a64e9de660
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@ -451,7 +451,8 @@ namespace nanojit
 // XORPD because it's one byte shorter.  This is ok because it's only used for
 // zeroing an XMM register;  hence the single argument.
 // Also note that (unlike most SSE2 instructions) XORPS does not have a prefix, thus emitrr() should be used.
-    void Assembler::XORPS(        R r)  { emitrr(X64_xorps,   r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
+    void Assembler::XORPS(        R r)  { emitrr(X64_xorps,    r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
+    void Assembler::XORPS(   R l, R r)  { emitrr(X64_xorps,    l,r); asm_output("xorps %s, %s",   RQ(l),RQ(r)); }
    void Assembler::DIVSD(   R l, R r)  { emitprr(X64_divsd,   l,r); asm_output("divsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::MULSD(   R l, R r)  { emitprr(X64_mulsd,   l,r); asm_output("mulsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::ADDSD(   R l, R r)  { emitprr(X64_addsd,   l,r); asm_output("addsd %s, %s",   RQ(l),RQ(r)); }
@ -804,7 +805,7 @@ namespace nanojit

    // binary op with integer registers
    void Assembler::asm_arith(LIns *ins) {
-        Register rr, ra, rb;
+        Register rr, ra, rb = UnspecifiedReg;   // init to shut GCC up

        switch (ins->opcode()) {
        case LIR_lsh: case LIR_qilsh:
@ -895,8 +896,7 @@ namespace nanojit
        ArgSize sizes[MAXARGS];
        int argc = call->get_sizes(sizes);

-        bool indirect = call->isIndirect();
-        if (!indirect) {
+        if (!call->isIndirect()) {
            verbose_only(if (_logc->lcbits & LC_Assembly)
                outputf("        %p:", _nIns);
            )
@ -908,16 +908,21 @@ namespace nanojit
                CALLRAX();
                asm_quad(RAX, (uint64_t)target, /*canClobberCCs*/true);
            }
+            // Call this now so that the arg setup can involve 'rr'.
+            freeResourcesOf(ins);
        } else {
            // Indirect call: we assign the address arg to RAX since it's not
            // used for regular arguments, and is otherwise scratch since it's
            // clobberred by the call.
-            asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX);
            CALLRAX();
-        }

-        // Call this now so that the arg setup can involve 'rr'. 
-        freeResourcesOf(ins);
+            // Call this now so that the arg setup can involve 'rr'.
+            freeResourcesOf(ins);
+
+            // Assign the call address to RAX.  Must happen after freeResourcesOf()
+            // since RAX is usually the return value and will be allocated until that point.
+            asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX);
+        }

    #ifdef _WIN64
        int stk_used = 32; // always reserve 32byte shadow area
@ -1380,15 +1385,13 @@ namespace nanojit
        }
        else {
            int d = findMemFor(ins);
-            if (ins->isF64()) {
-                NanoAssert(IsFpReg(r));
+            if (IsFpReg(r)) {
+                NanoAssert(ins->isF64());
                MOVSDRM(r, d, FP);
-            } else if (ins->isI64()) {
-                NanoAssert(IsGpReg(r));
+            } else if (ins->isN64()) {
                MOVQRM(r, d, FP);
            } else {
                NanoAssert(ins->isI32());
-                NanoAssert(IsGpReg(r));
                MOVLRM(r, d, FP);
            }
        }
@ -1689,6 +1692,8 @@ namespace nanojit
    // Register clean-up for 2-address style unary ops of the form R = (op) R.
    // Pairs with beginOp1Regs() and beginOp2Regs().
    void Assembler::endOpRegs(LIns* ins, Register rr, Register ra) {
+        (void) rr; // quell warnings when NanoAssert is compiled out.
+
        LIns* a = ins->oprnd1();

        // We're finished with 'ins'.
@ -1706,29 +1711,35 @@ namespace nanojit

    void Assembler::asm_fneg(LIns *ins) {
        Register rr, ra;
-        if (isS32((uintptr_t)negateMask) || isTargetWithinS32((NIns*)negateMask)) {
-            beginOp1Regs(ins, FpRegs, rr, ra);
-            if (isS32((uintptr_t)negateMask)) {
-                // builtin code is in bottom or top 2GB addr space, use absolute addressing
-                XORPSA(rr, (int32_t)(uintptr_t)negateMask);
-            } else {
-                // jit code is within +/-2GB of builtin code, use rip-relative
-                XORPSM(rr, (NIns*)negateMask);
-            }
-            if (ra != rr)
-                asm_nongp_copy(rr,ra);
-            endOpRegs(ins, rr, ra);
-
+        beginOp1Regs(ins, FpRegs, rr, ra);
+        if (isS32((uintptr_t)negateMask)) {
+            // builtin code is in bottom or top 2GB addr space, use absolute addressing
+            XORPSA(rr, (int32_t)(uintptr_t)negateMask);
+        } else if (isTargetWithinS32((NIns*)negateMask)) {
+            // jit code is within +/-2GB of builtin code, use rip-relative
+            XORPSM(rr, (NIns*)negateMask);
        } else {
            // This is just hideous - can't use RIP-relative load, can't use
            // absolute-address load, and cant move imm64 const to XMM.
-            // so do it all in a GPR.  hrmph.
-            rr = prepareResultReg(ins, GpRegs);
-            ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr));
-            XORQRR(rr, ra);                                     // xor rr, ra
-            asm_quad(rr, negateMask[0], /*canClobberCCs*/true); // mov rr, 0x8000000000000000
-            freeResourcesOf(ins);
+            // Solution: move negateMask into a temp GP register, then copy to
+            // a temp XMM register.
+            // Nb: we don't want any F64 values to end up in a GpReg, nor any
+            // I64 values to end up in an FpReg.
+            // 
+            //   # 'gt' and 'ga' are temporary GpRegs.
+            //   # ins->oprnd1() is in 'rr' (FpRegs)
+            //   mov   gt, 0x8000000000000000
+            //   mov   rt, gt
+            //   xorps rr, rt
+            Register rt = registerAllocTmp(FpRegs & ~(rmask(ra)|rmask(rr)));
+            Register gt = registerAllocTmp(GpRegs);
+            XORPS(rr, rt);
+            MOVQXR(rt, gt);
+            asm_quad(gt, negateMask[0], /*canClobberCCs*/true);
        }
+        if (ra != rr)
+            asm_nongp_copy(rr,ra);
+        endOpRegs(ins, rr, ra);
    }

    void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) {
--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@ -478,6 +478,7 @@ namespace nanojit
        void MOVSXDR(Register l, Register r);\
        void MOVZX8(Register l, Register r);\
        void XORPS(Register r);\
+        void XORPS(Register l, Register r);\
        void DIVSD(Register l, Register r);\
        void MULSD(Register l, Register r);\
        void ADDSD(Register l, Register r);\