Add VFP for floating point ops to nanojit ARM backend.

2008-09-02 22:29:23 -07:00 · 2008-09-02 22:29:23 -07:00 · 05c3cd68da
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@ -119,7 +119,7 @@ static bool nesting_enabled = true;
 static bool oracle_enabled = true;
 static bool did_we_check_sse2 = false;

-#ifdef DEBUG
+#if defined(DEBUG) || defined(INCLUDE_VERBOSE_OUTPUT)
 static bool verbose_debug = getenv("TRACEMONKEY") && strstr(getenv("TRACEMONKEY"), "verbose");
 #define debug_only_v(x) if (verbose_debug) { x; }
 #else
@ -282,7 +282,7 @@ static bool isi2f(LInsp i)
    if (i->isop(LIR_i2f))
        return true;

-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
    if (i->isop(LIR_qjoin) &&
        i->oprnd1()->isop(LIR_call) &&
        i->oprnd2()->isop(LIR_callh))
@ -300,7 +300,7 @@ static bool isu2f(LInsp i)
    if (i->isop(LIR_u2f))
        return true;

-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
    if (i->isop(LIR_qjoin) &&
        i->oprnd1()->isop(LIR_call) &&
        i->oprnd2()->isop(LIR_callh))
@ -315,7 +315,7 @@ static bool isu2f(LInsp i)

 static LInsp iu2fArg(LInsp i)
 {
-#ifdef NANOJIT_ARM
+#if defined(NANOJIT_ARM) && defined(NJ_SOFTFLOAT)
    if (i->isop(LIR_qjoin))
        return i->oprnd1()->arg(0);
 #endif
@ -371,7 +371,7 @@ static bool overflowSafe(LIns* i)
            ((c->constval() > 0)));
 }

-#ifdef NANOJIT_ARM
+#if defined(NJ_SOFTFLOAT)

 class SoftFloatFilter: public LirWriter
 {
@ -428,19 +428,6 @@ public:
            return out->ins2(LIR_eq, bv, out->insImm(1));
        }

-        // not really a softfloat filter, but needed on ARM --
-        // arm doesn't mask shifts to 31 like x86 does
-        if (v == LIR_lsh ||
-            v == LIR_rsh ||
-            v == LIR_ush)
-        {
-            if (s1->isconst())
-                s1->setimm16(s1->constval() & 31);
-            else
-                s1 = out->ins2(LIR_and, s1, out->insImm(31));
-            return out->ins2(v, s0, s1);
-        }
-
        return out->ins2(v, s0, s1);
    }

@ -455,7 +442,7 @@ public:
    }
 };

-#endif
+#endif // NJ_SOFTFLOAT

 class FuncFilter: public LirWriter
 {
@ -550,6 +537,20 @@ public:
                return out->ins2(LIR_add, x, y);
            }
        }
+#ifdef NANOJIT_ARM
+        else if (v == LIR_lsh ||
+                 v == LIR_rsh ||
+                 v == LIR_ush)
+        {
+            // needed on ARM -- arm doesn't mask shifts to 31 like x86 does
+            if (s1->isconst())
+                s1->setimm16(s1->constval() & 31);
+            else
+                s1 = out->ins2(LIR_and, s1, out->insImm(31));
+            return out->ins2(v, s0, s1);
+        }
+#endif
+
        return out->ins2(v, s0, s1);
    }

@ -604,7 +605,7 @@ public:

 /* In debug mode vpname contains a textual description of the type of the
   slot during the forall iteration over al slots. */
-#ifdef DEBUG
+#if defined(DEBUG) || defined(INCLUDE_VERBOSE_OUTPUT)
 #define DEF_VPNAME          const char* vpname; unsigned vpnum
 #define SET_VPNAME(name)    do { vpname = name; vpnum = 0; } while(0)
 #define INC_VPNUM()         do { ++vpnum; } while(0)
@ -821,7 +822,7 @@ TraceRecorder::TraceRecorder(JSContext* cx, GuardRecord* _anchor, Fragment* _fra
    if (verbose_debug)
        lir = verbose_filter = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);
 #endif
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
    lir = float_filter = new (&gc) SoftFloatFilter(lir);
 #endif
    lir = cse_filter = new (&gc) CseFilter(lir, &gc);
@ -867,7 +868,7 @@ TraceRecorder::~TraceRecorder()
    delete cse_filter;
    delete expr_filter;
    delete func_filter;
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
    delete float_filter;
 #endif
    delete lir_buf_writer;
@ -2277,8 +2278,10 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
    union { NIns *code; GuardRecord* (FASTCALL *func)(InterpState*, Fragment*); } u;
    u.code = f->code();

-#if defined(DEBUG) && defined(NANOJIT_IA32)
+#ifdef DEBUG
+#if defined(NANOJIT_IA32)
    uint64 start = rdtsc();
+#endif
 #endif

    /*
@ -2362,19 +2365,18 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
              js_ReconstructStackDepth(cx, fp->script, fp->regs->pc) == fp->regs->sp);

 #if defined(DEBUG) && defined(NANOJIT_IA32)
-    if (verbose_debug) {
-        printf("leaving trace at %s:%u@%u, op=%s, lr=%p, exitType=%d, sp=%d, ip=%p, "
-               "cycles=%llu\n",
-               fp->script->filename, js_PCToLineNumber(cx, fp->script, fp->regs->pc),
-               fp->regs->pc - fp->script->code,
-               js_CodeName[*fp->regs->pc],
-               lr,
-               lr->exit->exitType,
-               fp->regs->sp - StackBase(fp), lr->jmp,
-               (rdtsc() - start));
-    }
+    uint64 cycles = rdtsc() - start;
+#else
+    uint64 cycles = 0;
 #endif

+    debug_only_v(printf("leaving trace at %s:%u@%u, exitType=%d, sp=%d, ip=%p, cycles=%llu\n",
+                        fp->script->filename, js_PCToLineNumber(cx, fp->script, fp->regs->pc),
+                        fp->regs->pc - fp->script->code,
+                        lr->exit->exitType,
+                        fp->regs->sp - StackBase(fp), lr->jmp,
+                        cycles));
+
    /* If this trace is part of a tree, later branches might have added additional globals for
       with we don't have any type information available in the side exit. We merge in this
       information from the entry type-map. See also comment in the constructor of TraceRecorder
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@ -221,7 +221,7 @@ class TraceRecorder {
    nanojit::LirWriter*     cse_filter;
    nanojit::LirWriter*     expr_filter;
    nanojit::LirWriter*     func_filter;
-#ifdef NANOJIT_ARM
+#ifdef NJ_SOFTFLOAT
    nanojit::LirWriter*     float_filter;
 #endif
    nanojit::LIns*          cx_ins;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -44,6 +44,7 @@

 #if defined(AVMPLUS_LINUX) && defined(AVMPLUS_ARM)
 #include <asm/unistd.h>
+extern "C" void __clear_cache(char *BEG, char *END);
 #endif

 namespace nanojit
@ -178,6 +179,8 @@ namespace nanojit
 		// nothing free, steal one 
 		// LSRA says pick the one with the furthest use
 		LIns* vic = findVictim(regs,allow,prefer);
+		NanoAssert(vic != NULL);
+
 	    Reservation* resv = getresv(vic);

 		// restore vic
@ -446,25 +449,37 @@ namespace nanojit
 		Reservation* resv = getresv(i);
 		Register r;

+		// if we have an existing reservation and it has a non-unknown
+		// register allocated, and that register is in our allowed mask,
+		// return it.
        if (resv && (r=resv->reg) != UnknownReg && (rmask(r) & allow)) {
 			return r;
        }

+		// figure out what registers are preferred for this instruction
 		RegisterMask prefer = hint(i, allow);
+
+		// if we didn't have a reservation, allocate one now
 		if (!resv) 	
 			resv = reserveAlloc(i);

+		// if the reservation doesn't have a register assigned to it...
        if ((r=resv->reg) == UnknownReg)
 		{
+			// .. if the cost is 2 and the allowed mask includes
+			// the saved regs, then prefer just those.
            if (resv->cost == 2 && (allow&SavedRegs))
                prefer = allow&SavedRegs;
+			// grab one.
 			r = resv->reg = registerAlloc(prefer);
 			_allocator.addActive(r, i);
 			return r;
 		}
 		else 
 		{
-			// r not allowed
+			// the already-allocated register isn't in the allowed mask;
+			// we need to grab a new one and then copy over the old
+			// contents to the new.
 			resv->reg = UnknownReg;
 			_allocator.retire(r);
            if (resv->cost == 2 && (allow&SavedRegs))
@ -795,12 +810,15 @@ namespace nanojit
 # if defined(UNDER_CE)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
 # elif defined(AVMPLUS_LINUX)
-		// XXX fixme flush adjacent pages together
 		for (int i = 0; i < 2; i++) {
 			Page *p = (i == 0) ? _nativePages : _nativeExitPages;

+			Page *first = p;
 			while (p) {
-				flushCache((NIns*)p, (NIns*)((intptr_t)(p) + NJ_PAGE_SIZE));
+				if (!p->next || p->next != p+1) {
+					__clear_cache((char*)first, (char*)(p+1));
+					first = p->next;
+				}
 				p = p->next;
 			}
 		}
@ -852,7 +870,7 @@ namespace nanojit
 			switch(op)
 			{
 				default:
-					NanoAssertMsgf(false, ("unsupported LIR instruction: %d (~0x40: %d)\n",op, op&~LIR64));
+					NanoAssertMsgf(false, "unsupported LIR instruction: %d (~0x40: %d)\n", op, op&~LIR64);
 					break;
 					
 				case LIR_short:
@ -1208,13 +1226,20 @@ namespace nanojit
 					LIns* cond = ins->oprnd1();
 					LOpcode condop = cond->opcode();
 					NanoAssert(cond->isCond());
-#ifndef NJ_SOFTFLOAT
+#if !defined(NJ_SOFTFLOAT)
                    if (condop >= LIR_feq && condop <= LIR_fge)
 					{
+#if defined(NJ_ARM_VFP)
+						if (op == LIR_xf)
+							JNE(exit);
+						else
+							JE(exit);
+#else
 						if (op == LIR_xf)
 							JP(exit);
 						else
 							JNP(exit);
+#endif
 						asm_fcmp(cond);
                        break;
 					}
@ -1313,9 +1338,13 @@ namespace nanojit
 				{
 					// only want certain regs 
 					Register r = prepResultReg(ins, AllowableFlagRegs);
+#ifdef NJ_ARM_VFP
+					SETE(r);
+#else
 					// SETcc only sets low 8 bits, so extend 
 					MOVZX8(r,r);
 					SETNP(r);
+#endif
 					asm_fcmp(ins);
 					break;
 				}
@ -1437,8 +1466,13 @@ namespace nanojit

 	uint32_t Assembler::arFree(uint32_t idx)
 	{
+		// nothing to free
+		if (idx == 0)
+			return 0;
+
 		if (idx > 0 && _activation.entry[idx] == _activation.entry[idx+stack_direction(1)])
 			_activation.entry[idx+stack_direction(1)] = 0;  // clear 2 slots for doubles 
+
 		_activation.entry[idx] = 0;
 		return 0;
 	}
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -376,8 +376,6 @@ namespace nanojit
 		return l;
    }
 	
-#define isS24(x) (((int32_t(x)<<8)>>8) == (x))
-
 	LInsp LirBufWriter::insFar(LOpcode op, LInsp target)
 	{
        NanoAssert(op == LIR_skip || op == LIR_tramp);
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -49,14 +49,17 @@

 #if defined(AVMPLUS_LINUX)
 #include <asm/unistd.h>
+extern "C" void __clear_cache(char *BEG, char *END);
 #endif

+#ifdef FEATURE_NANOJIT
+
 namespace nanojit
 {
-#ifdef FEATURE_NANOJIT

 #ifdef NJ_VERBOSE
-const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","IP","SP","LR","PC"};
+const char* regNames[] = {"r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","FP","IP","SP","LR","PC",
+                          "d0","d1","d2","d3","d4","d5","d6","d7","s14"};
 #endif

 const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
@ -122,6 +125,7 @@ Assembler::nFragExit(LInsp guard)
        // for us; always force a far jump here.
        BL_far(_epilogue);

+        // stick the jmp pointer to the start of the sequence
        lr->jmp = _nIns;
    }

@ -155,18 +159,26 @@ void
 Assembler::asm_call(LInsp ins)
 {
    const CallInfo* call = callInfoFor(ins->fid());
+    Reservation *callRes = getresv(ins);
+
    uint32_t atypes = call->_argtypes;
    uint32_t roffset = 0;

+    // skip return type
+#ifdef NJ_ARM_VFP
+    ArgSize rsize = (ArgSize)(atypes & 3);
+#endif
+    atypes >>= 2;
+
    // we need to detect if we have arg0 as LO followed by arg1 as F;
    // in that case, we need to skip using r1 -- the F needs to be
    // loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
    // generated code.
    bool arg0IsInt32FollowedByFloat = false;
    while ((atypes & 3) != ARGSIZE_NONE) {
-        if (((atypes >> 4) & 3) == ARGSIZE_LO &&
-            ((atypes >> 2) & 3) == ARGSIZE_F &&
-            ((atypes >> 6) & 3) == ARGSIZE_NONE)
+        if (((atypes >> 2) & 3) == ARGSIZE_LO &&
+            ((atypes >> 0) & 3) == ARGSIZE_F &&
+            ((atypes >> 4) & 3) == ARGSIZE_NONE)
        {
            arg0IsInt32FollowedByFloat = true;
            break;
@ -174,17 +186,68 @@ Assembler::asm_call(LInsp ins)
        atypes >>= 2;
    }

+#ifdef NJ_ARM_VFP
+    if (rsize == ARGSIZE_F) {
+        NanoAssert(ins->opcode() == LIR_fcall);
+        NanoAssert(callRes);
+
+        //fprintf (stderr, "call ins: %p callRes: %p reg: %d ar: %d\n", ins, callRes, callRes->reg, callRes->arIndex);
+
+        Register rr = callRes->reg;
+        int d = disp(callRes);
+        freeRsrcOf(ins, rr != UnknownReg);
+
+        if (rr != UnknownReg) {
+            NanoAssert(IsFpReg(rr));
+            FMDRR(rr,R0,R1);
+        } else {
+            NanoAssert(d);
+            //fprintf (stderr, "call ins d: %d\n", d);
+            STMIA(Scratch, 1<<R0 | 1<<R1);
+            arm_ADDi(Scratch, FP, d);
+        }
+    }
+#endif
+
    CALL(call);

    ArgSize sizes[10];
    uint32_t argc = call->get_sizes(sizes);
-    for(uint32_t i=0; i < argc; i++) {
+    for(uint32_t i = 0; i < argc; i++) {
        uint32_t j = argc - i - 1;
        ArgSize sz = sizes[j];
-        NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
+        LInsp arg = ins->arg(j);
        // pre-assign registers R0-R3 for arguments (if they fit)
-        Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
-        asm_arg(sz, ins->arg(j), r);
+
+        Register r = (i + roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
+#ifdef NJ_ARM_VFP
+        if (sz == ARGSIZE_F) {
+            if (r == R0 || r == R2) {
+                roffset++;
+            } else if (r == R1) {
+                r = R2;
+                roffset++;
+            } else {
+                r = UnknownReg;
+            }
+
+            // XXX move this into asm_farg
+            Register sr = findRegFor(arg, FpRegs);
+
+            if (r != UnknownReg) {
+                // stick it into our scratch fp reg, and then copy into the base reg
+                //fprintf (stderr, "FMRRD: %d %d <- %d\n", r, nextreg(r), sr);
+                FMRRD(r, nextreg(r), sr);
+            } else {
+                asm_pusharg(arg);
+            }
+        } else {
+            asm_arg(sz, arg, r);
+        }
+#else
+        NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
+        asm_arg(sz, arg, r);
+#endif

        if (i == 0 && arg0IsInt32FollowedByFloat)
            roffset = 1;
@ -238,7 +301,7 @@ Assembler::nRegisterResetAll(RegAlloc& a)
    // add scratch registers to our free list for the allocator
    a.clear();
    a.used = 0;
-    a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5);
+    a.free = rmask(R0) | rmask(R1) | rmask(R2) | rmask(R3) | rmask(R4) | rmask(R5) | FpRegs;
    debug_only(a.managed = a.free);
 }

@ -251,16 +314,15 @@ Assembler::nPatchBranch(NIns* branch, NIns* target)
    // Which is really 2 instructions, so we need to modify both
    // XXX -- this is B, not BL, at least on non-Thumb..

-    // branch+2 because PC is always 2 instructions ahead on ARM/Thumb
-    int32_t offset = int(target) - int(branch+2);
+    int32_t offset = PC_OFFSET_FROM(target, branch);

    //printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);

    // We have 2 words to work with here -- if offset is in range of a 24-bit
    // relative jump, emit that; otherwise, we do a pc-relative load into pc.
-    if (-(1<<24) <= offset & offset < (1<<24)) {
+    if (isS24(offset)) {
        // ARM goodness, using unconditional B
-        *branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
+        *branch = (NIns)( COND_AL | (0xA<<24) | ((offset>>2) & 0xFFFFFF) );
    } else {
        // LDR pc,[pc]
        *branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
@ -295,11 +357,11 @@ Assembler::asm_qjoin(LIns *ins)
    LIns* hi = ins->oprnd2();
                            
    Register r = findRegFor(hi, GpRegs);
-    ST(FP, d+4, r);
+    STR(r, FP, d+4);

    // okay if r gets recycled.
    r = findRegFor(lo, GpRegs);
-    ST(FP, d, r);
+    STR(r, FP, d);
    freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
 }

@ -311,7 +373,7 @@ Assembler::asm_store32(LIns *value, int dr, LIns *base)
    findRegFor2(GpRegs, value, rA, base, rB);
    Register ra = rA->reg;
    Register rb = rB->reg;
-    ST(rb, dr, ra);
+    STR(ra, rb, dr);
 }

 void
@ -319,7 +381,17 @@ Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
 {
    (void)resv;
    int d = findMemFor(i);
-    LD(r, d, FP);
+
+    if (IsFpReg(r)) {
+        if (isS8(d >> 2)) {
+            FLDD(r, FP, d);
+        } else {
+            FLDD(r, Scratch, 0);
+            arm_ADDi(Scratch, FP, d);
+        }
+    } else {
+        LDR(r, FP, d);
+    }

    verbose_only(
        if (_verbose)
@ -332,12 +404,21 @@ Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
 {
    (void)i;
    (void)pop;
-
+    //fprintf (stderr, "resv->arIndex: %d\n", resv->arIndex);
    if (resv->arIndex) {
        int d = disp(resv);
        // save to spill location
        Register rr = resv->reg;
-        ST(FP, d, rr);
+        if (IsFpReg(rr)) {
+            if (isS8(d >> 2)) {
+                FSTD(rr, FP, d);
+            } else {
+                FSTD(rr, Scratch, 0);
+                arm_ADDi(Scratch, FP, d);
+            }
+        } else {
+            STR(rr, FP, d);
+        }

        verbose_only(if (_verbose){
                outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
@ -349,38 +430,164 @@ Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
 void
 Assembler::asm_load64(LInsp ins)
 {
-    LIns* base = ins->oprnd1();
-    int db = ins->oprnd2()->constval();
-    Reservation *resv = getresv(ins);
-    int dr = disp(resv);
-    NanoAssert(resv->reg == UnknownReg && dr != 0);
+    ///asm_output("<<< load64");
+
+    LIns* base = ins->oprnd1();
+    int offset = ins->oprnd2()->constval();
+
+    Reservation *resv = getresv(ins);
+    Register rr = resv->reg;
+    int d = disp(resv);

-    Register rb = findRegFor(base, GpRegs);
-    resv->reg = UnknownReg;
-    asm_mmq(FP, dr, rb, db);
    freeRsrcOf(ins, false);
+
+#ifdef NJ_ARM_VFP
+    Register rb = findRegFor(base, GpRegs);
+
+    NanoAssert(rb != UnknownReg);
+    NanoAssert(rr == UnknownReg || IsFpReg(rr));
+
+    if (rr != UnknownReg) {
+        if (!isS8(offset >> 2) || (offset&3) != 0) {
+            underrunProtect(LD32_size + 8);
+            FLDD(rr,Scratch,0);
+            ADD(Scratch, rb);
+            LD32_nochk(Scratch, offset);
+        } else {
+            FLDD(rr,rb,offset);
+        }
+    } else {
+        asm_mmq(FP, d, rb, offset);
+    }
+
+    // *(FP+dr) <- *(rb+db)
+#else
+    NanoAssert(resv->reg == UnknownReg && d != 0);
+    Register rb = findRegFor(base, GpRegs);
+    asm_mmq(FP, d, rb, offset);
+#endif
+
+    //asm_output(">>> load64");
 }

 void
 Assembler::asm_store64(LInsp value, int dr, LInsp base)
 {
+    //asm_output1("<<< store64 (dr: %d)", dr);
+
+#ifdef NJ_ARM_VFP
+    Reservation *valResv = getresv(value);
+
+    Register rb = findRegFor(base, GpRegs);
+    Register rv = findRegFor(value, FpRegs);
+
+    NanoAssert(rb != UnknownReg);
+    NanoAssert(rv != UnknownReg);
+
+    Register baseReg = rb;
+    intptr_t baseOffset = dr;
+
+    if (!isS8(dr)) {
+        baseReg = Scratch;
+        baseOffset = 0;
+    }
+
+    FSTD(rv, baseReg, baseOffset);
+
+    if (!isS8(dr)) {
+        underrunProtect(4 + LD32_size);
+        ADD(Scratch, rb);
+        LD32_nochk(Scratch, dr);
+    }
+
+    // if it's a constant, make sure our baseReg/baseOffset location
+    // has the right value
+    if (value->isconstq()) {
+        const int32_t* p = (const int32_t*) (value-2);
+
+        underrunProtect(12 + LD32_size);
+
+        asm_quad_nochk(rv, p);
+    }
+#else
    int da = findMemFor(value);
    Register rb = findRegFor(base, GpRegs);
    asm_mmq(rb, dr, FP, da);
+#endif
+    //asm_output(">>> store64");
+}
+
+// stick a quad into register rr, where p points to the two
+// 32-bit parts of the quad, optinally also storing at FP+d
+void
+Assembler::asm_quad_nochk(Register rr, const int32_t* p)
+{
+    *(++_nSlot) = p[0];
+    *(++_nSlot) = p[1];
+
+    intptr_t constAddr = (intptr_t) (_nSlot-1);
+    intptr_t realOffset = PC_OFFSET_FROM(constAddr, _nIns-1);
+    intptr_t offset = realOffset;
+    Register baseReg = PC;
+
+    //int32_t *q = (int32_t*) constAddr;
+    //fprintf (stderr, "asm_quad_nochk: rr = %d cAddr: 0x%x quad: %08x:%08x q: %f @0x%08x\n", rr, constAddr, p[0], p[1], *(double*)q, _nIns);
+
+    // for FLDD, we only get a left-shifted 8-bit offset
+    if (!isS8(realOffset >> 2)) {
+        offset = 0;
+        baseReg = Scratch;
+    }
+
+    FLDD(rr, baseReg, offset);
+
+    if (!isS8(realOffset >> 2))
+        LD32_nochk(Scratch, constAddr);
 }

 void
 Assembler::asm_quad(LInsp ins)
 {
-    Reservation *rR = getresv(ins);
-    int d = disp(rR);
+    //asm_output(">>> asm_quad");
+
+    Reservation *res = getresv(ins);
+    int d = disp(res);
+    Register rr = res->reg;
+
+    NanoAssert(d || rr != UnknownReg);
+
+    const int32_t* p = (const int32_t*) (ins-2);
+
+#ifdef NJ_ARM_VFP
    freeRsrcOf(ins, false);

+    // XXX We probably want nochk versions of FLDD/FSTD
+    underrunProtect(16 + LD32_size);
+
+    // grab a register to do the load into if we don't have one already;
+    // XXX -- maybe do a mmq in this case?  We're going to use our
+    // D7 register that's never allocated (since it's the one we use
+    // for int-to-double conversions), so we don't have to worry about
+    // spilling something in a fp reg.
+    if (rr == UnknownReg)
+        rr = D7;
+
+    if (d)
+        FSTD(rr, FP, d);
+
+    asm_quad_nochk(rr, p);
+#else
+    freeRsrcOf(ins, false);
    if (d) {
-        const int32_t* p = (const int32_t*) (ins-2);
-        STi(FP,d+4,p[1]);
-        STi(FP,d,p[0]);
+        underrunProtect(LD32_size * 2 + 8);
+        STR(Scratch, FP, d+4);
+        LD32_nochk(Scratch, p[1]);
+        STR(Scratch, FP, d);
+        LD32_nochk(Scratch, p[0]);
    }
+#endif
+
+    //asm_output("<<< asm_quad");
 }

 bool
@ -393,9 +600,17 @@ Assembler::asm_qlo(LInsp ins, LInsp q)
 void
 Assembler::asm_nongp_copy(Register r, Register s)
 {
-    // we will need this for VFP support
-    (void)r; (void)s;
-    NanoAssert(false);
+    if ((rmask(r) & FpRegs) && (rmask(s) & FpRegs)) {
+        // fp->fp
+        FCPYD(r, s);
+    } else if ((rmask(r) & GpRegs) && (rmask(s) & FpRegs)) {
+        // fp->gp
+        // who's doing this and why?
+        NanoAssert(0);
+        // FMRS(r, loSingleVfp(s));
+    } else {
+        NanoAssert(0);
+    }
 }

 Register
@ -416,31 +631,41 @@ Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
    // get a scratch reg
    Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
    _allocator.addFree(t);
-    ST(rd, dd+4, t);
-    LD(t, ds+4, rs);
-    ST(rd, dd, t);
-    LD(t, ds, rs);
+    // XXX use LDM,STM
+    STR(t, rd, dd+4);
+    LDR(t, rs, ds+4);
+    STR(t, rd, dd);
+    LDR(t, rs, ds);
 }

 void
-Assembler::asm_pusharg(LInsp p)
+Assembler::asm_pusharg(LInsp arg)
 {
-    // arg goes on stack
-    Reservation* rA = getresv(p);
-    if (rA == 0)
-    {
-        Register ra = findRegFor(p, GpRegs);
-        ST(SP,0,ra);
-    }
-    else if (rA->reg == UnknownReg)
-    {
-        ST(SP,0,Scratch);
-        LD(Scratch,disp(rA),FP);
-    }
+    Reservation* argRes = getresv(arg);
+    bool quad = arg->isQuad();
+    intptr_t stack_growth = quad ? 8 : 4;
+
+    Register ra;
+
+    if (argRes)
+        ra = argRes->reg;
    else
-    {
-        ST(SP,0,rA->reg);
+        ra = findRegFor(arg, quad ? FpRegs : GpRegs);
+
+    if (ra == UnknownReg) {
+        STR(Scratch, SP, 0);
+        LDR(Scratch, FP, disp(argRes));
+    } else {
+        if (!quad) {
+            Register ra = findRegFor(arg, GpRegs);
+            STR(ra, SP, 0);
+        } else {
+            Register ra = findRegFor(arg, FpRegs);
+            FSTD(ra, SP, 0);
+        }
    }
+
+    SUBi(SP, stack_growth);
 }

 void
@ -470,22 +695,6 @@ Assembler::nativePageSetup()
    }
 }

-void
-Assembler::flushCache(NIns* n1, NIns* n2) {
-#if defined(UNDER_CE)
-    // we changed the code, so we need to do this (sadly)
-    FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-#elif defined(AVMPLUS_LINUX)
-    // Just need to clear this one page (not even the whole page really)
-    //Page *page = (Page*)pageTop(_nIns);
-    register unsigned long _beg __asm("a1") = (unsigned long)(n1);
-    register unsigned long _end __asm("a2") = (unsigned long)(n2);
-    register unsigned long _flg __asm("a3") = 0;
-    register unsigned long _swi __asm("r7") = 0xF0002;
-    __asm __volatile ("swi 0    @ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
-#endif
-}
-
 NIns*
 Assembler::asm_adjustBranch(NIns* at, NIns* target)
 {
@ -497,9 +706,16 @@ Assembler::asm_adjustBranch(NIns* at, NIns* target)

    NIns* was = (NIns*) at[3];

+    //fprintf (stderr, "Adjusting branch @ 0x%8x: 0x%x -> 0x%x\n", at+3, at[3], target);
+
    at[3] = (NIns)target;

-    flushCache(at, at+4);
+#if defined(UNDER_CE)
+    // we changed the code, so we need to do this (sadly)
+    FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
+#elif defined(AVMPLUS_LINUX)
+    __clear_cache((char*)at, (char*)(at+4));
+#endif

 #ifdef AVMPLUS_PORTING_API
    NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
@ -550,6 +766,9 @@ Assembler::BL_far(NIns* addr)
    // point to the right spot before branching
    underrunProtect(16);

+    // TODO use a slot in const pool for address, but emit single insn
+    // for branch if offset fits
+
    // the address
    *(--_nIns) = (NIns)((addr));
    // bx ip             // branch to the address we loaded earlier
@ -558,17 +777,29 @@ Assembler::BL_far(NIns* addr)
    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
    // ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
    *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
+
+    //fprintf (stderr, "BL_far sequence @ 0x%08x\n", _nIns);
+
    asm_output1("bl %p (32-bit)", addr);
 }

 void
 Assembler::BL(NIns* addr)
 {
-    intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
-    if (JMP_S24_OFFSET_OK(offs)) {
-        // we can do this with a single BL call
+    intptr_t offs = PC_OFFSET_FROM(addr,_nIns-1);
+
+    //fprintf (stderr, "BL: 0x%x (offs: %d [%x]) @ 0x%08x\n", addr, offs, offs, (intptr_t)(_nIns-1));
+
+    if (isS24(offs)) {
+        // try to do this with a single S24 call;
+        // recompute offset in case underrunProtect had to allocate a new page
        underrunProtect(4);
-        *(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
+        offs = PC_OFFSET_FROM(addr,_nIns-1);
+    }
+
+    if (isS24(offs)) {
+        // already did underrunProtect above
+        *(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) );
        asm_output1("bl %p", addr);
    } else {
        BL_far(addr);
@ -579,6 +810,7 @@ void
 Assembler::CALL(const CallInfo *ci)
 {
    intptr_t addr = ci->_address;
+
    BL((NIns*)addr);
    asm_output1("   (call %s)", ci->_name);
 }
@ -586,21 +818,226 @@ Assembler::CALL(const CallInfo *ci)
 void
 Assembler::LD32_nochk(Register r, int32_t imm)
 {
-    // We can always reach the const pool, since it's on the same page (<4096)
-    underrunProtect(8);
+    // We should always reach the const pool, since it's on the same page (<4096);
+    // if we can't, someone didn't underrunProtect enough.

    *(++_nSlot) = (int)imm;

    //fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);

-    int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
+    int offset = PC_OFFSET_FROM(_nSlot,_nIns-1);

-    NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
+    NanoAssert(isS12(offset) && (offset < 0));

-    *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
-    asm_output2("ld %s,%d",gpn(r),imm);
+    asm_output2("  (%d(PC) = 0x%x)", offset, imm);
+
+    LDR_nochk(r,PC,offset);
 }

+
+// Branch to target address _t with condition _c, doing underrun
+// checks (_chk == 1) or skipping them (_chk == 0).
+//
+// If the jump fits in a relative jump (+/-32MB), emit that.
+// If the jump is unconditional, emit the dest address inline in
+// the instruction stream and load it into pc.
+// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
+// pointer is valid, stick the constant in the slot and emit a conditional
+// load into pc.
+// Otherwise, emit the conditional load into pc from a nearby constant,
+// and emit a jump to jump over it it in case the condition fails.
+//
+// NB: JMP_nochk depends on this not calling samepage() when _c == AL
+void
+Assembler::B_cond_chk(ConditionCode _c, NIns* _t, bool _chk)
+{
+    int32 offs = PC_OFFSET_FROM(_t,_nIns-1);
+    //fprintf(stderr, "B_cond_chk target: 0x%08x offset: %d @0x%08x\n", _t, offs, _nIns-1);
+    if (isS24(offs)) {
+        if (_chk) underrunProtect(4);
+        offs = PC_OFFSET_FROM(_t,_nIns-1);
+    }
+
+    if (isS24(offs)) {
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) );
+    } else if (_c == AL) {
+        if(_chk) underrunProtect(8);
+        *(--_nIns) = (NIns)(_t);
+        *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 );
+    } else if (samepage(_nIns,_nSlot)) {
+        if(_chk) underrunProtect(8);
+        *(++_nSlot) = (NIns)(_t);
+        offs = PC_OFFSET_FROM(_nSlot,_nIns-1);
+        NanoAssert(offs < 0);
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) );
+    } else {
+        if(_chk) underrunProtect(12);
+        *(--_nIns) = (NIns)(_t);
+        *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF );
+        *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 );
+    }
+
+    asm_output2("%s %p", _c == AL ? "jmp" : "b(cnd)", (void*)(_t));
+}
+
+/*
+ * VFP
+ */
+
+#ifdef NJ_ARM_VFP
+
+void
+Assembler::asm_i2f(LInsp ins)
+{
+    Register rr = prepResultReg(ins, FpRegs);
+    Register srcr = findRegFor(ins->oprnd1(), GpRegs);
+
+    // todo: support int value in memory, as per x86
+    NanoAssert(srcr != UnknownReg);
+
+    FSITOD(rr, FpSingleScratch);
+    FMSR(FpSingleScratch, srcr);
+}
+
+void
+Assembler::asm_u2f(LInsp ins)
+{
+    Register rr = prepResultReg(ins, FpRegs);
+    Register sr = findRegFor(ins->oprnd1(), GpRegs);
+
+    // todo: support int value in memory, as per x86
+    NanoAssert(sr != UnknownReg);
+
+    FUITOD(rr, FpSingleScratch);
+    FMSR(FpSingleScratch, sr);
+}
+
+void
+Assembler::asm_fneg(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    Register rr = prepResultReg(ins, FpRegs);
+
+    Reservation* rA = getresv(lhs);
+    Register sr;
+
+    if (!rA || rA->reg == UnknownReg)
+        sr = findRegFor(lhs, FpRegs);
+    else
+        sr = rA->reg;
+
+    FNEGD(rr, sr);
+}
+
+void
+Assembler::asm_fop(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    LInsp rhs = ins->oprnd2();
+    LOpcode op = ins->opcode();
+
+    NanoAssert(op >= LIR_fadd && op <= LIR_fdiv);
+
+    // rr = ra OP rb
+
+    Register rr = prepResultReg(ins, FpRegs);
+
+    Register ra = findRegFor(lhs, FpRegs);
+    Register rb = (rhs == lhs) ? ra : findRegFor(rhs, FpRegs);
+
+    // XXX special-case 1.0 and 0.0
+
+    if (op == LIR_fadd)
+        FADDD(rr,ra,rb);
+    else if (op == LIR_fsub)
+        FSUBD(rr,ra,rb);
+    else if (op == LIR_fmul)
+        FMULD(rr,ra,rb);
+    else //if (op == LIR_fdiv)
+        FDIVD(rr,ra,rb);
+}
+
+void
+Assembler::asm_fcmp(LInsp ins)
+{
+    LInsp lhs = ins->oprnd1();
+    LInsp rhs = ins->oprnd2();
+    LOpcode op = ins->opcode();
+
+    NanoAssert(op >= LIR_feq && op <= LIR_fge);
+
+    Register ra = findRegFor(lhs, FpRegs);
+    Register rb = findRegFor(rhs, FpRegs);
+
+    // We can't uniquely identify fge/fle via a single bit
+    // pattern (since equality and lt/gt are separate bits);
+    // so convert to the single-bit variant.
+    if (op == LIR_fge) {
+        Register temp = ra;
+        ra = rb;
+        rb = temp;
+        op = LIR_flt;
+    } else if (op == LIR_fle) {
+        Register temp = ra;
+        ra = rb;
+        rb = temp;
+        op = LIR_fgt;
+    }
+
+    // There is no way to test for an unordered result using
+    // the conditional form of an instruction; the encoding (C=1 V=1)
+    // ends up having overlaps with a few other tests.  So, test for
+    // the explicit mask.
+    uint8_t mask = 0x0;
+    
+    // NZCV
+    // for a valid ordered result, V is always 0 from VFP
+    if (op == LIR_feq)
+        // ZC // cond EQ (both equal and "not less than"
+        mask = 0x6;
+    else if (op == LIR_flt)
+        // N  // cond MI
+        mask = 0x8;
+    else if (op == LIR_fgt)
+        // C  // cond CS
+        mask = 0x2;
+    else
+        NanoAssert(0);
+/*
+    // these were converted into gt and lt above.
+    if (op == LIR_fle)
+        // NZ // cond LE
+        mask = 0xC;
+    else if (op == LIR_fge)
+        // ZC // cond fail?
+        mask = 0x6;
+*/
+
+    // TODO XXX could do this as fcmpd; fmstat; tstvs rX, #0 the tstvs
+    // would reset the status bits if V (NaN flag) is set, but that
+    // doesn't work for NE.  For NE could teqvs rX, #1.  rX needs to
+    // be any register that has lsb == 0, such as sp/fp/pc.
+    
+    // Test explicily with the full mask; if V is set, test will fail.
+    // Assumption is that this will be followed up by a BEQ/BNE
+    CMPi(Scratch, mask);
+    // grab just the condition fields
+    SHRi(Scratch, 28);
+    MRS(Scratch);
+
+    // do the comparison and get results loaded in ARM status register
+    FMSTAT();
+    FCMPD(ra, rb);
+}
+
+Register
+Assembler::asm_prep_fcall(Reservation* rR, LInsp ins)
+{
+    // We have nothing to do here; we do it all in asm_call.
+    return UnknownReg;
+}
+
+#endif /* NJ_ARM_VFP */
+
+}
 #endif /* FEATURE_NANOJIT */
-
-}
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -47,14 +47,28 @@ namespace nanojit

 const int NJ_LOG2_PAGE_SIZE = 12;       // 4K

-#define NJ_MAX_REGISTERS                11
+// If NJ_ARM_VFP is defined, then VFP is assumed to
+// be present.  If it's not defined, then softfloat
+// is used, and NJ_SOFTFLOAT is defined.
+#define NJ_ARM_VFP
+
+#ifdef NJ_ARM_VFP
+
+// only d0-d7; we'll use d7 as s14-s15 for i2f/u2f/etc.
+#define NJ_VFP_MAX_REGISTERS            8
+
+#else
+
+#define NJ_VFP_MAX_REGISTERS            0
+#define NJ_SOFTFLOAT
+
+#endif
+
+#define NJ_MAX_REGISTERS                (11 + NJ_VFP_MAX_REGISTERS)
 #define NJ_MAX_STACK_ENTRY              256
 #define NJ_MAX_PARAMETERS               16
 #define NJ_ALIGN_STACK                  8
-#define NJ_STACK_OFFSET                 8
-
-#define NJ_SOFTFLOAT
-#define NJ_STACK_GROWTH_UP
+#define NJ_STACK_OFFSET                 0

 #define NJ_CONSTANT_POOLS
 const int NJ_MAX_CPOOL_OFFSET = 4096;
@ -75,25 +89,40 @@ typedef enum {
    R8  = 8,
    R9  = 9,
    R10 = 10,
-    //FP  =11,
+    FP  = 11,
    IP  = 12,
    SP  = 13,
    LR  = 14,
    PC  = 15,

-    FP = 13,
-        
-    // Pseudo-register for floating point
-    F0  = 0,
+    // FP regs
+    D0 = 16,
+    D1 = 17,
+    D2 = 18,
+    D3 = 19,
+    D4 = 20,
+    D5 = 21,
+    D6 = 22,
+    D7 = 23,
+
+    FirstFloatReg = 16,
+    LastFloatReg = 22,

    // helpers
    FRAME_PTR = 11,
-    ESP = 13,
+    ESP = SP,
        
    FirstReg = 0,
+#ifdef NJ_ARM_VFP
+    LastReg = 23,
+#else
    LastReg = 10,
-    Scratch = 12,
-    UnknownReg = 11
+#endif
+    Scratch = IP,
+    UnknownReg = 31,
+
+    // special value referring to S14
+    FpSingleScratch = 24
 } Register;

 /* ARM condition codes */
@ -123,13 +152,30 @@ typedef struct _FragInfo {
    NIns*           epilogue;
 } FragInfo;

-static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;
-static const RegisterMask FpRegs = 0x0000; // FST0-FST7
+#ifdef ARM_VFP
+static const RegisterMask SavedFpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6 | 1<<D7;
+#else
+static const RegisterMask SavedFpRegs = 0;
+#endif
+static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10 | SavedFpRegs;
+static const RegisterMask FpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6; // no D7; S14-S15 are used for i2f/u2f.
 static const RegisterMask GpRegs = 0x07FF;
 static const RegisterMask AllowableFlagRegs = 1<<R0 | 1<<R1 | 1<<R2 | 1<<R3 | 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10;

+#define IsFpReg(_r)     ((rmask(_r) & (FpRegs | (1<<D7))) != 0)
+#define IsGpReg(_r)     ((rmask(_r) & (GpRegs | (1<<Scratch))) != 0)
+#define FpRegNum(_fpr)  ((_fpr) - FirstFloatReg)
+
 #define firstreg()      R0
-#define nextreg(r)      (Register)((int)r+1)
+#define nextreg(r)      ((Register)((int)(r)+1))
+#if 0
+static Register nextreg(Register r) {
+    if (r == R10)
+        return D0;
+    return (Register)(r+1);
+}
+#endif
+// only good for normal regs
 #define imm2register(c) (Register)(c-1)

 verbose_only( extern const char* regNames[]; )
@ -148,11 +194,12 @@ verbose_only( extern const char* regNames[]; )
    void BL(NIns*);                                                     \
    void BL_far(NIns*);                                                 \
    void CALL(const CallInfo*);                                         \
+    void B_cond_chk(ConditionCode, NIns*, bool);                        \
    void underrunProtect(int bytes);                                    \
    bool has_cmov;                                                      \
    void nativePageReset();                                             \
    void nativePageSetup();                                             \
-    void flushCache(NIns*,NIns*);                                       \
+    void asm_quad_nochk(Register, const int32_t*);                      \
    int* _nSlot;                                                        \
    int* _nExitSlot;

@ -174,6 +221,7 @@ verbose_only( extern const char* regNames[]; )
 #define FUNCADDR(addr) ( ((int)addr) )  

 #define OP_IMM  (1<<25)
+#define OP_STAT (1<<20)

 #define COND_AL (0xE<<28)

@ -189,7 +237,7 @@ typedef enum {
    ROR_reg = 7  // Rotate Right
 } ShiftOperator;

-#define LD32_size 4
+#define LD32_size 8

 #define BEGIN_NATIVE_CODE(x)                    \
    { DWORD* _nIns = (uint8_t*)x
@ -251,45 +299,58 @@ typedef enum {
        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<21) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
        asm_output2("eor %s,%d",gpn(_r),(_imm)); } while(0)

-// _l = _l + _r
-#define ADD(_l,_r) do {                                                 \
+// _d = _n + _m
+#define arm_ADD(_d,_n,_m) do {                                          \
        underrunProtect(4);                                             \
-        *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_l)<<12) | (_l)); \
-        asm_output2("add %s,%s",gpn(_l),gpn(_r)); } while(0)
+        *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (_m)); \
+        asm_output3("add %s,%s+%s",gpn(_d),gpn(_n),gpn(_m)); } while(0)

-// _r = _r + _imm
-#define ADDi(_r,_imm)   do {                                            \
-        if ((_imm)>-256 && (_imm)<256) {                                \
+// _l = _l + _r
+#define ADD(_l,_r)   arm_ADD(_l,_l,_r)
+
+// TODO: we can do better here, since we can rotate the 8-bit immediate left by
+// an even number of bits; should count zeros at the end.
+
+// Note that this sometimes converts negative immediate values to a to a sub.
+// _d = _r + _imm
+#define arm_ADDi(_d,_n,_imm)   do {                                     \
+        if ((_imm) > -256 && (_imm) < 256) {                            \
            underrunProtect(4);                                         \
            if ((_imm)>=0)                                              \
-                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | ((_imm)&0xFF) ); \
            else                                                        \
-                *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((-(_imm))&0xFF) ); \
+                *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<22) | ((_n)<<16) | ((_d)<<12) | ((-(_imm))&0xFF) ); \
        } else {                                                        \
            if ((_imm)>=0) {                                            \
                if ((_imm)<=1020 && (((_imm)&3)==0) ) {                 \
                    underrunProtect(4);                                 \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | ((_r)<<16) | ((_r)<<12) | (15<<8)| ((_imm)>>2) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (15<<8)| ((_imm)>>2) ); \
                } else {                                                \
                    underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<23) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<23) | ((_n)<<16) | ((_d)<<12) | (Scratch)); \
                    LD32_nochk(Scratch, _imm);                          \
                }                                                       \
            } else {                                                    \
+                underrunProtect(4+LD32_size);                           \
+                *(--_nIns) = (NIns)( COND_AL | OP_STAT | (1<<22) | ((_n)<<16) | ((_d)<<12) | (Scratch)); \
+                LD32_nochk(Scratch, -(_imm));                           \
+            }                                                           \
+        }                                                               \
+        asm_output3("add %s,%s,%d",gpn(_d),gpn(_n),(_imm));             \
+    } while(0)
+
+/*
+ * There used to be a :
                if ((_imm)>=-510) {                                     \
                    underrunProtect(8);                                 \
                    int rem = -(_imm) - 255;                            \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | ((rem)&0xFF) ); \
-                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_r)<<16) | ((_r)<<12) | (0xFF) ); \
-                } else {                                                \
-                    underrunProtect(4+LD32_size);                       \
-                    *(--_nIns) = (NIns)( COND_AL | (1<<22) | ((_r)<<16) | ((_r)<<12) | (Scratch)); \
-                    LD32_nochk(Scratch, -(_imm));                       \
-                }                                                       \
-            }                                                           \
-        }                                                               \
-        asm_output2("addi %s,%d",gpn(_r),(_imm));                       \
-    } while(0)
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_n)<<16) | ((_d)<<12) | ((rem)&0xFF) ); \
+                    *(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<22) | ((_n)<<16) | ((_d)<<12) | (0xFF) ); \
+                } else {                                               
+ * above, but if we do that we can't really update the status registers.  So don't do that.
+ */
+
+#define ADDi(_r,_imm)  arm_ADDi(_r,_r,_imm)

 // _l = _l - _r
 #define SUB(_l,_r)  do {                                                \
@ -402,6 +463,13 @@ typedef enum {
        *(--_nIns) = (NIns)( COND_AL | (0x11<<20) | ((_d)<<16) | (_s) ); \
        asm_output2("test %s,%s",gpn(_d),gpn(_s)); } while(0)

+#define TSTi(_d,_imm) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(((_imm) & 0xff) == (_imm));                          \
+        *(--_nIns) = (NIns)( COND_AL | OP_IMM | (0x11<<20) | ((_d) << 16) | (0xF<<12) | ((_imm) & 0xff) ); \
+        asm_output2("tst %s,#0x%x", gpn(_d), _imm);                     \
+    } while (0);
+
 // CMP
 #define CMP(_l,_r)  do {                                                \
        underrunProtect(4);                                             \
@ -429,7 +497,7 @@ typedef enum {
                LD32_nochk(Scratch, (_imm));                            \
            }                                                           \
        }                                                               \
-        asm_output2("cmp %s,%X",gpn(_r),(_imm));                        \
+        asm_output2("cmp %s,0x%x",gpn(_r),(_imm));                      \
    } while(0)

 // MOV
@ -457,25 +525,33 @@ typedef enum {
 #define MRNO(dr,sr) MR_cond(dr, sr, VC, "movvc") // overflow clear
 #define MRNC(dr,sr) MR_cond(dr, sr, CC, "movcc") // carry clear

-#define LD(_d,_off,_b) do {                                             \
-        if ((_off)<0) {                                                 \
-            underrunProtect(4);                                         \
+#define LDR_chk(_d,_b,_off,_chk) do {                                   \
+        if (IsFpReg(_d)) {                                              \
+            FLDD_chk(_d,_b,_off,_chk);                                  \
+        } else if ((_off)<0) {                                          \
+            if (_chk) underrunProtect(4);                               \
            NanoAssert((_off)>-4096);                                   \
            *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | ((_b)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
        } else {                                                        \
            if (isS16(_off) || isU16(_off)) {                           \
-                underrunProtect(4);                                     \
+                if (_chk) underrunProtect(4);                           \
                NanoAssert((_off)<4096);                                \
                *(--_nIns) = (NIns)( COND_AL | (0x59<<20) | ((_b)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
            } else {                                                    \
-                underrunProtect(4+LD32_size);                           \
+                if (_chk) underrunProtect(4+LD32_size);                 \
                *(--_nIns) = (NIns)( COND_AL | (0x79<<20) | ((_b)<<16) | ((_d)<<12) | Scratch ); \
                LD32_nochk(Scratch, _off);                              \
            }                                                           \
        }                                                               \
-        asm_output3("ld %s,%d(%s)",gpn((_d)),(_off),gpn((_b)));         \
+        asm_output3("ldr %s,%d(%s)",gpn((_d)),(_off),gpn((_b)));        \
    } while(0)

+#define LDR(_d,_b,_off)        LDR_chk(_d,_b,_off,0)
+#define LDR_nochk(_d,_b,_off)  LDR_chk(_d,_b,_off,1)
+
+// i386 compat, for Assembler.cpp
+#define LD(reg,offset,base)    LDR_chk(reg,base,offset,1)
+#define ST(base,offset,reg)    STR(reg,base,offset)

 #define LDi(_d,_imm) do {                                               \
        if (isS8((_imm)) || isU8((_imm))) {                             \
@ -486,7 +562,7 @@ typedef enum {
            underrunProtect(LD32_size);                                 \
            LD32_nochk(_d, (_imm));                                     \
        }                                                               \
-        asm_output2("ld %s,%d",gpn((_d)),(_imm));                       \
+        asm_output2("ld  %s,0x%x",gpn((_d)),(_imm));                      \
    } while(0)


@ -501,29 +577,13 @@ typedef enum {
        asm_output3("ldrb %s,%d(%s)", gpn(_d),(_off),gpn(_b));          \
    } while(0)

-#define ST(_b,_off,_r) do {                                             \
+#define STR(_d,_n,_off) do {                                            \
+        NanoAssert(!IsFpReg(_d) && isS12(_off));                        \
        underrunProtect(4);                                             \
-        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_b)<<16) | ((_r)<<12) | ((-(_off))&0xFFF) ); \
-        else            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((_r)<<12) | ((_off)&0xFFF) ); \
-        asm_output3("str %s, %d(%s)",gpn(_r), (_off),gpn(_b)); } while(0)
-
-
-#define STi(_b,_off,_imm) do {                                          \
-        NanoAssert((_off)>0);                                           \
-        if (isS8((_imm)) || isU8((_imm))) {                             \
-            underrunProtect(8);                                         \
-            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
-            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
-            if ((_imm)<0)   *(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | (Scratch<<12) | (((_imm)^0xFFFFFFFF)&0xFF) ); \
-            else            *(--_nIns) = (NIns)( COND_AL | (0x3B<<20) | (Scratch<<12) | ((_imm)&0xFF) ); \
-            asm_output2("ld %s,%d",gpn((Scratch)),(_imm));              \
-        } else {                                                        \
-            underrunProtect(4+LD32_size);                               \
-            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_b)<<16) | ((Scratch)<<12) | ((_off)&0xFFF) ); \
-            asm_output3("str %s, %d(%s)",gpn(Scratch), (_off),gpn(_b)); \
-            LD32_nochk(Scratch, (_imm));                                \
-        }                                \
-    } while(0);
+        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x50<<20) | ((_n)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
+        else            *(--_nIns) = (NIns)( COND_AL | (0x58<<20) | ((_n)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
+        asm_output3("str %s, %d(%s)",gpn(_d), (_off), gpn(_n));         \
+    } while(0)


 #define LEA(_r,_d,_b) do {                                              \
@ -548,7 +608,7 @@ typedef enum {
 //#define RET() INT3()

 #define BKPT_nochk() do { \
-        *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
+        *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0)

 // this is pushing a reg
 #define PUSHr(_r)  do {                                                 \
@ -581,47 +641,10 @@ typedef enum {
        *(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) ); \
        asm_output1("pop %x", (_mask));} while (0)

+// PC always points to current instruction + 8, so when calculating pc-relative
+// offsets, use PC+8.
 #define PC_OFFSET_FROM(target,frompc) ((intptr_t)(target) - ((intptr_t)(frompc) + 8))
-#define JMP_S24_OFFSET_OK(offs) ((-(1<<24)) <= (offs) && (offs) < (1<<24))
-
-// (XXX This ought to be a function instead of a macro)
-//
-// Branch to target address _t with condition _c, doing underrun
-// checks (_chk == 1) or skipping them (_chk == 0).
-//
-// If the jump fits in a relative jump (+/-32MB), emit that.
-// If the jump is unconditional, emit the dest address inline in
-// the instruction stream and load it into pc.
-// If the jump has a condition, but noone's mucked with _nIns and our _nSlot
-// pointer is valid, stick the constant in the slot and emit a conditional
-// load into pc.
-// Otherwise, emit the conditional load into pc from a nearby constant,
-// and emit a jump to jump over it it in case the condition fails.
-//
-// NB: JMP_nochk depends on this not calling samepage() when _c == AL
-#define B_cond_chk(_c,_t,_chk) do {                                     \
-        int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);            \
-        if (JMP_S24_OFFSET_OK(offs)) {                                  \
-            if(_chk) underrunProtect(4);                                \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
-        } else if (_c == AL) {                                          \
-            if(_chk) underrunProtect(8);                                \
-            *(--_nIns) = (NIns)(_t);                                    \
-            *(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
-        } else if (samepage(_nIns,_nSlot)) {                            \
-            if(_chk) underrunProtect(8);                                \
-            *(++_nSlot) = (NIns)(_t);                                   \
-            offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);          \
-            NanoAssert(offs < 0);                                       \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
-        } else {                                                        \
-            if(_chk) underrunProtect(24);                               \
-            *(--_nIns) = (NIns)(_t);                                    \
-            *(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
-            *(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
-        }                                                               \
-        asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
-    } while(0)
+#define isS12(offs) ((-(1<<12)) <= (offs) && (offs) < (1<<12))

 #define B_cond(_c,_t)                           \
    B_cond_chk(_c,_t,1)
@ -665,35 +688,12 @@ typedef enum {
 #define JO(t)   do {B_cond(VS,t); asm_output1("bvs 0x%08x",(unsigned int)t); } while(0)
 #define JNO(t)  do {B_cond(VC,t); asm_output1("bvc 0x%08x",(unsigned int)t); } while(0)

-// used for testing result of an FP compare
+// used for testing result of an FP compare on x86; not used on arm.
 // JP = comparison  false
-#define JP(t)   do {B_cond(EQ,NE,t); asm_output1("jp 0x%08x",t); } while(0) 
+#define JP(t)   do {NanoAssert(0); B_cond(NE,t); asm_output1("jp 0x%08x",t); } while(0) 

 // JNP = comparison true
-#define JNP(t)  do {B_cond(NE,EQ,t); asm_output1("jnp 0x%08x",t); } while(0)
-
-
-// floating point
-#define FNSTSW_AX() do {NanoAssert(0);      asm_output("fnstsw_ax"); } while(0)
-#define FFREE(r)    do {NanoAssert(0);      asm_output1("ffree %s",gpn(b)); } while(0)
-#define FSTQ(p,d,b) do {NanoAssert(0);      asm_output2("fstq %d(%s)",d,gpn(b)); } while(0)
-#define FSTPQ(d,b)  FSTQ(1,d,b)
-//#define FSTPQ(d,b)    do {NanoAssert(0);      asm_output2("fstpq %d(%s)",d,gpn(b)); } while(0)
-#define FCOM(p,d,b) do {NanoAssert(0);      asm_output2("fcom %d(%s)",d,gpn(b)); } while(0)
-#define FCOMP(d,b)  do {NanoAssert(0);      asm_output2("fcomp %d(%s)",d,gpn(b)); } while(0)
-#define FLDQ(d,b)   do {NanoAssert(0);      asm_output2("fldq %d(%s)",d,gpn(b)); } while(0)
-#define FILDQ(d,b)  do {NanoAssert(0);      asm_output2("fildq %d(%s)",d,gpn(b)); } while(0)
-#define FILD(d,b)   do {NanoAssert(0);      asm_output2("fild %d(%s)",d,gpn(b)); } while(0)
-#define FADD(d,b)   do {NanoAssert(0);      asm_output2("faddq %d(%s)",d,gpn(b)); } while(0)
-#define FSUB(d,b)   do {NanoAssert(0);      asm_output2("fsubq %d(%s)",d,gpn(b)); } while(0)
-#define FSUBR(d,b)  do {NanoAssert(0);      asm_output2("fsubr %d(%s)",d,gpn(b)); } while(0)
-#define FMUL(d,b)   do {NanoAssert(0);      asm_output2("fmulq %d(%s)",d,gpn(b)); } while(0)
-#define FDIV(d,b)   do {NanoAssert(0);      asm_output2("fdivq %d(%s)",d,gpn(b)); } while(0)
-#define FDIVR(d,b)  do {NanoAssert(0);      asm_output2("fdivr %d(%s)",d,gpn(b)); } while(0)
-#define FSTP(r)     do {NanoAssert(0);      asm_output1("fst st(%d)",r); } while(0)
-#define FLD1()      do {NanoAssert(0);      asm_output("fld1"); } while(0)
-#define FLDZ()      do {NanoAssert(0);      asm_output("fldz"); } while(0)
-
+#define JNP(t)  do {NanoAssert(0); B_cond(EQ,t); asm_output1("jnp 0x%08x",t); } while(0)


 // MOV(EQ) _r, #1 
@ -758,17 +758,147 @@ typedef enum {
    } while(0)

 #define STMIA(_b, _mask) do {                                           \
-        underrunProtect(2);                                             \
+        underrunProtect(4);                                             \
        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
        *(--_nIns) = (NIns)(COND_AL | (0x8A<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output2("stmia %s!,{%x}", gpn(_b), _mask); \
+        asm_output2("stmia %s!,{0x%x}", gpn(_b), _mask); \
    } while (0)

 #define LDMIA(_b, _mask) do {                                           \
-        underrunProtect(2);                                             \
+        underrunProtect(4);                                             \
        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
        *(--_nIns) = (NIns)(COND_AL | (0x8B<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output2("ldmia %s!,{%x}", gpn(_b), (_mask)); \
+        asm_output2("ldmia %s!,{0x%x}", gpn(_b), (_mask)); \
+    } while (0)
+
+#define MRS(_d) do {                            \
+        underrunProtect(4);                     \
+        *(--_nIns) = (NIns)(COND_AL | (0x10<<20) | (0xF<<16) | ((_d)<<12)); \
+        asm_output1("msr %s", gpn(_d));                                 \
+    } while (0)
+
+/*
+ * VFP
+ */
+
+#define FMDRR(_Dm,_Rd,_Rn) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dm) && IsGpReg(_Rd) && IsGpReg(_Rn));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xC4<<20) | ((_Rn)<<16) | ((_Rd)<<12) | (0xB1<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmdrr %s,%s,%s", gpn(_Dm), gpn(_Rd), gpn(_Rn));    \
+    } while (0)
+
+#define FMRRD(_Rd,_Rn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsGpReg(_Rd) && IsGpReg(_Rn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xC5<<20) | ((_Rn)<<16) | ((_Rd)<<12) | (0xB1<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmrrd %s,%s,%s", gpn(_Rd), gpn(_Rn), gpn(_Dm));    \
+    } while (0)
+
+#define FSTD(_Dd,_Rn,_offs) do {                                        \
+        underrunProtect(4);                                             \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2));         \
+        NanoAssert(IsFpReg(_Dd) && !IsFpReg(_Rn));                      \
+        int negflag = 1<<23;                                            \
+        intptr_t offs = (_offs);                                        \
+        if (_offs < 0) {                                                \
+            negflag = 0<<23;                                            \
+            offs = -(offs);                                             \
+        }                                                               \
+        *(--_nIns) = (NIns)( COND_AL | (0xD0<<20) | ((_Rn)<<16) | (FpRegNum(_Dd)<<12) | (0xB<<8) | negflag | ((offs>>2)&0xff) ); \
+        asm_output3("fstd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);    \
+    } while (0)
+
+#define FLDD_chk(_Dd,_Rn,_offs,_chk) do {                               \
+        if(_chk) underrunProtect(4);                                    \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2));         \
+        NanoAssert(IsFpReg(_Dd) && !IsFpReg(_Rn));                      \
+        int negflag = 1<<23;                                            \
+        intptr_t offs = (_offs);                                        \
+        if (_offs < 0) {                                                \
+            negflag = 0<<23;                                            \
+            offs = -(offs);                                             \
+        }                                                               \
+        *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (FpRegNum(_Dd)<<12) | (0xB<<8) | negflag | ((offs>>2)&0xff) ); \
+        asm_output3("fldd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);       \
+    } while (0)
+#define FLDD(_Dd,_Rn,_offs) FLDD_chk(_Dd,_Rn,_offs,1)
+
+#define FSITOD(_Dd,_Sm) do {                                            \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x7) ); \
+        asm_output2("fsitod %s,%s", gpn(_Dd), gpn(_Sm));                \
+    } while (0)
+
+
+#define FUITOD(_Dd,_Sm) do {                                            \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2D<<6) | (0<<5) | (0x7) ); \
+        asm_output2("fuitod %s,%s", gpn(_Dd), gpn(_Sm));                \
+    } while (0)
+
+#define FMSR(_Sn,_Rd) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(((_Sn) == FpSingleScratch) && IsGpReg(_Rd));         \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        asm_output2("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
+    } while (0)
+
+#define FNEGD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB1<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fnegd %s,%s", gpn(_Dd), gpn(_Dm));                 \
+    } while (0)
+
+#define FADDD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE3<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("faddd %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FSUBD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE3<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fsubd %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FMULD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE2<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmuld %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FDIVD(_Dd,_Dn,_Dm) do {                                         \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dn) && IsFpReg(_Dm));       \
+        *(--_nIns) = (NIns)( COND_AL | (0xE8<<20) | (FpRegNum(_Dn)<<16) | (FpRegNum(_Dd)<<12) | (0xB0<<4) | (FpRegNum(_Dm)) ); \
+        asm_output3("fmuld %s,%s,%s", gpn(_Dd), gpn(_Dn), gpn(_Dm));    \
+    } while (0)
+
+#define FMSTAT() do {                               \
+        underrunProtect(4);                         \
+        *(--_nIns) = (NIns)( COND_AL | 0x0EF1FA10); \
+        asm_output("fmstat");                       \
+    } while (0)
+
+#define FCMPD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB4<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fcmpd %s,%s", gpn(_Dd), gpn(_Dm));                 \
+    } while (0)
+
+#define FCPYD(_Dd,_Dm) do {                                             \
+        underrunProtect(4);                                             \
+        NanoAssert(IsFpReg(_Dd) && IsFpReg(_Dm));                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB0<<16) | (FpRegNum(_Dd)<<12) | (0xB4<<4) | (FpRegNum(_Dm)) ); \
+        asm_output2("fcpyd %s,%s", gpn(_Dd), gpn(_Dm));                 \
    } while (0)
 }
 #endif // __nanojit_NativeThumb__
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@ -68,7 +68,9 @@ namespace nanojit
 			debug_only( uint32_t	count; )
 			debug_only( RegisterMask managed; )    // bitfield of 0..NJ_MAX_REGISTERS denoting which are under our management                     

-			LIns*	active[NJ_MAX_REGISTERS];  // active[r] = OP that defines r
+			// RegisterMask is a 32-bit value, so we can never have more than 32 active.
+			// hardcode 32 here in case we have non-contiguous register numbers
+			LIns*	active[32];  // active[r] = OP that defines r
 			RegisterMask	free;
 			RegisterMask	used;

--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@ -151,6 +151,7 @@ namespace nanojit
 #define isU8(i)  ( int32_t(i) == uint8_t(i) )
 #define isS16(i) ( int32_t(i) == int16_t(i) )
 #define isU16(i) ( int32_t(i) == uint16_t(i) )
+#define isS24(i) ( ((int32_t(i)<<8)>>8) == (i) )

 #define alignTo(x,s)		((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)		((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))