From 88f0a9bfc3883205539ae9f8850b2124b7e35ef1 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@mozilla.com>
Date: Tue, 6 Apr 2010 15:55:43 -0700
Subject: [PATCH] Bug 557483 - nanojit: convert i386 codegen macros to
 functions.  r=edwsmith.

--HG--
extra : convert_revision : 4062fae8baf26ee9fcdf5c8d1125b2a1fa806515
---
 js/src/nanojit/NativeX64.cpp  |   10 +-
 js/src/nanojit/Nativei386.cpp |  783 ++++++++++++++++++++++++-
 js/src/nanojit/Nativei386.h   | 1042 ++++++++-------------------------
 3 files changed, 1033 insertions(+), 802 deletions(-)

diff --git a/js/src/nanojit/NativeX64.cpp b/js/src/nanojit/NativeX64.cpp
index ca9658d66373..a08b0b0a151b 100644
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@@ -366,11 +366,11 @@ namespace nanojit
 #define RL(r)       gpRegNames32[(r)]
 #define RQ(r)       gpn(r)
 
-#define R           Register
-#define I           int
-#define I32         int32_t
-#define U64         uint64_t
-#define S           size_t
+    typedef Register R;
+    typedef int      I;
+    typedef int32_t  I32;
+    typedef uint64_t U64;
+    typedef size_t   S;
 
     void Assembler::PUSHR(R r)  { emitr(X64_pushr,r); asm_output("push %s", RQ(r)); }
     void Assembler::POPR( R r)  { emitr(X64_popr, r); asm_output("pop %s",  RQ(r)); }
diff --git a/js/src/nanojit/Nativei386.cpp b/js/src/nanojit/Nativei386.cpp
index db7873b7e20e..5e847a581427 100644
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -70,6 +70,783 @@ namespace nanojit
         0  /* ABI_CDECL */
     };
 
+    typedef Register R;
+    typedef int32_t  I32;
+
+    // XXX rearrange NanoAssert() expression to workaround apparent gcc 4.3 bug:
+    // XXX "error: logical && with non-zero constant will always evaluate as true"
+    // underrunProtect(6) is necessary for worst-case
+    inline void Assembler::MODRMs(I32 r, I32 d, R b, I32 l, I32 i) {
+        NanoAssert(unsigned(i)<8 && unsigned(b)<8 && unsigned(r)<8);
+        if (d == 0 && b != EBP) {
+            _nIns -= 2;
+            _nIns[0] = (uint8_t) ( 0<<6 | r<<3 | 4);
+            _nIns[1] = (uint8_t) ( l<<6 | i<<3 | b);
+        } else if (isS8(d)) {
+            _nIns -= 3;
+            _nIns[0] = (uint8_t) ( 1<<6 | r<<3 | 4 );
+            _nIns[1] = (uint8_t) ( l<<6 | i<<3 | b );
+            _nIns[2] = (uint8_t) d;
+        } else {
+            IMM32(d);
+            *(--_nIns) = (uint8_t) ( l<<6 | i<<3 | b );
+            *(--_nIns) = (uint8_t) ( 2<<6 | r<<3 | 4 );
+        }
+    }
+
+    // underrunProtect(6) is necessary for worst-case
+    inline void Assembler::MODRMm(I32 r, I32 d, R b) {
+        NanoAssert(unsigned(r)<8 && ((b)==UnspecifiedReg || unsigned(b)<8));
+        if ((b) == UnspecifiedReg) {
+            IMM32(d);
+            *(--_nIns) = (uint8_t) (0<<6 | (r)<<3 | 5);
+        } else if ((b) == ESP) {
+            MODRMs(r, d, b, 0, (Register)4);
+        } else if ( (d) == 0 && (b) != EBP) {
+            *(--_nIns) = (uint8_t) ( 0<<6 | r<<3 | b );
+        } else if (isS8(d)) {
+            *(--_nIns) = (uint8_t) (d);
+            *(--_nIns) = (uint8_t) ( 1<<6 | r<<3 | b );
+        } else {
+            IMM32(d);
+            *(--_nIns) = (uint8_t) ( 2<<6 | r<<3 | b );
+        }
+    }
+
+    inline void Assembler::MODRMSIB(R reg, R base, I32 index, I32 scale, I32 disp) {
+        if (disp != 0 || base == EBP) {
+            if (isS8(disp)) {
+                *(--_nIns) = int8_t(disp);
+            } else {
+                IMM32(disp);
+            }
+        }
+        *(--_nIns) = uint8_t( scale<<6 | index<<3 | base );
+        if (disp == 0 && base != EBP) {
+            *(--_nIns) = uint8_t( (reg<<3) | 4);
+        } else if (isS8(disp)) {
+            *(--_nIns) = uint8_t( (1<<6) | (reg<<3) | 4 );
+        } else {
+            *(--_nIns) = uint8_t( (2<<6) | (reg<<3) | 4 );
+        }
+    }
+
+    inline void Assembler::MODRMdm(I32 r, I32 addr) {
+        NanoAssert(unsigned(r)<8);
+        IMM32(addr);
+        *(--_nIns) = (uint8_t)( r<<3 | 5 );
+    }
+
+    inline void Assembler::ALU0(I32 o) {
+        underrunProtect(1);
+        *(--_nIns) = uint8_t(o);
+    }
+
+    inline void Assembler::ALUm(I32 c, I32 r, I32 d, R b) {
+        underrunProtect(8);
+        MODRMm(r, d, b);
+        *(--_nIns) = uint8_t(c);
+    }
+
+    inline void Assembler::ALUdm(I32 c, I32 r, I32 addr) {
+        underrunProtect(6);
+        MODRMdm(r, addr);
+        *(--_nIns) = uint8_t(c);
+    }
+
+    inline void Assembler::ALUsib(I32 c, R r, R base, I32 index, I32 scale, I32 disp) {
+        underrunProtect(7);
+        MODRMSIB(r, base, index, scale, disp);
+        *(--_nIns) = uint8_t(c);
+    }
+
+    inline void Assembler::ALUm16(I32 c, I32 r, I32 d, R b) {
+        underrunProtect(9);
+        MODRMm(r, d, b);
+        *(--_nIns) = uint8_t(c);
+        *(--_nIns) = 0x66;
+    }
+
+    inline void Assembler::ALU2dm(I32 c, I32 r, I32 addr) {
+        underrunProtect(7);
+        MODRMdm(r, addr);
+        *(--_nIns) = uint8_t(c);
+        *(--_nIns) = uint8_t(c>>8);
+    }
+
+    inline void Assembler::ALU2m(I32 c, I32 r, I32 d, R b) {
+        underrunProtect(9);
+        MODRMm(r, d, b);
+        *(--_nIns) = uint8_t(c);
+        *(--_nIns) = uint8_t(c>>8);
+    }
+
+    inline void Assembler::ALU2sib(I32 c, Register r, R base, I32 index, I32 scale, I32 disp) {
+        underrunProtect(8);
+        MODRMSIB(r, base, index, scale, disp);
+        *(--_nIns) = uint8_t(c);
+        *(--_nIns) = uint8_t(c>>8);
+    }
+
+    inline void Assembler::ALUi(I32 c, I32 r, I32 i) {
+        underrunProtect(6);
+        NanoAssert(unsigned(r)<8);
+        if (isS8(i)) {
+            *(--_nIns) = uint8_t(i);
+            MODRM(c>>3, r);
+            *(--_nIns) = uint8_t(0x83);
+        } else {
+            IMM32(i);
+            if ( r == EAX) {
+                *(--_nIns) = uint8_t(c);
+            } else {
+                MODRM((c>>3),(r));
+                *(--_nIns) = uint8_t(0x81);
+            }
+        }
+    }
+
+    inline void Assembler::ALUmi(I32 c, I32 d, Register b, I32 i) {
+        underrunProtect(10);
+        NanoAssert(((unsigned)b)<8);
+        if (isS8(i)) {
+            *(--_nIns) = uint8_t(i);
+            MODRMm(c>>3, d, b);
+            *(--_nIns) = uint8_t(0x83);
+        } else {
+            IMM32(i);
+            MODRMm(c>>3, d, b);
+            *(--_nIns) = uint8_t(0x81);
+        }
+    }
+
+    inline void Assembler::ALU2(I32 c, I32 d, I32 s) {
+        underrunProtect(3);
+        MODRM((d),(s));
+        _nIns -= 2;
+        _nIns[0] = uint8_t(c>>8);
+        _nIns[1] = uint8_t(c);
+    }
+
+    inline void Assembler::LAHF()        { count_alu(); ALU0(0x9F);                   asm_output("lahf"); }
+    inline void Assembler::SAHF()        { count_alu(); ALU0(0x9E);                   asm_output("sahf"); }
+    inline void Assembler::OR(R l, R r)  { count_alu(); ALU(0x0b, (l),(r));           asm_output("or %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::AND(R l, R r) { count_alu(); ALU(0x23, (l),(r));           asm_output("and %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::XOR(R l, R r) { count_alu(); ALU(0x33, (l),(r));           asm_output("xor %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::ADD(R l, R r) { count_alu(); ALU(0x03, (l),(r));           asm_output("add %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::SUB(R l, R r) { count_alu(); ALU(0x2b, (l),(r));           asm_output("sub %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::MUL(R l, R r) { count_alu(); ALU2(0x0faf,(l),(r));         asm_output("mul %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::DIV(R r)      { count_alu(); ALU(0xf7, (Register)7,(r));   asm_output("idiv  edx:eax, %s",gpn(r)); }
+    inline void Assembler::NOT(R r)      { count_alu(); ALU(0xf7, (Register)2,(r));   asm_output("not %s",gpn(r)); }
+    inline void Assembler::NEG(R r)      { count_alu(); ALU(0xf7, (Register)3,(r));   asm_output("neg %s",gpn(r)); }
+    inline void Assembler::SHR(R r, R s) { count_alu(); ALU(0xd3, (Register)5,(r));   asm_output("shr %s,%s",gpn(r),gpn(s)); }
+    inline void Assembler::SAR(R r, R s) { count_alu(); ALU(0xd3, (Register)7,(r));   asm_output("sar %s,%s",gpn(r),gpn(s)); }
+    inline void Assembler::SHL(R r, R s) { count_alu(); ALU(0xd3, (Register)4,(r));   asm_output("shl %s,%s",gpn(r),gpn(s)); }
+
+    inline void Assembler::SHIFT(I32 c, R r, I32 i) {
+        underrunProtect(3);
+        *--_nIns = (uint8_t)(i);
+        MODRM((Register)c,r);
+        *--_nIns = 0xc1;
+    }
+
+    inline void Assembler::SHLi(R r, I32 i)   { count_alu(); SHIFT(4,r,i); asm_output("shl %s,%d", gpn(r),i); }
+    inline void Assembler::SHRi(R r, I32 i)   { count_alu(); SHIFT(5,r,i); asm_output("shr %s,%d", gpn(r),i); }
+    inline void Assembler::SARi(R r, I32 i)   { count_alu(); SHIFT(7,r,i); asm_output("sar %s,%d", gpn(r),i); }
+
+    inline void Assembler::MOVZX8(R d, R s)   { count_alu(); ALU2(0x0fb6,d,s); asm_output("movzx %s,%s", gpn(d),gpn(s)); }
+
+    inline void Assembler::SUBi(R r, I32 i)   { count_alu(); ALUi(0x2d,r,i);   asm_output("sub %s,%d",gpn(r),i); }
+    inline void Assembler::ADDi(R r, I32 i)   { count_alu(); ALUi(0x05,r,i);   asm_output("add %s,%d",gpn(r),i); }
+    inline void Assembler::ANDi(R r, I32 i)   { count_alu(); ALUi(0x25,r,i);   asm_output("and %s,%d",gpn(r),i); }
+    inline void Assembler::ORi(R r, I32 i)    { count_alu(); ALUi(0x0d,r,i);   asm_output("or %s,%d",gpn(r),i); }
+    inline void Assembler::XORi(R r, I32 i)   { count_alu(); ALUi(0x35,r,i);   asm_output("xor %s,%d",gpn(r),i); }
+
+    inline void Assembler::ADDmi(I32 d, R b, I32 i) { count_alust(); ALUmi(0x05, d, b, i); asm_output("add %d(%s), %d", d, gpn(b), i); }
+
+    inline void Assembler::TEST(R d, R s)      { count_alu(); ALU(0x85,d,s);   asm_output("test %s,%s",gpn(d),gpn(s)); }
+    inline void Assembler::CMP(R l, R r)       { count_alu(); ALU(0x3b,l,r);   asm_output("cmp %s,%s",gpn(l),gpn(r)); }
+    inline void Assembler::CMPi(R r, I32 i)    { count_alu(); ALUi(0x3d,r,i);  asm_output("cmp %s,%d",gpn(r),i); }
+
+    inline void Assembler::LEA(R r, I32 d, R b)    { count_alu(); ALUm(0x8d, r,d,b);   asm_output("lea %s,%d(%s)",gpn(r),d,gpn(b)); }
+    // lea %r, d(%i*4)
+    // This addressing mode is not supported by the MODRMSIB macro.
+    inline void Assembler::LEAmi4(R r, I32 d, I32 i) {
+        count_alu();
+        IMM32(int32_t(d));
+        *(--_nIns) = (2<<6) | ((uint8_t)i<<3) | 5;
+        *(--_nIns) = (0<<6) | ((uint8_t)r<<3) | 4;
+        *(--_nIns) = 0x8d;
+        asm_output("lea %s, %p(%s*4)", gpn(r), (void*)d, gpn(i));
+    }
+
+    inline void Assembler::CDQ()       { SARi(EDX, 31); MR(EDX, EAX); }
+
+    inline void Assembler::INCLi(I32 p) {
+        count_alu();
+        underrunProtect(6);
+        IMM32((uint32_t)(ptrdiff_t)p); *(--_nIns) = 0x05; *(--_nIns) = 0xFF;
+        asm_output("incl  (%p)", (void*)p);
+    }
+
+    inline void Assembler::SETE( R r)  { count_alu(); ALU2(0x0f94,(r),(r));    asm_output("sete %s", gpn(r)); }
+    inline void Assembler::SETNP(R r)  { count_alu(); ALU2(0x0f9B,(r),(r));    asm_output("setnp %s",gpn(r)); }
+    inline void Assembler::SETL( R r)  { count_alu(); ALU2(0x0f9C,(r),(r));    asm_output("setl %s", gpn(r)); }
+    inline void Assembler::SETLE(R r)  { count_alu(); ALU2(0x0f9E,(r),(r));    asm_output("setle %s",gpn(r)); }
+    inline void Assembler::SETG( R r)  { count_alu(); ALU2(0x0f9F,(r),(r));    asm_output("setg %s", gpn(r)); }
+    inline void Assembler::SETGE(R r)  { count_alu(); ALU2(0x0f9D,(r),(r));    asm_output("setge %s",gpn(r)); }
+    inline void Assembler::SETB( R r)  { count_alu(); ALU2(0x0f92,(r),(r));    asm_output("setb %s", gpn(r)); }
+    inline void Assembler::SETBE(R r)  { count_alu(); ALU2(0x0f96,(r),(r));    asm_output("setbe %s",gpn(r)); }
+    inline void Assembler::SETA( R r)  { count_alu(); ALU2(0x0f97,(r),(r));    asm_output("seta %s", gpn(r)); }
+    inline void Assembler::SETAE(R r)  { count_alu(); ALU2(0x0f93,(r),(r));    asm_output("setae %s",gpn(r)); }
+    inline void Assembler::SETO( R r)  { count_alu(); ALU2(0x0f92,(r),(r));    asm_output("seto %s", gpn(r)); }
+
+    inline void Assembler::MREQ(R d, R s) { count_alu(); ALU2(0x0f44,d,s); asm_output("cmove %s,%s",  gpn(d),gpn(s)); }
+    inline void Assembler::MRNE(R d, R s) { count_alu(); ALU2(0x0f45,d,s); asm_output("cmovne %s,%s", gpn(d),gpn(s)); }
+    inline void Assembler::MRL( R d, R s) { count_alu(); ALU2(0x0f4C,d,s); asm_output("cmovl %s,%s",  gpn(d),gpn(s)); }
+    inline void Assembler::MRLE(R d, R s) { count_alu(); ALU2(0x0f4E,d,s); asm_output("cmovle %s,%s", gpn(d),gpn(s)); }
+    inline void Assembler::MRG( R d, R s) { count_alu(); ALU2(0x0f4F,d,s); asm_output("cmovg %s,%s",  gpn(d),gpn(s)); }
+    inline void Assembler::MRGE(R d, R s) { count_alu(); ALU2(0x0f4D,d,s); asm_output("cmovge %s,%s", gpn(d),gpn(s)); }
+    inline void Assembler::MRB( R d, R s) { count_alu(); ALU2(0x0f42,d,s); asm_output("cmovb %s,%s",  gpn(d),gpn(s)); }
+    inline void Assembler::MRBE(R d, R s) { count_alu(); ALU2(0x0f46,d,s); asm_output("cmovbe %s,%s", gpn(d),gpn(s)); }
+    inline void Assembler::MRA( R d, R s) { count_alu(); ALU2(0x0f47,d,s); asm_output("cmova %s,%s",  gpn(d),gpn(s)); }
+    inline void Assembler::MRAE(R d, R s) { count_alu(); ALU2(0x0f43,d,s); asm_output("cmovae %s,%s", gpn(d),gpn(s)); }
+    inline void Assembler::MRNO(R d, R s) { count_alu(); ALU2(0x0f41,d,s); asm_output("cmovno %s,%s", gpn(d),gpn(s)); }
+
+    // these aren't currently used but left in for reference
+    //#define LDEQ(r,d,b) do { ALU2m(0x0f44,r,d,b); asm_output("cmove %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+    //#define LDNEQ(r,d,b) do { ALU2m(0x0f45,r,d,b); asm_output("cmovne %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+
+    inline void Assembler::LD(R reg, I32 disp, R base) {
+        count_ld();
+        ALUm(0x8b,reg,disp,base);
+        asm_output("mov %s,%d(%s)",gpn(reg),disp,gpn(base));
+    }
+
+    inline void Assembler::LDdm(R reg, I32 addr) {
+        count_ld();
+        ALUdm(0x8b,reg,addr);
+        asm_output("mov   %s,0(%lx)",gpn(reg),(unsigned long)addr);
+    }
+
+#define SIBIDX(n)    "1248"[n]
+
+    inline void Assembler::LDsib(R reg, I32 disp, R base, I32 index, I32 scale) {
+        count_ld();
+        ALUsib(0x8b, reg, base, index, scale, disp);
+        asm_output("mov   %s,%d(%s+%s*%c)",gpn(reg),disp,gpn(base),gpn(index),SIBIDX(scale));
+    }
+
+    // note: movzx/movsx are being output with an 8/16 suffix to indicate the
+    // size being loaded. This doesn't really match standard intel format
+    // (though is arguably terser and more obvious in this case) and would
+    // probably be nice to fix.  (Likewise, the 8/16 bit stores being output
+    // as "mov8" and "mov16" respectively.)
+
+    // Load 16-bit, sign extend.
+    inline void Assembler::LD16S(R r, I32 d, R b) {
+        count_ld();
+        ALU2m(0x0fbf, r, d, b);
+        asm_output("movsx16 %s,%d(%s)", gpn(r),d,gpn(b));
+    }
+
+    inline void Assembler::LD16Sdm(R r, I32 addr) {
+        count_ld();
+        ALU2dm(0x0fbf, r, addr);
+        asm_output("movsx16 %s,0(%lx)", gpn(r),(unsigned long)addr);
+    }
+
+    inline void Assembler::LD16Ssib(R r, I32 disp, R base, I32 index, I32 scale) {
+        count_ld();
+        ALU2sib(0x0fbf, r, base, index, scale, disp);
+        asm_output("movsx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale));
+    }
+
+    // Load 16-bit, zero extend.
+    inline void Assembler::LD16Z(R r, I32 d, R b) {
+        count_ld();
+        ALU2m(0x0fb7, r, d, b);
+        asm_output("movzx16 %s,%d(%s)", gpn(r),d,gpn(b));
+    }
+
+    inline void Assembler::LD16Zdm(R r, I32 addr) {
+        count_ld();
+        ALU2dm(0x0fb7, r, addr);
+        asm_output("movzx16 %s,0(%lx)", gpn(r),(unsigned long)addr);
+    }
+
+    inline void Assembler::LD16Zsib(R r, I32 disp, R base, I32 index, I32 scale) {
+        count_ld();
+        ALU2sib(0x0fb7, r, base, index, scale, disp);
+        asm_output("movzx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale));
+    }
+
+    // Load 8-bit, zero extend.
+    inline void Assembler::LD8Z(R r, I32 d, R b) {
+        count_ld();
+        ALU2m(0x0fb6, r, d, b);
+        asm_output("movzx8 %s,%d(%s)", gpn(r),d,gpn(b));
+    }
+
+    inline void Assembler::LD8Zdm(R r, I32 addr) {
+        count_ld();
+        ALU2dm(0x0fb6, r, addr);
+        asm_output("movzx8 %s,0(%lx)", gpn(r),(long unsigned)addr);
+    }
+
+    inline void Assembler::LD8Zsib(R r, I32 disp, R base, I32 index, I32 scale) {
+        count_ld();
+        ALU2sib(0x0fb6, r, base, index, scale, disp);
+        asm_output("movzx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale));
+    }
+
+    // Load 8-bit, sign extend.
+    inline void Assembler::LD8S(R r, I32 d, R b) {
+        count_ld();
+        ALU2m(0x0fbe, r, d, b);
+        asm_output("movsx8 %s,%d(%s)", gpn(r),d,gpn(b));
+    }
+
+    inline void Assembler::LD8Sdm(R r, I32 addr) {
+        count_ld();
+        ALU2dm(0x0fbe, r, addr);
+        asm_output("movsx8 %s,0(%lx)", gpn(r),(long unsigned)addr);
+    }
+
+    inline void Assembler::LD8Ssib(R r, I32 disp, R base, I32 index, I32 scale) {
+        count_ld();
+        ALU2sib(0x0fbe, r, base, index, scale, disp);
+        asm_output("movsx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale));
+    }
+
+    inline void Assembler::LDi(R r, I32 i) {
+        count_ld();
+        underrunProtect(5);
+        IMM32(i);
+        NanoAssert(((unsigned)r)<8);
+        *(--_nIns) = (uint8_t) ( 0xb8 | r );
+        asm_output("mov %s,%d",gpn(r),i);
+    }
+
+    // Quirk of x86-32: reg must be a/b/c/d for byte stores here.
+    inline void Assembler::ST8(R base, I32 disp, R reg) {
+        count_st();
+        NanoAssert(((unsigned)reg)<4);
+        ALUm(0x88, reg, disp, base);
+        asm_output("mov8 %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg));
+    }
+
+    inline void Assembler::ST16(R base, I32 disp, R reg) {
+        count_st();
+        ALUm16(0x89, reg, disp, base);
+        asm_output("mov16 %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg));
+    }
+
+    inline void Assembler::ST(R base, I32 disp, R reg) {
+        count_st();
+        ALUm(0x89, reg, disp, base);
+        asm_output("mov %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg));
+    }
+
+    inline void Assembler::ST8i(R base, I32 disp, I32 imm) {
+        count_st();
+        underrunProtect(8);
+        IMM8(imm);
+        MODRMm(0, disp, base);
+        *(--_nIns) = 0xc6;
+        asm_output("mov8 %d(%s),%d",disp,gpn(base),imm);
+    }
+
+    inline void Assembler::ST16i(R base, I32 disp, I32 imm) {
+        count_st();
+        underrunProtect(10);
+        IMM16(imm);
+        MODRMm(0, disp, base);
+        *(--_nIns) = 0xc7;
+        *(--_nIns) = 0x66;
+        asm_output("mov16 %d(%s),%d",disp,gpn(base),imm);
+    }
+
+    inline void Assembler::STi(R base, I32 disp, I32 imm) {
+        count_st();
+        underrunProtect(11);
+        IMM32(imm);
+        MODRMm(0, disp, base);
+        *(--_nIns) = 0xc7;
+        asm_output("mov %d(%s),%d",disp,gpn(base),imm);
+    }
+
+    inline void Assembler::RET()   { count_ret(); ALU0(0xc3); asm_output("ret"); }
+    inline void Assembler::NOP()   { count_alu(); ALU0(0x90); asm_output("nop"); }
+    inline void Assembler::INT3()  {              ALU0(0xcc); asm_output("int3"); }
+
+    inline void Assembler::PUSHi(I32 i) {
+        count_push();
+        if (isS8(i)) {
+            underrunProtect(2);
+            _nIns-=2; _nIns[0] = 0x6a; _nIns[1] = uint8_t(i);
+            asm_output("push %d",i);
+        } else {
+            PUSHi32(i);
+        }
+    }
+
+    inline void Assembler::PUSHi32(I32 i) {
+        count_push();
+        underrunProtect(5);
+        IMM32(i);
+        *(--_nIns) = 0x68;
+        asm_output("push %d",i);
+    }
+
+    inline void Assembler::PUSHr(R r) {
+        count_push();
+        underrunProtect(1);
+        NanoAssert(((unsigned)r)<8);
+        *(--_nIns) = (uint8_t) ( 0x50 | r );
+        asm_output("push %s",gpn(r));
+    }
+
+    inline void Assembler::PUSHm(I32 d, R b) {
+        count_pushld();
+        ALUm(0xff, 6, d, b);
+        asm_output("push %d(%s)",d,gpn(b));
+    }
+
+    inline void Assembler::POPr(R r) {
+        count_pop();
+        underrunProtect(1);
+        NanoAssert(((unsigned)r)<8);
+        *(--_nIns) = (uint8_t) ( 0x58 | (r) );
+        asm_output("pop %s",gpn(r));
+    }
+
+#define JCC32 0x0f
+#define JMP8  0xeb
+#define JMP32 0xe9
+
+    inline void Assembler::JCC(I32 o, NIns* t, const char* n) {
+        count_jcc();
+        underrunProtect(6);
+        intptr_t tt = (intptr_t)t - (intptr_t)_nIns;
+        if (isS8(tt)) {
+            verbose_only( NIns* next = _nIns; (void)next; )
+            _nIns -= 2;
+            _nIns[0] = uint8_t( 0x70 | o );
+            _nIns[1] = uint8_t(tt);
+            asm_output("%-5s %p",n,next+tt);
+        } else {
+            verbose_only( NIns* next = _nIns; )
+            IMM32(tt);
+            _nIns -= 2;
+            _nIns[0] = JCC32;
+            _nIns[1] = (uint8_t) ( 0x80 | o );
+            asm_output("%-5s %p",n,next+tt);
+        }
+    }
+
+    inline void Assembler::JMP_long(NIns* t) {
+        count_jmp();
+        underrunProtect(5);
+        intptr_t tt = (intptr_t)t - (intptr_t)_nIns;
+        JMP_long_nochk_offset(tt);
+        verbose_only( verbose_outputf("%010lx:", (unsigned long)_nIns); )
+    }
+
+    inline void Assembler::JMP_indirect(R r) {
+        underrunProtect(2);
+        MODRMm(4, 0, r);
+        *(--_nIns) = 0xff;
+        asm_output("jmp   *(%s)", gpn(r));
+    }
+
+    inline void Assembler::JMP_indexed(Register x, I32 ss, NIns** addr) {
+        underrunProtect(7);
+        IMM32(int32_t(addr));
+        _nIns -= 3;
+        _nIns[0]   = (NIns) 0xff; /* jmp */
+        _nIns[1]   = (NIns) (0<<6 | 4<<3 | 4); /* modrm: base=sib + disp32 */
+        _nIns[2]   = (NIns) (ss<<6 | (x)<<3 | 5); /* sib: x<<ss + table */
+        asm_output("jmp   *(%s*%d+%p)", gpn(x), 1<<ss, (void*)(addr));
+    }
+
+    inline void Assembler::JE(NIns* t)   { JCC(0x04, t, "je"); }
+    inline void Assembler::JNE(NIns* t)  { JCC(0x05, t, "jne"); }
+    inline void Assembler::JP(NIns* t)   { JCC(0x0A, t, "jp"); }
+    inline void Assembler::JNP(NIns* t)  { JCC(0x0B, t, "jnp"); }
+
+    inline void Assembler::JB(NIns* t)   { JCC(0x02, t, "jb"); }
+    inline void Assembler::JNB(NIns* t)  { JCC(0x03, t, "jnb"); }
+    inline void Assembler::JBE(NIns* t)  { JCC(0x06, t, "jbe"); }
+    inline void Assembler::JNBE(NIns* t) { JCC(0x07, t, "jnbe"); }
+
+    inline void Assembler::JA(NIns* t)   { JCC(0x07, t, "ja"); }
+    inline void Assembler::JNA(NIns* t)  { JCC(0x06, t, "jna"); }
+    inline void Assembler::JAE(NIns* t)  { JCC(0x03, t, "jae"); }
+    inline void Assembler::JNAE(NIns* t) { JCC(0x02, t, "jnae"); }
+
+    inline void Assembler::JL(NIns* t)   { JCC(0x0C, t, "jl"); }
+    inline void Assembler::JNL(NIns* t)  { JCC(0x0D, t, "jnl"); }
+    inline void Assembler::JLE(NIns* t)  { JCC(0x0E, t, "jle"); }
+    inline void Assembler::JNLE(NIns* t) { JCC(0x0F, t, "jnle"); }
+
+    inline void Assembler::JG(NIns* t)   { JCC(0x0F, t, "jg"); }
+    inline void Assembler::JNG(NIns* t)  { JCC(0x0E, t, "jng"); }
+    inline void Assembler::JGE(NIns* t)  { JCC(0x0D, t, "jge"); }
+    inline void Assembler::JNGE(NIns* t) { JCC(0x0C, t, "jnge"); }
+
+    inline void Assembler::JO(NIns* t)   { JCC(0x00, t, "jo"); }
+    inline void Assembler::JNO(NIns* t)  { JCC(0x01, t, "jno"); }
+
+    // sse instructions
+    inline void Assembler::SSE(I32 c, I32 d, I32 s) {
+        underrunProtect(9);
+        MODRM((d),(s));
+        _nIns -= 3;
+        _nIns[0] = uint8_t((c>>16) & 0xff);
+        _nIns[1] = uint8_t((c>>8) & 0xff);
+        _nIns[2] = uint8_t(c&0xff);
+    }
+
+    inline void Assembler::SSEm(I32 c, I32 r, I32 d, R b) {
+        underrunProtect(9);
+        MODRMm(r, d, b);
+        _nIns -= 3;
+        _nIns[0] = uint8_t((c>>16) & 0xff);
+        _nIns[1] = uint8_t((c>>8) & 0xff);
+        _nIns[2] = uint8_t(c & 0xff);
+    }
+
+    inline void Assembler::LDSDm(R r, const double* addr) {
+        count_ldq();
+        underrunProtect(8);
+        IMM32(int32_t(addr));
+        *(--_nIns) = uint8_t(((r)&7)<<3|5);
+        *(--_nIns) = 0x10;
+        *(--_nIns) = 0x0f;
+        *(--_nIns) = 0xf2;
+        asm_output("movsd %s,(%p) // =%f",gpn(r),(void*)addr,*addr);
+    }
+
+    inline void Assembler::SSE_LDSD(R r, I32 d, R b) { count_ldq(); SSEm(0xf20f10, r&7, d, b); asm_output("movsd %s,%d(%s)",gpn(r),(d),gpn(b)); }
+    inline void Assembler::SSE_LDQ( R r, I32 d, R b) { count_ldq(); SSEm(0xf30f7e, r&7, d, b); asm_output("movq %s,%d(%s)",gpn(r),d,gpn(b)); }
+    inline void Assembler::SSE_LDSS(R r, I32 d, R b) { count_ld();  SSEm(0xf30f10, r&7, d, b); asm_output("movss %s,%d(%s)",gpn(r),d,gpn(b)); }
+    inline void Assembler::SSE_STSD(I32 d, R b, R r) { count_stq(); SSEm(0xf20f11, r&7, d, b); asm_output("movsd %d(%s),%s",(d),gpn(b),gpn(r)); }
+    inline void Assembler::SSE_STQ( I32 d, R b, R r) { count_stq(); SSEm(0x660fd6, r&7, d, b); asm_output("movq %d(%s),%s",(d),gpn(b),gpn(r)); }
+    inline void Assembler::SSE_STSS(I32 d, R b, R r) { count_st();  SSEm(0xf30f11, r&7, d, b); asm_output("movss %d(%s),%s",(d),gpn(b),gpn(r)); }
+
+    inline void Assembler::SSE_CVTSI2SD(R xr, R gr)  { count_fpu(); SSE(0xf20f2a, xr&7, gr&7); asm_output("cvtsi2sd %s,%s",gpn(xr),gpn(gr)); }
+    inline void Assembler::SSE_CVTSD2SI(R gr, R xr)  { count_fpu(); SSE(0xf20f2d, gr&7, xr&7); asm_output("cvtsd2si %s,%s",gpn(gr),gpn(xr)); }
+    inline void Assembler::SSE_CVTSD2SS(R xr, R gr)  { count_fpu(); SSE(0xf20f5a, xr&7, gr&7); asm_output("cvtsd2ss %s,%s",gpn(xr),gpn(gr)); }
+    inline void Assembler::SSE_CVTSS2SD(R xr, R gr)  { count_fpu(); SSE(0xf30f5a, xr&7, gr&7); asm_output("cvtss2sd %s,%s",gpn(xr),gpn(gr)); }
+    inline void Assembler::SSE_CVTDQ2PD(R d,  R r)   { count_fpu(); SSE(0xf30fe6, d&7,  r&7);  asm_output("cvtdq2pd %s,%s",gpn(d),gpn(r)); }
+
+    // Move and zero-extend GP reg to XMM reg.
+    inline void Assembler::SSE_MOVD(R d, R s) {
+        count_mov();
+        if (_is_xmm_reg_(s)) {
+            NanoAssert(_is_gp_reg_(d));
+            SSE(0x660f7e, s&7, d&7);
+        } else {
+            NanoAssert(_is_gp_reg_(s));
+            NanoAssert(_is_xmm_reg_(d));
+            SSE(0x660f6e, d&7, s&7);
+        }
+        asm_output("movd %s,%s",gpn(d),gpn(s));
+    }
+
+    inline void Assembler::SSE_MOVSD(R rd, R rs) {
+        count_mov();
+        NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));
+        SSE(0xf20f10, rd&7, rs&7);
+        asm_output("movsd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    inline void Assembler::SSE_MOVDm(R d, R b, R xrs) {
+        count_st();
+        NanoAssert(_is_xmm_reg_(xrs) && (_is_gp_reg_(b) || b==FP));
+        SSEm(0x660f7e, xrs&7, d, b);
+        asm_output("movd %d(%s),%s", d, gpn(b), gpn(xrs));
+    }
+
+    inline void Assembler::SSE_ADDSD(R rd, R rs) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));
+        SSE(0xf20f58, rd&7, rs&7);
+        asm_output("addsd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    inline void Assembler::SSE_ADDSDm(R r, const double* addr) {
+        count_fpuld();
+        underrunProtect(8);
+        NanoAssert(_is_xmm_reg_(r));
+        const double* daddr = addr;
+        IMM32(int32_t(daddr));
+        *(--_nIns) = uint8_t((r&7)<<3 | 5);
+        *(--_nIns) = 0x58;
+        *(--_nIns) = 0x0f;
+        *(--_nIns) = 0xf2;
+        asm_output("addsd %s,%p // =%f",gpn(r),(void*)daddr,*daddr);
+    }
+
+    inline void Assembler::SSE_SUBSD(R rd, R rs) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));
+        SSE(0xf20f5c, rd&7, rs&7);
+        asm_output("subsd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    inline void Assembler::SSE_MULSD(R rd, R rs) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));
+        SSE(0xf20f59, rd&7, rs&7);
+        asm_output("mulsd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    inline void Assembler::SSE_DIVSD(R rd, R rs) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));
+        SSE(0xf20f5e, rd&7, rs&7);
+        asm_output("divsd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    inline void Assembler::SSE_UCOMISD(R rl, R rr) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(rl) && _is_xmm_reg_(rr));
+        SSE(0x660f2e, rl&7, rr&7);
+        asm_output("ucomisd %s,%s",gpn(rl),gpn(rr));
+    }
+
+    inline void Assembler::SSE_CVTSI2SDm(R xr, R d, R b) {
+        count_fpu();
+        NanoAssert(_is_xmm_reg_(xr) && _is_gp_reg_(b));
+        SSEm(0xf20f2a, xr&7, d, b);
+        asm_output("cvtsi2sd %s,%d(%s)",gpn(xr),d,gpn(b));
+    }
+
+    inline void Assembler::SSE_XORPD(R r, const uint32_t* maskaddr) {
+        count_fpuld();
+        underrunProtect(8);
+        IMM32(int32_t(maskaddr));
+        *(--_nIns) = uint8_t((r&7)<<3 | 5);
+        *(--_nIns) = 0x57;
+        *(--_nIns) = 0x0f;
+        *(--_nIns) = 0x66;
+        asm_output("xorpd %s,[%p]",gpn(r),(void*)maskaddr);
+    }
+
+    inline void Assembler::SSE_XORPDr(R rd, R rs) {
+        count_fpu();
+        SSE(0x660f57, rd&7, rs&7);
+        asm_output("xorpd %s,%s",gpn(rd),gpn(rs));
+    }
+
+    // floating point unit
+    inline void Assembler::FPUc(I32 o) {
+        underrunProtect(2);
+        *(--_nIns) = (uint8_t)(o & 0xff);
+        *(--_nIns) = (uint8_t)((o>>8) & 0xff);
+    }
+
+    inline void Assembler::FPUm(I32 o, I32 d, R b) {
+        underrunProtect(7);
+        MODRMm(uint8_t(o), d, b);
+        *(--_nIns) = (uint8_t)(o>>8);
+    }
+
+    inline void Assembler::FPUdm(I32 o, const double* const m) {
+        underrunProtect(6);
+        MODRMdm(uint8_t(o), int32_t(m));
+        *(--_nIns) = uint8_t(o>>8);
+    }
+
+    inline void Assembler::TEST_AH(I32 i) {
+        count_alu();
+        underrunProtect(3);
+        *(--_nIns) = uint8_t(i);
+        *(--_nIns) = 0xc4;
+        *(--_nIns) = 0xf6;
+        asm_output("test ah, %d",i);
+    }
+
+    inline void Assembler::TEST_AX(I32 i) {
+        count_fpu();
+        underrunProtect(5);
+        *(--_nIns) = 0;
+        *(--_nIns) = uint8_t(i);
+        *(--_nIns) = uint8_t((i)>>8);
+        *(--_nIns) = 0;
+        *(--_nIns) = 0xa9;
+        asm_output("test ax, %d",i);
+    }
+
+    inline void Assembler::FNSTSW_AX() { count_fpu(); FPUc(0xdfe0);    asm_output("fnstsw_ax"); }
+    inline void Assembler::FCHS()      { count_fpu(); FPUc(0xd9e0);    asm_output("fchs"); }
+    inline void Assembler::FLD1()      { count_fpu(); FPUc(0xd9e8);    asm_output("fld1"); fpu_push(); }
+    inline void Assembler::FLDZ()      { count_fpu(); FPUc(0xd9ee);    asm_output("fldz"); fpu_push(); }
+
+    inline void Assembler::FFREE(R r)  { count_fpu(); FPU(0xddc0, r);  asm_output("ffree %s",gpn(r)); }
+
+    inline void Assembler::FST32(bool p, I32 d, R b){ count_stq(); FPUm(0xd902|p, d, b);   asm_output("fst%s32 %d(%s)",(p?"p":""),d,gpn(b)); if (p) fpu_pop(); }
+    inline void Assembler::FSTQ(bool p, I32 d, R b) { count_stq(); FPUm(0xdd02|p, d, b);   asm_output("fst%sq %d(%s)",(p?"p":""),d,gpn(b)); if (p) fpu_pop(); }
+
+    inline void Assembler::FSTPQ(I32 d, R b) { FSTQ(1, d, b); }
+
+    inline void Assembler::FCOM(bool p, I32 d, R b) { count_fpuld(); FPUm(0xdc02|p, d, b); asm_output("fcom%s %d(%s)",(p?"p":""),d,gpn(b)); if (p) fpu_pop(); }
+    inline void Assembler::FCOMdm(bool p, const double* dm) {
+        count_fpuld();
+        FPUdm(0xdc02|p, dm);
+        asm_output("fcom%s (%p)",(p?"p":""),(void*)dm);
+        if (p) fpu_pop();
+    }
+
+    inline void Assembler::FLD32(I32 d, R b)        { count_ldq();   FPUm(0xd900, d, b); asm_output("fld32 %d(%s)",d,gpn(b)); fpu_push();}
+    inline void Assembler::FLDQ(I32 d, R b)         { count_ldq();   FPUm(0xdd00, d, b); asm_output("fldq %d(%s)",d,gpn(b)); fpu_push();}
+    inline void Assembler::FLDQdm(const double* dm) { count_ldq();   FPUdm(0xdd00, dm);  asm_output("fldq (%p)",(void*)dm); fpu_push();}
+    inline void Assembler::FILDQ(I32 d, R b)        { count_fpuld(); FPUm(0xdf05, d, b); asm_output("fildq %d(%s)",d,gpn(b)); fpu_push(); }
+    inline void Assembler::FILD(I32 d, R b)         { count_fpuld(); FPUm(0xdb00, d, b); asm_output("fild %d(%s)",d,gpn(b)); fpu_push(); }
+
+    inline void Assembler::FIST(bool p, I32 d, R b) { count_fpu(); FPUm(0xdb02|p, d, b); asm_output("fist%s %d(%s)",(p?"p":""),d,gpn(b)); if(p) fpu_pop(); }
+
+    inline void Assembler::FADD( I32 d, R b) { count_fpu(); FPUm(0xdc00, d, b); asm_output("fadd %d(%s)", d,gpn(b)); }
+    inline void Assembler::FSUB( I32 d, R b) { count_fpu(); FPUm(0xdc04, d, b); asm_output("fsub %d(%s)", d,gpn(b)); }
+    inline void Assembler::FSUBR(I32 d, R b) { count_fpu(); FPUm(0xdc05, d, b); asm_output("fsubr %d(%s)",d,gpn(b)); }
+    inline void Assembler::FMUL( I32 d, R b) { count_fpu(); FPUm(0xdc01, d, b); asm_output("fmul %d(%s)", d,gpn(b)); }
+    inline void Assembler::FDIV( I32 d, R b) { count_fpu(); FPUm(0xdc06, d, b); asm_output("fdiv %d(%s)", d,gpn(b)); }
+    inline void Assembler::FDIVR(I32 d, R b) { count_fpu(); FPUm(0xdc07, d, b); asm_output("fdivr %d(%s)",d,gpn(b)); }
+
+    inline void Assembler::FADDdm( const double *dm) { count_ldq(); FPUdm(0xdc00, dm); asm_output("fadd (%p)", (void*)dm); }
+    inline void Assembler::FSUBRdm(const double* dm) { count_ldq(); FPUdm(0xdc05, dm); asm_output("fsubr (%p)",(void*)dm); }
+    inline void Assembler::FMULdm( const double* dm) { count_ldq(); FPUdm(0xdc01, dm); asm_output("fmul (%p)", (void*)dm); }
+    inline void Assembler::FDIVRdm(const double* dm) { count_ldq(); FPUdm(0xdc07, dm); asm_output("fdivr (%p)",(void*)dm); }
+
+    inline void Assembler::FINCSTP()   { count_fpu(); FPUc(0xd9f7);    asm_output("fincstp"); }
+
+    inline void Assembler::FCOMP()     { count_fpu(); FPUc(0xD8D9);    asm_output("fcomp"); fpu_pop();}
+    inline void Assembler::FCOMPP()    { count_fpu(); FPUc(0xDED9);    asm_output("fcompp"); fpu_pop();fpu_pop();}
+    inline void Assembler::FLDr(R r)   { count_ldq(); FPU(0xd9c0,r);   asm_output("fld %s",gpn(r)); fpu_push(); }
+    inline void Assembler::EMMS()      { count_fpu(); FPUc(0x0f77);    asm_output("emms"); }
+
+    // standard direct call
+    inline void Assembler::CALL(const CallInfo* ci) {
+        count_call();
+        underrunProtect(5);
+        int offset = (ci->_address) - ((int)_nIns);
+        IMM32( (uint32_t)offset );
+        *(--_nIns) = 0xE8;
+        verbose_only(asm_output("call %s",(ci->_name));)
+        debug_only(if (ci->returnType()==ARGTYPE_F) fpu_push();)
+    }
+
+    // indirect call thru register
+    inline void Assembler::CALLr(const CallInfo* ci, Register r) {
+        count_calli();
+        underrunProtect(2);
+        ALU(0xff, 2, (r));
+        verbose_only(asm_output("call %s",gpn(r));)
+        debug_only(if (ci->returnType()==ARGTYPE_F) fpu_push();)
+    }
+
     void Assembler::nInit(AvmCore*)
     {
     }
@@ -122,7 +899,7 @@ namespace nanojit
                 _epilogue = genEpilogue();
             emitJumpTable(si, _epilogue);
             JMP_indirect(r);
-            LEAmi4(r, si->table, r);
+            LEAmi4(r, int32_t(si->table), r);
         } else {
             // If the guard already exists, use a simple jump.
             if (destKnown) {
@@ -140,7 +917,7 @@ namespace nanojit
         // profiling for the exit
         verbose_only(
            if (_logc->lcbits & LC_FragProfile) {
-              INCLi( &guard->record()->profCount );
+              INCLi( int32_t(&guard->record()->profCount) );
            }
         )
 
@@ -1910,7 +2687,7 @@ namespace nanojit
     verbose_only(
     void Assembler::asm_inc_m32(uint32_t* pCtr)
     {
-       INCLi(pCtr);
+       INCLi(int32_t(pCtr));
     }
     )
 
diff --git a/js/src/nanojit/Nativei386.h b/js/src/nanojit/Nativei386.h
index 5bf19769fa30..017b0e7ca6f0 100644
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@@ -112,7 +112,7 @@ namespace nanojit
 
     typedef uint8_t NIns;
 
-    // Bytes of icache to flush after Assembler::patch
+    // Bytes of icache to flush after patch
     const size_t LARGEST_BRANCH_PATCH = 16 * sizeof(NIns);
 
     // These are used as register numbers in various parts of the code
@@ -191,801 +191,255 @@ namespace nanojit
         void asm_cmp(LIns *cond); \
         void asm_div_mod(LIns *cond); \
         void asm_load(int d, Register r); \
-        void asm_immf(Register r, uint64_t q, double d, bool canClobberCCs);
-
- #define IMM8(i)    \
-     _nIns -= 1;     \
-     *((int8_t*)_nIns) = (int8_t)(i)
-
- #define IMM16(i)    \
-     _nIns -= 2;     \
-     *((int16_t*)_nIns) = (int16_t)(i)
-
-#define IMM32(i)    \
-    _nIns -= 4;     \
-    *((int32_t*)_nIns) = (int32_t)(i)
-
-// XXX rearrange NanoAssert() expression to workaround apparent gcc 4.3 bug:
-// XXX "error: logical && with non-zero constant will always evaluate as true"
-// underrunProtect(6) is necessary for worst-case
-#define MODRMs(r,d,b,l,i) \
-        NanoAssert(unsigned(i)<8 && unsigned(b)<8 && unsigned(r)<8); \
-        if ((d) == 0 && (b) != EBP) { \
+        void asm_immf(Register r, uint64_t q, double d, bool canClobberCCs); \
+        void IMM8(int32_t i) { \
+            _nIns -= 1; \
+            *((int8_t*)_nIns) = (int8_t)(i); \
+        }; \
+        void IMM16(int32_t i) { \
             _nIns -= 2; \
-            _nIns[0] = (uint8_t)     ( 0<<6 |   (r)<<3 | 4); \
-            _nIns[1] = (uint8_t) ((l)<<6 | (i)<<3 | (b)); \
-        } else if (isS8(d)) { \
-            _nIns -= 3; \
-            _nIns[0] = (uint8_t)     ( 1<<6 |   (r)<<3 | 4 ); \
-            _nIns[1] = (uint8_t) ( (l)<<6 | (i)<<3 | (b) ); \
-            _nIns[2] = (uint8_t) (d); \
-        } else { \
-            IMM32(d); \
-            *(--_nIns) = (uint8_t) ( (l)<<6 | (i)<<3 | (b) ); \
-            *(--_nIns) = (uint8_t)    ( 2<<6 |   (r)<<3 | 4 ); \
-        }
-
-// underrunProtect(6) is necessary for worst-case
-#define MODRMm(r,d,b) \
-        NanoAssert(unsigned(r)<8 && ((b)==UnspecifiedReg || unsigned(b)<8)); \
-        if ((b) == UnspecifiedReg) {\
-            IMM32(d);\
-            *(--_nIns) = (uint8_t) (0<<6 | (r)<<3 | 5);\
-        } else if ((b) == ESP) { \
-            MODRMs(r, d, b, 0, (Register)4); \
-        } \
-        else if ( (d) == 0 && (b) != EBP) { \
-            *(--_nIns) = (uint8_t) ( 0<<6 | (r)<<3 | (b) ); \
-        } else if (isS8(d)) { \
-            *(--_nIns) = (uint8_t) (d); \
-            *(--_nIns) = (uint8_t) ( 1<<6 | (r)<<3 | (b) ); \
-        } else { \
-            IMM32(d); \
-            *(--_nIns) = (uint8_t) ( 2<<6 | (r)<<3 | (b) ); \
-        }
-
-#define MODRMSIB(reg,base,index,scale,disp)                    \
-        if (disp != 0 || base == EBP) {                        \
-            if (isS8(disp)) {                                \
-                *(--_nIns) = int8_t(disp);                    \
-            } else {                                        \
-                IMM32(disp);                                \
-            }                                                \
-        }                                                    \
-        *(--_nIns) = uint8_t((scale)<<6|(index)<<3|(base));    \
-        if (disp == 0 && base != EBP) {                        \
-            *(--_nIns) = uint8_t(((reg)<<3)|4);                \
-        } else {                                            \
-            if (isS8(disp))                                    \
-                *(--_nIns) = uint8_t((1<<6)|(reg<<3)|4);    \
-            else                                            \
-                *(--_nIns) = uint8_t((2<<6)|(reg<<3)|4);    \
-        }
-
-#define MODRMdm(r,addr)                    \
-        NanoAssert(unsigned(r)<8);        \
-        IMM32(addr);                    \
-        *(--_nIns) = (uint8_t)( (r)<<3 | 5 );
-
-#define MODRM(d,s) \
-        NanoAssert(((unsigned)(d))<8 && ((unsigned)(s))<8); \
-        *(--_nIns) = (uint8_t) ( 3<<6|(d)<<3|(s) )
-
-#define ALU0(o)             \
-        underrunProtect(1);\
-        *(--_nIns) = (uint8_t) (o)
-
-#define ALUm(c,r,d,b)       \
-        underrunProtect(8); \
-        MODRMm(r,d,b);      \
-        *(--_nIns) = uint8_t(c)
-
-#define ALUdm(c,r,addr)        \
-        underrunProtect(6);    \
-        MODRMdm(r,addr);    \
-        *(--_nIns) = uint8_t(c)
-
-#define ALUsib(c,r,base,index,scale,disp)    \
-        underrunProtect(7);                    \
-        MODRMSIB(r,base,index,scale,disp);    \
-        *(--_nIns) = uint8_t(c)
-
-#define ALUm16(c,r,d,b)     \
-        underrunProtect(9); \
-        MODRMm(r,d,b);      \
-        *(--_nIns) = uint8_t(c);\
-        *(--_nIns) = 0x66
-
-#define ALU2dm(c,r,addr)    \
-        underrunProtect(7);    \
-        MODRMdm(r,addr);    \
-        *(--_nIns) = (uint8_t) (c);\
-        *(--_nIns) = (uint8_t) ((c)>>8)
-
-#define ALU2m(c,r,d,b)      \
-        underrunProtect(9); \
-        MODRMm(r,d,b);      \
-        *(--_nIns) = (uint8_t) (c);\
-        *(--_nIns) = (uint8_t) ((c)>>8)
-
-#define ALU2sib(c,r,base,index,scale,disp)    \
-        underrunProtect(8);                    \
-        MODRMSIB(r,base,index,scale,disp);    \
-        *(--_nIns) = (uint8_t) (c);            \
-        *(--_nIns) = (uint8_t) ((c)>>8)
-
-#define ALU(c,d,s)  \
-        underrunProtect(2);\
-        MODRM(d,s); \
-        *(--_nIns) = (uint8_t) (c)
-
-#define ALUi(c,r,i) \
-        underrunProtect(6); \
-        NanoAssert(unsigned(r)<8);\
-        if (isS8(i)) { \
-            *(--_nIns) = uint8_t(i); \
-            MODRM((c>>3),(r)); \
-            *(--_nIns) = uint8_t(0x83); \
-        } else { \
-            IMM32(i); \
-            if ( (r) == EAX) { \
-                *(--_nIns) = (uint8_t) (c); \
+            *((int16_t*)_nIns) = (int16_t)(i); \
+        }; \
+        void IMM32(int32_t i) { \
+            _nIns -= 4; \
+            *((int32_t*)_nIns) = (int32_t)(i); \
+        }; \
+        void MODRMs(int32_t r, int32_t d, Register b, int32_t l, int32_t i); \
+        void MODRMm(int32_t r, int32_t d, Register b); \
+        void MODRMSIB(Register reg, Register base, int32_t index, int32_t scale, int32_t disp); \
+        void MODRMdm(int32_t r, int32_t addr); \
+        void MODRM(int32_t d, int32_t s) { \
+            NanoAssert((unsigned(d))<8 && (unsigned(s))<8); \
+            *(--_nIns) = (uint8_t) ( 3<<6 | d<<3 | s ); \
+        }; \
+        void ALU0(int32_t o); \
+        void ALUm(int32_t c, int32_t r, int32_t d, Register b); \
+        void ALUdm(int32_t c, int32_t r, int32_t addr); \
+        void ALUsib(int32_t c, Register r, Register base, int32_t index, int32_t scale, int32_t disp); \
+        void ALUm16(int32_t c, int32_t r, int32_t d, Register b); \
+        void ALU2dm(int32_t c, int32_t r, int32_t addr); \
+        void ALU2m(int32_t c, int32_t r, int32_t d, Register b); \
+        void ALU2sib(int32_t c, Register r, Register base, int32_t index, int32_t scale, int32_t disp); \
+        void ALU(int32_t c, int32_t d, int32_t s) { \
+            underrunProtect(2); \
+            MODRM(d,s); \
+            *(--_nIns) = uint8_t(c); \
+        }; \
+        void ALUi(int32_t c, int32_t r, int32_t i); \
+        void ALUmi(int32_t c, int32_t d, Register b, int32_t i); \
+        void ALU2(int32_t c, int32_t d, int32_t s); \
+        void LAHF(); \
+        void SAHF(); \
+        void OR(Register l, Register r); \
+        void AND(Register l, Register r); \
+        void XOR(Register l, Register r); \
+        void ADD(Register l, Register r); \
+        void SUB(Register l, Register r); \
+        void MUL(Register l, Register r); \
+        void DIV(Register r); \
+        void NOT(Register r); \
+        void NEG(Register r); \
+        void SHR(Register r, Register s); \
+        void SAR(Register r, Register s); \
+        void SHL(Register r, Register s); \
+        void SHIFT(int32_t c, Register r, int32_t i); \
+        void SHLi(Register r, int32_t i); \
+        void SHRi(Register r, int32_t i); \
+        void SARi(Register r, int32_t i); \
+        void MOVZX8(Register d, Register s); \
+        void SUBi(Register r, int32_t i); \
+        void ADDi(Register r, int32_t i); \
+        void ANDi(Register r, int32_t i); \
+        void ORi(Register r, int32_t i); \
+        void XORi(Register r, int32_t i); \
+        void ADDmi(int32_t d, Register b, int32_t i); \
+        void TEST(Register d, Register s); \
+        void CMP(Register l, Register r); \
+        void CMPi(Register r, int32_t i); \
+        void MR(Register d, Register s) { \
+            count_mov(); \
+            ALU(0x8b,d,s); \
+            asm_output("mov %s,%s",gpn(d),gpn(s)); \
+        }; \
+        void LEA(Register r, int32_t d, Register b); \
+        void LEAmi4(Register r, int32_t d, int32_t i); \
+        void CDQ(); \
+        void INCLi(int32_t p); \
+        void SETE( Register r); \
+        void SETNP(Register r); \
+        void SETL( Register r); \
+        void SETLE(Register r); \
+        void SETG( Register r); \
+        void SETGE(Register r); \
+        void SETB( Register r); \
+        void SETBE(Register r); \
+        void SETA( Register r); \
+        void SETAE(Register r); \
+        void SETO( Register r); \
+        void MREQ(Register d, Register s); \
+        void MRNE(Register d, Register s); \
+        void MRL( Register d, Register s); \
+        void MRLE(Register d, Register s); \
+        void MRG( Register d, Register s); \
+        void MRGE(Register d, Register s); \
+        void MRB( Register d, Register s); \
+        void MRBE(Register d, Register s); \
+        void MRA( Register d, Register s); \
+        void MRAE(Register d, Register s); \
+        void MRNO(Register d, Register s); \
+        void LD(Register reg, int32_t disp, Register base); \
+        void LDdm(Register reg, int32_t addr); \
+        void LDsib(Register reg, int32_t disp, Register base, int32_t index, int32_t scale); \
+        void LD16S(Register r, int32_t d, Register b); \
+        void LD16Sdm(Register r, int32_t addr); \
+        void LD16Ssib(Register r, int32_t disp, Register base, int32_t index, int32_t scale); \
+        void LD16Z(Register r, int32_t d, Register b); \
+        void LD16Zdm(Register r, int32_t addr); \
+        void LD16Zsib(Register r, int32_t disp, Register base, int32_t index, int32_t scale); \
+        void LD8Z(Register r, int32_t d, Register b); \
+        void LD8Zdm(Register r, int32_t addr); \
+        void LD8Zsib(Register r, int32_t disp, Register base, int32_t ndex, int32_t scale); \
+        void LD8S(Register r, int32_t d, Register b); \
+        void LD8Sdm(Register r, int32_t addr); \
+        void LD8Ssib(Register r, int32_t disp, Register base, int32_t index, int32_t scale); \
+        void LDi(Register r, int32_t i); \
+        void ST8(Register base, int32_t disp, Register reg); \
+        void ST16(Register base, int32_t disp, Register reg); \
+        void ST(Register base, int32_t disp, Register reg); \
+        void ST8i(Register base, int32_t disp, int32_t imm); \
+        void ST16i(Register base, int32_t disp, int32_t imm); \
+        void STi(Register base, int32_t disp, int32_t imm); \
+        void RET(); \
+        void NOP(); \
+        void INT3(); \
+        void PUSHi(int32_t i); \
+        void PUSHi32(int32_t i); \
+        void PUSHr(Register r); \
+        void PUSHm(int32_t d, Register b); \
+        void POPr(Register r); \
+        void JCC(int32_t o, NIns* t, const char* n); \
+        void JMP_long(NIns* t); \
+        void JMP(NIns* t) { \
+            count_jmp(); \
+            underrunProtect(5); \
+            intptr_t tt = (intptr_t)t - (intptr_t)_nIns; \
+            if (isS8(tt)) { \
+                verbose_only( NIns* next = _nIns; (void)next; ) \
+                _nIns -= 2; \
+                _nIns[0] = /*JMP8*/0xeb; \
+                _nIns[1] = uint8_t(tt & 0xff); \
+                asm_output("jmp %p",next+tt); \
             } else { \
-                MODRM((c>>3),(r)); \
-                *(--_nIns) = uint8_t(0x81); \
+                JMP_long_nochk_offset(tt); \
             } \
-        }
-
-#define ALUmi(c,d,b,i) \
-        underrunProtect(10); \
-        NanoAssert(((unsigned)b)<8); \
-        if (isS8(i)) { \
-            *(--_nIns) = uint8_t(i); \
-            MODRMm((c>>3),(d),(b)); \
-            *(--_nIns) = uint8_t(0x83); \
-        } else { \
-            IMM32(i); \
-            MODRMm((c>>3),(d),(b)); \
-            *(--_nIns) = uint8_t(0x81); \
-        }
-
-#define ALU2(c,d,s) \
-        underrunProtect(3); \
-        MODRM((d),(s)); \
-        _nIns -= 2; \
-        _nIns[0] = (uint8_t) ( ((c)>>8) ); \
-        _nIns[1] = (uint8_t) ( (c) )
-
-#define LAHF()      do { count_alu(); ALU0(0x9F);                   asm_output("lahf"); } while(0)
-#define SAHF()      do { count_alu(); ALU0(0x9E);                   asm_output("sahf"); } while(0)
-#define OR(l,r)     do { count_alu(); ALU(0x0b, (l),(r));           asm_output("or %s,%s",gpn(l),gpn(r)); } while(0)
-#define AND(l,r)    do { count_alu(); ALU(0x23, (l),(r));           asm_output("and %s,%s",gpn(l),gpn(r)); } while(0)
-#define XOR(l,r)    do { count_alu(); ALU(0x33, (l),(r));           asm_output("xor %s,%s",gpn(l),gpn(r)); } while(0)
-#define ADD(l,r)    do { count_alu(); ALU(0x03, (l),(r));           asm_output("add %s,%s",gpn(l),gpn(r)); } while(0)
-#define SUB(l,r)    do { count_alu(); ALU(0x2b, (l),(r));           asm_output("sub %s,%s",gpn(l),gpn(r)); } while(0)
-#define MUL(l,r)    do { count_alu(); ALU2(0x0faf,(l),(r));         asm_output("mul %s,%s",gpn(l),gpn(r)); } while(0)
-#define DIV(r)      do { count_alu(); ALU(0xf7, (Register)7,(r));   asm_output("idiv  edx:eax, %s",gpn(r)); } while(0)
-#define NOT(r)      do { count_alu(); ALU(0xf7, (Register)2,(r));   asm_output("not %s",gpn(r)); } while(0)
-#define NEG(r)      do { count_alu(); ALU(0xf7, (Register)3,(r));   asm_output("neg %s",gpn(r)); } while(0)
-#define SHR(r,s)    do { count_alu(); ALU(0xd3, (Register)5,(r));   asm_output("shr %s,%s",gpn(r),gpn(s)); } while(0)
-#define SAR(r,s)    do { count_alu(); ALU(0xd3, (Register)7,(r));   asm_output("sar %s,%s",gpn(r),gpn(s)); } while(0)
-#define SHL(r,s)    do { count_alu(); ALU(0xd3, (Register)4,(r));   asm_output("shl %s,%s",gpn(r),gpn(s)); } while(0)
-
-#define SHIFT(c,r,i) \
-        underrunProtect(3);\
-        *--_nIns = (uint8_t)(i);\
-        MODRM((Register)c,r);\
-        *--_nIns = 0xc1;
-
-#define SHLi(r,i)   do { count_alu(); SHIFT(4,r,i); asm_output("shl %s,%d", gpn(r),i); } while(0)
-#define SHRi(r,i)   do { count_alu(); SHIFT(5,r,i); asm_output("shr %s,%d", gpn(r),i); } while(0)
-#define SARi(r,i)   do { count_alu(); SHIFT(7,r,i); asm_output("sar %s,%d", gpn(r),i); } while(0)
-
-#define MOVZX8(d,s) do { count_alu(); ALU2(0x0fb6,d,s); asm_output("movzx %s,%s", gpn(d),gpn(s)); } while(0)
-
-#define SUBi(r,i)   do { count_alu(); ALUi(0x2d,r,i);               asm_output("sub %s,%d",gpn(r),i); } while(0)
-#define ADDi(r,i)   do { count_alu(); ALUi(0x05,r,i);               asm_output("add %s,%d",gpn(r),i); } while(0)
-#define ANDi(r,i)   do { count_alu(); ALUi(0x25,r,i);               asm_output("and %s,%d",gpn(r),i); } while(0)
-#define ORi(r,i)    do { count_alu(); ALUi(0x0d,r,i);               asm_output("or %s,%d",gpn(r),i); } while(0)
-#define XORi(r,i)   do { count_alu(); ALUi(0x35,r,i);               asm_output("xor %s,%d",gpn(r),i); } while(0)
-
-#define ADDmi(d,b,i) do { count_alust(); ALUmi(0x05, d, b, i); asm_output("add %d(%s), %d", d, gpn(b), i); } while(0)
-
-#define TEST(d,s)   do { count_alu(); ALU(0x85,d,s);                asm_output("test %s,%s",gpn(d),gpn(s)); } while(0)
-#define CMP(l,r)    do { count_alu(); ALU(0x3b, (l),(r));           asm_output("cmp %s,%s",gpn(l),gpn(r)); } while(0)
-#define CMPi(r,i)   do { count_alu(); ALUi(0x3d,r,i);               asm_output("cmp %s,%d",gpn(r),i); } while(0)
-
-#define MR(d,s)     do { count_mov(); ALU(0x8b,d,s);                asm_output("mov %s,%s",gpn(d),gpn(s)); } while(0)
-#define LEA(r,d,b)  do { count_alu(); ALUm(0x8d, r,d,b);            asm_output("lea %s,%d(%s)",gpn(r),d,gpn(b)); } while(0)
-// lea %r, d(%i*4)
-// This addressing mode is not supported by the MODRMSIB macro.
-#define LEAmi4(r,d,i) do { count_alu(); IMM32(d); *(--_nIns) = (2<<6)|((uint8_t)i<<3)|5; *(--_nIns) = (0<<6)|((uint8_t)r<<3)|4; *(--_nIns) = 0x8d;                    asm_output("lea %s, %p(%s*4)", gpn(r), (void*)d, gpn(i)); } while(0)
-
-#define CDQ()       do { SARi(EDX, 31); MR(EDX, EAX); } while(0)
-
-#define INCLi(p)    do { count_alu(); \
-                         underrunProtect(6); \
-                         IMM32((uint32_t)(ptrdiff_t)p); *(--_nIns) = 0x05; *(--_nIns) = 0xFF; \
-                         asm_output("incl  (%p)", (void*)p); } while (0)
-
-#define SETE(r)     do { count_alu(); ALU2(0x0f94,(r),(r));         asm_output("sete %s",gpn(r)); } while(0)
-#define SETNP(r)    do { count_alu(); ALU2(0x0f9B,(r),(r));         asm_output("setnp %s",gpn(r)); } while(0)
-#define SETL(r)     do { count_alu(); ALU2(0x0f9C,(r),(r));         asm_output("setl %s",gpn(r)); } while(0)
-#define SETLE(r)    do { count_alu(); ALU2(0x0f9E,(r),(r));         asm_output("setle %s",gpn(r)); } while(0)
-#define SETG(r)     do { count_alu(); ALU2(0x0f9F,(r),(r));         asm_output("setg %s",gpn(r)); } while(0)
-#define SETGE(r)    do { count_alu(); ALU2(0x0f9D,(r),(r));         asm_output("setge %s",gpn(r)); } while(0)
-#define SETB(r)     do { count_alu(); ALU2(0x0f92,(r),(r));         asm_output("setb %s",gpn(r)); } while(0)
-#define SETBE(r)    do { count_alu(); ALU2(0x0f96,(r),(r));         asm_output("setbe %s",gpn(r)); } while(0)
-#define SETA(r)     do { count_alu(); ALU2(0x0f97,(r),(r));         asm_output("seta %s",gpn(r)); } while(0)
-#define SETAE(r)    do { count_alu(); ALU2(0x0f93,(r),(r));         asm_output("setae %s",gpn(r)); } while(0)
-#define SETO(r)     do { count_alu(); ALU2(0x0f92,(r),(r));         asm_output("seto %s",gpn(r)); } while(0)
-
-#define MREQ(dr,sr) do { count_alu(); ALU2(0x0f44,dr,sr); asm_output("cmove %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRNE(dr,sr) do { count_alu(); ALU2(0x0f45,dr,sr); asm_output("cmovne %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRL(dr,sr)  do { count_alu(); ALU2(0x0f4C,dr,sr); asm_output("cmovl %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRLE(dr,sr) do { count_alu(); ALU2(0x0f4E,dr,sr); asm_output("cmovle %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRG(dr,sr)  do { count_alu(); ALU2(0x0f4F,dr,sr); asm_output("cmovg %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRGE(dr,sr) do { count_alu(); ALU2(0x0f4D,dr,sr); asm_output("cmovge %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRB(dr,sr)  do { count_alu(); ALU2(0x0f42,dr,sr); asm_output("cmovb %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRBE(dr,sr) do { count_alu(); ALU2(0x0f46,dr,sr); asm_output("cmovbe %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRA(dr,sr)  do { count_alu(); ALU2(0x0f47,dr,sr); asm_output("cmova %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRAE(dr,sr) do { count_alu(); ALU2(0x0f43,dr,sr); asm_output("cmovae %s,%s", gpn(dr),gpn(sr)); } while(0)
-#define MRNO(dr,sr) do { count_alu(); ALU2(0x0f41,dr,sr); asm_output("cmovno %s,%s", gpn(dr),gpn(sr)); } while(0)
-
-// these aren't currently used but left in for reference
-//#define LDEQ(r,d,b) do { ALU2m(0x0f44,r,d,b); asm_output("cmove %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-//#define LDNEQ(r,d,b) do { ALU2m(0x0f45,r,d,b); asm_output("cmovne %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-
-#define LD(reg,disp,base)   do {    \
-    count_ld();\
-    ALUm(0x8b,reg,disp,base);   \
-    asm_output("mov %s,%d(%s)",gpn(reg),disp,gpn(base)); } while(0)
-
-#define LDdm(reg,addr) do {        \
-    count_ld();                 \
-    ALUdm(0x8b,reg,addr);        \
-    asm_output("mov   %s,0(%lx)",gpn(reg),(unsigned long)addr); \
-    } while (0)
-
-
-#define SIBIDX(n)    "1248"[n]
-
-#define LDsib(reg,disp,base,index,scale) do {    \
-    count_ld();                                 \
-    ALUsib(0x8b,reg,base,index,scale,disp);        \
-    asm_output("mov   %s,%d(%s+%s*%c)",gpn(reg),disp,gpn(base),gpn(index),SIBIDX(scale)); \
-    } while (0)
-
-// note: movzx/movsx are being output with an 8/16 suffix to indicate the size
-// being loaded. this doesn't really match standard intel format (though is arguably
-// terser and more obvious in this case) and would probably be nice to fix.
-// (likewise, the 8/16 bit stores being output as "mov8" and "mov16" respectively.)
-
-// load 16-bit, sign extend
-#define LD16S(r,d,b) do { count_ld(); ALU2m(0x0fbf,r,d,b); asm_output("movsx16 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-
-#define LD16Sdm(r,addr) do { count_ld(); ALU2dm(0x0fbf,r,addr); asm_output("movsx16 %s,0(%lx)", gpn(r),(unsigned long)addr); } while (0)
-
-#define LD16Ssib(r,disp,base,index,scale) do {    \
-    count_ld();                                 \
-    ALU2sib(0x0fbf,r,base,index,scale,disp);    \
-    asm_output("movsx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
-    } while (0)
-
-// load 16-bit, zero extend
-#define LD16Z(r,d,b) do { count_ld(); ALU2m(0x0fb7,r,d,b); asm_output("movzx16 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-
-#define LD16Zdm(r,addr) do { count_ld(); ALU2dm(0x0fb7,r,addr); asm_output("movzx16 %s,0(%lx)", gpn(r),(unsigned long)addr); } while (0)
-
-#define LD16Zsib(r,disp,base,index,scale) do {    \
-    count_ld();                                 \
-    ALU2sib(0x0fb7,r,base,index,scale,disp);    \
-    asm_output("movzx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
-    } while (0)
-
-// load 8-bit, zero extend
-#define LD8Z(r,d,b)    do { count_ld(); ALU2m(0x0fb6,r,d,b); asm_output("movzx8 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-
-#define LD8Zdm(r,addr) do { \
-    count_ld(); \
-    ALU2dm(0x0fb6,r,addr); \
-    asm_output("movzx8 %s,0(%lx)", gpn(r),(long unsigned)addr); \
-    } while(0)
-
-#define LD8Zsib(r,disp,base,index,scale) do {    \
-    count_ld();                                 \
-    ALU2sib(0x0fb6,r,base,index,scale,disp);    \
-    asm_output("movzx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
-    } while(0)
-
-// load 8-bit, sign extend
-#define LD8S(r,d,b)    do { count_ld(); ALU2m(0x0fbe,r,d,b); asm_output("movsx8 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
-
-#define LD8Sdm(r,addr) do { \
-    count_ld(); \
-    ALU2dm(0x0fbe,r,addr); \
-    asm_output("movsx8 %s,0(%lx)", gpn(r),(long unsigned)addr); \
-    } while(0)
-
-#define LD8Ssib(r,disp,base,index,scale) do {    \
-    count_ld();                                 \
-    ALU2sib(0x0fbe,r,base,index,scale,disp);    \
-    asm_output("movsx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
-    } while(0)
-
-#define LDi(r,i) do { \
-    count_ld();\
-    underrunProtect(5);         \
-    IMM32(i);                   \
-    NanoAssert(((unsigned)r)<8); \
-    *(--_nIns) = (uint8_t) (0xb8 | (r) );       \
-    asm_output("mov %s,%d",gpn(r),i); } while(0)
-
-// quirk of x86-32: reg must be a/b/c/d for byte stores here
-#define ST8(base,disp,reg) do {  \
-    count_st();\
-    NanoAssert(((unsigned)reg)<4); \
-    ALUm(0x88,reg,disp,base);   \
-    asm_output("mov8 %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg)); } while(0)
-
-#define ST16(base,disp,reg) do {  \
-    count_st();\
-    ALUm16(0x89,reg,disp,base);   \
-    asm_output("mov16 %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg)); } while(0)
-
-#define ST(base,disp,reg) do {  \
-    count_st();\
-    ALUm(0x89,reg,disp,base);   \
-    asm_output("mov %d(%s),%s",disp,base==UnspecifiedReg?"0":gpn(base),gpn(reg)); } while(0)
-
-#define ST8i(base,disp,imm)  do { \
-    count_st();\
-    underrunProtect(8);    \
-    IMM8(imm);             \
-    MODRMm(0, disp, base);  \
-    *(--_nIns) = 0xc6;      \
-    asm_output("mov8 %d(%s),%d",disp,gpn(base),imm); } while(0)
-
-#define ST16i(base,disp,imm)  do { \
-    count_st();\
-    underrunProtect(10);    \
-    IMM16(imm);             \
-    MODRMm(0, disp, base);  \
-    *(--_nIns) = 0xc7;      \
-    *(--_nIns) = 0x66;      \
-    asm_output("mov16 %d(%s),%d",disp,gpn(base),imm); } while(0)
-
-#define STi(base,disp,imm)  do { \
-    count_st();\
-    underrunProtect(11);    \
-    IMM32(imm);             \
-    MODRMm(0, disp, base);  \
-    *(--_nIns) = 0xc7;      \
-    asm_output("mov %d(%s),%d",disp,gpn(base),imm); } while(0)
-
-#define RET()   do { count_ret(); ALU0(0xc3); asm_output("ret"); } while(0)
-#define NOP()   do { count_alu(); ALU0(0x90); asm_output("nop"); } while(0)
-#define INT3()  do { ALU0(0xcc); asm_output("int3"); } while(0)
-
-#define PUSHi(i) do { \
-    count_push();\
-    if (isS8(i)) { \
-        underrunProtect(2);         \
-        _nIns-=2; _nIns[0] = 0x6a; _nIns[1] = (uint8_t)(i); \
-        asm_output("push %d",i); \
-    } else \
-        { PUSHi32(i); } } while(0)
-
-#define PUSHi32(i)  do {    \
-    count_push();\
-    underrunProtect(5); \
-    IMM32(i);           \
-    *(--_nIns) = 0x68;  \
-    asm_output("push %d",i); } while(0)
-
-#define PUSHr(r) do {  \
-    count_push();\
-    underrunProtect(1);         \
-    NanoAssert(((unsigned)r)<8); \
-    *(--_nIns) = (uint8_t) ( 0x50 | (r) );  \
-    asm_output("push %s",gpn(r)); } while(0)
-
-#define PUSHm(d,b) do { \
-    count_pushld();\
-    ALUm(0xff, 6, d, b);        \
-    asm_output("push %d(%s)",d,gpn(b)); } while(0)
-
-#define POPr(r) do { \
-    count_pop();\
-    underrunProtect(1);         \
-    NanoAssert(((unsigned)r)<8); \
-    *(--_nIns) = (uint8_t) ( 0x58 | (r) ); \
-    asm_output("pop %s",gpn(r)); } while(0)
-
-#define JCC32 0x0f
-#define JMP8  0xeb
-#define JMP32 0xe9
-
-#define JCC(o,t,n) do { \
-    count_jcc();\
-    underrunProtect(6); \
-    intptr_t tt = (intptr_t)t - (intptr_t)_nIns;    \
-    if (isS8(tt)) { \
-        verbose_only( NIns* next = _nIns; (void)next; ) \
-        _nIns -= 2; \
-        _nIns[0] = (uint8_t) ( 0x70 | (o) ); \
-        _nIns[1] = (uint8_t) (tt); \
-        asm_output("%-5s %p",(n),(next+tt)); \
-    } else { \
-        verbose_only( NIns* next = _nIns; ) \
-        IMM32(tt); \
-        _nIns -= 2; \
-        _nIns[0] = JCC32; \
-        _nIns[1] = (uint8_t) ( 0x80 | (o) ); \
-        asm_output("%-5s %p",(n),(next+tt)); \
-    } } while(0)
-
-#define JMP_long(t) do { \
-    count_jmp();\
-    underrunProtect(5); \
-    intptr_t tt = (intptr_t)t - (intptr_t)_nIns;    \
-    JMP_long_nochk_offset(tt);  \
-    verbose_only( verbose_outputf("%010lx:", (unsigned long)_nIns); )    \
-    } while(0)
-
-#define JMP(t)      do {    \
-    count_jmp();\
-    underrunProtect(5); \
-    intptr_t tt = (intptr_t)t - (intptr_t)_nIns;    \
-    if (isS8(tt)) { \
-        verbose_only( NIns* next = _nIns; (void)next; ) \
-        _nIns -= 2; \
-        _nIns[0] = JMP8; \
-        _nIns[1] = (uint8_t) ( (tt)&0xff ); \
-        asm_output("jmp %p",(next+tt)); \
-    } else { \
-        JMP_long_nochk_offset(tt);  \
-    } } while(0)
-
-// this should only be used when you can guarantee there is enough room on the page
-#define JMP_long_nochk_offset(o) do {\
-        verbose_only( NIns* next = _nIns; (void)next; ) \
-        IMM32((o)); \
-        *(--_nIns) = JMP32; \
-        asm_output("jmp %p",(next+(o))); } while(0)
-
-#define JMP_indirect(r) do { \
-        underrunProtect(2);  \
-        MODRMm(4, 0, r);     \
-        *(--_nIns) = 0xff;   \
-        asm_output("jmp   *(%s)", gpn(r)); } while (0)
-
-#define JMP_indexed(x, ss, addr) do { \
-        underrunProtect(7);  \
-        IMM32(addr); \
-        _nIns -= 3;\
-        _nIns[0]   = (NIns) 0xff; /* jmp */ \
-        _nIns[1]   = (NIns) (0<<6 | 4<<3 | 4); /* modrm: base=sib + disp32 */ \
-        _nIns[2]   = (NIns) ((ss)<<6 | (x)<<3 | 5); /* sib: x<<ss + table */ \
-        asm_output("jmp   *(%s*%d+%p)", gpn(x), 1<<(ss), (void*)(addr)); } while (0)
-
-#define JE(t)   JCC(0x04, t, "je")
-#define JNE(t)  JCC(0x05, t, "jne")
-#define JP(t)   JCC(0x0A, t, "jp")
-#define JNP(t)  JCC(0x0B, t, "jnp")
-
-#define JB(t)   JCC(0x02, t, "jb")
-#define JNB(t)  JCC(0x03, t, "jnb")
-#define JBE(t)  JCC(0x06, t, "jbe")
-#define JNBE(t) JCC(0x07, t, "jnbe")
-
-#define JA(t)   JCC(0x07, t, "ja")
-#define JNA(t)  JCC(0x06, t, "jna")
-#define JAE(t)  JCC(0x03, t, "jae")
-#define JNAE(t) JCC(0x02, t, "jnae")
-
-#define JL(t)   JCC(0x0C, t, "jl")
-#define JNL(t)  JCC(0x0D, t, "jnl")
-#define JLE(t)  JCC(0x0E, t, "jle")
-#define JNLE(t) JCC(0x0F, t, "jnle")
-
-#define JG(t)   JCC(0x0F, t, "jg")
-#define JNG(t)  JCC(0x0E, t, "jng")
-#define JGE(t)  JCC(0x0D, t, "jge")
-#define JNGE(t) JCC(0x0C, t, "jnge")
-
-#define JO(t)   JCC(0x00, t, "jo")
-#define JNO(t)  JCC(0x01, t, "jno")
-
-// sse instructions
-#define SSE(c,d,s)  \
-        underrunProtect(9); \
-        MODRM((d),(s)); \
-        _nIns -= 3; \
-        _nIns[0] = (uint8_t)(((c)>>16)&0xff); \
-        _nIns[1] = (uint8_t)(((c)>>8)&0xff); \
-        _nIns[2] = (uint8_t)((c)&0xff)
-
-#define SSEm(c,r,d,b)   \
-        underrunProtect(9); \
-        MODRMm((r),(d),(b));    \
-        _nIns -= 3;     \
-        _nIns[0] = (uint8_t)(((c)>>16)&0xff); \
-        _nIns[1] = (uint8_t)(((c)>>8)&0xff); \
-        _nIns[2] = (uint8_t)((c)&0xff)
-
-#define LDSD(r,d,b)do {     \
-    count_ldq();\
-    SSEm(0xf20f10, (r)&7, (d), (b)); \
-    asm_output("movsd %s,%d(%s)",gpn(r),(d),gpn(b)); \
-    } while(0)
-
-#define LDSDm(r,addr)do {     \
-    count_ldq();\
-    underrunProtect(8); \
-    const double* daddr = addr; \
-    IMM32(int32_t(daddr));\
-    *(--_nIns) = uint8_t(((r)&7)<<3|5); \
-    *(--_nIns) = 0x10;\
-    *(--_nIns) = 0x0f;\
-    *(--_nIns) = 0xf2;\
-    asm_output("movsd %s,(%p) // =%f",gpn(r),(void*)daddr,*daddr); \
-    } while(0)
-
-#define STSD(d,b,r)do {     \
-    count_stq();\
-    SSEm(0xf20f11, (r)&7, (d), (b)); \
-    asm_output("movsd %d(%s),%s",(d),gpn(b),gpn(r)); \
-    } while(0)
-
-#define SSE_LDQ(r,d,b)do {  \
-    count_ldq();\
-    SSEm(0xf30f7e, (r)&7, (d), (b)); \
-    asm_output("movq %s,%d(%s)",gpn(r),d,gpn(b)); \
-    } while(0)
-
-#define SSE_STQ(d,b,r)do {  \
-    count_stq();\
-    SSEm(0x660fd6, (r)&7, (d), (b)); \
-    asm_output("movq %d(%s),%s",(d),gpn(b),gpn(r)); \
-    } while(0)
-
-#define SSE_LDSS(r,d,b)do {  \
-    count_ld();\
-    SSEm(0xf30f10, (r)&7, (d), (b)); \
-    asm_output("movss %s,%d(%s)",gpn(r),d,gpn(b)); \
-    } while(0)
-
-#define SSE_STSS(d,b,r)do {  \
-    count_st();\
-    SSEm(0xf30f11, (r)&7, (d), (b)); \
-    asm_output("movss %d(%s),%s",(d),gpn(b),gpn(r)); \
-    } while(0)
-
-#define SSE_CVTSI2SD(xr,gr) do{ \
-    count_fpu();\
-    SSE(0xf20f2a, (xr)&7, (gr)&7); \
-    asm_output("cvtsi2sd %s,%s",gpn(xr),gpn(gr)); \
-    } while(0)
-
- #define SSE_CVTSD2SI(gr,xr) do{ \
-    count_fpu();\
-    SSE(0xf20f2d, (gr)&7, (xr)&7); \
-    asm_output("cvtsd2si %s,%s",gpn(gr),gpn(xr)); \
-    } while(0)
-
-#define SSE_CVTSD2SS(xr,gr) do{ \
-    count_fpu();\
-    SSE(0xf20f5a, (xr)&7, (gr)&7); \
-    asm_output("cvtsd2ss %s,%s",gpn(xr),gpn(gr)); \
-    } while(0)
-
-#define SSE_CVTSS2SD(xr,gr) do{ \
-    count_fpu();\
-    SSE(0xf30f5a, (xr)&7, (gr)&7); \
-    asm_output("cvtss2sd %s,%s",gpn(xr),gpn(gr)); \
-    } while(0)
-
-#define CVTDQ2PD(dstr,srcr) do{ \
-    count_fpu();\
-    SSE(0xf30fe6, (dstr)&7, (srcr)&7); \
-    asm_output("cvtdq2pd %s,%s",gpn(dstr),gpn(srcr)); \
-    } while(0)
-
-// move and zero-extend gpreg to xmm reg
-#define SSE_MOVD(d,s) do{ \
-    count_mov();\
-    if (_is_xmm_reg_(s)) { \
-        NanoAssert(_is_gp_reg_(d)); \
-        SSE(0x660f7e, (s)&7, (d)&7); \
-    } else { \
-        NanoAssert(_is_gp_reg_(s)); \
-        NanoAssert(_is_xmm_reg_(d)); \
-        SSE(0x660f6e, (d)&7, (s)&7); \
-    } \
-    asm_output("movd %s,%s",gpn(d),gpn(s)); \
-    } while(0)
-
-#define SSE_MOVSD(rd,rs) do{ \
-    count_mov();\
-    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
-    SSE(0xf20f10, (rd)&7, (rs)&7); \
-    asm_output("movsd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-
-#define SSE_MOVDm(d,b,xrs) do {\
-    count_st();\
-    NanoAssert(_is_xmm_reg_(xrs) && (_is_gp_reg_(b) || b==FP));\
-    SSEm(0x660f7e, (xrs)&7, d, b);\
-    asm_output("movd %d(%s),%s", d, gpn(b), gpn(xrs));\
-    } while(0)
-
-#define SSE_ADDSD(rd,rs) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
-    SSE(0xf20f58, (rd)&7, (rs)&7); \
-    asm_output("addsd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-
-#define SSE_ADDSDm(r,addr)do {     \
-    count_fpuld();\
-    underrunProtect(8); \
-    NanoAssert(_is_xmm_reg_(r));\
-    const double* daddr = addr; \
-    IMM32(int32_t(daddr));\
-    *(--_nIns) = uint8_t(((r)&7)<<3|5); \
-    *(--_nIns) = 0x58;\
-    *(--_nIns) = 0x0f;\
-    *(--_nIns) = 0xf2;\
-    asm_output("addsd %s,%p // =%f",gpn(r),(void*)daddr,*daddr); \
-    } while(0)
-
-#define SSE_SUBSD(rd,rs) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
-    SSE(0xf20f5c, (rd)&7, (rs)&7); \
-    asm_output("subsd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-#define SSE_MULSD(rd,rs) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
-    SSE(0xf20f59, (rd)&7, (rs)&7); \
-    asm_output("mulsd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-#define SSE_DIVSD(rd,rs) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
-    SSE(0xf20f5e, (rd)&7, (rs)&7); \
-    asm_output("divsd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-#define SSE_UCOMISD(rl,rr) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(rl) && _is_xmm_reg_(rr));\
-    SSE(0x660f2e, (rl)&7, (rr)&7); \
-    asm_output("ucomisd %s,%s",gpn(rl),gpn(rr)); \
-    } while(0)
-
-#define CVTSI2SDm(xr,d,b) do{ \
-    count_fpu();\
-    NanoAssert(_is_xmm_reg_(xr) && _is_gp_reg_(b));\
-    SSEm(0xf20f2a, (xr)&7, (d), (b)); \
-    asm_output("cvtsi2sd %s,%d(%s)",gpn(xr),(d),gpn(b)); \
-    } while(0)
-
-#define SSE_XORPD(r, maskaddr) do {\
-    count_fpuld();\
-    underrunProtect(8); \
-    IMM32(maskaddr);\
-    *(--_nIns) = uint8_t(((r)&7)<<3|5); \
-    *(--_nIns) = 0x57;\
-    *(--_nIns) = 0x0f;\
-    *(--_nIns) = 0x66;\
-    asm_output("xorpd %s,[%p]",gpn(r),(void*)(maskaddr));\
-    } while(0)
-
-#define SSE_XORPDr(rd,rs) do{ \
-    count_fpu();\
-    SSE(0x660f57, (rd)&7, (rs)&7); \
-    asm_output("xorpd %s,%s",gpn(rd),gpn(rs)); \
-    } while(0)
-
-// floating point unit
-#define FPUc(o)                             \
-        underrunProtect(2);                 \
-        *(--_nIns) = ((uint8_t)(o)&0xff);       \
-        *(--_nIns) = (uint8_t)(((o)>>8)&0xff)
-
-#define FPU(o,r)                            \
-        underrunProtect(2);                 \
-        *(--_nIns) = uint8_t(((uint8_t)(o)&0xff) | (r&7));\
-        *(--_nIns) = (uint8_t)(((o)>>8)&0xff)
-
-#define FPUm(o,d,b)                         \
-        underrunProtect(7);                 \
-        MODRMm((uint8_t)(o), d, b);         \
-        *(--_nIns) = (uint8_t)((o)>>8)
-
-#define FPUdm(o, m)                         \
-        underrunProtect(6);                 \
-        MODRMdm((uint8_t)(o), m);         \
-        *(--_nIns) = (uint8_t)((o)>>8)
-
-#define TEST_AH(i) do {                             \
-        count_alu();\
-        underrunProtect(3);                 \
-        *(--_nIns) = ((uint8_t)(i));            \
-        *(--_nIns) = 0xc4;                  \
-        *(--_nIns) = 0xf6;                  \
-        asm_output("test ah, %d",i); } while(0)
-
-#define TEST_AX(i) do {                             \
-        count_fpu();\
-        underrunProtect(5);                 \
-        *(--_nIns) = (0);       \
-        *(--_nIns) = ((uint8_t)(i));            \
-        *(--_nIns) = ((uint8_t)((i)>>8));       \
-        *(--_nIns) = (0);       \
-        *(--_nIns) = 0xa9;                  \
-        asm_output("test ax, %d",i); } while(0)
-
-#define FNSTSW_AX() do { count_fpu(); FPUc(0xdfe0);             asm_output("fnstsw_ax"); } while(0)
-#define FCHS()      do { count_fpu(); FPUc(0xd9e0);             asm_output("fchs"); } while(0)
-#define FLD1()      do { count_fpu(); FPUc(0xd9e8);             asm_output("fld1"); fpu_push(); } while(0)
-#define FLDZ()      do { count_fpu(); FPUc(0xd9ee);             asm_output("fldz"); fpu_push(); } while(0)
-#define FFREE(r)    do { count_fpu(); FPU(0xddc0, r);           asm_output("ffree %s",gpn(r)); } while(0)
-#define FST32(p,d,b) do { count_stq(); FPUm(0xd902|(p), d, b);   asm_output("fst%s32 %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
-#define FSTQ(p,d,b) do { count_stq(); FPUm(0xdd02|(p), d, b);   asm_output("fst%sq %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
-#define FSTPQ(d,b)  FSTQ(1,d,b)
-#define FCOM(p,d,b) do { count_fpuld(); FPUm(0xdc02|(p), d, b); asm_output("fcom%s %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
-#define FCOMdm(p,m) do { const double* const dm = m; \
-                         count_fpuld(); FPUdm(0xdc02|(p), dm);   asm_output("fcom%s (%p)",((p)?"p":""),(void*)dm); if (p) fpu_pop(); } while(0)
-#define FLD32(d,b)  do { count_ldq(); FPUm(0xd900, d, b);       asm_output("fld32 %d(%s)",d,gpn(b)); fpu_push();} while(0)
-#define FLDQ(d,b)   do { count_ldq(); FPUm(0xdd00, d, b);       asm_output("fldq %d(%s)",d,gpn(b)); fpu_push();} while(0)
-#define FLDQdm(m)   do { const double* const dm = m; \
-                         count_ldq(); FPUdm(0xdd00, dm);        asm_output("fldq (%p)",(void*)dm); fpu_push();} while(0)
-#define FILDQ(d,b)  do { count_fpuld(); FPUm(0xdf05, d, b);     asm_output("fildq %d(%s)",d,gpn(b)); fpu_push(); } while(0)
-#define FILD(d,b)   do { count_fpuld(); FPUm(0xdb00, d, b);     asm_output("fild %d(%s)",d,gpn(b)); fpu_push(); } while(0)
-#define FIST(p,d,b) do { count_fpu(); FPUm(0xdb02|(p), d, b);   asm_output("fist%s %d(%s)",((p)?"p":""),d,gpn(b)); if(p) fpu_pop(); } while(0)
-#define FADD(d,b)   do { count_fpu(); FPUm(0xdc00, d, b);       asm_output("fadd %d(%s)",d,gpn(b)); } while(0)
-#define FADDdm(m)   do { const double* const dm = m; \
-                         count_ldq(); FPUdm(0xdc00, dm);        asm_output("fadd (%p)",(void*)dm); } while(0)
-#define FSUB(d,b)   do { count_fpu(); FPUm(0xdc04, d, b);       asm_output("fsub %d(%s)",d,gpn(b)); } while(0)
-#define FSUBR(d,b)  do { count_fpu(); FPUm(0xdc05, d, b);       asm_output("fsubr %d(%s)",d,gpn(b)); } while(0)
-#define FSUBRdm(m)  do { const double* const dm = m; \
-                         count_ldq(); FPUdm(0xdc05, dm);        asm_output("fsubr (%p)",(void*)dm); } while(0)
-#define FMUL(d,b)   do { count_fpu(); FPUm(0xdc01, d, b);       asm_output("fmul %d(%s)",d,gpn(b)); } while(0)
-#define FMULdm(m)   do { const double* const dm = m; \
-                         count_ldq(); FPUdm(0xdc01, dm);        asm_output("fmul (%p)",(void*)dm); } while(0)
-#define FDIV(d,b)   do { count_fpu(); FPUm(0xdc06, d, b);       asm_output("fdiv %d(%s)",d,gpn(b)); } while(0)
-#define FDIVR(d,b)  do { count_fpu(); FPUm(0xdc07, d, b);       asm_output("fdivr %d(%s)",d,gpn(b)); } while(0)
-#define FDIVRdm(m)  do { const double* const dm = m; \
-                         count_ldq(); FPUdm(0xdc07, dm);        asm_output("fdivr (%p)",(void*)dm); } while(0)
-#define FINCSTP()   do { count_fpu(); FPUc(0xd9f7);             asm_output("fincstp"); } while(0)
-#define FSTP(r)     do { count_fpu(); FPU(0xddd8, r&7);         asm_output("fstp %s",gpn(r)); fpu_pop();} while(0)
-#define FCOMP()     do { count_fpu(); FPUc(0xD8D9);             asm_output("fcomp"); fpu_pop();} while(0)
-#define FCOMPP()    do { count_fpu(); FPUc(0xDED9);             asm_output("fcompp"); fpu_pop();fpu_pop();} while(0)
-#define FLDr(r)     do { count_ldq(); FPU(0xd9c0,r);            asm_output("fld %s",gpn(r)); fpu_push(); } while(0)
-#define EMMS()      do { count_fpu(); FPUc(0x0f77);             asm_output("emms"); } while (0)
-
-// standard direct call
-#define CALL(ci) do { \
-  count_call();\
-  underrunProtect(5);                   \
-  int offset = (ci->_address) - ((int)_nIns); \
-  IMM32( (uint32_t)offset );    \
-  *(--_nIns) = 0xE8;        \
-  verbose_only(asm_output("call %s",(ci->_name));) \
-  debug_only(if (ci->returnType()==ARGTYPE_F) fpu_push();)\
-} while (0)
-
-// indirect call thru register
-#define CALLr(ci,r)  do { \
-  count_calli();\
-  underrunProtect(2);\
-  ALU(0xff, 2, (r));\
-  verbose_only(asm_output("call %s",gpn(r));) \
-  debug_only(if (ci->returnType()==ARGTYPE_F) fpu_push();)\
-} while (0)
-
+        }; \
+        /* This should only be used when you can guarantee there is enough room on the page. */ \
+        void JMP_long_nochk_offset(int32_t o) { \
+            verbose_only( NIns* next = _nIns; (void)next; ) \
+            IMM32(o); \
+            *(--_nIns) = /*JMP32*/0xe9; \
+            asm_output("jmp %p",next+o); \
+        }; \
+        void JMP_indirect(Register r); \
+        void JMP_indexed(Register x, int32_t ss, NIns** addr); \
+        void JE(NIns* t); \
+        void JNE(NIns* t); \
+        void JP(NIns* t); \
+        void JNP(NIns* t); \
+        void JB(NIns* t); \
+        void JNB(NIns* t); \
+        void JBE(NIns* t); \
+        void JNBE(NIns* t); \
+        void JA(NIns* t); \
+        void JNA(NIns* t); \
+        void JAE(NIns* t); \
+        void JNAE(NIns* t); \
+        void JL(NIns* t); \
+        void JNL(NIns* t); \
+        void JLE(NIns* t); \
+        void JNLE(NIns* t); \
+        void JG(NIns* t); \
+        void JNG(NIns* t); \
+        void JGE(NIns* t); \
+        void JNGE(NIns* t); \
+        void JO(NIns* t); \
+        void JNO(NIns* t); \
+        void SSE(int32_t c, int32_t d, int32_t s); \
+        void SSEm(int32_t c, int32_t r, int32_t d, Register b); \
+        void LDSDm(Register r, const double* addr); \
+        void SSE_LDSD(Register r, int32_t d, Register b); \
+        void SSE_LDQ( Register r, int32_t d, Register b); \
+        void SSE_LDSS(Register r, int32_t d, Register b); \
+        void SSE_STSD(int32_t d, Register b, Register r); \
+        void SSE_STQ( int32_t d, Register b, Register r); \
+        void SSE_STSS(int32_t d, Register b, Register r); \
+        void SSE_CVTSI2SD(Register xr, Register gr); \
+        void SSE_CVTSD2SI(Register gr, Register xr); \
+        void SSE_CVTSD2SS(Register xr, Register gr); \
+        void SSE_CVTSS2SD(Register xr, Register gr); \
+        void SSE_CVTDQ2PD(Register d, Register r); \
+        void SSE_MOVD(Register d, Register s); \
+        void SSE_MOVSD(Register rd, Register rs); \
+        void SSE_MOVDm(Register d, Register b, Register xrs); \
+        void SSE_ADDSD(Register rd, Register rs); \
+        void SSE_ADDSDm(Register r, const double* addr); \
+        void SSE_SUBSD(Register rd, Register rs); \
+        void SSE_MULSD(Register rd, Register rs); \
+        void SSE_DIVSD(Register rd, Register rs); \
+        void SSE_UCOMISD(Register rl, Register rr); \
+        void SSE_CVTSI2SDm(Register xr, Register d, Register b); \
+        void SSE_XORPD(Register r, const uint32_t* maskaddr); \
+        void SSE_XORPDr(Register rd, Register rs); \
+        void FPUc(int32_t o); \
+        void FPU(int32_t o, Register r) { \
+            underrunProtect(2); \
+            *(--_nIns) = uint8_t(((uint8_t)(o)&0xff) | (r&7)); \
+            *(--_nIns) = (uint8_t)((o>>8)&0xff); \
+        }; \
+        void FPUm(int32_t o, int32_t d, Register b); \
+        void FPUdm(int32_t o, const double* const m); \
+        void TEST_AH(int32_t i); \
+        void TEST_AX(int32_t i); \
+        void FNSTSW_AX(); \
+        void FCHS(); \
+        void FLD1(); \
+        void FLDZ(); \
+        void FFREE(Register r); \
+        void FST32(bool p, int32_t d, Register b); \
+        void FSTQ(bool p, int32_t d, Register b); \
+        void FSTPQ(int32_t d, Register b); \
+        void FCOM(bool p, int32_t d, Register b); \
+        void FCOMdm(bool p, const double* dm); \
+        void FLD32(int32_t d, Register b); \
+        void FLDQ(int32_t d, Register b); \
+        void FLDQdm(const double* dm); \
+        void FILDQ(int32_t d, Register b); \
+        void FILD(int32_t d, Register b); \
+        void FIST(bool p, int32_t d, Register b); \
+        void FADD( int32_t d, Register b); \
+        void FSUB( int32_t d, Register b); \
+        void FSUBR(int32_t d, Register b); \
+        void FMUL( int32_t d, Register b); \
+        void FDIV( int32_t d, Register b); \
+        void FDIVR(int32_t d, Register b); \
+        void FADDdm( const double *dm); \
+        void FSUBRdm(const double* dm); \
+        void FMULdm( const double* dm); \
+        void FDIVRdm(const double* dm); \
+        void FINCSTP(); \
+        void FSTP(Register r) { \
+            count_fpu(); \
+            FPU(0xddd8, r); \
+            asm_output("fstp %s",gpn(r)); fpu_pop(); \
+        }; \
+        void FCOMP(); \
+        void FCOMPP(); \
+        void FLDr(Register r); \
+        void EMMS(); \
+        void CALL(const CallInfo* ci); \
+        void CALLr(const CallInfo* ci, Register r);
 }
+
+
+
 #endif // __nanojit_Nativei386__