merge

2010-08-19 16:46:27 -07:00 · 2010-08-19 16:46:27 -07:00 · eae82f529a
--- a/js/src/lirasm/tests/call1.in
+++ b/js/src/lirasm/tests/call1.in
@ -1,12 +1,12 @@
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
--- a/js/src/lirasm/tests/multfrag1.in
+++ b/js/src/lirasm/tests/multfrag1.in
@ -1,13 +1,13 @@
 .begin a
 ptr = allocp 8
 a = immi 65
-sti a ptr 0
+sti2c a ptr 0
 b = immi 66
-sti b ptr 1
+sti2c b ptr 1
 c = immi 67
-sti c ptr 2
+sti2c c ptr 2
 zero = immi 0
-sti zero ptr 3
+sti2c zero ptr 3
 ss = calli puts cdecl ptr
 nn = gei ss zero
 reti nn
--- a/js/src/nanojit-import-rev
+++ b/js/src/nanojit-import-rev
@ -1 +1 @@
-982cd218ddb049bdbbcdda4fa3a9d7e40e45e0be
+c7009f5cd83ea028b98f59e1f8830a76ba27c1dd
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -41,7 +41,7 @@

 #ifdef FEATURE_NANOJIT

-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
 #include "../core/CodegenLIR.h"
 #endif

@ -50,6 +50,18 @@
    #pragma warning(disable:4310) // cast truncates constant value
 #endif

+#ifdef VMCFG_VTUNE
+namespace vtune {
+    using namespace nanojit;
+    void vtuneStart(void*, NIns*);
+    void vtuneEnd(void*, NIns*);
+    void vtuneLine(void*, int, NIns*);
+    void vtuneFile(void*, void*);
+}
+using namespace vtune;
+#endif // VMCFG_VTUNE
+
+
 namespace nanojit
 {
    /**
@ -74,8 +86,8 @@ namespace nanojit
    #if PEDANTIC
        , pedanticTop(NULL)
    #endif
-    #ifdef VTUNE
-        , cgen(NULL)
+    #ifdef VMCFG_VTUNE
+        , vtuneHandle(NULL)
    #endif
        , _config(config)
    {
@ -186,10 +198,11 @@ namespace nanojit
    void Assembler::registerResetAll()
    {
        nRegisterResetAll(_allocator);
+        _allocator.managed = _allocator.free;

        // At start, should have some registers free and none active.
        NanoAssert(0 != _allocator.free);
-        NanoAssert(0 == _allocator.countActive());
+        NanoAssert(0 == _allocator.activeMask());
 #ifdef NANOJIT_IA32
        debug_only(_fpuStkDepth = 0; )
 #endif
@ -273,14 +286,6 @@ namespace nanojit
        verbose_only( nBytes += (end - start) * sizeof(NIns); )
        NanoAssert(uintptr_t(end) - uintptr_t(start) >= (size_t)LARGEST_UNDERRUN_PROT);
        eip = end;
-
-        #ifdef VTUNE
-        if (_nIns && _nExitIns) {
-            //cgen->jitAddRecord((uintptr_t)list->code, 0, 0, true); // add placeholder record for top of page
-            cgen->jitCodePosUpdate((uintptr_t)list->code);
-            cgen->jitPushInfo(); // new page requires new entry
-        }
-        #endif
    }

    void Assembler::reset()
@ -360,8 +365,7 @@ namespace nanojit
    void Assembler::registerConsistencyCheck()
    {
        RegisterMask managed = _allocator.managed;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & managed) {
+        for (Register r = lsReg(managed); managed; r = nextLsReg(managed, r)) {
            // A register managed by register allocation must be either
            // free or active, but not both.
            if (_allocator.isFree(r)) {
@ -374,9 +378,13 @@ namespace nanojit
                NanoAssert(ins);
                NanoAssertMsg(r == ins->getReg(), "Register record mismatch");
            }
-            } else {
+        }
+
+        RegisterMask not_managed = ~_allocator.managed;
+        for (Register r = lsReg(not_managed); not_managed; r = nextLsReg(not_managed, r)) {
            // A register not managed by register allocation must be
            // neither free nor active.
+            if (r <= LastReg) {
                NanoAssert(!_allocator.isFree(r));
                NanoAssert(!_allocator.getActive(r));
            }
@ -1108,6 +1116,15 @@ namespace nanojit
        // save entry point pointers
        frag->fragEntry = fragEntry;
        frag->setCode(_nIns);
+
+#ifdef VMCFG_VTUNE
+        if (vtuneHandle)
+        {
+            vtuneEnd(vtuneHandle, codeEnd);
+            vtuneStart(vtuneHandle, _nIns);
+        }
+#endif
+
        PERFM_NVPROF("code", CodeAlloc::size(codeList));

 #ifdef NANOJIT_IA32
@ -1120,17 +1137,16 @@ namespace nanojit

    void Assembler::releaseRegisters()
    {
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active; r = nextLsReg(active, r))
        {
            LIns *ins = _allocator.getActive(r);
-            if (ins) {
            // Clear reg allocation, preserve stack allocation.
            _allocator.retire(r);
            NanoAssert(r == ins->getReg());
            ins->clearReg();
        }
    }
-    }

 #ifdef PERFM
 #define countlir_live() _nvprof("lir-live",1)
@ -1731,7 +1747,7 @@ namespace nanojit
                    // Out of range indices aren't allowed or checked.
                    // Code after this jtbl instruction is unreachable.
                    releaseRegisters();
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);

                    uint32_t count = ins->getTableSize();
                    bool has_back_edges = false;
@ -1756,7 +1772,7 @@ namespace nanojit
                    // to reconcile registers.  So, frontends *must* insert LIR_regfence at labels of
                    // forward jtbl jumps.  Check here to make sure no registers were picked up from
                    // any forward edges.
-                    NanoAssert(_allocator.countActive() == 0);
+                    NanoAssert(_allocator.activeMask() == 0);

                    if (has_back_edges) {
                        handleLoopCarriedExprs(pending_lives);
@ -1928,27 +1944,28 @@ namespace nanojit
                    asm_call(ins);
                    break;

-                #ifdef VTUNE
+                #ifdef VMCFG_VTUNE
                case LIR_file: {
                     // we traverse backwards so we are now hitting the file
                     // that is associated with a bunch of LIR_lines we already have seen
-                    ins->oprnd1()->setResultLive();
-                    uintptr_t currentFile = ins->oprnd1()->immI();
-                    cgen->jitFilenameUpdate(currentFile);
+                    if (vtuneHandle) {
+                        void * currentFile = (void *) ins->oprnd1()->immI();
+                        vtuneFile(vtuneHandle, currentFile);
+                    }
                    break;
                }
-
                case LIR_line: {
                     // add a new table entry, we don't yet knwo which file it belongs
                     // to so we need to add it to the update table too
                     // note the alloc, actual act is delayed; see above
-                    ins->oprnd1()->setResultLive();
+                    if (vtuneHandle) {
                        uint32_t currentLine = (uint32_t) ins->oprnd1()->immI();
-                    cgen->jitLineNumUpdate(currentLine);
-                    cgen->jitAddRecord((uintptr_t)_nIns, 0, currentLine, true);
+                        vtuneLine(vtuneHandle, currentLine, _nIns);
+                    }
                    break;
                }
-                #endif // VTUNE
+               #endif // VMCFG_VTUNE
+
            }

 #ifdef NJ_VERBOSE
@ -1968,10 +1985,6 @@ namespace nanojit
            if (error())
                return;

-        #ifdef VTUNE
-            cgen->jitCodePosUpdate((uintptr_t)_nIns);
-        #endif
-
            // check that all is well (don't check in exit paths since its more complicated)
            debug_only( pageValidate(); )
            debug_only( resourceConsistencyCheck();  )
@ -2073,9 +2086,9 @@ namespace nanojit
        VMPI_sprintf(s, "RR");
        s += VMPI_strlen(s);

-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
+        RegisterMask active = _allocator.activeMask();
+        for (Register r = lsReg(active); active != 0; r = nextLsReg(active, r)) {
            LIns *ins = _allocator.getActive(r);
-            if (ins) {
            NanoAssertMsg(!_allocator.isFree(r),
                          "Coding error; register is both free and active! " );
            RefBuf b;
@ -2091,7 +2104,6 @@ namespace nanojit
            VMPI_sprintf(s, " %s(%s)", gpn(r), n);
            s += VMPI_strlen(s);
        }
-        }
        output();
    }

@ -2236,10 +2248,9 @@ namespace nanojit
        Register tosave[LastReg-FirstReg+1];
        int len=0;
        RegAlloc *regs = &_allocator;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & GpRegs & ~ignore) {
+        RegisterMask evict_set = regs->activeMask() & GpRegs & ~ignore;
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r)) {
            LIns *ins = regs->getActive(r);
-                if (ins) {
            if (canRemat(ins)) {
                NanoAssert(ins->getReg() == r);
                evict(ins);
@ -2256,8 +2267,6 @@ namespace nanojit
                tosave[j] = r;
            }
        }
-            }
-        }

        // Now primap has the live exprs in priority order.
        // Allocate each of the top priority exprs to a SavedReg.
@ -2297,24 +2306,12 @@ namespace nanojit
        evictSomeActiveRegs(~(SavedRegs | ignore));
    }

-    void Assembler::evictAllActiveRegs()
-    {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            evictIfActive(r);
-        }
-    }
-
+    // Generate code to restore any registers in 'regs' that are currently active,
    void Assembler::evictSomeActiveRegs(RegisterMask regs)
    {
-        // generate code to restore callee saved registers
-        // @todo speed this up
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if ((rmask(r) & regs)) {
-                evictIfActive(r);
-            }
-        }
+        RegisterMask evict_set = regs & _allocator.activeMask();
+        for (Register r = lsReg(evict_set); evict_set; r = nextLsReg(evict_set, r))
+            evict(_allocator.getActive(r));
    }

    /**
@ -2337,19 +2334,13 @@ namespace nanojit
        // Do evictions and pops first.
        verbose_only(bool shouldMention=false; )
        // The obvious thing to do here is to iterate from FirstReg to LastReg.
-        // viz: for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) ...
        // However, on ARM that causes lower-numbered integer registers
        // to be be saved at higher addresses, which inhibits the formation
        // of load/store multiple instructions.  Hence iterate the loop the
-        // other way.  The "r <= LastReg" guards against wraparound in
-        // the case where Register is treated as unsigned and FirstReg is zero.
-        //
-        // Note, the loop var is deliberately typed as int (*not* Register)
-        // to outsmart compilers that will otherwise report
-        // "error: comparison is always true due to limited range of data type".
-        for (int ri = LastReg; ri >= FirstReg && ri <= LastReg; ri = int(prevreg(Register(ri))))
+        // other way.
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = msReg(reg_set); reg_set; r = nextMsReg(reg_set, r))
        {
-            Register const r = Register(ri);
            LIns* curins = _allocator.getActive(r);
            LIns* savedins = saved.getActive(r);
            if (curins != savedins)
@ -2403,7 +2394,8 @@ namespace nanojit

        // Do evictions and pops first.
        verbose_only(bool shouldMention=false; )
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
+        RegisterMask reg_set = _allocator.activeMask() | saved.activeMask();
+        for (Register r = lsReg(reg_set); reg_set; r = nextLsReg(reg_set, r))
        {
            LIns* curins = _allocator.getActive(r);
            LIns* savedins = saved.getActive(r);
@ -2453,17 +2445,16 @@ namespace nanojit
        NanoAssert(allow);
        LIns *ins, *vic = 0;
        int allow_pri = 0x7fffffff;
-        for (Register r = FirstReg; r <= LastReg; r = nextreg(r))
-        {
-            if ((allow & rmask(r)) && (ins = _allocator.getActive(r)) != 0)
+        RegisterMask vic_set = allow & _allocator.activeMask();
+        for (Register r = lsReg(vic_set); vic_set; r = nextLsReg(vic_set, r))
        {
+            ins = _allocator.getActive(r);
            int pri = canRemat(ins) ? 0 : _allocator.getPriority(r);
            if (!vic || pri < allow_pri) {
                vic = ins;
                allow_pri = pri;
            }
        }
-        }
        NanoAssert(vic != 0);
        return vic;
    }
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -196,7 +196,7 @@ namespace nanojit
    typedef HashMap<uint64_t, uint64_t*> ImmDPoolMap;
 #endif

-#ifdef VTUNE
+#ifdef VMCFG_VTUNE
    class avmplus::CodegenLIR;
 #endif

@ -271,8 +271,8 @@ namespace nanojit
            #endif // NJ_VERBOSE

        public:
-            #ifdef VTUNE
-            avmplus::CodegenLIR *cgen;
+            #ifdef VMCFG_VTUNE
+            void* vtuneHandle;
            #endif

            Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config);
@ -315,7 +315,11 @@ namespace nanojit
            Register    registerAlloc(LIns* ins, RegisterMask allow, RegisterMask prefer);
            Register    registerAllocTmp(RegisterMask allow);
            void        registerResetAll();
-            void        evictAllActiveRegs();
+            void        evictAllActiveRegs() {
+                // The evicted set will be be intersected with activeSet(),
+                // so use an all-1s mask to avoid an extra load or call.
+                evictSomeActiveRegs(~RegisterMask(0));
+            }
            void        evictSomeActiveRegs(RegisterMask regs);
            void        evictScratchRegsExcept(RegisterMask ignore);
            void        intersectRegisterState(RegAlloc& saved);
--- a/js/src/nanojit/CodeAlloc.cpp
+++ b/js/src/nanojit/CodeAlloc.cpp
@ -47,7 +47,11 @@
 namespace nanojit
 {
    static const bool verbose = false;
-#if defined(NANOJIT_ARM)
+#ifdef VMCFG_VTUNE
+    // vtune jit profiling api can't handle non-contiguous methods,
+    // so make the allocation size huge to avoid non-contiguous methods
+    static const int pagesPerAlloc = 128; // 1MB
+#elif defined(NANOJIT_ARM)
    // ARM requires single-page allocations, due to the constant pool that
    // lives on each page that must be reachable by a 4kb pcrel load.
    static const int pagesPerAlloc = 1;
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -1976,13 +1976,16 @@ namespace nanojit
        m_capNL[LIns3]     = 16;
        m_capNL[LInsCall]  = 64;

-        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) {
            m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
+            m_usedNL[nlkind] = 1; // Force memset in clearAll().
+        }

        // Note that this allocates the CONST and MULTIPLE tables as well.
        for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) {
            m_capL[a] = 16;
            m_listL[a] = new (alloc) LIns*[m_capL[a]];
+            m_usedL[a] = 1; // Force memset(0) in first clearAll().
        }

        clearAll();
@ -2484,7 +2487,7 @@ namespace nanojit
                // this function.  
                AccSet a = storesSinceLastLoad & ((1 << EMB_NUM_USED_ACCS) - 1);
                while (a) {
-                    int acc = msbSet(a);
+                    int acc = msbSet32(a);
                    clearL((CseAcc)acc);
                    a &= ~(1 << acc);
                }
@ -3038,7 +3041,7 @@ namespace nanojit

        case LIR_file:
        case LIR_line:
-            // XXX: not sure about these ones.  Ignore for the moment.
+            // These will never get hit since VTUNE implies !DEBUG.  Ignore for the moment.
            nArgs = 0;
            break;

--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@ -289,32 +289,9 @@ namespace nanojit
    struct MiniAccSet { MiniAccSetVal val; };
    static const MiniAccSet MINI_ACCSET_MULTIPLE = { 99 };

-#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
-    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
-    # pragma intrinsic(_BitScanReverse)
-
-    // Returns the index of the most significant bit that is set.
-    static int msbSet(uint32_t x) {
-        unsigned long idx;
-        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
-        return idx;
-    }
-#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-    static int msbSet(uint32_t x) {
-        return 31 - __builtin_clz(x | 1);
-    }
-#else
-    static int msbSet(uint32_t x) {     // slow fallback version
-        for (int i = 31; i >= 0; i--)
-            if ((1 << i) & x) 
-                return i;
-        return 0;
-    }
-#endif
-
    static MiniAccSet compressAccSet(AccSet accSet) {
        if (isSingletonAccSet(accSet)) {
-            MiniAccSet ret = { uint8_t(msbSet(accSet)) };
+            MiniAccSet ret = { uint8_t(msbSet32(accSet)) };
            return ret;
        }

@ -1143,8 +1120,12 @@ namespace nanojit
        // Nb: the types of these bitfields are all 32-bit integers to ensure
        // they are fully packed on Windows, sigh.  Also, 'loadQual' is
        // unsigned to ensure the values 0, 1, and 2 all fit in 2 bits.
-        int32_t     disp:16;
-        int32_t     miniAccSetVal:8;
+        //
+        // Nb: explicit signed keyword for bitfield types is required,
+        // some compilers may treat them as unsigned without it.
+        // See Bugzilla 584219 comment #18
+        signed int  disp:16;
+        signed int  miniAccSetVal:8;
        uint32_t    loadQual:2;

        LIns*       oprnd_1;
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@ -99,15 +99,6 @@

 namespace nanojit {

-    inline Register nextreg(Register r) {
-        return Register(r+1);
-    }
-
-    inline Register prevreg(Register r) {
-        return Register(r-1);
-    }
-
-
    class Fragment;
    struct SideExit;
    struct SwitchInfo;
@ -153,7 +144,7 @@ namespace nanojit {
            if (_logc->lcbits & LC_Native) { \
                outline[0]='\0'; \
                VMPI_sprintf(outline, "%p   ", _nIns);  \
-                sprintf(&outline[13], ##__VA_ARGS__); \
+                VMPI_sprintf(outline+VMPI_strlen(outline), ##__VA_ARGS__);   \
                output();                               \
            } \
        } while (0) /* no semi */
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -43,7 +43,6 @@

 #ifdef UNDER_CE
 #include <cmnintrin.h>
-extern "C" bool blx_lr_broken();
 #endif

 #if defined(FEATURE_NANOJIT) && defined(NANOJIT_ARM)
@ -114,13 +113,14 @@ Assembler::CountLeadingZeroes(uint32_t data)
    // ARMCC can do this with an intrinsic.
    leading_zeroes = __clz(data);

-// current Android GCC compiler incorrectly refuses to compile 'clz' for armv5
-// (even though this is a legal instruction there). Since we currently only compile for ARMv5
-// for emulation, we don't care too much (but we DO care for ARMv6+ since those are "real"
-// devices).
-#elif defined(__GNUC__) && !(defined(ANDROID) && __ARM_ARCH__ <= 5)
+#elif defined(__GNUC__) && (NJ_COMPILER_ARM_ARCH >= 5)
    // GCC can use inline assembler to insert a CLZ instruction.
    __asm (
+#if defined(ANDROID) && (NJ_COMPILER_ARM_ARCH < 7)
+    // On Android gcc compiler, the clz instruction is not supported with a
+    // target smaller than armv7, despite it being legal for armv5+.
+        "   .arch armv7-a\n"
+#endif
        "   clz     %0, %1  \n"
        :   "=r"    (leading_zeroes)
        :   "r"     (data)
@ -463,11 +463,6 @@ Assembler::asm_eor_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 void
 Assembler::nInit(AvmCore*)
 {
-#ifdef UNDER_CE
-    blx_lr_bug = blx_lr_broken();
-#else
-    blx_lr_bug = 0;
-#endif
    nHints[LIR_calli]  = rmask(retRegs[0]);
    nHints[LIR_hcalli] = rmask(retRegs[1]);
    nHints[LIR_paramp] = PREFER_SPECIAL;
@ -628,7 +623,7 @@ Assembler::asm_arg(ArgType ty, LIns* arg, Register& r, int& stkd)
        // pre-assign registers R0-R3 for arguments (if they fit)
        if (r < R4) {
            asm_regarg(ty, arg, r);
-            r = nextreg(r);
+            r = Register(r + 1);
        } else {
            asm_stkarg(arg, stkd);
            stkd += 4;
@ -662,14 +657,14 @@ Assembler::asm_arg_64(LIns* arg, Register& r, int& stkd)
    // R3 if r is R3 to start with, and will force the argument to go on
    // the stack.
    if ((r == R1) || (r == R3)) {
-        r = nextreg(r);
+        r = Register(r + 1);
    }
 #endif

    if (r < R3) {
        Register    ra = r;
-        Register    rb = nextreg(r);
-        r = nextreg(rb);
+        Register    rb = Register(r + 1);
+        r = Register(rb + 1);

 #ifdef NJ_ARM_EABI
        // EABI requires that 64-bit arguments are aligned on even-numbered
@ -692,12 +687,8 @@ Assembler::asm_arg_64(LIns* arg, Register& r, int& stkd)
        // We only have one register left, but the legacy ABI requires that we
        // put 32 bits of the argument in the register (R3) and the remaining
        // 32 bits on the stack.
-        Register    ra = r;
-        r = nextreg(r);
-
-        // This really just checks that nextreg() works properly, as we know
-        // that r was previously R3.
-        NanoAssert(r == R4);
+        Register    ra = r; // R3
+        r = R4;

        // We're splitting the argument between registers and the stack.  This
        // must be the first time that the stack is used, so stkd must be at 0.
@ -912,26 +903,17 @@ Assembler::asm_call(LIns* ins)
            outputf("        %p:", _nIns);
        )

-        // Direct call: on v5 and above (where the calling sequence doesn't
-        // corrupt LR until the actual branch instruction), we can avoid an
-        // interlock in the "long" branch sequence by manually loading the
-        // target address into LR ourselves before setting up the parameters
-        // in other registers.
        BranchWithLink((NIns*)ci->_address);
    } else {
-        // Indirect call: we assign the address arg to LR since it's not
-        // used for regular arguments, and is otherwise scratch since it's
-        // clobberred by the call. On v4/v4T, where we have to manually do
-        // the equivalent of a BLX, move LR into IP before corrupting LR
-        // with the return address.
-        if (blx_lr_bug) {
+        // Indirect call: we assign the address arg to LR
+#ifdef UNDER_CE
        // workaround for msft device emulator bug (blx lr emulated as no-op)
        underrunProtect(8);
        BLX(IP);
        MOV(IP, LR);
-        } else {
+#else
        BLX(LR);
-        }
+#endif
        asm_regarg(ARGTYPE_I, ins->arg(--argc), LR);
    }

@ -981,8 +963,6 @@ Assembler::nRegisterResetAll(RegAlloc& a)
        rmask(R10) | rmask(LR);
    if (_config.arm_vfp)
        a.free |= FpRegs;
-
-    debug_only(a.managed = a.free);
 }

 static inline ConditionCode
@ -1925,17 +1905,19 @@ Assembler::BLX(Register addr, bool chk /* = true */)
    NanoAssert(_config.arm_arch >= 5);

    NanoAssert(IsGpReg(addr));
+#ifdef UNDER_CE
    // There is a bug in the WinCE device emulator which stops "BLX LR" from
    // working as expected. Assert that we never do that!
-    if (blx_lr_bug) { NanoAssert(addr != LR); }
+    NanoAssert(addr != LR);
+#endif

    if (chk) {
        underrunProtect(4);
    }

-    // BLX IP
+    // BLX reg
    *(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
-    asm_output("blx ip");
+    asm_output("blx %s", gpn(addr));
 }

 // Emit the code required to load a memory address into a register as follows:
@ -2777,14 +2759,13 @@ Assembler::asm_cmov(LIns* ins)

    Register rf = findRegFor(iffalse, allow & ~rmask(rr));

+    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
    if (ins->isop(LIR_cmovd)) {
        NIns* target = _nIns;
        asm_nongp_copy(rr, rf);
        asm_branch(false, condval, target);
-
-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
        if (rr != rt)
            asm_nongp_copy(rr, rt);
        freeResourcesOf(ins);
@ -2795,9 +2776,6 @@ Assembler::asm_cmov(LIns* ins)
        return;
    }

-    // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-    Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
    // WARNING: We cannot generate any code that affects the condition
    // codes between the MRcc generation here and the asm_cmp() call
    // below.  See asm_cmp() for more details.
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -240,7 +240,6 @@ verbose_only( extern const char* shiftNames[]; )
    inline uint32_t CountLeadingZeroes(uint32_t data);                          \
    int *       _nSlot;                                                         \
    int *       _nExitSlot;                                                     \
-    bool        blx_lr_bug;                                                     \
    int         max_out_args; /* bytes */

 #define IMM32(imm)  *(--_nIns) = (NIns)((imm));
--- a/js/src/nanojit/NativeMIPS.cpp
+++ b/js/src/nanojit/NativeMIPS.cpp
@ -481,8 +481,8 @@ namespace nanojit
        // where we are
        if (stkd & 4) {
            if (stkd < 16) {
-                r = nextreg(r);
-                fr = nextreg(fr);
+                r = Register(r + 1);
+                fr = Register(fr + 1);
            }
            stkd += 4;
        }
@ -496,11 +496,11 @@ namespace nanojit
                // Move it to the integer pair
                Register fpupair = arg->getReg();
                Register intpair = fr;
-                MFC1(mswregpair(intpair), nextreg(fpupair));       // Odd fpu register contains sign,expt,manthi
+                MFC1(mswregpair(intpair), Register(fpupair + 1));  // Odd fpu register contains sign,expt,manthi
                MFC1(lswregpair(intpair), fpupair);                // Even fpu register contains mantlo
            }
-            r = nextreg(nextreg(r));
-            fr = nextreg(nextreg(fr));
+            r = Register(r + 2);
+            fr = Register(fr + 2);
        }
        else
            asm_stkarg(arg, stkd);
@ -1578,8 +1578,8 @@ namespace nanojit
            NanoAssert(ty == ARGTYPE_I || ty == ARGTYPE_UI);
            if (stkd < 16) {
                asm_regarg(ty, arg, r);
-                fr = nextreg(fr);
-                r = nextreg(r);
+                fr = Register(fr + 1);
+                r = Register(r + 1);
            }
            else
                asm_stkarg(arg, stkd);
@ -1684,7 +1684,6 @@ namespace nanojit
        regs.free = GpRegs;
        if (cpu_has_fpu)
            regs.free |= FpRegs;
-        debug_only(regs.managed = regs.free;)
    }

 #define signextend16(s) ((int32_t(s)<<16)>>16)
--- a/js/src/nanojit/NativePPC.cpp
+++ b/js/src/nanojit/NativePPC.cpp
@ -736,7 +736,7 @@ namespace nanojit
                // GP arg
                if (r <= R10) {
                    asm_regarg(ty, arg, r);
-                    r = nextreg(r);
+                    r = Register(r + 1);
                    param_size += sizeof(void*);
                } else {
                    // put arg on stack
@ -746,11 +746,11 @@ namespace nanojit
                // double
                if (fr <= F13) {
                    asm_regarg(ty, arg, fr);
-                    fr = nextreg(fr);
+                    fr = Register(fr + 1);
                #ifdef NANOJIT_64BIT
-                    r = nextreg(r);
+                    r = Register(r + 1);
                #else
-                    r = nextreg(nextreg(r)); // skip 2 gpr's
+                    r = Register(r + 2); // skip 2 gpr's
                #endif
                    param_size += sizeof(double);
                } else {
@ -1040,11 +1040,11 @@ namespace nanojit
        }
    }

-    void Assembler::asm_dasq(LIns *ins) {
+    void Assembler::asm_dasq(LIns*) {
        TODO(asm_dasq);
    }

-    void Assembler::asm_qasd(LIns *ins) {
+    void Assembler::asm_qasd(LIns*) {
        TODO(asm_qasd);
    }

@ -1390,7 +1390,6 @@ namespace nanojit
    void Assembler::nRegisterResetAll(RegAlloc &regs) {
        regs.clear();
        regs.free = SavedRegs | 0x1ff8 /* R3-12 */ | 0x3ffe00000000LL /* F1-13 */;
-        debug_only(regs.managed = regs.free);
    }

 #ifdef NANOJIT_64BIT
--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@ -234,7 +234,6 @@ namespace nanojit
    {
        a.clear();
        a.free = GpRegs | FpRegs;
-        debug_only( a.managed = a.free; )
    }

    void Assembler::nPatchBranch(NIns* branch, NIns* location)
@ -537,7 +536,7 @@ namespace nanojit
        return at;
    }

-    NIns* Assembler::asm_branch_ov(LOpcode, NIns* targ)
+    NIns* Assembler::asm_branch_ov(LOpcode op, NIns* targ)
    {
        NIns* at = 0;
        underrunProtect(32);
@ -552,6 +551,9 @@ namespace nanojit
        }
        NOP();

+        if( op == LIR_mulxovi || op == LIR_muljovi )
+            BNE(0, tt);
+        else
            BVS(0, tt);
        return at;
    }
@ -645,7 +647,7 @@ namespace nanojit

        Register rb = deprecated_UnknownReg;
        RegisterMask allow = GpRegs;
-        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || !rhs->isImmI());
+        bool forceReg = (op == LIR_muli || op == LIR_mulxovi || op == LIR_muljovi || !rhs->isImmI());

        if (lhs != rhs && forceReg)
            {
@ -679,8 +681,14 @@ namespace nanojit
                    ADDCC(rr, rb, rr);
                else if (op == LIR_subi || op == LIR_subxovi)
                    SUBCC(rr, rb, rr);
-                else if (op == LIR_muli || op == LIR_mulxovi)
-                    MULX(rr, rb, rr);
+                else if (op == LIR_muli)
+                    SMULCC(rr, rb, rr);
+                else if (op == LIR_mulxovi || op == LIR_muljovi) {
+                    SUBCC(L4, L6, L4);
+                    SRAI(rr, 31, L6);
+                    RDY(L4);
+                    SMULCC(rr, rb, rr);
+                }
                else if (op == LIR_andi)
                    AND(rr, rb, rr);
                else if (op == LIR_ori)
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@ -737,10 +737,10 @@ namespace nanojit
    asm_output("movvs %d, %s", simm11, gpn(rd)); \
    } while (0)

-#define MULX(rs1, rs2, rd) \
+#define SMULCC(rs1, rs2, rd) \
    do { \
-    Format_3_1(2, rd, 0x9, rs1, 0, rs2); \
-    asm_output("mul %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
+    Format_3_1(2, rd, 0x1b, rs1, 0, rs2); \
+    asm_output("smulcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
    } while (0)

 #define NOP() \
@ -773,6 +773,12 @@ namespace nanojit
    asm_output("andcc %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
    } while (0)

+#define RDY(rd) \
+    do { \
+    Format_3_1(2, rd, 0x28, 0, 0, 0); \
+    asm_output("rdy %s", gpn(rd)); \
+    } while (0)
+
 #define RESTORE(rs1, rs2, rd) \
    do { \
    Format_3_1(2, rd, 0x3D, rs1, 0, rs2); \
@ -809,6 +815,12 @@ namespace nanojit
    asm_output("sra %s, %s, %s", gpn(rs1), gpn(rs2), gpn(rd)); \
    } while (0)

+#define SRAI(rs1, shcnt32, rd) \
+    do { \
+    Format_3_6(2, rd, 0x27, rs1, shcnt32); \
+    asm_output("sra %s, %d, %s", gpn(rs1), shcnt32, gpn(rd)); \
+    } while (0)
+
 #define SRL(rs1, rs2, rd) \
    do { \
    Format_3_5(2, rd, 0x26, rs1, 0, rs2); \
--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@ -966,7 +966,7 @@ namespace nanojit
            else if (ty == ARGTYPE_D && fr < XMM8) {
                // double goes in next available XMM register
                asm_regarg(ty, arg, fr);
-                fr = nextreg(fr);
+                fr = Register(fr + 1);
            }
        #endif
            else {
@ -1119,14 +1119,13 @@ namespace nanojit

        Register rf = findRegFor(iffalse, allow & ~rmask(rr));

+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
        if (ins->isop(LIR_cmovd)) {
            NIns* target = _nIns;
            asm_nongp_copy(rr, rf);
            asm_branch(false, cond, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
            if (rr != rt)
                asm_nongp_copy(rr, rt);
            freeResourcesOf(ins);
@ -1137,9 +1136,6 @@ namespace nanojit
            return;
        }

-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
        // WARNING: We cannot generate any code that affects the condition
        // codes between the MRcc generation here and the asm_cmp() call
        // below.  See asm_cmp() for more details.
@ -1905,7 +1901,6 @@ namespace nanojit
 #else
        a.free = 0xffffffff & ~(1<<RSP | 1<<RBP);
 #endif
-        debug_only( a.managed = a.free; )
    }

    void Assembler::nPatchBranch(NIns *patch, NIns *target) {
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@ -1112,7 +1112,6 @@ namespace nanojit
        a.free = SavedRegs | ScratchRegs;
        if (!_config.i386_sse2)
            a.free &= ~XmmRegs;
-        debug_only( a.managed = a.free; )
    }

    void Assembler::nPatchBranch(NIns* branch, NIns* targ)
@ -2059,14 +2058,13 @@ namespace nanojit

        Register rf = findRegFor(iffalse, allow & ~rmask(rr));

+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
        if (ins->isop(LIR_cmovd)) {
            NIns* target = _nIns;
            asm_nongp_copy(rr, rf);
            asm_branch(false, condval, target);
-
-            // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-            Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
            if (rr != rt)
                asm_nongp_copy(rr, rt);
            freeResourcesOf(ins);
@ -2077,9 +2075,6 @@ namespace nanojit
            return;
        }

-        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
-        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
-
        NanoAssert(ins->isop(LIR_cmovi));

        // WARNING: We cannot generate any code that affects the condition
--- a/js/src/nanojit/RegAlloc.cpp
+++ b/js/src/nanojit/RegAlloc.cpp
@ -45,14 +45,6 @@ namespace nanojit

    #ifdef _DEBUG

-    uint32_t RegAlloc::countActive()
-    {
-        int cnt = 0;
-        for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-            cnt += active[i] ? 1 : 0;
-        return cnt;
-    }
-
    bool RegAlloc::isConsistent(Register r, LIns* i) const
    {
        NanoAssert(r != deprecated_UnknownReg);
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@ -120,9 +120,13 @@ namespace nanojit
            return active[r];
        }

-        debug_only( uint32_t    countActive(); )
+        // Return a mask containing the active registers.  For each register
+        // in this set, getActive(register) will be a nonzero LIns pointer.
+        RegisterMask activeMask() const {
+            return ~free & managed;
+        }
+
        debug_only( bool        isConsistent(Register r, LIns* v) const; )
-        debug_only( RegisterMask managed; )     // the registers managed by the register allocator

        // Some basics:
        //
@ -171,10 +175,41 @@ namespace nanojit
        //
        LIns*           active[LastReg + 1];    // active[r] = LIns that defines r
        int32_t         usepri[LastReg + 1];    // used priority. lower = more likely to spill.
-        RegisterMask    free;
+        RegisterMask    free;       // Registers currently free.
+        RegisterMask    managed;    // Registers under management (invariant).
        int32_t         priority;

        DECLARE_PLATFORM_REGALLOC()
    };
+
+    // Return the lowest numbered Register in mask.
+    inline Register lsReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) lsbSet32(mask);
+        else
+            return (Register) lsbSet64(mask);
+    }
+
+    // Return the highest numbered Register in mask.
+    inline Register msReg(RegisterMask mask) {
+        // This is faster than it looks; we rely on the C++ optimizer
+        // to strip the dead branch and inline just one alternative.
+        if (sizeof(RegisterMask) == 4)
+            return (Register) msbSet32(mask);
+        else
+            return (Register) msbSet64(mask);
+    }
+
+    // Clear bit r in mask, then return lsReg(mask).
+    inline Register nextLsReg(RegisterMask& mask, Register r) {
+        return lsReg(mask &= ~rmask(r));
+    }
+
+    // Clear bit r in mask, then return msReg(mask).
+    inline Register nextMsReg(RegisterMask& mask, Register r) {
+        return msReg(mask &= ~rmask(r));
+    }
 }
 #endif // __nanojit_RegAlloc__
--- a/js/src/nanojit/avmplus.cpp
+++ b/js/src/nanojit/avmplus.cpp
@ -41,13 +41,6 @@
    typedef void *maddr_ptr;
 #endif

-#if defined(AVMPLUS_ARM) && defined(UNDER_CE)
-extern "C" bool
-blx_lr_broken() {
-    return false;
-}
-#endif
-
 using namespace avmplus;

 nanojit::Config AvmCore::config;
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@ -189,6 +189,121 @@ static inline bool isU32(uintptr_t i) {
 #define alignTo(x,s)        ((((uintptr_t)(x)))&~(((uintptr_t)s)-1))
 #define alignUp(x,s)        ((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))

+namespace nanojit
+{
+// Define msbSet32(), lsbSet32(), msbSet64(), and lsbSet64() functions using
+// fast find-first-bit instructions intrinsics when available.
+// The fall-back implementations use iteration.
+#if defined(_WIN32) && (_MSC_VER >= 1300) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+
+    extern "C" unsigned char _BitScanForward(unsigned long * Index, unsigned long Mask);
+    extern "C" unsigned char _BitScanReverse(unsigned long * Index, unsigned long Mask);
+    # pragma intrinsic(_BitScanForward)
+    # pragma intrinsic(_BitScanReverse)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanReverse(&idx, (unsigned long)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        unsigned long idx;
+        _BitScanForward(&idx, (unsigned long)(x | 0x80000000)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+
+#if defined(_M_AMD64) || defined(_M_X64)
+    extern "C" unsigned char _BitScanForward64(unsigned long * Index, unsigned __int64 Mask);
+    extern "C" unsigned char _BitScanReverse64(unsigned long * Index, unsigned __int64 Mask);
+    # pragma intrinsic(_BitScanForward64)
+    # pragma intrinsic(_BitScanReverse64)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanReverse64(&idx, (unsigned __int64)(x | 1)); // the '| 1' ensures a 0 result when x==0
+        return idx;
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        unsigned long idx;
+        _BitScanForward64(&idx, (unsigned __int64)(x | 0x8000000000000000LL)); // the '| 0x80000000' ensures a 0 result when x==0
+        return idx;
+    }
+#else
+    // Returns the index of the most significant bit that is set.
+    static int msbSet64(uint64_t x) {
+        return (x & 0xffffffff00000000LL) ? msbSet32(uint32_t(x >> 32)) + 32 : msbSet32(uint32_t(x));
+    }
+    // Returns the index of the least significant bit that is set.
+    static int lsbSet64(uint64_t x) {
+        return (x & 0x00000000ffffffffLL) ? lsbSet32(uint32_t(x)) : lsbSet32(uint32_t(x >> 32)) + 32;
+    }
+#endif
+
+#elif (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet32(uint32_t x) {
+        return 31 - __builtin_clz(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet32(uint32_t x) {
+        return __builtin_ctz(x | 0x80000000);
+    }
+
+    // Returns the index of the most significant bit that is set.
+    static inline int msbSet64(uint64_t x) {
+        return 63 - __builtin_clzll(x | 1);
+    }
+
+    // Returns the index of the least significant bit that is set.
+    static inline int lsbSet64(uint64_t x) {
+        return __builtin_ctzll(x | 0x8000000000000000LL);
+    }
+
+#else
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet32(uint32_t x) {
+        for (int i = 31; i >= 0; i--)
+            if ((1 << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet32(uint32_t x) {
+        for (int i = 0; i < 32; i++)
+            if ((1 << i) & x)
+                return i;
+        return 31;
+    }
+
+    // Slow fall-back: return most significant bit set by searching iteratively.
+    static int msbSet64(uint64_t x) {
+        for (int i = 63; i >= 0; i--)
+            if ((1LL << i) & x)
+                return i;
+        return 0;
+    }
+
+    // Slow fall-back: return least significant bit set by searching iteratively.
+    static int lsbSet64(uint64_t x) {
+        for (int i = 0; i < 64; i++)
+            if ((1LL << i) & x)
+                return i;
+        return 63;
+    }
+
+#endif // select compiler
+} // namespace nanojit
+
 // -------------------------------------------------------------------
 // START debug-logging definitions
 // -------------------------------------------------------------------