merge

2010-11-10 15:52:26 -08:00 · 2010-11-10 15:52:26 -08:00 · 39e1d582a2
--- a/js/src/nanojit-import-rev
+++ b/js/src/nanojit-import-rev
@ -1 +1 @@
-f348fd5b02118c7151d991f51d76abe69976952e
+04d7771f3f85877cf12395ffecfc4f2f6d4a0b50
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -70,8 +70,7 @@ namespace nanojit
     *    - merging paths ( build a graph? ), possibly use external rep to drive codegen
     */
    Assembler::Assembler(CodeAlloc& codeAlloc, Allocator& dataAlloc, Allocator& alloc, AvmCore* core, LogControl* logc, const Config& config)
-        : codeList(NULL)
-        , alloc(alloc)
+        : alloc(alloc)
        , _codeAlloc(codeAlloc)
        , _dataAlloc(dataAlloc)
        , _thisfrag(NULL)
@ -82,6 +81,7 @@ namespace nanojit
    #if NJ_USES_IMMD_POOL
        , _immDPool(alloc)
    #endif
+        , codeList(NULL)
        , _epilogue(NULL)
        , _err(None)
    #if PEDANTIC
@ -1125,6 +1125,7 @@ namespace nanojit
            _codeAlloc.free(exitStart, exitEnd);
        _codeAlloc.free(codeStart, codeEnd);
        codeList = NULL;
+        _codeAlloc.markAllExec(); // expensive but safe, we mark all code pages R-X
    }

    void Assembler::endAssembly(Fragment* frag)
@ -1162,6 +1163,9 @@ namespace nanojit
        verbose_only( codeBytes -= (_nIns - codeStart) * sizeof(NIns); )
 #endif

+        // note: the code pages are no longer writable from this point onwards
+        _codeAlloc.markExec(codeList);
+
        // at this point all our new code is in the d-cache and not the i-cache,
        // so flush the i-cache on cpu's that need it.
        CodeAlloc::flushICache(codeList);
@ -1488,13 +1492,10 @@ namespace nanojit
            {
                size_t delta = (uintptr_t)priorIns - (uintptr_t)_nIns; // # bytes that have been emitted since last go-around

-                if (codeList) {
-                    codeList = codeList;
-                }
                // if no codeList then we know priorIns and _nIns are on same page, otherwise make sure priorIns was not in the previous code block
                if (!codeList || !codeList->isInBlock(priorIns)) {
                    NanoAssert(delta < VMPI_getVMPageSize()); // sanity check
-                    nopInsertTrigger -= delta;
+                    nopInsertTrigger -= (int32_t) delta;
                    if (nopInsertTrigger < 0)
                    {
                        nopInsertTrigger = noiseForNopInsertion(_noise);
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -313,8 +313,6 @@ namespace nanojit
            debug_only( void        resourceConsistencyCheck(); )
            debug_only( void        registerConsistencyCheck(); )

-            CodeList*   codeList;                   // finished blocks of code.
-
        private:
            void        gen(LirFilter* toCompile);
            NIns*       genPrologue();
@ -401,6 +399,7 @@ namespace nanojit
            // temporarily swap all the code/exit variables below (using
            // swapCodeChunks()).  Afterwards we swap them all back and set
            // _inExit to false again.
+            CodeList*   codeList;               // finished blocks of code.
            bool        _inExit, vpad2[3];
            NIns        *codeStart, *codeEnd;   // current normal code chunk
            NIns        *exitStart, *exitEnd;   // current exit code chunk
--- a/js/src/nanojit/CodeAlloc.cpp
+++ b/js/src/nanojit/CodeAlloc.cpp
@ -128,28 +128,20 @@ namespace nanojit
    }

    void CodeAlloc::alloc(NIns* &start, NIns* &end) {
-        //  Reuse a block if possible.
-        if (availblocks) {
-            markBlockWrite(availblocks);
-            CodeList* b = removeBlock(availblocks);
-            b->isFree = false;
-            start = b->start();
-            end = b->end;
-            if (verbose)
-                avmplus::AvmLog("alloc %p-%p %d\n", start, end, int(end-start));
-            return;
+        if (!availblocks) {
+            // no free mem, get more
+            addMem();
        }
-        // no suitable block found, get more memory
-        void *mem = allocCodeChunk(bytesPerAlloc); // allocations never fail
-        totalAllocated += bytesPerAlloc;
-        NanoAssert(mem != NULL); // see allocCodeChunk contract in CodeAlloc.h
-        _nvprof("alloc page", uintptr_t(mem)>>12);
-        CodeList* b = addMem(mem, bytesPerAlloc);
+
+        //  grab a block
+        markBlockWrite(availblocks);
+        CodeList* b = removeBlock(availblocks);
        b->isFree = false;
        start = b->start();
        end = b->end;
        if (verbose)
-            avmplus::AvmLog("alloc %p-%p %d\n", start, end, int(end-start));
+            avmplus::AvmLog("CodeAlloc(%p).alloc %p-%p %d\n", this, start, end, int(end-start));
+        debug_only(sanity_check();)
    }

    void CodeAlloc::free(NIns* start, NIns *end) {
@ -349,11 +341,16 @@ extern  "C" void sync_instruction_memory(caddr_t v, u_int len);
        blocks = b;
    }

-    CodeList* CodeAlloc::addMem(void *mem, size_t bytes) {
+    void CodeAlloc::addMem() {
+        void *mem = allocCodeChunk(bytesPerAlloc); // allocations never fail
+        totalAllocated += bytesPerAlloc;
+        NanoAssert(mem != NULL); // see allocCodeChunk contract in CodeAlloc.h
+        _nvprof("alloc page", uintptr_t(mem)>>12);
+
        CodeList* b = (CodeList*)mem;
        b->lower = 0;
-        b->end = (NIns*) (uintptr_t(mem) + bytes - sizeofMinBlock);
        b->next = 0;
+        b->end = (NIns*) (uintptr_t(mem) + bytesPerAlloc - sizeofMinBlock);
        b->isFree = true;

        // create a tiny terminator block, add to fragmented list, this way
@ -370,7 +367,8 @@ extern  "C" void sync_instruction_memory(caddr_t v, u_int len);
        // add terminator to heapblocks list so we can track whole blocks
        terminator->next = heapblocks;
        heapblocks = terminator;
-        return b;
+
+        addBlock(availblocks, b); // add to free list
    }

    CodeList* CodeAlloc::getBlock(NIns* start, NIns* end) {
@ -509,6 +507,15 @@ extern  "C" void sync_instruction_memory(caddr_t v, u_int len);
    }
    #endif

+    // Loop through a list of blocks marking the chunks executable.  If we encounter
+    // multiple blocks in the same chunk, only the first block will cause the
+    // chunk to become executable, the other calls will no-op (isExec flag checked)
+    void CodeAlloc::markExec(CodeList* &blocks) {
+        for (CodeList *b = blocks; b != 0; b = b->next) {
+            markChunkExec(b->terminator);
+        }
+    }
+
    // Variant of markExec(CodeList*) that walks all heapblocks (i.e. chunks) marking
    // each one executable.   On systems where bytesPerAlloc is low (i.e. have lots
    // of elements in the list) this can be expensive.
--- a/js/src/nanojit/CodeAlloc.h
+++ b/js/src/nanojit/CodeAlloc.h
@ -43,9 +43,9 @@
 namespace nanojit
 {
    /**
-     * CodeList is a linked list of non-contigous blocks of code.  Clients use CodeList*
-     * to point to a list, and each CodeList instance tracks a single contiguous
-     * block of code.
+     * CodeList is a single block of code.  The next field is used to
+     * form linked lists of non-contiguous blocks of code.  Clients use CodeList*
+     * to point to the first block in a list.
     */
    class CodeList
    {
@ -95,13 +95,24 @@ namespace nanojit
    };

    /**
-     * Code memory allocator.
-     * Long lived manager for many code blocks,
+     * Code memory allocator is a long lived manager for many code blocks that
     * manages interaction with an underlying code memory allocator,
-     * setting page permissions, api's for allocating and freeing
+     * sets page permissions.  CodeAlloc provides APIs for allocating and freeing
     * individual blocks of code memory (for methods, stubs, or compiled
-     * traces), and also static functions for managing lists of allocated
-     * code.
+     * traces), static functions for managing lists of allocated code, and has
+     * a few pure virtual methods that embedders must implement to provide
+     * memory to the allocator.
+     *
+     * A "chunk" is a region of memory obtained from allocCodeChunk; it must
+     * be page aligned and be a multiple of the system page size.
+     *
+     * A "block" is a region of memory within a chunk.  It can be arbitrarily
+     * sized and aligned, but is always contained within a single chunk.
+     * class CodeList represents one block; the members of CodeList track the
+     * extent of the block and support creating lists of blocks.
+     *
+     * The allocator coalesces free blocks when it can, in free(), but never
+     * coalesces chunks.
     */
    class CodeAlloc
    {
@ -133,7 +144,7 @@ namespace nanojit
        static CodeList* getBlock(NIns* start, NIns* end);

        /** add raw memory to the free list */
-        CodeList* addMem(void* mem, size_t bytes);
+        void addMem();

        /** make sure all the higher/lower pointers are correct for every block */
        void sanity_check();
@ -142,9 +153,9 @@ namespace nanojit
        CodeList* firstBlock(CodeList* term);

        //
-        // CodeAlloc's SPI.  Implementations must be defined by nanojit embedder.
-        // allocation failures should cause an exception or longjmp; nanojit
-        // intentionally does not check for null.
+        // CodeAlloc's SPI (Service Provider Interface).  Implementations must be
+        // defined by nanojit embedder.  Allocation failures should cause an exception
+        // or longjmp; nanojit intentionally does not check for null.
        //

        /** allocate nbytes of memory to hold code.  Never return null! */
@ -203,9 +214,12 @@ namespace nanojit
        /** print out stats about heap usage */
        void logStats();

-        /** protect all code in this code alloc */
+        /** protect all code managed by this CodeAlloc */
        void markAllExec();

+        /** protect all mem in the block list */
+        void markExec(CodeList* &blocks);
+
        /** protect an entire chunk */
        void markChunkExec(CodeList* term);

--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -167,7 +167,6 @@ namespace nanojit
        // clear the stats, etc
        _unused = 0;
        _limit = 0;
-        _bytesAllocated = 0;
        _stats.lir = 0;
        for (int i = 0; i < NumSavedRegs; ++i)
            savedRegs[i] = NULL;
@ -186,11 +185,6 @@ namespace nanojit
        return _stats.lir;
    }

-    size_t LirBuffer::byteCount()
-    {
-        return _bytesAllocated - (_limit - _unused);
-    }
-
    // Allocate a new page, and write the first instruction to it -- a skip
    // linking to last instruction of the previous page.
    void LirBuffer::moveToNewChunk(uintptr_t addrOfLastLInsOnCurrentChunk)
@ -2070,23 +2064,25 @@ namespace nanojit
          suspended(false)
    {

-        m_findNL[LInsImmI] = &CseFilter::findImmI;
-        m_findNL[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ);
-        m_findNL[LInsImmD] = &CseFilter::findImmD;
-        m_findNL[LIns1]    = &CseFilter::find1;
-        m_findNL[LIns2]    = &CseFilter::find2;
-        m_findNL[LIns3]    = &CseFilter::find3;
-        m_findNL[LInsCall] = &CseFilter::findCall;
+        m_findNL[NLImmISmall] = &CseFilter::findImmISmall;
+        m_findNL[NLImmILarge] = &CseFilter::findImmILarge;
+        m_findNL[NLImmQ]      = PTR_SIZE(NULL, &CseFilter::findImmQ);
+        m_findNL[NLImmD]      = &CseFilter::findImmD;
+        m_findNL[NL1]         = &CseFilter::find1;
+        m_findNL[NL2]         = &CseFilter::find2;
+        m_findNL[NL3]         = &CseFilter::find3;
+        m_findNL[NLCall]      = &CseFilter::findCall;

-        m_capNL[LInsImmI]  = 128;
-        m_capNL[LInsImmQ]  = PTR_SIZE(0, 16);
-        m_capNL[LInsImmD]  = 16;
-        m_capNL[LIns1]     = 256;
-        m_capNL[LIns2]     = 512;
-        m_capNL[LIns3]     = 16;
-        m_capNL[LInsCall]  = 64;
+        m_capNL[NLImmISmall]  = 17;   // covers 0..16, which is over half the cases for TraceMonkey
+        m_capNL[NLImmILarge]  = 64;
+        m_capNL[NLImmQ]       = PTR_SIZE(0, 16);
+        m_capNL[NLImmD]       = 16;
+        m_capNL[NL1]          = 256;
+        m_capNL[NL2]          = 512;
+        m_capNL[NL3]          = 16;
+        m_capNL[NLCall]       = 64;

-        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) {
+        for (NLKind nlkind = NLFirst; nlkind <= NLLast; nlkind = nextNLKind(nlkind)) {
            m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
            m_usedNL[nlkind] = 1; // Force memset in clearAll().
        }
@ -2162,7 +2158,7 @@ namespace nanojit
    }

    void CseFilter::clearAll() {
-        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+        for (NLKind nlkind = NLFirst; nlkind <= NLLast; nlkind = nextNLKind(nlkind))
            clearNL(nlkind);

        // Note that this clears the CONST and MULTIPLE load tables as well.
@ -2216,6 +2212,7 @@ namespace nanojit

    void CseFilter::growNL(NLKind nlkind)
    {
+        NanoAssert(nlkind != NLImmISmall);
        const uint32_t oldcap = m_capNL[nlkind];
        m_capNL[nlkind] <<= 1;
        LIns** oldlist = m_listNL[nlkind];
@ -2248,6 +2245,16 @@ namespace nanojit
        }
    }

+    void CseFilter::addNLImmISmall(LIns* ins, uint32_t k)
+    {
+        if (suspended) return;
+        NLKind nlkind = NLImmISmall;
+        NanoAssert(k < m_capNL[nlkind]);
+        NanoAssert(!m_listNL[nlkind][k]);
+        m_usedNL[nlkind]++;
+        m_listNL[nlkind][k] = ins;
+    }
+
    void CseFilter::addNL(NLKind nlkind, LIns* ins, uint32_t k)
    {
        if (suspended) return;
@ -2271,9 +2278,26 @@ namespace nanojit
        }
    }

-    inline LIns* CseFilter::findImmI(int32_t a, uint32_t &k)
+    inline LIns* CseFilter::findImmISmall(int32_t a, uint32_t &k)
    {
-        NLKind nlkind = LInsImmI;
+        // This one is a direct array lookup rather than a hashtable lookup.
+        NLKind nlkind = NLImmISmall;
+        k = a;
+        LIns* ins = m_listNL[nlkind][k];
+        NanoAssert(!ins || ins->isImmI(a));
+        return ins;
+    }
+
+    uint32_t CseFilter::findImmISmall(LIns* ins)
+    {
+        uint32_t k;
+        findImmISmall(ins->immI(), k);
+        return k;
+    }
+
+    inline LIns* CseFilter::findImmILarge(int32_t a, uint32_t &k)
+    {
+        NLKind nlkind = NLImmILarge;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hashImmI(a) & bitmask;
        uint32_t n = 1;
@ -2296,17 +2320,17 @@ namespace nanojit
        }
    }

-    uint32_t CseFilter::findImmI(LIns* ins)
+    uint32_t CseFilter::findImmILarge(LIns* ins)
    {
        uint32_t k;
-        findImmI(ins->immI(), k);
+        findImmILarge(ins->immI(), k);
        return k;
    }

 #ifdef NANOJIT_64BIT
    inline LIns* CseFilter::findImmQ(uint64_t a, uint32_t &k)
    {
-        NLKind nlkind = LInsImmQ;
+        NLKind nlkind = NLImmQ;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hashImmQorD(a) & bitmask;
        uint32_t n = 1;
@ -2332,7 +2356,7 @@ namespace nanojit

    inline LIns* CseFilter::findImmD(uint64_t a, uint32_t &k)
    {
-        NLKind nlkind = LInsImmD;
+        NLKind nlkind = NLImmD;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hashImmQorD(a) & bitmask;
        uint32_t n = 1;
@ -2357,7 +2381,7 @@ namespace nanojit

    inline LIns* CseFilter::find1(LOpcode op, LIns* a, uint32_t &k)
    {
-        NLKind nlkind = LIns1;
+        NLKind nlkind = NL1;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hash1(op, a) & bitmask;
        uint32_t n = 1;
@ -2381,7 +2405,7 @@ namespace nanojit

    inline LIns* CseFilter::find2(LOpcode op, LIns* a, LIns* b, uint32_t &k)
    {
-        NLKind nlkind = LIns2;
+        NLKind nlkind = NL2;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hash2(op, a, b) & bitmask;
        uint32_t n = 1;
@ -2405,7 +2429,7 @@ namespace nanojit

    inline LIns* CseFilter::find3(LOpcode op, LIns* a, LIns* b, LIns* c, uint32_t &k)
    {
-        NLKind nlkind = LIns3;
+        NLKind nlkind = NL3;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hash3(op, a, b, c) & bitmask;
        uint32_t n = 1;
@ -2466,7 +2490,7 @@ namespace nanojit

    inline LIns* CseFilter::findCall(const CallInfo *ci, uint32_t argc, LIns* args[], uint32_t &k)
    {
-        NLKind nlkind = LInsCall;
+        NLKind nlkind = NLCall;
        const uint32_t bitmask = m_capNL[nlkind] - 1;
        k = hashCall(ci, argc, args) & bitmask;
        uint32_t n = 1;
@ -2496,10 +2520,19 @@ namespace nanojit
    LIns* CseFilter::insImmI(int32_t imm)
    {
        uint32_t k;
-        LIns* ins = findImmI(imm, k);
-        if (!ins) {
-            ins = out->insImmI(imm);
-            addNL(LInsImmI, ins, k);
+        LIns* ins;
+        if (0 <= imm && imm < int32_t(m_capNL[NLImmISmall])) {
+            ins = findImmISmall(imm, k);
+            if (!ins) {
+                ins = out->insImmI(imm);
+                addNLImmISmall(ins, k);
+            }
+        } else {
+            ins = findImmILarge(imm, k);
+            if (!ins) {
+                ins = out->insImmI(imm);
+                addNL(NLImmILarge, ins, k);
+            }
        }
        // We assume that downstream stages do not modify the instruction, so
        // that we can insert 'ins' into slot 'k'.  Check this.
@ -2514,7 +2547,7 @@ namespace nanojit
        LIns* ins = findImmQ(q, k);
        if (!ins) {
            ins = out->insImmQ(q);
-            addNL(LInsImmQ, ins, k);
+            addNL(NLImmQ, ins, k);
        }
        NanoAssert(ins->isop(LIR_immq) && ins->immQ() == q);
        return ins;
@ -2534,7 +2567,7 @@ namespace nanojit
        LIns* ins = findImmD(u.u64, k);
        if (!ins) {
            ins = out->insImmD(d);
-            addNL(LInsImmD, ins, k);
+            addNL(NLImmD, ins, k);
        }
        NanoAssert(ins->isop(LIR_immd) && ins->immDasQ() == u.u64);
        return ins;
@ -2555,7 +2588,7 @@ namespace nanojit
            ins = find1(op, a, k);
            if (!ins) {
                ins = out->ins1(op, a);
-                addNL(LIns1, ins, k);
+                addNL(NL1, ins, k);
            }
        } else {
            ins = out->ins1(op, a);
@ -2572,7 +2605,7 @@ namespace nanojit
        ins = find2(op, a, b, k);
        if (!ins) {
            ins = out->ins2(op, a, b);
-            addNL(LIns2, ins, k);
+            addNL(NL2, ins, k);
        } else if (ins->isCmp()) {
            if (knownCmpValues.containsKey(ins)) {
                // We've seen this comparison before, and it was previously
@ -2594,7 +2627,7 @@ namespace nanojit
        LIns* ins = find3(op, a, b, c, k);
        if (!ins) {
            ins = out->ins3(op, a, b, c);
-            addNL(LIns3, ins, k);
+            addNL(NL3, ins, k);
        }
        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c);
        return ins;
@ -2694,7 +2727,7 @@ namespace nanojit
            ins = find1(op, c, k);
            if (!ins) {
                ins = out->insGuard(op, c, gr);
-                addNL(LIns1, ins, k);
+                addNL(NL1, ins, k);
            }
            // After this guard, we know that 'c's result was true (if
            // op==LIR_xf) or false (if op==LIR_xt), else we would have
@ -2719,7 +2752,7 @@ namespace nanojit
        LIns* ins = find2(op, a, b, k);
        if (!ins) {
            ins = out->insGuardXov(op, a, b, gr);
-            addNL(LIns2, ins, k);
+            addNL(NL2, ins, k);
        }
        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
        return ins;
@ -2737,7 +2770,7 @@ namespace nanojit
            ins = findCall(ci, argc, args, k);
            if (!ins) {
                ins = out->insCall(ci, args);
-                addNL(LInsCall, ins, k);
+                addNL(NLCall, ins, k);
            }
        } else {
            // We only need to worry about aliasing if !ci->_isPure.
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@ -1924,18 +1924,19 @@ namespace nanojit
            // We divide instruction kinds into groups.  LIns0 isn't present
            // because we don't need to record any 0-ary instructions.  Loads
            // aren't here, they're handled separately.
-            LInsImmI = 0,
-            LInsImmQ = 1,   // only occurs on 64-bit platforms
-            LInsImmD = 2,
-            LIns1    = 3,
-            LIns2    = 4,
-            LIns3    = 5,
-            LInsCall = 6,
+            NLImmISmall = 0,
+            NLImmILarge = 1,
+            NLImmQ      = 2,   // only occurs on 64-bit platforms
+            NLImmD      = 3,
+            NL1         = 4,
+            NL2         = 5,
+            NL3         = 6,
+            NLCall      = 7,

-            LInsFirst = 0,
-            LInsLast = 6,
+            NLFirst = 0,
+            NLLast = 7,
            // Need a value after "last" to outsmart compilers that insist last+1 is impossible.
-            LInsInvalid = 7
+            NLInvalid = 8
        };
        #define nextNLKind(kind)  NLKind(kind+1)

@ -1948,11 +1949,11 @@ namespace nanojit
        //     Don't start m_capNL too small, or we'll waste time growing and rehashing.
        //     Don't start m_capNL too large, will waste memory.
        //
-        LIns**      m_listNL[LInsLast + 1];
-        uint32_t    m_capNL[ LInsLast + 1];
-        uint32_t    m_usedNL[LInsLast + 1];
+        LIns**      m_listNL[NLLast + 1];
+        uint32_t    m_capNL[ NLLast + 1];
+        uint32_t    m_usedNL[NLLast + 1];
        typedef uint32_t (CseFilter::*find_t)(LIns*);
-        find_t      m_findNL[LInsLast + 1];
+        find_t      m_findNL[NLLast + 1];

        // Similarly, for loads, there is one table for each CseAcc.  A CseAcc
        // is like a normal access region, but there are two extra possible
@ -2021,7 +2022,8 @@ namespace nanojit
        static uint32_t hashCall(const CallInfo *call, uint32_t argc, LIns* args[]);

        // These versions are used before an LIns has been created.
-        LIns* findImmI(int32_t a, uint32_t &k);
+        LIns* findImmISmall(int32_t a, uint32_t &k);
+        LIns* findImmILarge(int32_t a, uint32_t &k);
 #ifdef NANOJIT_64BIT
        LIns* findImmQ(uint64_t a, uint32_t &k);
 #endif
@ -2036,7 +2038,8 @@ namespace nanojit
        // These versions are used after an LIns has been created; they are
        // used for rehashing after growing.  They just call onto the
        // multi-arg versions above.
-        uint32_t findImmI(LIns* ins);
+        uint32_t findImmISmall(LIns* ins);
+        uint32_t findImmILarge(LIns* ins);
 #ifdef NANOJIT_64BIT
        uint32_t findImmQ(LIns* ins);
 #endif
@ -2050,6 +2053,7 @@ namespace nanojit
        void growNL(NLKind kind);
        void growL(CseAcc cseAcc);

+        void addNLImmISmall(LIns* ins, uint32_t k);
        // 'k' is the index found by findXYZ().
        void addNL(NLKind kind, LIns* ins, uint32_t k);
        void addL(LIns* ins, uint32_t k);
@ -2096,7 +2100,6 @@ namespace nanojit
            verbose_only(LInsPrinter* printer;)

            int32_t insCount();
-            size_t  byteCount();

            // stats
            struct
@ -2123,7 +2126,6 @@ namespace nanojit
            Allocator&  _allocator;
            uintptr_t   _unused;   // next unused instruction slot in the current LIR chunk
            uintptr_t   _limit;    // one past the last usable byte of the current LIR chunk
-            size_t      _bytesAllocated;
    };

    class LirBufWriter : public LirWriter
--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@ -115,16 +115,16 @@ namespace nanojit
    }
    inline void Assembler::SUB(Register rs1, Register rs2, Register rd) {
        IntegerOperation(rs1, rs2, rd, 0x4, "sub");
-    };
+    }
    inline void Assembler::SUBCC(Register rs1, Register rs2, Register rd) {
        IntegerOperation(rs1, rs2, rd, 0x14, "subcc");
-    };
+    }
    inline void Assembler::SUBI(Register rs1, int32_t simm13, Register rd) {
        IntegerOperationI(rs1, simm13, rd, 0x4, "sub");
    }
    inline void Assembler::XOR(Register rs1, Register rs2, Register rd) {
        IntegerOperation(rs1, rs2, rd, 0x3, "xor");
-    };
+    }

    inline void Assembler::Bicc(int32_t a, int32_t dsp22, int32_t cond, const char *opcode) {
        Format_2_2(a, cond, 0x2, dsp22);
@ -208,6 +208,12 @@ namespace nanojit
    inline void Assembler::FITOD(Register rs2, Register rd) {
        FloatOperation(G0, rs2, rd, 0xc8, "fitod");
    }
+    inline void Assembler::FDTOS(Register rs2, Register rd) {
+        FloatOperation(G0, rs2, rd, 0xc6, "fdtos");
+    }
+    inline void Assembler::FSTOD(Register rs2, Register rd) {
+        FloatOperation(G0, rs2, rd, 0xc9, "fstod");
+    }

    inline void Assembler::JMPL(Register rs1, Register rs2, Register rd) {
        Format_3_1(2, rd, 0x38, rs1, 0, rs2);
@ -238,6 +244,15 @@ namespace nanojit
        LoadOperationI(rs1, simm13, rd, 0x20, "ldf");
    }

+    inline void Assembler::LDF32(Register rs1, int32_t immI, Register rd) {
+        if (isIMM13(immI)) {
+            LDFI(rs1, immI, rd);
+        } else {
+            LDF(rs1, L0, rd);
+            SET32(immI, L0);
+        }
+    }
+
    inline void Assembler::LDDF32(Register rs1, int32_t immI, Register rd) {
        if (isIMM13(immI+4)) {
            LDFI(rs1, immI+4, rd + 1);
@ -266,6 +281,22 @@ namespace nanojit
        }
    }

+    inline void Assembler::LDSB(Register rs1, Register rs2, Register rd) {
+        LoadOperation(rs1, rs2, rd,  0x9, "ldsb");
+    }
+    inline void Assembler::LDSBI(Register rs1, int32_t simm13, Register rd) {
+        LoadOperationI(rs1, simm13, rd, 0x9, "ldsb");
+    }
+
+    inline void Assembler::LDSB32(Register rs1, int32_t immI, Register rd) {
+        if (isIMM13(immI)) {
+            LDSBI(rs1, immI, rd);
+        } else {
+            LDSB(rs1, L0, rd);
+            SET32(immI, L0);
+        }
+    }
+
    inline void Assembler::LDUH(Register rs1, Register rs2, Register rd) {
        LoadOperation(rs1, rs2, rd,  0x2, "lduh");
    }
@ -282,6 +313,22 @@ namespace nanojit
        }
    }

+    inline void Assembler::LDSH(Register rs1, Register rs2, Register rd) {
+        LoadOperation(rs1, rs2, rd,  0xa, "ldsh");
+    }
+    inline void Assembler::LDSHI(Register rs1, int32_t simm13, Register rd) {
+        LoadOperationI(rs1, simm13, rd, 0xa, "ldsh");
+    }
+
+    inline void Assembler::LDSH32(Register rs1, int32_t immI, Register rd) {
+        if (isIMM13(immI)) {
+            LDSHI(rs1, immI, rd);
+        } else {
+            LDSH(rs1, L0, rd);
+            SET32(immI, L0);
+        }
+    }
+
    inline void Assembler::LDSW(Register rs1, Register rs2, Register rd) {
        LoadOperation(rs1, rs2, rd,  0x8, "ldsw");
    }
@ -475,6 +522,22 @@ namespace nanojit
         }
    }

+    inline void Assembler::STH(Register rd, Register rs1, Register rs2) {
+        Store(rd, rs1, rs2, 0x6, "sth");
+    }
+    inline void Assembler::STHI(Register rd, int32_t simm13, Register rs1) {
+        StoreI(rd, simm13, rs1, 0x6, "sth");
+    }
+
+    inline void Assembler::STH32(Register rd, int32_t immI, Register rs1) {
+        if (isIMM13(immI)) {
+            STHI(rd, immI, rs1);
+         } else {
+            STH(rd, L0, rs1);
+            SET32(immI, L0);
+         }
+    }
+
    inline void Assembler::STB(Register rd, Register rs1, Register rs2) {
        Store(rd, rs1, rs2, 0x5, "stb");
    }
@ -753,11 +816,9 @@ namespace nanojit
        switch (op) {
            case LIR_sti:
            case LIR_sti2c:
+            case LIR_sti2s:
                // handled by mainline code below for now
                break;
-            case LIR_sti2s:
-                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
-                return;
            default:
                NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
                return;
@ -775,6 +836,9 @@ namespace nanojit
                case LIR_sti2c:
                    STB32(L2, dr, rb);
                    break;
+                case LIR_sti2s:
+                    STH32(L2, dr, rb);
+                    break;
                }
                SET32(c, L2);
            }
@ -797,6 +861,9 @@ namespace nanojit
                case LIR_sti2c:
                    STB32(ra, dr, rb);
                    break;
+                case LIR_sti2s:
+                    STH32(ra, dr, rb);
+                    break;
                }
            }
    }
@ -817,65 +884,75 @@ namespace nanojit
    {
        switch (ins->opcode()) {
            case LIR_ldd:
+            case LIR_ldf2d:
                // handled by mainline code below for now
                break;
-            case LIR_ldf2d:
-                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
-                return;
            default:
                NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
                return;
        }

-        underrunProtect(72);
+        underrunProtect(48);
        LIns* base = ins->oprnd1();
        int db = ins->disp();
-        Register rr = ins->deprecated_getReg();
+        Register rb = getBaseReg(base, db, GpRegs);

-        int dr = deprecated_disp(ins);
-        Register rb;
-        if (base->isop(LIR_allocp)) {
-            rb = FP;
-            db += findMemFor(base);
-        } else {
-            rb = findRegFor(base, GpRegs);
-        }
-        ins->clearReg();
+        if (ins->isInReg()) {
+            Register rr =  ins->getReg();
+            asm_maybe_spill(ins, false);
+            NanoAssert(rmask(rr) & FpRegs);

-        // don't use an fpu reg to simply load & store the value.
-        if (dr)
-            asm_mmq(FP, dr, rb, db);
-
-        deprecated_freeRsrcOf(ins);
-
-        if (rr != deprecated_UnknownReg)
-            {
-                NanoAssert(rmask(rr)&FpRegs);
-                _allocator.retire(rr);
+            if (ins->opcode() == LIR_ldd) {
                LDDF32(rb, db, rr);
+            } else {
+                FSTOD(F28, rr);
+                LDF32(rb, db, F28);
            }
+        } else {
+            NanoAssert(ins->isInAr());
+            int dr = arDisp(ins);
+
+            if (ins->opcode() == LIR_ldd) {
+                // don't use an fpu reg to simply load & store the value.
+                asm_mmq(FP, dr, rb, db);
+            } else {
+                STDF32(F28, dr, FP);
+                FSTOD(F28, F28);
+                LDF32(rb, db, F28);
+            }
+        }
+
+        freeResourcesOf(ins);
    }

    void Assembler::asm_store64(LOpcode op, LIns* value, int dr, LIns* base)
    {
        switch (op) {
            case LIR_std:
+            case LIR_std2f:
                // handled by mainline code below for now
                break;
-            case LIR_std2f:
-                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
-                return;
            default:
                NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
                return;
        }

        underrunProtect(48);
+        Register rb = getBaseReg(base, dr, GpRegs);
+        if (op == LIR_std2f) {
+            Register rv = ( !value->isInReg()
+                            ? findRegFor(value, FpRegs)
+                            : value->getReg() );
+            NanoAssert(rmask(rv) & FpRegs);
+            STF32(F28, dr, rb);
+            FDTOS(rv, F28);
+            return;
+        }
+
        if (value->isImmD())
            {
                // if a constant 64-bit value just store it now rather than
                // generating a pointless store/load/store sequence
-                Register rb = findRegFor(base, GpRegs);
                STW32(L2, dr+4, rb);
                SET32(value->immDlo(), L2);
                STW32(L2, dr, rb);
@ -895,30 +972,15 @@ namespace nanojit
                // c) maybe its a double just being stored.  oh well.

                int da = findMemFor(value);
-                Register rb;
-                if (base->isop(LIR_allocp)) {
-                    rb = FP;
-                    dr += findMemFor(base);
-                } else {
-                    rb = findRegFor(base, GpRegs);
-                }
                asm_mmq(rb, dr, FP, da);
                return;
            }

-        Register rb;
-        if (base->isop(LIR_allocp)) {
-            rb = FP;
-            dr += findMemFor(base);
-        } else {
-            rb = findRegFor(base, GpRegs);
-        }
-
        // if value already in a reg, use that, otherwise
-        // try to get it into XMM regs before FPU regs.
+        // get it into FPU regs.
        Register rv = ( !value->isInReg()
                      ? findRegFor(value, FpRegs)
-                      : value->deprecated_getReg() );
+                      : value->getReg() );

        STDF32(rv, dr, rb);
    }
@ -1244,9 +1306,11 @@ namespace nanojit
                LDSW32(ra, d, rr);
                break;
            case LIR_ldc2i:
+                LDSB32(ra, d, rr);
+                break;
            case LIR_lds2i:
-                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
-                return;
+                LDSH32(ra, d, rr);
+                break;
            default:
                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
                return;
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@ -77,7 +77,7 @@ namespace nanojit
 #define NJ_MAX_PARAMETERS               1

 #define NJ_JTBL_SUPPORTED               0
-#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 1
 #define NJ_F2I_SUPPORTED                1
 #define NJ_SOFTFLOAT_SUPPORTED          0
 #define NJ_DIVI_SUPPORTED               0
@ -296,7 +296,7 @@ namespace nanojit
        Format_3A(2, rd, op3, (cond & 0xF) << 14 | (opf_cc & 0x7) << 11 | (opf_low & 0x3F) << 5 | _reg_(rs2)); \
    } \
    void IntegerOperation(Register rs1, Register rs2, Register rd, int32_t op3, const char *opcode); \
-    void Assembler::IntegerOperationI(Register rs1, int32_t simm13, Register rd, int32_t op3, const char *opcode); \
+    void IntegerOperationI(Register rs1, int32_t simm13, Register rd, int32_t op3, const char *opcode); \
    void FloatOperation(Register rs1, Register rs2, Register rd, int32_t op3, const char *opcode); \
    void Bicc(int32_t a, int32_t dsp22, int32_t cond, const char *opcode); \
    void FBfcc(int32_t a, int32_t dsp22, int32_t cond, const char *opcode); \
@ -308,7 +308,7 @@ namespace nanojit
    void ShiftOperation(Register rs1, Register rs2, Register rd, int32_t op3, const char* opcode); \
    void ShiftOperationI(Register rs1, int32_t shcnt32, Register rd, int32_t op3, const char* opcode); \
    void Store(Register rd, Register rs1, Register rs2, int32_t op3, const char* opcode); \
-    void Assembler::StoreI(Register rd, int32_t simm13, Register rs1, int32_t op3, const char* opcode); \
+    void StoreI(Register rd, int32_t simm13, Register rs1, int32_t op3, const char* opcode); \
    void ADD(Register rs1, Register rs2, Register rd); \
    void ADDCC(Register rs1, Register rs2, Register rd); \
    void AND(Register rs1, Register rs2, Register rd); \
@ -355,17 +355,26 @@ namespace nanojit
    void FMOVD(Register rs2, Register rd); \
    void FNEGD(Register rs2, Register rd); \
    void FITOD(Register rs2, Register rd); \
+    void FDTOS(Register rs2, Register rd); \
+    void FSTOD(Register rs2, Register rd); \
    void JMPL(Register rs1, Register rs2, Register rd); \
    void JMPLI(Register rs1, int32_t simm13, Register rd); \
    void LDF(Register rs1, Register rs2, Register rd); \
    void LDFI(Register rs1, int32_t simm13, Register rd); \
+    void LDF32(Register rs1, int32_t immI, Register rd); \
    void LDDF32(Register rs1, int32_t immI, Register rd); \
    void LDUB(Register rs1, Register rs2, Register rd); \
    void LDUBI(Register rs1, int32_t simm13, Register rd); \
    void LDUB32(Register rs1, int32_t immI, Register rd); \
+    void LDSB(Register rs1, Register rs2, Register rd); \
+    void LDSBI(Register rs1, int32_t simm13, Register rd); \
+    void LDSB32(Register rs1, int32_t immI, Register rd); \
    void LDUH(Register rs1, Register rs2, Register rd); \
    void LDUHI(Register rs1, int32_t simm13, Register rd); \
    void LDUH32(Register rs1, int32_t immI, Register rd); \
+    void LDSH(Register rs1, Register rs2, Register rd); \
+    void LDSHI(Register rs1, int32_t simm13, Register rd); \
+    void LDSH32(Register rs1, int32_t immI, Register rd); \
    void LDSW(Register rs1, Register rs2, Register rd); \
    void LDSWI(Register rs1, int32_t simm13, Register rd); \
    void LDSW32(Register rs1, int32_t immI, Register rd); \
@ -428,6 +437,9 @@ namespace nanojit
    void STW(Register rd, Register rs1, Register rs2); \
    void STWI(Register rd, int32_t simm13, Register rs1); \
    void STW32(Register rd, int32_t immI, Register rs1); \
+    void STH(Register rd, Register rs1, Register rs2); \
+    void STHI(Register rd, int32_t simm13, Register rs1); \
+    void STH32(Register rd, int32_t immI, Register rs1); \
    void STB(Register rd, Register rs1, Register rs2); \
    void STBI(Register rd, int32_t simm13, Register rs1); \
    void STB32(Register rd, int32_t immI, Register rs1); \