Merge tamarin-redux (nanojit2) into tracemonkey (457786, r=edwsmith,gal,danderson).

2008-10-13 13:29:18 -07:00 · 2008-10-13 13:29:18 -07:00 · c6c4b6404b
--- a/js/src/jsbuiltins.h
+++ b/js/src/jsbuiltins.h
@ -145,7 +145,7 @@ struct JSTraceableNative {
 #define _JS_DEFINE_CALLINFO(name, crtype, cargtypes, argtypes, cse, fold)      \
    crtype FASTCALL js_##name cargtypes;                                       \
    const nanojit::CallInfo ci_##name =                                        \
-        { (intptr_t) &js_##name, argtypes, cse, fold _JS_CI_NAME(name) };
+        { (intptr_t) &js_##name, argtypes, cse, fold, nanojit::ABI_FASTCALL _JS_CI_NAME(name) };

 /*
 * Declare a C function named js_<op> and a CallInfo struct named ci_<op> so
--- a/js/src/jsmath.cpp
+++ b/js/src/jsmath.cpp
@ -344,16 +344,12 @@ math_max(JSContext *cx, uintN argc, jsval *vp)
            *vp = DOUBLE_TO_JSVAL(cx->runtime->jsNaN);
            return JS_TRUE;
        }
-        if (x == 0 && x == z && fd_copysign(1.0, z) == -1)
-            z = x;
-        else
-            /* 
-             * Note: it is essential that you write the ternary expression
-             * here such that the false branch produces z not x, as the case
-             * of x=-0, z=0, for which we wind up in this expression but
-             * evaluate either > order as false, whether we do x>z *or* z>x.
-             */
+        if (x == 0 && x == z) {
+            if (fd_copysign(1.0, z) == -1)
+                z = x;
+        } else {
            z = (x > z) ? x : z;
+        }
    }
    return js_NewNumberInRootedValue(cx, z, vp);
 }
@ -378,9 +374,10 @@ math_min(JSContext *cx, uintN argc, jsval *vp)
            *vp = DOUBLE_TO_JSVAL(cx->runtime->jsNaN);
            return JS_TRUE;
        }
-        if (x == 0 && x == z && fd_copysign(1.0,x) == -1)
-            z = x;
-        else
+        if (x == 0 && x == z) {
+            if (fd_copysign(1.0, x) == -1)
+                z = x;
+        } else
            z = (x < z) ? x : z;
    }
    return js_NewNumberInRootedValue(cx, z, vp);
@ -623,9 +620,13 @@ js_Math_max(jsdouble d, jsdouble p)
    if (JSDOUBLE_IS_NaN(d) || JSDOUBLE_IS_NaN(p))
        return js_NaN;

-    if (p == 0 && p == d && fd_copysign(1.0, d) == -1)
-        return p;
-    return (d > p) ? d : p;
+    if (p == 0 && p == d) {
+        if (fd_copysign(1.0, d) == -1)
+            return p;
+        else
+            return d;
+    }
+    return (p > d) ? p : d;
 }

 jsdouble FASTCALL
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@ -1000,12 +1000,14 @@ TraceRecorder::TraceRecorder(JSContext* cx, GuardRecord* _anchor, Fragment* _fra
    lir = cse_filter = new (&gc) CseFilter(lir, &gc);
    lir = expr_filter = new (&gc) ExprFilter(lir);
    lir = func_filter = new (&gc) FuncFilter(lir, *this);
-    lir->ins0(LIR_trace);
+    lir->ins0(LIR_start);

    if (!nanojit::AvmCore::config.tree_opt || fragment->root == fragment) {
-        lirbuf->state = addName(lir->insParam(0), "state");
-        lirbuf->param1 = addName(lir->insParam(1), "param1");
+        lirbuf->state = addName(lir->insParam(0, 0), "state");
+        lirbuf->param1 = addName(lir->insParam(1, 0), "param1");
    }
+    loop_header_ins = addName(lir->ins0(LIR_label), "loop_header");
+
    lirbuf->sp = addName(lir->insLoad(LIR_ldp, lirbuf->state, (int)offsetof(InterpState, sp)), "sp");
    lirbuf->rp = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, rp)), "rp");
    cx_ins = addName(lir->insLoad(LIR_ldp, lirbuf->state, offsetof(InterpState, cx)), "cx");
@ -1955,10 +1957,9 @@ TraceRecorder::closeLoop(Fragmento* fragmento)
    SideExit *exit = snapshot(LOOP_EXIT);
    exit->target = fragment->root;
    if (fragment == fragment->root) {
-        fragment->lastIns = lir->insGuard(LIR_loop, lir->insImm(1), exit);
-    } else {
-        fragment->lastIns = lir->insGuard(LIR_x, lir->insImm(1), exit);
+        fragment->lastIns = lir->insBranch(LIR_j, NULL, loop_header_ins);
    }
+    fragment->lastIns = lir->insGuard(LIR_x, lir->insImm(1), exit);
    compile(fragmento);

    debug_only_v(printf("recording completed at %s:%u@%u via closeLoop\n", cx->fp->script->filename,
@ -2107,9 +2108,9 @@ TraceRecorder::fuseIf(jsbytecode* pc, bool cond, LIns* x)
 int
 nanojit::StackFilter::getTop(LInsp guard)
 {
-    if (sp == frag->lirbuf->sp)
+    if (sp == lirbuf->sp)
        return guard->exit()->sp_adj;
-    JS_ASSERT(sp == frag->lirbuf->rp);
+    JS_ASSERT(sp == lirbuf->rp);
    return guard->exit()->rp_adj;
 }

@ -2387,7 +2388,7 @@ js_RecordTree(JSContext* cx, JSTraceMonitor* tm, Fragment* f)
    while (f->code() && f->peer)
        f = f->peer;
    if (f->code())
-        f = JS_TRACE_MONITOR(cx).fragmento->newLoop(f->ip);
+        f = JS_TRACE_MONITOR(cx).fragmento->getAnchor(f->ip);

    f->calldepth = 0;
    f->root = f;
@ -2604,7 +2605,7 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
                            OBJ_SHAPE(globalObj), tm->globalShape);)
        const void* ip = f->ip;
        js_FlushJITCache(cx);
-        *treep = tm->fragmento->newLoop(ip);
+        *treep = tm->fragmento->getAnchor(ip);
        return NULL;
    }

@ -2624,7 +2625,7 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
        bool didGC;
        const void* ip = f->ip;
        if (!ReplenishReservePool(cx, tm, didGC) || didGC) {
-            *treep = tm->fragmento->newLoop(ip);
+            *treep = tm->fragmento->getAnchor(ip);
            return NULL;
        }
    }
@ -2663,6 +2664,7 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
        tm->onTrace = true;
    GuardRecord* lr;
    
+    debug_only(fflush(NULL);)
 #if defined(JS_NO_FASTCALL) && defined(NANOJIT_IA32)
    SIMULATE_FASTCALL(lr, &state, NULL, u.func);
 #else
@ -2854,7 +2856,7 @@ js_MonitorLoopEdge(JSContext* cx, uintN& inlineCallCount)
    } else {
        f = tm->fragmento->getLoop(pc);
        if (!f)
-            f = tm->fragmento->newLoop(pc);
+            f = tm->fragmento->getAnchor(pc);
        cacheEntry->pc = pc;
        cacheEntry->fragment = f;
    }
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@ -220,6 +220,7 @@ class TraceRecorder : public GCObject {
 #ifdef NJ_SOFTFLOAT
    nanojit::LirWriter*     float_filter;
 #endif
+    nanojit::LIns*          loop_header_ins;
    nanojit::LIns*          cx_ins;
    nanojit::LIns*          gp_ins;
    nanojit::LIns*          eos_ins;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -73,8 +73,8 @@ namespace nanojit
    struct Reservation
 	{
 		uint32_t arIndex:16;	/* index into stack frame.  displ is -4*arIndex */
-		Register reg:8;			/* register UnkownReg implies not in register */
-        int cost:8;
+		Register reg:15;			/* register UnkownReg implies not in register */
+        uint32_t used:1;
 	};

 	struct AR
@ -83,32 +83,6 @@ namespace nanojit
 		uint32_t		tos;							/* current top of stack entry */
 		uint32_t		highwatermark;					/* max tos hit */
 		uint32_t		lowwatermark;					/* we pre-allocate entries from 0 upto this index-1; so dynamic entries are added above this index */
-		LIns*			parameter[ NJ_MAX_PARAMETERS ]; /* incoming parameters */
-	};
-
-    enum ArgSize {
-	    ARGSIZE_NONE = 0,
-	    ARGSIZE_F = 1,
-	    ARGSIZE_LO = 2,
-	    ARGSIZE_Q = 3,
-	    _ARGSIZE_MASK_INT = 2, 
-        _ARGSIZE_MASK_ANY = 3
-    };
-
-	struct CallInfo
-	{
-		intptr_t	_address;
-		uint16_t	_argtypes;		// 6 2-bit fields indicating arg type, by ARGSIZE above (including ret type): a1 a2 a3 a4 a5 ret
-		uint8_t		_cse;			// true if no side effects
-		uint8_t		_fold;			// true if no side effects
-		verbose_only ( const char* _name; )
-		
-		uint32_t FASTCALL _count_args(uint32_t mask) const;
-        uint32_t get_sizes(ArgSize*) const;
-
-		inline uint32_t FASTCALL count_args() const { return _count_args(_ARGSIZE_MASK_ANY); }
-		inline uint32_t FASTCALL count_iargs() const { return _count_args(_ARGSIZE_MASK_INT); }
-		// fargs = args - iargs
 	};

 	#ifdef AVMPLUS_WIN32
@ -124,6 +98,10 @@ namespace nanojit
 		counter_define(spills;)
 		counter_define(native;)
        counter_define(exitnative;)
+		
+		int32_t pages;
+		NIns* codeStart;
+		NIns* codeExitStart;

 		DECLARE_PLATFORM_STATS()
 #ifdef __GNUC__
@ -146,10 +124,34 @@ namespace nanojit
        ,MaxExit
        ,MaxXJump
        ,UnknownPrim
+        ,UnknownBranch
 	};

 	typedef avmplus::List<NIns*, avmplus::LIST_NonGCObjects> NInsList;
+	typedef avmplus::SortedMap<LIns*,NIns*,avmplus::LIST_NonGCObjects> InsMap;
+	typedef avmplus::SortedMap<NIns*,LIns*,avmplus::LIST_NonGCObjects> NInsMap;

+    class LabelState MMGC_SUBCLASS_DECL
+    {
+    public:
+        RegAlloc regs;
+        NIns *addr;
+        LabelState(NIns *a, RegAlloc &r) : regs(r), addr(a)
+        {}
+    };
+
+    class LabelStateMap
+    {
+        GC *gc;
+        avmplus::SortedMap<LIns*, LabelState*, avmplus::LIST_GCObjects> labels;
+    public:
+        LabelStateMap(GC *gc) : gc(gc), labels(gc)
+        {}
+
+        void clear() { labels.clear(); }
+        void add(LIns *label, NIns *addr, RegAlloc &regs);
+        LabelState *get(LIns *);
+    };
    /**
 	 * Information about the activation record for the method is built up 
 	 * as we generate machine code.  As part of the prologue, we issue
@ -190,6 +192,7 @@ namespace nanojit
 			void		setError(AssmError e) { _err = e; }
 			void		setCallTable(const CallInfo *functions);
 			void		pageReset();
+			int32_t		codeBytes();
 			Page*		handoverPages(bool exitPages=false);

 			debug_only ( void		pageValidate(); )
@ -197,30 +200,32 @@ namespace nanojit
 			
 			// support calling out from a fragment ; used to debug the jit
 			debug_only( void		resourceConsistencyCheck(); )
-			debug_only( void		registerConsistencyCheck(LIns** resv); )
+			debug_only( void		registerConsistencyCheck(); )
 			
 			Stats		_stats;		
+            int hasLoop;

 		private:
 			
 			void		gen(LirFilter* toCompile, NInsList& loopJumps);
-			NIns*		genPrologue(RegisterMask);
-			NIns*		genEpilogue(RegisterMask);
-
-			bool		ignoreInstruction(LInsp ins);
+			NIns*		genPrologue();
+			NIns*		genEpilogue();

 			GuardRecord* placeGuardRecord(LInsp guard);
 			void		initGuardRecord(LInsp guard, GuardRecord*);

 			uint32_t	arReserve(LIns* l);
-			uint32_t	arFree(uint32_t idx);
+			void    	arFree(uint32_t idx);
 			void		arReset();

 			Register	registerAlloc(RegisterMask allow);
 			void		registerResetAll();
-			void		restoreCallerSaved();
-			void		mergeRegisterState(RegAlloc& saved);
-	        LInsp       findVictim(RegAlloc& regs, RegisterMask allow, RegisterMask prefer);
+			void		evictRegs(RegisterMask regs);
+            void        evictScratchRegs();
+			void		intersectRegisterState(RegAlloc& saved);
+			void		unionRegisterState(RegAlloc& saved);
+            void        assignSaved(RegAlloc &saved, RegisterMask skip);
+	        LInsp       findVictim(RegAlloc& regs, RegisterMask allow);
 		
 			int			findMemFor(LIns* i);
 			Register	findRegFor(LIns* i, RegisterMask allow);
@ -234,12 +239,16 @@ namespace nanojit
 			NIns*		pageAlloc(bool exitPage=false);
 			void		pagesFree(Page*& list);
 			void		internalReset();
+            bool        canRemat(LIns*);

 			Reservation* reserveAlloc(LInsp i);
 			void		reserveFree(LInsp i);
 			void		reserveReset();

-			Reservation* getresv(LIns *x) { return x->resv() ? &_resvTable[x->resv()] : 0; }
+			Reservation* getresv(LIns *x) {
+                uint32_t resv_index = x->resv();
+                return resv_index ? &_resvTable[resv_index] : 0;
+            }

 			DWB(Fragmento*)		_frago;
            GC*					_gc;
@ -259,13 +268,18 @@ namespace nanojit
 			AR			_activation;
 			RegAlloc	_allocator;

+			LabelStateMap	_labels; 
+			NInsMap		_patches;
 			Reservation _resvTable[ NJ_MAX_STACK_ENTRY ]; // table where we house stack and register information
 			uint32_t	_resvFree;
-			bool		_inExit,vpad2[3];
+			bool		_inExit, vpad2[3];
+            avmplus::List<LIns*, avmplus::LIST_GCObjects> pending_lives;

 			void		asm_cmp(LIns *cond);
 #ifndef NJ_SOFTFLOAT
 			void		asm_fcmp(LIns *cond);
+            void        asm_setcc(Register res, LIns *cond);
+            NIns *      asm_jmpcc(bool brOnFalse, LIns *cond, NIns *target);
 #endif
 			void		asm_mmq(Register rd, int dd, Register rs, int ds);
            NIns*       asm_exit(LInsp guard);
@ -274,7 +288,9 @@ namespace nanojit
            void        asm_store32(LIns *val, int d, LIns *base);
            void        asm_store64(LIns *val, int d, LIns *base);
 			void		asm_restore(LInsp, Reservation*, Register);
-			void		asm_spill(LInsp i, Reservation *resv, bool pop);
+			void		asm_load(int d, Register r);
+			void		asm_spilli(LInsp i, Reservation *resv, bool pop);
+			void		asm_spill(Register rr, int d, bool pop=false, bool quad=false);
 			void		asm_load64(LInsp i);
 			void		asm_pusharg(LInsp p);
 			NIns*		asm_adjustBranch(NIns* at, NIns* target);
@ -290,6 +306,10 @@ namespace nanojit
 			void		asm_call(LInsp);
            void        asm_arg(ArgSize, LInsp, Register);
 			Register	asm_binop_rhs_reg(LInsp ins);
+			NIns*		asm_branch(bool branchOnFalse, LInsp cond, NIns* targ);
+            void        assignSavedParams();
+            void        reserveSavedParams();
+            void        handleLoopCarriedExprs();

 			// platform specific implementation (see NativeXXX.cpp file)
 			void		nInit(uint32_t flags);
@ -303,6 +323,7 @@ namespace nanojit

 			// platform specific methods
        public:
+			const static Register savedRegs[NumSavedRegs];
 			DECLARE_PLATFORM_ASSEMBLER()

 		private:
--- a/js/src/nanojit/Fragmento.cpp
+++ b/js/src/nanojit/Fragmento.cpp
@ -39,6 +39,7 @@
 * ***** END LICENSE BLOCK ***** */

 #include "nanojit.h"
+#undef MEMORY_INFO

 namespace nanojit
 {	
@ -58,16 +59,17 @@ namespace nanojit
 	 */
 	Fragmento::Fragmento(AvmCore* core, uint32_t cacheSizeLog2) 
 		: _allocList(core->GetGC()),
-			_max_pages(1 << (calcSaneCacheSize(cacheSizeLog2) - NJ_LOG2_PAGE_SIZE))
+			_max_pages(1 << (calcSaneCacheSize(cacheSizeLog2) - NJ_LOG2_PAGE_SIZE)),
+			_pagesGrowth(1)
 	{
 #ifdef MEMORY_INFO
 		_allocList.set_meminfo_name("Fragmento._allocList");
 #endif
+		NanoAssert(_max_pages > _pagesGrowth); // shrink growth if needed 
 		_core = core;
 		GC *gc = core->GetGC();
 		_frags = new (gc) FragmentMap(gc, 128);
 		_assm = new (gc) nanojit::Assembler(this);
-        _pageGrowth = 1;
 		verbose_only( enterCounts = new (gc) BlockHist(gc); )
 		verbose_only( mergeCounts = new (gc) BlockHist(gc); )
 	}
@ -109,10 +111,10 @@ namespace nanojit
 	{
        NanoAssert(sizeof(Page) == NJ_PAGE_SIZE);
 		if (!_pageList) {
-			pagesGrow(_pageGrowth);	// try to get more mem
-            if ((_pageGrowth << 1) < _max_pages)
-                _pageGrowth <<= 1;
-        }
+			pagesGrow(_pagesGrowth);	// try to get more mem
+			            if ((_pagesGrowth << 1) < _max_pages)
+							_pagesGrowth <<= 1;						
+		}
 		Page *page = _pageList;
 		if (page)
 		{
@ -221,7 +223,7 @@ namespace nanojit
 		return _core;
 	}

-	Fragment* Fragmento::newLoop(const void* ip)
+    Fragment* Fragmento::getAnchor(const void* ip)
 	{
        Fragment *f = newFrag(ip);
        Fragment *p = _frags->get(ip);
@ -480,7 +482,7 @@ namespace nanojit
 	{
 		int c = hist->count(ip);
 		if (_assm->_verbose)
-			_assm->outputf("++ %s %d", core()->interp.labels->format(ip), c);
+			_assm->outputf("++ %s %d", labels->format(ip), c);
 	}

 	void Fragmento::countIL(uint32_t il, uint32_t abc)
--- a/js/src/nanojit/Fragmento.h
+++ b/js/src/nanojit/Fragmento.h
@ -54,7 +54,6 @@ namespace nanojit
    struct PageHeader
    {
        struct Page *next;
-        verbose_only (int seq;) // sequence # of page
    };
    struct Page: public PageHeader
    {
@ -101,8 +100,8 @@ namespace nanojit
 			Page*		pageAlloc();
 			void		pageFree(Page* page);
 			
-			Fragment*   newLoop(const void* ip);
            Fragment*   getLoop(const void* ip);
+            Fragment*   getAnchor(const void* ip);
 			void        clearFrags();	// clear all fragments from the cache
            Fragment*   getMerge(GuardRecord *lr, const void* ip);
            Fragment*   createBranch(GuardRecord *lr, const void* ip);
@ -145,13 +144,13 @@ namespace nanojit
 			DWB(Assembler*)		_assm;
 			DWB(FragmentMap*)	_frags;		/* map from ip -> Fragment ptr  */
 			Page*			_pageList;
-            uint32_t        _pageGrowth;

 			/* unmanaged mem */
 			AllocList	_allocList;
 			GCHeap*		_gcHeap;

 			const uint32_t _max_pages;
+			uint32_t _pagesGrowth;
 	};

 	enum TraceKind {
@ -236,18 +235,5 @@ namespace nanojit
 			int32_t			_hits;
 			Page*			_pages;		// native code pages 
 	};
-	
-#ifdef NJ_VERBOSE
-	inline int nbr(LInsp x) 
-	{
-        Page *p = x->page();
-        return (p->seq * NJ_PAGE_SIZE + (intptr_t(x)-intptr_t(p))) / sizeof(LIns);
-	}
-#else
-    inline int nbr(LInsp x)
-    {
-        return (int)(intptr_t(x) & intptr_t(NJ_PAGE_SIZE-1));
-    }
-#endif
 }
 #endif // __nanojit_Fragmento__
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@ -63,23 +63,34 @@ namespace nanojit
 		LIR64	= 0x40,			// result is double or quad
 		
 		// special operations (must be 0..N)
-		LIR_trace = 2,	
-		LIR_nearskip = 3, // must be LIR_skip-1 and lsb=1
-		LIR_skip = 4,
-        LIR_neartramp = 5, // must be LIR_tramp-1 and lsb=1
-        LIR_tramp = 6,
+		LIR_start = 0,	
+		LIR_nearskip = 1, // must be LIR_skip-1 and lsb=1
+		LIR_skip = 2,
+        LIR_neartramp = 3, // must be LIR_tramp-1 and lsb=1
+        LIR_tramp = 4,

 		// non-pure operations
+		LIR_addp    = 9,
 		LIR_param	= 10,
 		LIR_st		= 11, // 32-bit store
 		LIR_ld		= 12, // 32-bit load
+		LIR_alloc   = 13, // alloca some stack space
        LIR_sti     = 14,
-		LIR_call	= 18, // subrouting call returning a 32-bit value
+		LIR_ret     = 15,
+		LIR_live    = 16, // extend live range of reference
+		LIR_calli   = 17, // indirect call	
+		LIR_call	= 18, // subroutine call returning a 32-bit value
 			
 		// guards
 		LIR_loop    = 19, // loop fragment
 		LIR_x		= 20, // exit always

+		// branches
+		LIR_j		= 21, // jump always
+		LIR_jt		= 22, // jump true
+		LIR_jf		= 23, // jump false
+		LIR_label	= 24, // a jump target
+		LIR_ji      = 25, // jump indirect
 		// operators

 		// LIR_feq though LIR_fge must only be used on float arguments.  They
@ -137,18 +148,25 @@ namespace nanojit
 		LIR_ule		= 62, // 0x3E 0011 1110
 		LIR_uge		= 63, // 0x3F 0011 1111

+		// non-64bit ops, but we're out of code space below 64
+		LIR_file    = 1 | LIR64,
+		LIR_line    = 2 | LIR64,
+
 		/**
 		 * 64bit operations
 		 */
 		LIR_stq		= LIR_st | LIR64, // quad store
 		LIR_stqi	= LIR_sti | LIR64,
+		LIR_fret    = LIR_ret | LIR64,
 		LIR_quad    = LIR_int | LIR64, // quad constant value
 		LIR_ldq		= LIR_ld    | LIR64, // quad load
+		LIR_ldqc    = LIR_ldc   | LIR64,
        LIR_qiand   = 24 | LIR64,
        LIR_qiadd   = 25 | LIR64,
        LIR_qilsh   = LIR_lsh | LIR64,

 		LIR_fcall   = LIR_call  | LIR64, // subroutine call returning quad
+		LIR_fcalli  = LIR_calli | LIR64,
 		LIR_fneg	= LIR_neg  | LIR64, // floating-point numeric negation
 		LIR_fadd	= LIR_add  | LIR64, // floating-point addition
 		LIR_fsub	= LIR_sub  | LIR64, // floating-point subtraction
@ -164,6 +182,7 @@ namespace nanojit

 	#if defined NANOJIT_64BIT
 	#define LIR_ldp     LIR_ldq
+	#define LIR_stp     LIR_stq
    #define LIR_piadd   LIR_qiadd
    #define LIR_piand   LIR_qiand
    #define LIR_pilsh   LIR_qilsh
@ -171,6 +190,7 @@ namespace nanojit
    #define LIR_pior    LIR_qior
 	#else
 	#define LIR_ldp     LIR_ld
+	#define LIR_stp     LIR_st
    #define LIR_piadd   LIR_add
    #define LIR_piand   LIR_and
    #define LIR_pilsh   LIR_lsh
@ -184,7 +204,71 @@ namespace nanojit

    struct SideExit;
    struct Page;
-    struct CallInfo;
+
+    enum AbiKind {
+        ABI_FASTCALL,
+        ABI_THISCALL,
+		ABI_STDCALL,
+        ABI_CDECL
+    };
+
+    enum ArgSize {
+	    ARGSIZE_NONE = 0,
+	    ARGSIZE_F = 1,
+	    ARGSIZE_LO = 2,
+	    ARGSIZE_Q = 3,
+	    _ARGSIZE_MASK_INT = 2, 
+        _ARGSIZE_MASK_ANY = 3
+    };
+
+    struct CallInfo
+	{
+		uintptr_t	_address;
+        uint32_t	_argtypes:18;	// 9 2-bit fields indicating arg type, by ARGSIZE above (including ret type): a1 a2 a3 a4 a5 ret
+        uint8_t		_cse:1;			// true if no side effects
+        uint8_t		_fold:1;		// true if no side effects
+        AbiKind     _abi:3;
+		verbose_only ( const char* _name; )
+		
+		uint32_t FASTCALL _count_args(uint32_t mask) const;
+        uint32_t get_sizes(ArgSize*) const;
+
+        inline bool isInterface() const {
+            return _address == 2 || _address == 3; /* hack! */
+        }
+        inline bool isIndirect() const {
+            return _address < 256;
+        }
+		inline uint32_t FASTCALL count_args() const {
+            return _count_args(_ARGSIZE_MASK_ANY) + isIndirect();
+        }
+		inline uint32_t FASTCALL count_iargs() const {
+            return _count_args(_ARGSIZE_MASK_INT);
+        }
+		// fargs = args - iargs
+	};
+
+    inline bool isGuard(LOpcode op) {
+        return op==LIR_x || op==LIR_xf || op==LIR_xt || op==LIR_loop;
+    }
+
+    inline bool isCall(LOpcode op) {
+        op = LOpcode(op & ~LIR64);
+        return op == LIR_call || op == LIR_calli;
+    }
+
+    inline bool isStore(LOpcode op) {
+        op = LOpcode(op & ~LIR64);
+        return op == LIR_st || op == LIR_sti;
+    }
+
+    inline bool isConst(LOpcode op) {
+        return (op & ~1) == LIR_short;
+    }
+
+    inline bool isLoad(LOpcode op) {
+        return op == LIR_ldq || op == LIR_ld || op == LIR_ldc || op == LIR_ldqc;
+    }

 	// Low-level Instruction 4B
 	// had to lay it our as a union with duplicate code fields since msvc couldn't figure out how to compact it otherwise.
@ -290,7 +374,9 @@ namespace nanojit

 		inline LOpcode	opcode() const	{ return u.code; }
 		inline uint8_t	imm8()	 const	{ return c.imm8a; }
+		inline uint8_t	imm8b()	 const	{ return c.imm8b; }
 		inline int16_t	imm16()	 const	{ return i.imm16; }
+		inline int32_t	imm24()	 const	{ return t.imm24; }
 		inline LIns*	ref()	 const	{ 
 #if defined NANOJIT_64BIT
            return (t.code & 1) ? (LIns*)this+t.imm24 : *(LIns**)(this-2);
@ -302,6 +388,14 @@ namespace nanojit
 		inline uint8_t	resv()	 const  { return g.resv; }
        void*	payload() const;
        inline Page*	page()			{ return (Page*) alignTo(this,NJ_PAGE_SIZE); }
+        inline int32_t  size() const {
+            NanoAssert(isop(LIR_alloc));
+            return i.imm16<<2;
+        }
+        inline void setSize(int32_t bytes) {
+            NanoAssert(isop(LIR_alloc) && (bytes&3)==0 && isU16(bytes>>2));
+            i.imm16 = bytes>>2;
+        }

 		// index args in r-l order.  arg(0) is rightmost arg
 		inline LIns* arg(uint32_t i) {
@ -375,12 +469,12 @@ namespace nanojit
 		bool isQuad() const;
 		bool isCond() const;
 		bool isCmp() const;
-		bool isCall() const;
-        bool isStore() const;
-        bool isLoad() const;
-		bool isGuard() const;
+		bool isCall() const { return nanojit::isCall(u.code); }
+        bool isStore() const { return nanojit::isStore(u.code); }
+        bool isLoad() const { return nanojit::isLoad(u.code); }
+		bool isGuard() const { return nanojit::isGuard(u.code); }
 		// True if the instruction is a 32-bit or smaller constant integer.
-		bool isconst() const;
+		bool isconst() const { return nanojit::isConst(u.code); }
 		// True if the instruction is a 32-bit or smaller constant integer and
 		// has the value val when treated as a 32-bit signed integer.
 		bool isconstval(int32_t val) const;
@ -391,10 +485,13 @@ namespace nanojit
        bool isTramp() {
            return isop(LIR_neartramp) || isop(LIR_tramp);
        }
-
+		bool isBranch() const {
+			return isop(LIR_jt) || isop(LIR_jf) || isop(LIR_j);
+		}
 		// Set the imm16 member.  Should only be used on instructions that use
 		// that.  If you're not sure, you shouldn't be calling it.
 		void setimm16(int32_t i);
+		void setimm24(int32_t x);
 		// Set the resv member.  Should only be used on instructions that use
 		// that.  If you're not sure, you shouldn't be calling it.
 		void setresv(uint32_t resv);
@ -405,6 +502,9 @@ namespace nanojit
 		void setOprnd2(LIns*);
 		void setOprnd3(LIns*);
        void setDisp(int8_t d);
+		void target(LIns* t);
+        LIns **targetAddr();
+		LIns* getTarget();

        SideExit *exit();

@ -424,19 +524,21 @@ namespace nanojit
 	bool FASTCALL isCse(LOpcode v);
 	bool FASTCALL isCmp(LOpcode v);
 	bool FASTCALL isCond(LOpcode v);
+    inline bool isRet(LOpcode c) {
+        return (c & ~LIR64) == LIR_ret;
+    }
+    bool FASTCALL isFloat(LOpcode v);
 	LIns* FASTCALL callArgN(LInsp i, uint32_t n);
 	extern const uint8_t operandCount[];

 	class Fragmento;	// @todo remove this ; needed for minbuild for some reason?!?  Should not be compiling this code at all
 	class LirFilter;
-	struct CallInfo;

 	// make it a GCObject so we can explicitly delete it early
 	class LirWriter : public GCObject
 	{
 	public:
 		LirWriter *out;
-	public:
        const CallInfo *_functions;

 		virtual ~LirWriter() {}
@ -455,8 +557,13 @@ namespace nanojit
 		virtual LInsp insGuard(LOpcode v, LIns *c, SideExit *x) {
 			return out->insGuard(v, c, x);
 		}
-		virtual LInsp insParam(int32_t i) {
-			return out->insParam(i);
+		virtual LInsp insBranch(LOpcode v, LInsp condition, LInsp to) {
+			return out->insBranch(v, condition, to);
+		}
+        // arg: 0=first, 1=second, ...
+        // kind: 0=arg 1=saved-reg
+		virtual LInsp insParam(int32_t arg, int32_t kind) {
+			return out->insParam(arg, kind);
 		}
 		virtual LInsp insImm(int32_t imm) {
 			return out->insImm(imm);
@ -477,10 +584,14 @@ namespace nanojit
 		virtual LInsp insCall(const CallInfo *call, LInsp args[]) {
 			return out->insCall(call, args);
 		}
+		virtual LInsp insAlloc(int32_t size) {
+			return out->insAlloc(size);
+		}

 		// convenience
 	    LIns*		insLoadi(LIns *base, int disp);
 	    LIns*		insLoad(LOpcode op, LIns *base, int disp);
+	    LIns*		store(LIns* value, LIns* base, int32_t d);
 		// Inserts a conditional to execute and branches to execute if
 		// the condition is true and false respectively.
 	    LIns*		ins_choose(LIns* cond, LIns* iftrue, LIns* iffalse);
@ -491,6 +602,7 @@ namespace nanojit
        LIns*       ins2i(LOpcode op, LIns *oprnd1, int32_t);
 		LIns*		qjoin(LInsp lo, LInsp hi);
 		LIns*		insImmPtr(const void *ptr);
+		LIns*		insImmf(double f);
 	};

 #ifdef NJ_VERBOSE
@ -516,8 +628,8 @@ namespace nanojit
 		char buf[1000], *end;
        void formatAddr(const void *p, char *buf);
    public:
-		AvmCore *core;
-        LabelMap(AvmCore *, LabelMap* parent);
+        avmplus::AvmCore *core;
+        LabelMap(avmplus::AvmCore *, LabelMap* parent);
        ~LabelMap();
        void add(const void *p, size_t size, size_t align, const char *name);
 		void add(const void *p, size_t size, size_t align, avmplus::String*);
@ -579,50 +691,63 @@ namespace nanojit
 	class VerboseWriter : public LirWriter
 	{
 		avmplus::List<LInsp, avmplus::LIST_NonGCObjects> code;
-		LirNameMap *names;
+		DWB(LirNameMap*) names;
    public:
 		VerboseWriter(GC *gc, LirWriter *out, LirNameMap* names) 
 			: LirWriter(out), code(gc), names(names) 
 		{}

 		LInsp add(LInsp i) {
-			code.add(i);
+			if (i)
+				code.add(i);
 			return i;
 		}

+        LInsp add_flush(LInsp i) {
+            if ((i = add(i)) != 0) 
+                flush();
+            return i;
+        }
+
 		void flush()
 		{
-			for (int j=0, n=code.size(); j < n; j++)
-				printf("    %s\n",names->formatIns(code[j]));
-			code.clear();
-			printf("\n");
+            int n = code.size();
+            if (n) {
+			    for (int i=0; i < n; i++)
+				    printf("    %s\n",names->formatIns(code[i]));
+			    code.clear();
+                if (n > 1)
+        			printf("\n");
+            }
 		}

 		LIns* insGuard(LOpcode op, LInsp cond, SideExit *x) {
-			LInsp i = add(out->insGuard(op,cond,x));
-			if (i)
-				flush();
-			return i;
+			return add_flush(out->insGuard(op,cond,x));
 		}

+		LIns* insBranch(LOpcode v, LInsp condition, LInsp to) {
+			return add_flush(out->insBranch(v, condition, to));
+		}
+
+
 		LIns* ins0(LOpcode v) {
-			LInsp i = add(out->ins0(v));
-			if (i)
-				flush();
-			return i;
+            if (v == LIR_label || v == LIR_start) {
+                flush();
+            }
+			return add(out->ins0(v));
 		}

 		LIns* ins1(LOpcode v, LInsp a) {
-			return add(out->ins1(v, a));
+			return isRet(v) ? add_flush(out->ins1(v, a)) : add(out->ins1(v, a));
 		}
 		LIns* ins2(LOpcode v, LInsp a, LInsp b) {
 			return v == LIR_2 ? out->ins2(v,a,b) : add(out->ins2(v, a, b));
 		}
 		LIns* insCall(const CallInfo *call, LInsp args[]) {
-			return add(out->insCall(call, args));
+			return add_flush(out->insCall(call, args));
 		}
-		LIns* insParam(int32_t i) {
-			return add(out->insParam(i));
+		LIns* insParam(int32_t i, int32_t kind) {
+			return add(out->insParam(i, kind));
 		}
 		LIns* insLoad(LOpcode v, LInsp base, LInsp disp) {
 			return add(out->insLoad(v, base, disp));
@ -633,6 +758,9 @@ namespace nanojit
 		LIns* insStorei(LInsp v, LInsp b, int32_t d) {
 			return add(out->insStorei(v, b, d));
 		}
+        LIns* insAlloc(int32_t size) {
+            return add(out->insAlloc(size));
+        }
    };

 #endif
@ -643,7 +771,8 @@ namespace nanojit
 		ExprFilter(LirWriter *out) : LirWriter(out) {}
 		LIns* ins1(LOpcode v, LIns* a);
 	    LIns* ins2(LOpcode v, LIns* a, LIns* b);
-		LIns* insGuard(LOpcode v, LIns *c, SideExit *x);
+		LIns* insGuard(LOpcode, LIns *cond, SideExit *);
+        LIns* insBranch(LOpcode, LIns *cond, LIns *target);
 	};

 	// @todo, this could be replaced by a generic HashMap or HashSet, if we had one
@ -652,14 +781,14 @@ namespace nanojit
 		// must be a power of 2. 
 		// don't start too small, or we'll waste time growing and rehashing.
 		// don't start too large, will waste memory. 
-		static const uint32_t kInitialCap = 2048;	
+		static const uint32_t kInitialCap = 64;	

-		InsList m_list;
-		uint32_t m_used;
+		LInsp *m_list; // explicit WB's are used, no DWB needed.
+		uint32_t m_used, m_cap;
 		GC* m_gc;

 		static uint32_t FASTCALL hashcode(LInsp i);
-		uint32_t FASTCALL find(LInsp name, uint32_t hash, const InsList& list, uint32_t cap);
+		uint32_t FASTCALL find(LInsp name, uint32_t hash, const LInsp *list, uint32_t cap);
 		static bool FASTCALL equals(LInsp a, LInsp b);
 		void FASTCALL grow();

@ -673,6 +802,7 @@ namespace nanojit
 		LInsp findcall(const CallInfo *call, uint32_t argc, LInsp args[], uint32_t &i);
 		LInsp add(LInsp i, uint32_t k);
 		void replace(LInsp i);
+        void clear();

 		static uint32_t FASTCALL hashimm(int32_t);
 		static uint32_t FASTCALL hashimmq(uint64_t);
@ -695,7 +825,6 @@ namespace nanojit
 		LIns* insGuard(LOpcode op, LInsp cond, SideExit *x);
 	};

-	struct Page;
 	class LirBuffer : public GCFinalizedObject
 	{
 		public:
@ -704,13 +833,13 @@ namespace nanojit
 			virtual ~LirBuffer();
 			void        clear();
 			LInsp		next();
-			LInsp		commit(uint32_t count);
-			bool		addPage();
 			bool		outOmem() { return _noMem != 0; }
-			debug_only (void		validate() const;)
+			
+			debug_only (void validate() const;)
 			verbose_only(DWB(LirNameMap*) names;)
-			verbose_only(int insCount();)
-			verbose_only(int byteCount();)
+			
+			int32_t insCount();
+			int32_t byteCount();

 			// stats
 			struct 
@ -721,14 +850,20 @@ namespace nanojit
 			_stats;

 			const CallInfo* _functions;
+            AbiKind abi;
            LInsp state,param1,sp,rp;
+            LInsp savedParams[NumSavedRegs];
 			
-		private:
+		protected:
+			friend class LirBufWriter;
+
+			LInsp		commit(uint32_t count);
+			bool		addPage();
 			Page*		pageAlloc();

-			Page*				_start;		// first page
-			LInsp				_unused;	// next unused instruction slot
-			int					_noMem;		// set if ran out of memory when writing to buffer
+			Page*		_start;		// first page
+			LInsp		_unused;	// next unused instruction slot
+			int			_noMem;		// set if ran out of memory when writing to buffer
 	};	

 	class LirBufWriter : public LirWriter
@ -749,17 +884,24 @@ namespace nanojit
 			LInsp	ins0(LOpcode op);
 			LInsp	ins1(LOpcode op, LInsp o1);
 			LInsp	ins2(LOpcode op, LInsp o1, LInsp o2);
-			LInsp	insParam(int32_t i);
+			LInsp	insParam(int32_t i, int32_t kind);
 			LInsp	insImm(int32_t imm);
 			LInsp	insImmq(uint64_t imm);
 		    LInsp	insCall(const CallInfo *call, LInsp args[]);
 			LInsp	insGuard(LOpcode op, LInsp cond, SideExit *x);
+			LInsp	insBranch(LOpcode v, LInsp condition, LInsp to);
+			LInsp   insAlloc(int32_t size);

 			// buffer mgmt
 			LInsp	skip(size_t);
+
+		protected:
 			LInsp	insFar(LOpcode op, LInsp target);
+			LInsp	insLink(LOpcode op, LInsp target);
 			LInsp	ensureReferenceable(LInsp i, int32_t addedDistance);
 			bool	ensureRoom(uint32_t count);
+			bool	can8bReach(LInsp from, LInsp to) { return isU8(from-to-1); }
+			bool	can24bReach(LInsp from, LInsp to){ return isS24(from-to); }
 			bool	canReference(LInsp from, LInsp to) {
 				return isU8(from-to-1);
 			}
@ -795,24 +937,27 @@ namespace nanojit
 		LInsp pos() {
 			return _i;
 		}
+        void setpos(LIns *i) {
+            _i = i;
+        }
 	};

    class Assembler;

    void compile(Assembler *assm, Fragment *frag);
    verbose_only( void printTracker(const char* s, avmplus::RegionTracker& trk, Assembler* assm); )
-	verbose_only(void live(GC *gc, Assembler *assm, Fragment *frag);)
+	verbose_only(void live(GC *gc, LirBuffer *lirbuf);)

 	class StackFilter: public LirFilter
 	{
 		GC *gc;
-		Fragment *frag;
+		LirBuffer *lirbuf;
 		LInsp sp;
 		avmplus::BitSet stk;
        int top;
-		int getTop(LInsp guard);
+		int getTop(LInsp br);
 	public:
-		StackFilter(LirFilter *in, GC *gc, Fragment *frag, LInsp sp); 
+		StackFilter(LirFilter *in, GC *gc, LirBuffer *lirbuf, LInsp sp); 
 		virtual ~StackFilter() {}
 		LInsp read();
 	};
@ -825,5 +970,23 @@ namespace nanojit
 		CseReader(LirFilter *in, LInsHashSet *exprs, const CallInfo*);
 		LInsp read();
 	};
+
+    // eliminate redundant loads by watching for stores & mutator calls
+    class LoadFilter: public LirWriter
+    {
+    public:
+        LInsp sp, rp;
+        LInsHashSet exprs;
+        void clear(LInsp p);
+    public:
+        LoadFilter(LirWriter *out, GC *gc)
+            : LirWriter(out), exprs(gc) { }
+
+        LInsp ins0(LOpcode);
+        LInsp insLoad(LOpcode, LInsp base, LInsp disp);
+        LInsp insStore(LInsp v, LInsp b, LInsp d);
+        LInsp insStorei(LInsp v, LInsp b, int32_t d);
+        LInsp insCall(const CallInfo *call, LInsp args[]);
+    };
 }
 #endif // __nanojit_LIR__
--- a/js/src/nanojit/NativeAMD64.h
+++ b/js/src/nanojit/NativeAMD64.h
@ -109,6 +109,7 @@ namespace nanojit
 	typedef int RegisterMask;

 	/* RBX, R13-R15 */
+	static const int NumSavedRegs = 3;
 	static const RegisterMask SavedRegs = /*(1<<RBX) |*/ /*(1<<R12) |*/ (1<<R13) | (1<<R14) | (1<<R15);
 	/* RAX, RCX, RDX, RDI, RSI, R8-R11 */
 	static const RegisterMask TempRegs = (1<<RAX) | (1<<RCX) | (1<<RDX) | (1<<R8) | (1<<R9) | (1<<R10) | (1<<R11) | (1<<RDI) | (1<<RSI);
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -157,6 +157,7 @@ static const RegisterMask SavedFpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 |
 #else
 static const RegisterMask SavedFpRegs = 0;
 #endif
+static const int NumSavedRegs = 7;
 static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7 | 1<<R8 | 1<<R9 | 1<<R10 | SavedFpRegs;
 static const RegisterMask FpRegs = 1<<D0 | 1<<D1 | 1<<D2 | 1<<D3 | 1<<D4 | 1<<D5 | 1<<D6; // no D7; S14-S15 are used for i2f/u2f.
 static const RegisterMask GpRegs = 0x07FF;
--- a/js/src/nanojit/NativeThumb.cpp
+++ b/js/src/nanojit/NativeThumb.cpp
@ -61,6 +61,12 @@ namespace nanojit
    const Register Assembler::argRegs[] = { R0, R1, R2, R3 };
    const Register Assembler::retRegs[] = { R0, R1 };

+#ifdef NJ_THUMB_JIT
+	const Register Assembler::savedRegs[] = { R4, R5, R6, R7 };
+#else
+	const Register Assembler::savedRegs[] = { R4, R5, R6, R7, R8, R9, R10 };
+#endif
+
 	void Assembler::nInit(AvmCore*)
 	{
 		// Thumb mode does not have conditional move, alas
@ -269,7 +275,7 @@ namespace nanojit
 		else if (op == LIR_callh)
 			prefer = rmask(R1);
 		else if (op == LIR_param)
-			prefer = rmask(imm2register(i->imm8()));
+			prefer = rmask(imm2register(argRegs[i->imm8()]));

 		if (_allocator.free & allow & prefer)
 			allow &= prefer;
--- a/js/src/nanojit/NativeThumb.h
+++ b/js/src/nanojit/NativeThumb.h
@ -101,6 +101,7 @@ namespace nanojit
 	} 
 	FragInfo;

+	static const int NumSavedRegs = 4;
 	static const RegisterMask SavedRegs = 1<<R4 | 1<<R5 | 1<<R6 | 1<<R7;
 	static const RegisterMask FpRegs = 0x0000; // FST0-FST7
 	static const RegisterMask GpRegs = 0x003F;
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@ -71,6 +71,7 @@ namespace nanojit
 #if defined NANOJIT_IA32
    const Register Assembler::argRegs[] = { ECX, EDX };
    const Register Assembler::retRegs[] = { EAX, EDX };
+    const Register Assembler::savedRegs[] = { EBX, ESI, EDI };
 #elif defined NANOJIT_AMD64
 #if defined WIN64
 	const Register Assembler::argRegs[] = { R8, R9, RCX, RDX };
@ -78,8 +79,17 @@ namespace nanojit
 	const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 };
 #endif
 	const Register Assembler::retRegs[] = { RAX, RDX };
+	const Register Assembler::savedRegs[] = { R13, R14, R15 };
 #endif

+    const static uint8_t max_abi_regs[] = {
+        2, /* ABI_FASTCALL */
+        1, /* ABI_THISCALL */
+        0, /* ABI_STDCALL */
+        0  /* ABI_CDECL */
+    };
+
+
 	void Assembler::nInit(AvmCore* core)
 	{
        OSDep::getDate();
@ -89,21 +99,16 @@ namespace nanojit
 #endif
 	}

-	NIns* Assembler::genPrologue(RegisterMask needSaving)
+	NIns* Assembler::genPrologue()
 	{
 		/**
 		 * Prologue
 		 */
 		uint32_t stackNeeded = STACK_GRANULARITY * _activation.highwatermark;
-		uint32_t savingCount = 0;

-		for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-			if (needSaving&rmask(i)) 
-				savingCount++;
-
-		// After forcing alignment, we've pushed the pre-alignment SP
-		// and savingCount registers.
-		uint32_t stackPushed = STACK_GRANULARITY * (1+savingCount);
+		uint32_t stackPushed =
+            STACK_GRANULARITY + // returnaddr
+            STACK_GRANULARITY; // ebp
 		uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
 		uint32_t amt = aligned - stackPushed;

@ -122,37 +127,37 @@ namespace nanojit
 		verbose_only( verbose_output("        patch entry:"); )
        NIns *patchEntry = _nIns;
 		MR(FP, SP); // Establish our own FP.
+        PUSHr(FP); // Save caller's FP.

-		// Save pre-alignment SP value here, where the FP will point,
-		// to preserve the illusion of a valid frame chain for
-		// functions like MMgc::GetStackTrace.  The 'return address'
-		// of this 'frame' will be the last-saved register, but that's
-		// fine, because the next-older frame will be legit.
-		PUSHr(FP);
-
-		for(Register i=FirstReg; i <= LastReg; i = nextreg(i))
-			if (needSaving&rmask(i))
-				PUSHr(i);
-
-		// We'd like to be able to use SSE instructions like MOVDQA on
-		// stack slots; it requires 16B alignment.  Darwin requires a
-		// 16B stack alignment, and Linux GCC seems to intend to
-		// establish and preserve the same, but we're told that GCC
-		// has not always done this right.  To avoid doubt, do it on
-		// all platforms.  The prologue runs only when we enter
-		// fragments from the interpreter, so forcing 16B alignment
-		// here is cheap.
-#if defined NANOJIT_IA32
-		ANDi(SP, -NJ_ALIGN_STACK);
-#elif defined NANOJIT_AMD64
-		ANDQi(SP, -NJ_ALIGN_STACK);
-#endif
-		MR(FP,SP);
-		PUSHr(FP); // Save caller's FP.
+        // align the entry point
+        asm_align_code();

 		return patchEntry;
 	}

+    void Assembler::asm_align_code() {
+        static char nop[][9] = {
+                {0x90},
+                {0x66,0x90},
+                {0x0f,0x1f,0x00},
+                {0x0f,0x1f,0x40,0x00},
+                {0x0f,0x1f,0x44,0x00,0x00},
+                {0x66,0x0f,0x1f,0x44,0x00,0x00},
+                {0x0f,0x1f,0x80,0x00,0x00,0x00,0x00},
+                {0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
+                {0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00},
+        };
+        unsigned n;
+        while((n = uintptr_t(_nIns) & 15) != 0) {
+            if (n > 9)
+                n = 9;
+            underrunProtect(n);
+            _nIns -= n;
+            memcpy(_nIns, nop[n-1], n);
+            asm_output1("nop%d", n);
+        }
+    }
+
 	void Assembler::nFragExit(LInsp guard)
 	{
 		SideExit *exit = guard->exit();
@ -191,7 +196,6 @@ namespace nanojit
 		// first restore ESP from EBP, undoing SUBi(SP,amt) from genPrologue
        MR(SP,FP);

-
        #ifdef NJ_VERBOSE
        if (_frago->core()->config.show_stats) {
 			// load EDX (arg1) with Fragment *fromFrag, target fragment
@ -213,17 +217,11 @@ namespace nanojit
 	#endif
 	}

-    NIns *Assembler::genEpilogue(RegisterMask restore)
+    NIns *Assembler::genEpilogue()
    {
        RET();
        POPr(FP); // Restore caller's FP.
-        MR(SP,FP); // Undo forced alignment.
-
-		// Restore saved registers.
-		for (Register i=UnknownReg; i >= FirstReg; i = prevreg(i))
-			if (restore&rmask(i)) { POPr(i); } 
-		
-		POPr(FP); // Pop the pre-alignment SP.
+        MR(SP,FP); // pop the stack frame
        return  _nIns;
    }
 	
@ -232,75 +230,92 @@ namespace nanojit
 	{
        const CallInfo* call = ins->callInfo();
 		// must be signed, not unsigned
-		const uint32_t iargs = call->count_iargs();
-		int32_t fstack = call->count_args() - iargs;
+		uint32_t iargs = call->count_iargs();
+		int32_t fargs = call->count_args() - iargs - call->isIndirect();

+        bool imt = call->isInterface();
+        if (imt)
+            iargs --;
+
+        uint32_t max_regs = max_abi_regs[call->_abi];
+        if (max_regs > iargs)
+            max_regs = iargs;
+
+        int32_t istack = iargs-max_regs;  // first 2 4B args are in registers
        int32_t extra = 0;
+		const int32_t pushsize = 4*istack + 8*fargs; // actual stack space used

-#if defined NJ_NO_FASTCALL
-        int32_t istack = iargs;
+#if _MSC_VER
+        // msc is slack, and MIR doesn't do anything extra, so lets use this
+        // call-site alignment to at least have code size parity with MIR.
+        uint32_t align = 4;//NJ_ALIGN_STACK;
 #else
-		int32_t istack = iargs-2;  // first 2 4B args are in registers
-		if (istack <= 0)
-		{
-			istack = 0;
-		}
+        uint32_t align = NJ_ALIGN_STACK;
 #endif

-		const int32_t size = 4*istack + 8*fstack; // actual stack space used
-        if (size) {
+        if (pushsize) {
 		    // stack re-alignment 
 		    // only pop our adjustment amount since callee pops args in FASTCALL mode
-		    extra = alignUp(size, NJ_ALIGN_STACK) - (size); 
-#ifndef NJ_NO_FASTCALL
-		    if (extra > 0)
-			{
+		    extra = alignUp(pushsize, align) - pushsize;
+            if (call->_abi == ABI_CDECL) {
+				// with CDECL only, caller pops args
+                ADDi(SP, extra+pushsize);
+            } else if (extra > 0) {
 				ADDi(SP, extra);
-			}
-#endif
-        }
-
-#ifdef NJ_NO_FASTCALL
-        // In C calling conventions, callee doesn't pop args.
-        ADDi(SP, 4*iargs + 8*fstack + extra);
-#endif
-
-		CALL(call);
-
-#ifdef NJ_NO_FASTCALL
-        if (iargs >= 1) {
-            PUSHr(ECX);
-            if (iargs >= 2) {
-                PUSHr(EDX);
            }
        }
-#endif
+
+        bool indirect = false;
+        if (ins->isop(LIR_call) || ins->isop(LIR_fcall)) {
+            verbose_only(if (_verbose)
+                outputf("        %p:", _nIns);
+            )
+    		CALL(call);
+        }
+        else {
+            // indirect call.  x86 Calling conventions don't use EAX as an
+            // argument, and do use EAX as a return value.  We need a register
+            // for the address to call, so we use EAX since it will always be
+            // available
+            NanoAssert(ins->isop(LIR_calli) || ins->isop(LIR_fcalli));
+            CALLr(call, EAX);
+            indirect = true;
+        }

 		// make sure fpu stack is empty before call (restoreCallerSaved)
 		NanoAssert(_allocator.isFree(FST0));
 		// note: this code requires that ref arguments (ARGSIZE_Q)
        // be one of the first two arguments
-		// pre-assign registers to the first 2 4B args
-		const int max_regs = (iargs < 2) ? iargs : 2;
-		int n = 0;
+		// pre-assign registers to the first N 4B args based on the calling convention
+		uint32_t n = 0;

-        ArgSize sizes[10];
+        ArgSize sizes[2*MAXARGS];
        uint32_t argc = call->get_sizes(sizes);
+        if (indirect) {
+            argc--;
+            asm_arg(ARGSIZE_LO, ins->arg(argc), EAX);
+        }
+
+        if (imt) {
+            // interface thunk calling convention: put iid in EDX
+            NanoAssert(call->_abi == ABI_CDECL);
+            argc--;
+            asm_arg(ARGSIZE_LO, ins->arg(argc), EDX);
+        }

 		for(uint32_t i=0; i < argc; i++)
 		{
 			uint32_t j = argc-i-1;
            ArgSize sz = sizes[j];
            Register r = UnknownReg;
-            if (n < max_regs && sz != ARGSIZE_F) 
-			    r = argRegs[n++]; // tell asm_arg what reg to use
+            if (n < max_regs && sz != ARGSIZE_F) { 
+		        r = argRegs[n++]; // tell asm_arg what reg to use
+            }
            asm_arg(sz, ins->arg(j), r);
 		}

 		if (extra > 0)
-		{
 			SUBi(SP, extra);
-		}
 	}

 #elif defined NANOJIT_AMD64
@ -310,7 +325,7 @@ namespace nanojit
 		Register fpu_reg = XMM0;
        const CallInfo* call = ins->callInfo();
 		int n = 0;
-        
+
 		CALL(call);

        ArgSize sizes[10];
@ -417,25 +432,26 @@ namespace nanojit
 	{
 		uint32_t op = i->opcode();
 		int prefer = allow;
-		if (op == LIR_call)
-#if defined NANOJIT_IA32
-			prefer &= rmask(EAX);
-#elif defined NANOJIT_AMD64
-			prefer &= rmask(RAX);
-#endif
-		else if (op == LIR_param)
-			prefer &= rmask(Register(i->imm8()));
-#if defined NANOJIT_IA32
-        else if (op == LIR_callh || op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh)
-            prefer &= rmask(EDX);
-#else
-		else if (op == LIR_callh)
-			prefer &= rmask(RAX);
-#endif
-		else if (i->isCmp())
+        if (op == LIR_call || op == LIR_calli) {
+			prefer &= rmask(retRegs[0]);
+        }
+        else if (op == LIR_fcall || op == LIR_fcalli) {
+            prefer &= rmask(FST0);
+        }
+        else if (op == LIR_param) {
+            uint32_t max_regs = max_abi_regs[_thisfrag->lirbuf->abi];
+            if (i->imm8() < max_regs)
+    			prefer &= rmask(Register(i->imm8()));
+        }
+        else if (op == LIR_callh || op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh) {
+            prefer &= rmask(retRegs[1]);
+        }
+        else if (i->isCmp()) {
 			prefer &= AllowableFlagRegs;
-        else if (i->isconst())
+        }
+        else if (i->isconst()) {
            prefer &= ScratchRegs;
+        }
 		return (_allocator.free & prefer) ? prefer : allow;
 	}

@ -476,38 +492,49 @@ namespace nanojit
        freeRsrcOf(ins, false);	// if we had a reg in use, emit a ST to flush it to mem
    }

+	void Assembler::asm_load(int d, Register r)
+	{
+		if (rmask(r) & FpRegs)
+		{
+#if defined NANOJIT_IA32
+			if (rmask(r) & XmmRegs) {
+#endif
+				SSE_LDQ(r, d, FP);
+#if defined NANOJIT_IA32
+			} else {
+				FLDQ(d, FP); 
+			}
+#endif
+		}
+#if defined NANOJIT_AMD64
+		else if (i->opcode() == LIR_param)
+		{
+			LDQ(r, d, FP);
+		}
+#endif
+		else
+		{
+			LD(r, d, FP);
+		}
+	}
+	
 	void Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
 	{
-        if (i->isconst())
-        {
+        if (i->isop(LIR_alloc)) {
+            LEA(r, disp(resv), FP);
+            verbose_only(if (_verbose) {
+                outputf("        remat %s size %d", _thisfrag->lirbuf->names->formatRef(i), i->size());
+            })
+        }
+        else if (i->isconst()) {
            if (!resv->arIndex) {
                reserveFree(i);
            }
            LDi(r, i->constval());
        }
-        else
-        {
+        else {
            int d = findMemFor(i);
-            if (rmask(r) & FpRegs)
-		    {
-#if defined NANOJIT_IA32
-                if (rmask(r) & XmmRegs) {
-#endif
-                    SSE_LDQ(r, d, FP);
-#if defined NANOJIT_IA32
-                } else {
-			        FLDQ(d, FP); 
-                }
-#endif
-            }
-            else
-		    {
-#if defined NANOJIT_AMD64
-                LDQ(r, d, FP);
-#else
-			    LD(r, d, FP);
-#endif
-		    }
+			asm_load(d,r);
 			verbose_only(if (_verbose) {
 				outputf("        restore %s", _thisfrag->lirbuf->names->formatRef(i));
 			})
@ -518,7 +545,13 @@ namespace nanojit
    {
        if (value->isconst())
        {
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
            int c = value->constval();
 			STi(rb, dr, c);
        }
@ -526,18 +559,28 @@ namespace nanojit
        {
 		    // make sure what is in a register
 		    Reservation *rA, *rB;
-		    findRegFor2(GpRegs, value, rA, base, rB);
-		    Register ra = rA->reg;
-		    Register rb = rB->reg;
+            Register ra, rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+                ra = findRegFor(value, GpRegs);
+            } else if (base->isconst()) {
+                // absolute address
+                dr += base->constval();
+                ra = findRegFor(value, GpRegs);
+                rb = UnknownReg;
+            } else {
+    		    findRegFor2(GpRegs, value, rA, base, rB);
+		        ra = rA->reg;
+		        rb = rB->reg;
+            }
 		    ST(rb, dr, ra);
        }
    }

-	void Assembler::asm_spill(LInsp i, Reservation *resv, bool pop)
+	void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
 	{
-		(void)i;
-		int d = disp(resv);
-		Register rr = resv->reg;
+		(void)quad;
 		if (d)
 		{
 			// save to spill location
@ -553,17 +596,16 @@ namespace nanojit
                }
 #endif
 			}
+#if defined NANOJIT_AMD64
+			else if (quad)
+			{
+				STQ(FP, d, rr);
+			}
+#endif
 			else
 			{
-#if defined NANOJIT_AMD64
-				STQ(FP, d, rr);
-#else
 				ST(FP, d, rr);
-#endif
 			}
-			verbose_only(if (_verbose) {
-				outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
-			})
 		}
 #if defined NANOJIT_IA32
 		else if (pop && (rmask(rr) & x87Regs))
@ -571,7 +613,21 @@ namespace nanojit
 			// pop the fpu result since it isn't used
 			FSTP(FST0);
 		}
-#endif
+#endif	
+	}
+
+	void Assembler::asm_spilli(LInsp i, Reservation *resv, bool pop)
+	{
+		int d = disp(resv);
+		Register rr = resv->reg;
+		bool quad = i->opcode() == LIR_param || i->isQuad();
+		asm_spill(rr, d, pop, quad);
+		if (d) 
+		{
+			verbose_only(if (_verbose) {
+				outputf("        spill %s",_thisfrag->lirbuf->names->formatRef(i));
+			})
+		}
 	}

 	void Assembler::asm_load64(LInsp ins)
@ -584,7 +640,13 @@ namespace nanojit
 		if (rr != UnknownReg && rmask(rr) & XmmRegs)
 		{
 			freeRsrcOf(ins, false);
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                db += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			SSE_LDQ(rr, db, rb);
 		}
 #if defined NANOJIT_AMD64
@ -614,7 +676,13 @@ namespace nanojit
 		else
 		{
 			int dr = disp(resv);
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                db += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			resv->reg = UnknownReg;

 			// don't use an fpu reg to simply load & store the value.
@ -639,7 +707,13 @@ namespace nanojit
 		{
 			// if a constant 64-bit value just store it now rather than
 			// generating a pointless store/load/store sequence
-			Register rb = findRegFor(base, GpRegs);
+			Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
 			const int32_t* p = (const int32_t*) (value-2);
 			STi(rb, dr+4, p[1]);
 			STi(rb, dr, p[0]);
@ -647,7 +721,7 @@ namespace nanojit
 		}

 #if defined NANOJIT_IA32
-        if (value->isop(LIR_ldq) || value->isop(LIR_qjoin))
+        if (value->isop(LIR_ldq) || value->isop(LIR_ldqc) || value->isop(LIR_qjoin))
 		{
 			// value is 64bit struct or int64_t, or maybe a double.
 			// it may be live in an FPU reg.  Either way, don't
@ -660,21 +734,47 @@ namespace nanojit

 			if (avmplus::AvmCore::use_sse2()) {
                Register rv = findRegFor(value, XmmRegs);
-                Register rb = findRegFor(base, GpRegs);
+		Register rb;
+		if (base->isop(LIR_alloc)) {
+		    rb = FP;
+		    dr += findMemFor(base);
+		} else {
+		    rb = findRegFor(base, GpRegs);
+		}
                SSE_STQ(dr, rb, rv);
 				return;
            }

 			int da = findMemFor(value);
-		    Register rb = findRegFor(base, GpRegs);
+		    Register rb;
+		    if (base->isop(LIR_alloc)) {
+					rb = FP;
+					dr += findMemFor(base);
+		    } else {
+					rb = findRegFor(base, GpRegs);
+		    }
 		    asm_mmq(rb, dr, FP, da);
            return;
 		}

+		Register rb;
+		if (base->isop(LIR_alloc)) {
+		    rb = FP;
+		    dr += findMemFor(base);
+		} else {
+		    rb = findRegFor(base, GpRegs);
+		}
+
+		// if value already in a reg, use that, otherwise
+		// try to get it into XMM regs before FPU regs.
 		Reservation* rA = getresv(value);
+		Register rv;
 		int pop = !rA || rA->reg==UnknownReg;
- 		Register rv = findRegFor(value, avmplus::AvmCore::use_sse2() ? XmmRegs : FpRegs);
-		Register rb = findRegFor(base, GpRegs);
+		if (pop) {
+		    rv = findRegFor(value, avmplus::AvmCore::use_sse2() ? XmmRegs : FpRegs);
+		} else {
+		    rv = rA->reg;
+		}

 		if (rmask(rv) & XmmRegs) {
            SSE_STQ(dr, rb, rv);
@ -763,8 +863,10 @@ namespace nanojit
 			NanoAssert((rmask(rr) & FpRegs) != 0);

 			const double d = ins->constvalf();
+            const uint64_t q = ins->constvalq();
 			if (rmask(rr) & XmmRegs) {
-				if (d == 0.0) {
+				if (q == 0.0) {
+                    // test (int64)0 since -0.0 == 0.0
 					SSE_XORPDr(rr, rr);
 				} else if (d == 1.0) {
 					// 1.0 is extremely frequent and worth special-casing!
@ -776,7 +878,8 @@ namespace nanojit
 					SSE_LDQ(rr, d, FP);
 				}
 			} else {
-				if (d == 0.0) {
+				if (q == 0.0) {
+                    // test (int64)0 since -0.0 == 0.0
 					FLDZ();
 				} else if (d == 1.0) {
 					FLD1();
@ -803,24 +906,23 @@ namespace nanojit

 		if (rR->reg != UnknownReg)
 		{
-            Register rr = rR->reg;
-		    freeRsrcOf(ins, false);
-			if (rmask(rr) & GpRegs)
+			if (rmask(rR->reg) & GpRegs)
 			{
-				LDQi(rr, val);
+				LDQi(rR->reg, val);
 			}
-			else if (rmask(rr) & XmmRegs)
+			else if (rmask(rR->reg) & XmmRegs)
 			{
 				if (ins->constvalf() == 0.0)
 				{
-					SSE_XORPDr(rr, rr);
+					SSE_XORPDr(rR->reg, rR->reg);
 				}
 				else
 				{
 					/* Get a short-lived register, not associated with instruction */
+					Register rd = rR->reg;
 					Register rs = registerAlloc(GpRegs);
-
-					SSE_MOVD(rr, rs);
+	
+					SSE_MOVD(rd, rs);
 					LDQi(rs, val);

 					_allocator.addFree(rs);
@ -831,10 +933,11 @@ namespace nanojit
 		{
 			const int32_t* p = (const int32_t*) (ins-2);
 			int dr = disp(rR);
-		    freeRsrcOf(ins, false);
 			STi(FP, dr+4, p[1]);
 			STi(FP, dr, p[0]);
 		}
+
+		freeRsrcOf(ins, false);
 #endif
 	}
 	
@ -925,22 +1028,75 @@ namespace nanojit
 #endif
 	}

+    void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
+    {
+        if (sz == ARGSIZE_Q) 
+        {
+			// ref arg - use lea
+			if (r != UnknownReg)
+			{
+				// arg in specific reg
+				int da = findMemFor(p);
+				LEA(r, da, FP);
+			}
+			else
+			{
+				NanoAssert(0); // not supported
+			}
+		}
+        else if (sz == ARGSIZE_LO)
+		{
+			if (r != UnknownReg) {
+				// arg goes in specific register
+                if (p->isconst()) {
+					LDi(r, p->constval());
+                } else {
+            		Reservation* rA = getresv(p);
+                    if (rA) {
+                        if (rA->reg == UnknownReg) {
+                            // load it into the arg reg
+                            int d = findMemFor(p);
+                            if (p->isop(LIR_alloc)) {
+                                LEA(r, d, FP);
+                            } else {
+                                LD(r, d, FP);
+                            }
+                        } else {
+                            // it must be in a saved reg
+                            MR(r, rA->reg);
+                        }
+                    } 
+                    else {
+                        // this is the last use, so fine to assign it
+                        // to the scratch reg, it's dead after this point.
+    					findSpecificRegFor(p, r);
+                    }
+                }
+			}
+            else {
+				asm_pusharg(p);
+			}
+		}
+        else
+		{
+            NanoAssert(sz == ARGSIZE_F);
+			asm_farg(p);
+		}
+    }
+
 	void Assembler::asm_pusharg(LInsp p)
 	{
 		// arg goes on stack
 		Reservation* rA = getresv(p);
-		if (rA == 0)
+		if (rA == 0 && p->isconst())
 		{
-			if (p->isconst())
-			{
-				// small const we push directly
-				PUSHi(p->constval());
-			}
-			else
-			{
-				Register ra = findRegFor(p, GpRegs);
-				PUSHr(ra);
-			}
+			// small const we push directly
+			PUSHi(p->constval());
+		}
+		else if (rA == 0 || p->isop(LIR_alloc))
+		{
+			Register ra = findRegFor(p, GpRegs);
+			PUSHr(ra);
 		}
 		else if (rA->reg == UnknownReg)
 		{
@ -955,14 +1111,16 @@ namespace nanojit
 	void Assembler::asm_farg(LInsp p)
 	{
 #if defined NANOJIT_IA32
+        NanoAssert(p->isQuad());
 		Register r = findRegFor(p, FpRegs);
 		if (rmask(r) & XmmRegs) {
 			SSE_STQ(0, SP, r); 
 		} else {
 			FSTPQ(0, SP);
 		}
-		PUSHr(ECX); // 2*pushr is smaller than sub
-		PUSHr(ECX);
+        SUBi(ESP,8);
+		//PUSHr(ECX); // 2*pushr is smaller than sub
+		//PUSHr(ECX);
 #endif
 	}

@ -997,7 +1155,10 @@ namespace nanojit
 				 */
 				ra = findRegFor(lhs, XmmRegs);
 			}
-			// else, rA already has a register assigned.
+            else {
+    			// rA already has a register assigned but maybe not from the allow set
+                ra = findRegFor(lhs, allow);
+            }

 			if (lhs == rhs)
 				rb = ra;
@ -1190,6 +1351,75 @@ namespace nanojit
 		}
 	}

+    NIns * Assembler::asm_jmpcc(bool branchOnFalse, LIns *cond, NIns *targ)
+    {
+        LOpcode c = cond->opcode();
+        if (avmplus::AvmCore::use_sse2() && c != LIR_feq) {
+            LIns *lhs = cond->oprnd1();
+            LIns *rhs = cond->oprnd2();
+            if (c == LIR_flt) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                c = LIR_fgt;
+            }
+            else if (c == LIR_fle) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                c = LIR_fge;
+            }
+
+            if (c == LIR_fgt) {
+                if (branchOnFalse) { JNA(targ); } else { JA(targ); }
+            }
+            else { // if (c == LIR_fge)
+                if (branchOnFalse) { JNAE(targ); } else { JAE(targ); }
+            }
+            NIns *at = _nIns;
+            Reservation *rA, *rB;
+            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
+            SSE_UCOMISD(rA->reg, rB->reg);
+            return at;
+        }
+
+    	if (branchOnFalse)
+			JP(targ);
+		else
+			JNP(targ);
+		NIns *at = _nIns;
+		asm_fcmp(cond);
+        return at;
+    }
+
+    void Assembler::asm_setcc(Register r, LIns *cond)
+    {
+        LOpcode c = cond->opcode();
+        if (avmplus::AvmCore::use_sse2() && c != LIR_feq) {
+    		MOVZX8(r,r);
+            LIns *lhs = cond->oprnd1();
+            LIns *rhs = cond->oprnd2();
+            if (c == LIR_flt) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                SETA(r);
+            }
+            else if (c == LIR_fle) {
+                LIns *t = lhs; lhs = rhs; rhs = t;
+                SETAE(r);
+            }
+            else if (c == LIR_fgt) {
+                SETA(r);
+            }
+            else { // if (c == LIR_fge)
+                SETAE(r);
+            }
+            Reservation *rA, *rB;
+            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
+            SSE_UCOMISD(rA->reg, rB->reg);
+            return;
+        }
+		// SETcc only sets low 8 bits, so extend 
+		MOVZX8(r,r);
+		SETNP(r);
+        asm_fcmp(cond);
+    }
+
 	void Assembler::asm_fcmp(LIns *cond)
 	{
 		LOpcode condop = cond->opcode();
@ -1206,10 +1436,12 @@ namespace nanojit
 		    mask = 0x05;
        else if (condop == LIR_fge) {
            // swap, use le
+            condop = LIR_fle;
            LIns* t = lhs; lhs = rhs; rhs = t;
            mask = 0x41;
        } else { // if (condop == LIR_fgt)
            // swap, use lt
+            condop = LIR_flt;
            LIns* t = lhs; lhs = rhs; rhs = t;
 		    mask = 0x05;
        }
@ -1227,7 +1459,8 @@ namespace nanojit
                // nan check
                Register r = findRegFor(lhs, XmmRegs);
                SSE_UCOMISD(r, r);
-            } else {
+            } 
+            else {
 #if defined NANOJIT_IA32
                evict(EAX);
                TEST_AH(mask);
@ -1384,5 +1617,19 @@ namespace nanojit
 		if (!_nIns)		 _nIns	   = pageAlloc();
 		if (!_nExitIns)  _nExitIns = pageAlloc(true);
 	}
+	
+	// enough room for n bytes
+    void Assembler::underrunProtect(int n)
+    {
+        NIns *eip = this->_nIns;
+        Page *p = (Page*)pageTop(eip-1);
+        NIns *top = (NIns*) &p->code[0];
+        if (eip - n < top) {
+			_nIns = pageAlloc(_inExit);
+            JMP(eip);
+        }
+    }
+
+	
 	#endif /* FEATURE_NANOJIT */
 }
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@ -101,6 +101,7 @@ namespace nanojit

 	typedef int RegisterMask;

+	static const int NumSavedRegs = 3;
 	static const RegisterMask SavedRegs = 1<<EBX | 1<<EDI | 1<<ESI;
 	static const RegisterMask GpRegs = SavedRegs | 1<<EAX | 1<<ECX | 1<<EDX;
    static const RegisterMask XmmRegs = 1<<XMM0|1<<XMM1|1<<XMM2|1<<XMM3|1<<XMM4|1<<XMM5|1<<XMM6|1<<XMM7;
@ -132,23 +133,12 @@ namespace nanojit
 		bool pad[3];\
 		void nativePageReset();\
 		void nativePageSetup();\
-        void asm_farg(LInsp);
+        void underrunProtect(int);\
+        void asm_farg(LInsp);\
+        void asm_align_code();
 		
 	#define swapptrs()  { NIns* _tins = _nIns; _nIns=_nExitIns; _nExitIns=_tins; }
 		
-	// enough room for n bytes
-	#define underrunProtect(n)									\
-		{														\
-			intptr_t u = n + sizeof(PageHeader)/sizeof(NIns) + 5; \
-			if ( !samepage(_nIns-u,_nIns-1) )					\
-			{													\
-				NIns *tt = _nIns; \
-				_nIns = pageAlloc(_inExit);						\
-				int d = tt-_nIns; \
-				JMP_long_nochk_offset(d);			\
-			}													\
-		}														\
-
 #define IMM32(i)	\
 	_nIns -= 4;		\
 	*((int32_t*)_nIns) = (int32_t)(i)
@ -171,8 +161,11 @@ namespace nanojit
 		}

 #define MODRMm(r,d,b) \
-		NanoAssert(unsigned(r)<8 && unsigned(b)<8); \
- 		if ((b) == ESP) { \
+		NanoAssert(unsigned(r)<8 && ((b)==UnknownReg || unsigned(b)<8)); \
+        if ((b) == UnknownReg) {\
+            IMM32(d);\
+            *(--_nIns) = (uint8_t) (0<<6 | (r)<<3 | 5);\
+        } else if ((b) == ESP) { \
 			MODRMs(r, d, b, 0, (Register)4); \
 		} \
 		else if ( (d) == 0 && (b) != EBP) { \
@ -344,7 +337,7 @@ namespace nanojit

 #define ST(base,disp,reg) do {  \
 	ALUm(0x89,reg,disp,base);	\
-	asm_output3("mov %d(%s),%s",disp,gpn(base),gpn(reg)); } while(0)
+    asm_output3("mov %d(%s),%s",disp,base==UnknownReg?"0":gpn(base),gpn(reg)); } while(0)

 #define STi(base,disp,imm)	do { \
 	underrunProtect(12);	\
@ -497,7 +490,7 @@ namespace nanojit
    *(--_nIns) = 0x10;\
    *(--_nIns) = 0x0f;\
    *(--_nIns) = 0xf2;\
-    asm_output3("movsd %s,%p // =%f",gpn(r),daddr,*daddr); \
+    asm_output3("movsd %s,(#%p) // =%f",gpn(r),(void*)daddr,*daddr); \
    } while(0)

 #define STSD(d,b,r)do {     \
@ -539,61 +532,70 @@ namespace nanojit
    } while(0)

 #define SSE_MOVSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
    SSE(0xf20f10, (rd)&7, (rs)&7); \
    asm_output2("movsd %s,%s",gpn(rd),gpn(rs)); \
    } while(0)

 #define SSE_MOVDm(d,b,xrs) do {\
+    NanoAssert(_is_xmm_reg_(xrs) && _is_gp_reg_(b));\
    SSEm(0x660f7e, (xrs)&7, d, b);\
    asm_output3("movd %d(%s),%s", d, gpn(b), gpn(xrs));\
    } while(0)

 #define SSE_ADDSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
    SSE(0xf20f58, (rd)&7, (rs)&7); \
    asm_output2("addsd %s,%s",gpn(rd),gpn(rs)); \
    } while(0)

 #define SSE_ADDSDm(r,addr)do {     \
    underrunProtect(8); \
+    NanoAssert(_is_xmm_reg_(r));\
 	const double* daddr = addr; \
    IMM32(int32_t(daddr));\
    *(--_nIns) = uint8_t(((r)&7)<<3|5); \
    *(--_nIns) = 0x58;\
    *(--_nIns) = 0x0f;\
    *(--_nIns) = 0xf2;\
-    asm_output3("addsd %s,%p // =%f",gpn(r),daddr,*daddr); \
+    asm_output3("addsd %s,%p // =%f",gpn(r),(void*)daddr,*daddr); \
    } while(0)

 #define SSE_SUBSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
    SSE(0xf20f5c, (rd)&7, (rs)&7); \
    asm_output2("subsd %s,%s",gpn(rd),gpn(rs)); \
    } while(0)
 #define SSE_MULSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
    SSE(0xf20f59, (rd)&7, (rs)&7); \
    asm_output2("mulsd %s,%s",gpn(rd),gpn(rs)); \
    } while(0)
 #define SSE_DIVSD(rd,rs) do{ \
+    NanoAssert(_is_xmm_reg_(rd) && _is_xmm_reg_(rs));\
    SSE(0xf20f5e, (rd)&7, (rs)&7); \
    asm_output2("divsd %s,%s",gpn(rd),gpn(rs)); \
    } while(0)
 #define SSE_UCOMISD(rl,rr) do{ \
+    NanoAssert(_is_xmm_reg_(rl) && _is_xmm_reg_(rr));\
    SSE(0x660f2e, (rl)&7, (rr)&7); \
    asm_output2("ucomisd %s,%s",gpn(rl),gpn(rr)); \
    } while(0)

 #define CVTSI2SDm(xr,d,b) do{ \
+    NanoAssert(_is_xmm_reg_(xr) && _is_gp_reg_(b));\
    SSEm(0xf20f2a, (xr)&7, (d), (b)); \
    asm_output3("cvtsi2sd %s,%d(%s)",gpn(xr),(d),gpn(b)); \
    } while(0)

 #define SSE_XORPD(r, maskaddr) do {\
-    underrunProtect(8); \
+	underrunProtect(8); \
    IMM32(maskaddr);\
    *(--_nIns) = uint8_t(((r)&7)<<3|5); \
    *(--_nIns) = 0x57;\
    *(--_nIns) = 0x0f;\
    *(--_nIns) = 0x66;\
-    asm_output2("xorpd %s,[0x%p]",gpn(r),(maskaddr));\
+    asm_output2("xorpd %s,[0x%p]",gpn(r),(void*)(maskaddr));\
    } while(0)

 #define SSE_XORPDr(rd,rs) do{ \
@ -657,6 +659,7 @@ namespace nanojit
 #define FLDr(r)		do { FPU(0xd9c0,r);				asm_output1("fld %s",fpn(r)); fpu_push(); } while(0)
 #define EMMS()		do { FPUc(0x0f77);				asm_output("emms"); } while (0)

+// standard direct call
 #define CALL(c)	do { \
  underrunProtect(5);					\
  int offset = (c->_address) - ((int)_nIns); \
@ -666,5 +669,14 @@ namespace nanojit
  debug_only(if ((c->_argtypes&3)==ARGSIZE_F) fpu_push();)\
 } while (0)

+// indirect call thru register
+#define CALLr(c,r)	do { \
+  underrunProtect(2);\
+  ALU(0xff, 2, (r));\
+  verbose_only(asm_output1("call %s",gpn(r));) \
+  debug_only(if ((c->_argtypes&3)==ARGSIZE_F) fpu_push();)\
+} while (0)
+
+
 }
 #endif // __nanojit_Nativei386__
--- a/js/src/nanojit/RegAlloc.cpp
+++ b/js/src/nanojit/RegAlloc.cpp
@ -72,11 +72,18 @@ namespace nanojit

 	void RegAlloc::addActive(Register r, LIns* v)
 	{
-		//addActiveCount++;
+		//  Count++;
 		NanoAssert(v && r != UnknownReg && active[r] == NULL );
 		active[r] = v;
+        useActive(r);
 	}

+    void RegAlloc::useActive(Register r)
+    {
+        NanoAssert(r != UnknownReg && active[r] != NULL);
+        usepri[r] = priority++;
+    }
+
 	void RegAlloc::removeActive(Register r)
 	{
 		//registerReleaseCount++;
@ -87,12 +94,6 @@ namespace nanojit
 		active[r] = NULL;
 	}

-	LIns* RegAlloc::getActive(Register r)
-	{
-		NanoAssert(r != UnknownReg);
-		return active[r];
-	}
-
 	void RegAlloc::retire(Register r)
 	{
 		NanoAssert(r != UnknownReg);
@ -101,30 +102,26 @@ namespace nanojit
 		free |= rmask(r);
 	}

-	// scan table for instruction with longest span
-	LIns* Assembler::findVictim(RegAlloc &regs, RegisterMask allow, RegisterMask prefer)
+	// scan table for instruction with the lowest priority, meaning it is used
+    // furthest in the future.
+	LIns* Assembler::findVictim(RegAlloc &regs, RegisterMask allow)
 	{
-		NanoAssert(allow != 0 && (allow&prefer)==prefer);
-		LIns *i, *a=0, *p = 0;
-        int acost=10, pcost=10;
+		NanoAssert(allow != 0);
+		LIns *i, *a=0;
+        int allow_pri = 0x7fffffff;
 		for (Register r=FirstReg; r <= LastReg; r = nextreg(r))
 		{
            if ((allow & rmask(r)) && (i = regs.getActive(r)) != 0)
            {
-                int cost = getresv(i)->cost;
-                if (!a || cost < acost || cost == acost && nbr(i) < nbr(a)) {
+                int pri = canRemat(i) ? 0 : regs.getPriority(r);
+                if (!a || pri < allow_pri) {
                    a = i;
-                    acost = cost;
-                }
-                if (prefer & rmask(r)) {
-                    if (!p || cost < pcost || cost == pcost && nbr(i) < nbr(p)) {
-                        p = i;
-                        pcost = cost;
-                    }
+                    allow_pri = pri;
                }
 			}
 		}
-        return acost < pcost ? a : p;
+        NanoAssert(a != 0);
+        return a;
 	}

 	#ifdef  NJ_VERBOSE
--- a/js/src/nanojit/RegAlloc.h
+++ b/js/src/nanojit/RegAlloc.h
@ -51,15 +51,28 @@ namespace nanojit
 	class RegAlloc MMGC_SUBCLASS_DECL
 	{
 		public:
-			RegAlloc() {}
+            RegAlloc() : free(0), used(0), priority(0) {}
 			void	clear();
 			bool	isFree(Register r); 
 			void	addFree(Register r);
 			void	removeFree(Register r);
 			void	addActive(Register r, LIns* ins);
+            void    useActive(Register r);
 			void	removeActive(Register r);
-			LIns*	getActive(Register r); 
 			void	retire(Register r);
+            bool    isValid() {
+                return (free|used) != 0;
+            }
+
+            int32_t getPriority(Register r) {
+                NanoAssert(r != UnknownReg && active[r]);
+                return usepri[r];
+            }
+
+	        LIns* getActive(Register r) {
+		        NanoAssert(r != UnknownReg);
+		        return active[r];
+	        }

 			debug_only( uint32_t	countFree(); )
 			debug_only( uint32_t	countActive(); )
@ -68,11 +81,11 @@ namespace nanojit
 			debug_only( uint32_t	count; )
 			debug_only( RegisterMask managed; )    // bitfield of 0..NJ_MAX_REGISTERS denoting which are under our management                     

-			// RegisterMask is a 32-bit value, so we can never have more than 32 active.
-			// hardcode 32 here in case we have non-contiguous register numbers
-			LIns*	active[32];  // active[r] = OP that defines r
+			LIns*	active[LastReg + 1];  // active[r] = OP that defines r
+			int32_t usepri[LastReg + 1]; // used priority. lower = more likely to spill.
 			RegisterMask	free;
 			RegisterMask	used;
+            int32_t         priority;

 			verbose_only( static void formatRegisters(RegAlloc& regs, char* s, Fragment*); )

--- a/js/src/nanojit/avmplus.h
+++ b/js/src/nanojit/avmplus.h
@ -326,6 +326,8 @@ public:

 #define DWB(x) x
 #define DRCWB(x) x
+#define WB(gc, container, addr, value) do { *(addr) = (value); } while(0)
+#define WBRC(gc, container, addr, value) do { *(addr) = (value); } while(0)

 #define MMGC_MEM_TYPE(x)

--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@ -42,6 +42,8 @@
 #include <stddef.h>
 #include "avmplus.h"

+#ifdef FEATURE_NANOJIT
+
 #ifdef AVMPLUS_IA32
 #define NANOJIT_IA32
 #elif AVMPLUS_ARM
@ -73,6 +75,8 @@ namespace nanojit
 	typedef avmplus::List<LIns*,avmplus::LIST_NonGCObjects>	InsList;
 	typedef avmplus::List<char*, avmplus::LIST_GCObjects> StringList;

+    const uint32_t MAXARGS = 8;
+
 	#if defined(_MSC_VER) && _MSC_VER < 1400
 		static void NanoAssertMsgf(bool a,const char *f,...) {}
 		static void NanoAssertMsg(bool a,const char *m) {}
@ -113,12 +117,12 @@ namespace nanojit
 	#define verbose_output						if (verbose_enabled()) Assembler::output
 	#define verbose_outputf						if (verbose_enabled()) Assembler::outputf
 	#define verbose_enabled()					(_verbose)
-	#define verbose_only(x)						x
+	#define verbose_only(...)					__VA_ARGS__
 #else
 	#define verbose_output
 	#define verbose_outputf
 	#define verbose_enabled()
-	#define verbose_only(x)
+	#define verbose_only(...)
 #endif /*NJ_VERBOSE*/

 #ifdef _DEBUG
@ -172,4 +176,5 @@ namespace nanojit
 #include "Assembler.h"
 #include "TraceTreeDrawer.h"

+#endif // FEATURE_NANOJIT
 #endif // __nanojit_h__
--- a/js/src/t/crypto-sha1.js
+++ b/js/src/t/crypto-sha1.js
@ -1,224 +0,0 @@
-/*
- * A JavaScript implementation of the Secure Hash Algorithm, SHA-1, as defined
- * in FIPS PUB 180-1
- * Version 2.1a Copyright Paul Johnston 2000 - 2002.
- * Other contributors: Greg Holt, Andrew Kepert, Ydnar, Lostinet
- * Distributed under the BSD License
- * See http://pajhome.org.uk/crypt/md5 for details.
- */
-
-/*
- * Configurable variables. You may need to tweak these to be compatible with
- * the server-side, but the defaults work in most cases.
- */
-var hexcase = 0;  /* hex output format. 0 - lowercase; 1 - uppercase        */
-var b64pad  = ""; /* base-64 pad character. "=" for strict RFC compliance   */
-var chrsz   = 8;  /* bits per input character. 8 - ASCII; 16 - Unicode      */
-
-/*
- * These are the functions you'll usually want to call
- * They take string arguments and return either hex or base-64 encoded strings
- */
-function hex_sha1(s){return binb2hex(core_sha1(str2binb(s),s.length * chrsz));}
-function b64_sha1(s){return binb2b64(core_sha1(str2binb(s),s.length * chrsz));}
-function str_sha1(s){return binb2str(core_sha1(str2binb(s),s.length * chrsz));}
-function hex_hmac_sha1(key, data){ return binb2hex(core_hmac_sha1(key, data));}
-function b64_hmac_sha1(key, data){ return binb2b64(core_hmac_sha1(key, data));}
-function str_hmac_sha1(key, data){ return binb2str(core_hmac_sha1(key, data));}
-
-/*
- * Perform a simple self-test to see if the VM is working
- */
-function sha1_vm_test()
-{
-  return hex_sha1("abc") == "a9993e364706816aba3e25717850c26c9cd0d89d";
-}
-
-/*
- * Calculate the SHA-1 of an array of big-endian words, and a bit length
- */
-function core_sha1(x, len)
-{
-  /* append padding */
-  x[len >> 5] |= 0x80 << (24 - len % 32);
-  x[((len + 64 >> 9) << 4) + 15] = len;
-
-  var w = Array(80);
-  var a =  1732584193;
-  var b = -271733879;
-  var c = -1732584194;
-  var d =  271733878;
-  var e = -1009589776;
-
-  for(var i = 0; i < x.length; i += 16)
-  {
-    var olda = a;
-    var oldb = b;
-    var oldc = c;
-    var oldd = d;
-    var olde = e;
-
-    for(var j = 0; j < 80; j++)
-    {
-      if(j < 16) w[j] = x[i + j];
-      else w[j] = rol(w[j-3] ^ w[j-8] ^ w[j-14] ^ w[j-16], 1);
-      var t = safe_add(safe_add(rol(a, 5), sha1_ft(j, b, c, d)),
-                       safe_add(safe_add(e, w[j]), sha1_kt(j)));
-      e = d;
-      d = c;
-      c = rol(b, 30);
-      b = a;
-      a = t;
-    }
-
-    a = safe_add(a, olda);
-    b = safe_add(b, oldb);
-    c = safe_add(c, oldc);
-    d = safe_add(d, oldd);
-    e = safe_add(e, olde);
-  }
-  return Array(a, b, c, d, e);
-
-}
-
-/*
- * Perform the appropriate triplet combination function for the current
- * iteration
- */
-function sha1_ft(t, b, c, d)
-{
-  if(t < 20) return (b & c) | ((~b) & d);
-  if(t < 40) return b ^ c ^ d;
-  if(t < 60) return (b & c) | (b & d) | (c & d);
-  return b ^ c ^ d;
-}
-
-/*
- * Determine the appropriate additive constant for the current iteration
- */
-function sha1_kt(t)
-{
-  return (t < 20) ?  1518500249 : (t < 40) ?  1859775393 :
-         (t < 60) ? -1894007588 : -899497514;
-}
-
-/*
- * Calculate the HMAC-SHA1 of a key and some data
- */
-function core_hmac_sha1(key, data)
-{
-  var bkey = str2binb(key);
-  if(bkey.length > 16) bkey = core_sha1(bkey, key.length * chrsz);
-
-  var ipad = Array(16), opad = Array(16);
-  for(var i = 0; i < 16; i++)
-  {
-    ipad[i] = bkey[i] ^ 0x36363636;
-    opad[i] = bkey[i] ^ 0x5C5C5C5C;
-  }
-
-  var hash = core_sha1(ipad.concat(str2binb(data)), 512 + data.length * chrsz);
-  return core_sha1(opad.concat(hash), 512 + 160);
-}
-
-/*
- * Add integers, wrapping at 2^32. This uses 16-bit operations internally
- * to work around bugs in some JS interpreters.
- */
-function safe_add(x, y)
-{
-  var lsw = (x & 0xFFFF) + (y & 0xFFFF);
-  var msw = (x >> 16) + (y >> 16) + (lsw >> 16);
-  return (msw << 16) | (lsw & 0xFFFF);
-}
-
-/*
- * Bitwise rotate a 32-bit number to the left.
- */
-function rol(num, cnt)
-{
-  return (num << cnt) | (num >>> (32 - cnt));
-}
-
-/*
- * Convert an 8-bit or 16-bit string to an array of big-endian words
- * In 8-bit function, characters >255 have their hi-byte silently ignored.
- */
-function str2binb(str)
-{
-  var bin = Array();
-  var mask = (1 << chrsz) - 1;
-  for(var i = 0; i < str.length * chrsz; i += chrsz)
-    bin[i>>5] |= (str.charCodeAt(i / chrsz) & mask) << (32 - chrsz - i%32);
-  return bin;
-}
-
-/*
- * Convert an array of big-endian words to a string
- */
-function binb2str(bin)
-{
-  var str = "";
-  var mask = (1 << chrsz) - 1;
-  for(var i = 0; i < bin.length * 32; i += chrsz)
-    str += String.fromCharCode((bin[i>>5] >>> (32 - chrsz - i%32)) & mask);
-  return str;
-}
-
-/*
- * Convert an array of big-endian words to a hex string.
- */
-function binb2hex(binarray)
-{
-  var hex_tab = hexcase ? "0123456789ABCDEF" : "0123456789abcdef";
-  var str = "";
-  for(var i = 0; i < binarray.length * 4; i++)
-  {
-    str += hex_tab.charAt((binarray[i>>2] >> ((3 - i%4)*8+4)) & 0xF) +
-           hex_tab.charAt((binarray[i>>2] >> ((3 - i%4)*8  )) & 0xF);
-  }
-  return str;
-}
-
-/*
- * Convert an array of big-endian words to a base-64 string
- */
-function binb2b64(binarray)
-{
-  var tab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-  var str = "";
-  for(var i = 0; i < binarray.length * 4; i += 3)
-  {
-    var triplet = (((binarray[i   >> 2] >> 8 * (3 -  i   %4)) & 0xFF) << 16)
-                | (((binarray[i+1 >> 2] >> 8 * (3 - (i+1)%4)) & 0xFF) << 8 )
-                |  ((binarray[i+2 >> 2] >> 8 * (3 - (i+2)%4)) & 0xFF);
-    for(var j = 0; j < 4; j++)
-    {
-      if(i * 8 + j * 6 > binarray.length * 32) str += b64pad;
-      else str += tab.charAt((triplet >> 6*(3-j)) & 0x3F);
-    }
-  }
-  return str;
-}
-
-
-var plainText = "Two households, both alike in dignity,\n\
-In fair Verona, where we lay our scene,\n\
-From ancient grudge break to new mutiny,\n\
-Where civil blood makes civil hands unclean.\n\
-From forth the fatal loins of these two foes\n\
-A pair of star-cross'd lovers take their life;\n\
-Whole misadventured piteous overthrows\n\
-Do with their death bury their parents' strife.\n\
-The fearful passage of their death-mark'd love,\n\
-And the continuance of their parents' rage,\n\
-Which, but their children's end, nought could remove,\n\
-Is now the two hours' traffic of our stage;\n\
-The which if you with patient ears attend,\n\
-What here shall miss, our toil shall strive to mend.";
-
-for (var i = 0; i <4; i++) {
-    plainText += plainText;
-}
-
-var sha1Output = hex_sha1(plainText);