From 228379824a47f0c20a44c919a06d78d279514daf Mon Sep 17 00:00:00 2001
From: Jacob Bramley <Jacob.Bramley@arm.com>
Date: Tue, 27 Apr 2010 08:31:36 +0100
Subject: [PATCH] Optimize DoubleToECMAInt32 for ARM. [Bug 551837][r=vlad]

---
 js/src/jsnum.h | 121 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/js/src/jsnum.h b/js/src/jsnum.h
index 1a0580679ef..d782ca8f587 100644
--- a/js/src/jsnum.h
+++ b/js/src/jsnum.h
@@ -442,6 +442,127 @@ js_DoubleToECMAInt32(jsdouble d)
     }
 
     return int32(du.d);
+#elif defined (__arm__) && defined (__GNUC__)
+    int32_t i;
+    uint32_t    tmp0;
+    uint32_t    tmp1;
+    uint32_t    tmp2;
+    asm (
+    // We use a pure integer solution here. In the 'softfp' ABI, the argument
+    // will start in r0 and r1, and VFP can't do all of the necessary ECMA
+    // conversions by itself so some integer code will be required anyway. A
+    // hybrid solution is faster on A9, but this pure integer solution is
+    // notably faster for A8.
+
+    // %0 is the result register, and may alias either of the %[QR]1 registers.
+    // %Q4 holds the lower part of the mantissa.
+    // %R4 holds the sign, exponent, and the upper part of the mantissa.
+    // %1, %2 and %3 are used as temporary values.
+
+    // Extract the exponent.
+"   mov     %1, %R4, LSR #20\n"
+"   bic     %1, %1, #(1 << 11)\n"  // Clear the sign.
+
+    // Set the implicit top bit of the mantissa. This clobbers a bit of the
+    // exponent, but we have already extracted that.
+"   orr     %R4, %R4, #(1 << 20)\n"
+
+    // Special Cases
+    //   We should return zero in the following special cases:
+    //    - Exponent is 0x000 - 1023: +/-0 or subnormal.
+    //    - Exponent is 0x7ff - 1023: +/-INFINITY or NaN
+    //      - This case is implicitly handled by the standard code path anyway,
+    //        as shifting the mantissa up by the exponent will result in '0'.
+    //
+    // The result is composed of the mantissa, prepended with '1' and
+    // bit-shifted left by the (decoded) exponent. Note that because the r1[20]
+    // is the bit with value '1', r1 is effectively already shifted (left) by
+    // 20 bits, and r0 is already shifted by 52 bits.
+    
+    // Adjust the exponent to remove the encoding offset. If the decoded
+    // exponent is negative, quickly bail out with '0' as such values round to
+    // zero anyway. This also catches +/-0 and subnormals.
+"   sub     %1, %1, #0xff\n"
+"   subs    %1, %1, #0x300\n"
+"   bmi     8f\n"
+
+    //  %1 = (decoded) exponent >= 0
+    //  %R4 = upper mantissa and sign
+
+    // ---- Lower Mantissa ----
+"   subs    %3, %1, #52\n"         // Calculate exp-52
+"   bmi     1f\n"
+
+    // Shift r0 left by exp-52.
+    // Ensure that we don't overflow ARM's 8-bit shift operand range.
+    // We need to handle anything up to an 11-bit value here as we know that
+    // 52 <= exp <= 1024 (0x400). Any shift beyond 31 bits results in zero
+    // anyway, so as long as we don't touch the bottom 5 bits, we can use
+    // a logical OR to push long shifts into the 32 <= (exp&0xff) <= 255 range.
+"   bic     %2, %3, #0xff\n"
+"   orr     %3, %3, %2, LSR #3\n"
+    // We can now perform a straight shift, avoiding the need for any
+    // conditional instructions or extra branches.
+"   mov     %Q4, %Q4, LSL %3\n"
+"   b       2f\n"
+"1:\n" // Shift r0 right by 52-exp.
+    // We know that 0 <= exp < 52, and we can shift up to 255 bits so 52-exp
+    // will always be a valid shift and we can sk%3 the range check for this case.
+"   rsb     %3, %1, #52\n"
+"   mov     %Q4, %Q4, LSR %3\n"
+
+    //  %1 = (decoded) exponent
+    //  %R4 = upper mantissa and sign
+    //  %Q4 = partially-converted integer
+
+"2:\n"
+    // ---- Upper Mantissa ----
+    // This is much the same as the lower mantissa, with a few different
+    // boundary checks and some masking to hide the exponent & sign bit in the
+    // upper word.
+    // Note that the upper mantissa is pre-shifted by 20 in %R4, but we shift
+    // it left more to remove the sign and exponent so it is effectively
+    // pre-shifted by 31 bits.
+"   subs    %3, %1, #31\n"          // Calculate exp-31
+"   mov     %1, %R4, LSL #11\n"     // Re-use %1 as a temporary register.
+"   bmi     3f\n"
+
+    // Shift %R4 left by exp-31.
+    // Avoid overflowing the 8-bit shift range, as before.
+"   bic     %2, %3, #0xff\n"
+"   orr     %3, %3, %2, LSR #3\n"
+    // Perform the shift.
+"   mov     %2, %1, LSL %3\n"
+"   b       4f\n"
+"3:\n" // Shift r1 right by 31-exp.
+    // We know that 0 <= exp < 31, and we can shift up to 255 bits so 31-exp
+    // will always be a valid shift and we can skip the range check for this case.
+"   rsb     %3, %3, #0\n"          // Calculate 31-exp from -(exp-31)
+"   mov     %2, %1, LSR %3\n"      // Thumb-2 can't do "LSR %3" in "orr".
+
+    //  %Q4 = partially-converted integer (lower)
+    //  %R4 = upper mantissa and sign
+    //  %2 = partially-converted integer (upper)
+
+"4:\n"
+    // Combine the converted parts.
+"   orr     %Q4, %Q4, %2\n"
+    // Negate the result if we have to, and move it to %0 in the process. To
+    // avoid conditionals, we can do this by inverting on %R4[31], then adding
+    // %R4[31]>>31.
+"   eor     %Q4, %Q4, %R4, ASR #31\n"
+"   add     %0, %Q4, %R4, LSR #31\n"
+"   b       9f\n"
+"8:\n"
+    // +/-INFINITY, +/-0, subnormals, NaNs, and anything else out-of-range that
+    // will result in a conversion of '0'.
+"   mov     %0, #0\n"
+"9:\n"
+    : "=r" (i), "=&r" (tmp0), "=&r" (tmp1), "=&r" (tmp2)
+    : "r" (d)
+    : "cc"
+        );
+    return i;
 #else
     int32 i;
     jsdouble two32, two31;