diff --git a/intl/uconv/src/nsUTF8ToUnicode.cpp b/intl/uconv/src/nsUTF8ToUnicode.cpp index 6563dbdc0e2a..f15f45b2e4ce 100644 --- a/intl/uconv/src/nsUTF8ToUnicode.cpp +++ b/intl/uconv/src/nsUTF8ToUnicode.cpp @@ -38,6 +38,25 @@ #include "nsUCSupport.h" #include "nsUTF8ToUnicode.h" +#if defined(__GNUC__) && defined(__i386__) && defined(__APPLE__) +#define MAC_SSE2 +#endif +#if defined(XP_WIN32) && defined(_M_IX86) && !defined(__GNUC__) +#define WIN_SSE2 +#endif + +#if defined(MAC_SSE2) || defined(WIN_SSE2) +#include "emmintrin.h" +#endif + +#if defined(MAC_SSE2) +#define __sse2_available 1 +#endif + +#if defined(WIN_SSE2) +extern "C" int __sse2_available; +#endif + #define UNICODE_BYTE_ORDER_MARK 0xFEFF NS_IMETHODIMP NS_NewUTF8ToUnicode(nsISupports* aOuter, @@ -119,6 +138,167 @@ NS_IMETHODIMP nsUTF8ToUnicode::Reset() //---------------------------------------------------------------------- // Subclassing of nsBasicDecoderSupport class [implementation] +// Fast ASCII -> UTF16 inner loop implementations +// +// Convert_ascii_run will update src and dst to the new values, and +// len must be the maximum number ascii chars that it would be valid +// to take from src and place into dst. (That is, the minimum of the +// number of bytes left in src and the number of unichars available in +// dst.) + +#if defined(MAC_SSE2) || defined(WIN_SSE2) + +static inline void +Convert_ascii_run (const char *&src, + PRUnichar *&dst, + PRInt32 len) +{ + if (len > 15 && __sse2_available) { + __m128i in, out1, out2; + __m128d *outp1, *outp2; + __m128i zeroes; + PRUint32 offset; + + // align input to 16 bytes + while ((NS_PTR_TO_UINT32(src) & 15) && len > 0) { + if (*src & 0x80U) + return; + *dst++ = (PRUnichar) *src++; + len--; + } + + zeroes = _mm_setzero_si128(); + + offset = NS_PTR_TO_UINT32(dst) & 15; + + // Note: all these inner loops have to break, not return; we need + // to let the single-char loop below catch any leftover + // byte-at-a-time ASCII chars, since this function must consume + // all available ASCII chars before it returns + + if (offset == 0) { + while (len > 15) { + in = _mm_load_si128((__m128i *) src); + if (_mm_movemask_epi8(in)) + break; + out1 = _mm_unpacklo_epi8(in, zeroes); + out2 = _mm_unpackhi_epi8(in, zeroes); + _mm_stream_si128((__m128i *) dst, out1); + _mm_stream_si128((__m128i *) (dst + 8), out2); + dst += 16; + src += 16; + len -= 16; + } + } else if (offset == 8) { + outp1 = (__m128d *) &out1; + outp2 = (__m128d *) &out2; + while (len > 15) { + in = _mm_load_si128((__m128i *) src); + if (_mm_movemask_epi8(in)) + break; + out1 = _mm_unpacklo_epi8(in, zeroes); + out2 = _mm_unpackhi_epi8(in, zeroes); + _mm_storel_epi64((__m128i *) dst, out1); + _mm_storel_epi64((__m128i *) (dst + 8), out2); + _mm_storeh_pd((double *) (dst + 4), *outp1); + _mm_storeh_pd((double *) (dst + 12), *outp2); + src += 16; + dst += 16; + len -= 16; + } + } else { + while (len > 15) { + in = _mm_load_si128((__m128i *) src); + if (_mm_movemask_epi8(in)) + break; + out1 = _mm_unpacklo_epi8(in, zeroes); + out2 = _mm_unpackhi_epi8(in, zeroes); + _mm_storeu_si128((__m128i *) dst, out1); + _mm_storeu_si128((__m128i *) (dst + 8), out2); + src += 16; + dst += 16; + len -= 16; + } + } + } + + // finish off a byte at a time + + while (len-- > 0 && (*src & 0x80U) == 0) { + *dst++ = (PRUnichar) *src++; + } +} + +#elif defined(__arm__) || defined(_M_ARM) + +// on ARM, do extra work to avoid byte/halfword reads/writes by +// reading/writing a word at a time for as long as we can +static inline void +Convert_ascii_run (const char *&src, + PRUnichar *&dst, + PRInt32 len) +{ + const PRUint32 *src32; + PRUint32 *dst32; + + // with some alignments, we'd never actually break out of the slow loop, so + // check and do the faster slow loop + if ((((NS_PTR_TO_UINT32(dst) & 3) == 0) && ((NS_PTR_TO_UINT32(src) & 1) == 0)) || + (((NS_PTR_TO_UINT32(dst) & 3) == 2) && ((NS_PTR_TO_UINT32(src) & 1) == 1))) + { + while (((NS_PTR_TO_UINT32(src) & 3) || + (NS_PTR_TO_UINT32(dst) & 3)) && + len > 0) + { + if (*src & 0x80U) + return; + *dst++ = (PRUnichar) *src++; + len--; + } + } else { + goto finish; + } + + // then go 4 bytes at a time + src32 = (const PRUint32*) src; + dst32 = (PRUint32*) dst; + + while (len > 4) { + PRUint32 in = *src32++; + + if (in & 0x80808080U) { + src32--; + break; + } + + *dst32++ = ((in & 0x000000ff) >> 0) | ((in & 0x0000ff00) << 8); + *dst32++ = ((in & 0x00ff0000) >> 16) | ((in & 0xff000000) >> 8); + + len -= 4; + } + + src = (const char *) src32; + dst = (PRUnichar *) dst32; + +finish: + while (len-- > 0 && (*src & 0x80U) == 0) { + *dst++ = (PRUnichar) *src++; + } +} + +#else /* generic code */ + +static inline void +Convert_ascii_run (const char *&src, + PRUnichar *&dst, + PRInt32 len) +{ + while (len-- > 0 && (*src & 0x80U) == 0) { + *dst++ = (PRUnichar) *src++; + } +} + +#endif NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength, @@ -136,6 +316,12 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, nsresult res = NS_OK; // conversion result + // alias these locally for speed + PRInt32 mUcs4 = this->mUcs4; + PRUint8 mState = this->mState; + PRUint8 mBytes = this->mBytes; + PRUint8 mFirst = this->mFirst; + // Set mFirst to PR_FALSE now so we don't have to every time through the ASCII // branch within the loop. if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc)))) @@ -146,8 +332,9 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, // When mState is zero we expect either a US-ASCII character or a // multi-octet sequence. if (0 == (0x80 & (*in))) { - // US-ASCII, pass straight through. - *out++ = (PRUnichar)*in; + PRInt32 max_loops = PR_MIN(inend - in, outend - out); + Convert_ascii_run(in, out, max_loops); + --in; // match the rest of the cases mBytes = 1; } else if (0xC0 == (0xE0 & (*in))) { // First octet of 2 octet sequence @@ -266,5 +453,10 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, *aSrcLength = in - aSrc; *aDestLength = out - aDest; + this->mUcs4 = mUcs4; + this->mState = mState; + this->mBytes = mBytes; + this->mFirst = mFirst; + return(res); }