From 586c7b1a14afbfe83509ea4d048eb25cde8705c0 Mon Sep 17 00:00:00 2001 From: Tooru Fujisawa Date: Sat, 13 Aug 2016 23:03:31 +0900 Subject: [PATCH] Bug 1289003 - Part 1: Add UTF8CharsToNewLatin1CharsZ, LossyUTF8CharsToNewLatin1CharsZ. r=jwalden --- js/public/CharacterEncoding.h | 33 ++++++++++++++ js/src/vm/CharacterEncoding.cpp | 81 ++++++++++++++++++++------------- 2 files changed, 83 insertions(+), 31 deletions(-) diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h index 99a6b4cdfce9..8c166013c788 100644 --- a/js/public/CharacterEncoding.h +++ b/js/public/CharacterEncoding.h @@ -31,6 +31,8 @@ class Latin1Chars : public mozilla::Range typedef mozilla::Range Base; public: + using CharT = Latin1Char; + Latin1Chars() : Base() {} Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) {} Latin1Chars(const Latin1Char* aBytes, size_t aLength) @@ -49,6 +51,8 @@ class Latin1CharsZ : public mozilla::RangedPtr typedef mozilla::RangedPtr Base; public: + using CharT = Latin1Char; + Latin1CharsZ() : Base(nullptr, 0) {} Latin1CharsZ(char* aBytes, size_t aLength) @@ -73,6 +77,8 @@ class UTF8Chars : public mozilla::Range typedef mozilla::Range Base; public: + using CharT = unsigned char; + UTF8Chars() : Base() {} UTF8Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast(aBytes), aLength) @@ -90,6 +96,8 @@ class UTF8CharsZ : public mozilla::RangedPtr typedef mozilla::RangedPtr Base; public: + using CharT = unsigned char; + UTF8CharsZ() : Base(nullptr, 0) {} UTF8CharsZ(char* aBytes, size_t aLength) @@ -120,6 +128,8 @@ class ConstUTF8CharsZ const char* data_; public: + using CharT = unsigned char; + ConstUTF8CharsZ() : data_(nullptr) {} @@ -157,6 +167,8 @@ class TwoByteChars : public mozilla::Range typedef mozilla::Range Base; public: + using CharT = char16_t; + TwoByteChars() : Base() {} TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast(aChars), aLength) {} @@ -170,6 +182,8 @@ class TwoByteCharsZ : public mozilla::RangedPtr typedef mozilla::RangedPtr Base; public: + using CharT = char16_t; + TwoByteCharsZ() : Base(nullptr, 0) {} TwoByteCharsZ(char16_t* chars, size_t length) @@ -191,6 +205,8 @@ class ConstTwoByteChars : public mozilla::Range typedef mozilla::Range Base; public: + using CharT = char16_t; + ConstTwoByteChars() : Base() {} ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {} }; @@ -272,6 +288,23 @@ JS_PUBLIC_API(void) DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr dst, size_t* dstlenp = nullptr, size_t* numcharsp = nullptr); +/* + * Return a null-terminated Latin-1 string copied from the input string, + * storing its length (excluding null terminator) in |*outlen|. Fail and + * report an error if the string contains non-Latin-1 codepoints. Returns + * Latin1CharsZ() on failure. + */ +extern Latin1CharsZ +UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); + +/* + * Return a null-terminated Latin-1 string copied from the input string, + * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1 + * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure. + */ +extern Latin1CharsZ +LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen); + } // namespace JS inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp index fb4d6e12a71b..cd2cbb557b84 100644 --- a/js/src/vm/CharacterEncoding.cpp +++ b/js/src/vm/CharacterEncoding.cpp @@ -8,6 +8,8 @@ #include "mozilla/Range.h" +#include + #include "jscntxt.h" #include "jsprf.h" @@ -253,19 +255,20 @@ enum InflateUTF8Action { Copy }; -static const uint32_t REPLACE_UTF8 = 0xFFFD; +static const char16_t REPLACE_UTF8 = 0xFFFD; +static const Latin1Char REPLACE_UTF8_LATIN1 = '?'; // If making changes to this algorithm, make sure to also update // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp -template +template static bool -InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, size_t* dstlenp, +InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp, bool* isAsciip) { if (Action != AssertNoInvalids) *isAsciip = true; - // Count how many char16_t characters need to be in the inflated string. + // Count how many code units need to be in the inflated string. // |i| is the index into |src|, and |j| is the the index into |dst|. size_t srclen = src.length(); uint32_t j = 0; @@ -274,7 +277,7 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz if (!(v & 0x80)) { // ASCII code unit. Simple copy. if (Action == Copy) - dst[j] = char16_t(v); + dst[j] = CharT(v); } else { // Non-ASCII code unit. Determine its length in bytes (n). @@ -292,10 +295,14 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz } else if (Action == AssertNoInvalids) { \ MOZ_CRASH("invalid UTF-8 string: " # report); \ } else { \ - if (Action == Copy) \ - dst[j] = char16_t(REPLACE_UTF8); \ - else \ + if (Action == Copy) { \ + if (std::is_same::value) \ + dst[j] = CharT(REPLACE_UTF8_LATIN1); \ + else \ + dst[j] = CharT(REPLACE_UTF8); \ + } else { \ MOZ_ASSERT(Action == CountAndIgnoreInvalids); \ + } \ n = n2; \ goto invalidMultiByteCodeUnit; \ } \ @@ -324,25 +331,24 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz if ((src[i + m] & 0xC0) != 0x80) INVALID(ReportInvalidCharacter, i, m); - // Determine the code unit's length in char16_t and act accordingly. + // Determine the code unit's length in CharT and act accordingly. v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n); if (v < 0x10000) { - // The n-byte UTF8 code unit will fit in a single char16_t. + // The n-byte UTF8 code unit will fit in a single CharT. if (Action == Copy) - dst[j] = char16_t(v); - + dst[j] = CharT(v); } else { v -= 0x10000; if (v <= 0xFFFFF) { - // The n-byte UTF8 code unit will fit in two char16_t units. + // The n-byte UTF8 code unit will fit in two CharT units. if (Action == Copy) - dst[j] = char16_t((v >> 10) + 0xD800); + dst[j] = CharT((v >> 10) + 0xD800); j++; if (Action == Copy) - dst[j] = char16_t((v & 0x3FF) + 0xDC00); + dst[j] = CharT((v & 0x3FF) + 0xDC00); } else { - // The n-byte UTF8 code unit won't fit in two char16_t units. + // The n-byte UTF8 code unit won't fit in two CharT units. INVALID(ReportTooBigCharacter, v, 1); } } @@ -361,61 +367,73 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz return true; } -template -static TwoByteCharsZ +template +static CharsT InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen) { + using CharT = typename CharsT::CharT; *outlen = 0; bool isAscii; - if (!InflateUTF8StringToBuffer(cx, src, /* dst = */ nullptr, outlen, &isAscii)) - return TwoByteCharsZ(); + if (!InflateUTF8StringToBuffer(cx, src, /* dst = */ nullptr, outlen, &isAscii)) + return CharsT(); - char16_t* dst = cx->pod_malloc(*outlen + 1); // +1 for NUL + CharT* dst = cx->pod_malloc(*outlen + 1); // +1 for NUL if (!dst) { ReportOutOfMemory(cx); - return TwoByteCharsZ(); + return CharsT(); } if (isAscii) { size_t srclen = src.length(); MOZ_ASSERT(*outlen == srclen); for (uint32_t i = 0; i < srclen; i++) - dst[i] = char16_t(src[i]); - + dst[i] = CharT(src[i]); } else { - JS_ALWAYS_TRUE(InflateUTF8StringToBuffer(cx, src, dst, outlen, &isAscii)); + JS_ALWAYS_TRUE((InflateUTF8StringToBuffer(cx, src, dst, outlen, &isAscii))); } dst[*outlen] = 0; // NUL char - return TwoByteCharsZ(dst, *outlen); + return CharsT(dst, *outlen); } TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) { - return InflateUTF8StringHelper(cx, utf8, outlen); + return InflateUTF8StringHelper(cx, utf8, outlen); } TwoByteCharsZ JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen) { UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); - return InflateUTF8StringHelper(cx, chars, outlen); + return InflateUTF8StringHelper(cx, chars, outlen); } TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) { - return InflateUTF8StringHelper(cx, utf8, outlen); + return InflateUTF8StringHelper(cx, utf8, outlen); } TwoByteCharsZ JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen) { UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str())); - return InflateUTF8StringHelper(cx, chars, outlen); + return InflateUTF8StringHelper(cx, chars, outlen); +} + +Latin1CharsZ +JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper(cx, utf8, outlen); +} + +Latin1CharsZ +JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen) +{ + return InflateUTF8StringHelper(cx, utf8, outlen); } #ifdef DEBUG @@ -424,6 +442,7 @@ JS::ConstUTF8CharsZ::validate(size_t aLength) { MOZ_ASSERT(data_); UTF8Chars chars(data_, aLength); - InflateUTF8StringToBuffer(nullptr, chars, nullptr, nullptr, nullptr); + InflateUTF8StringToBuffer(nullptr, chars, nullptr, nullptr, + nullptr); } #endif