Bug 1289003 - Part 1: Add UTF8CharsToNewLatin1CharsZ, LossyUTF8CharsToNewLatin1CharsZ. r=jwalden

2016-08-13 23:03:31 +09:00 · 2016-08-13 23:03:31 +09:00 · 586c7b1a14
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@ -31,6 +31,8 @@ class Latin1Chars : public mozilla::Range<Latin1Char>
    typedef mozilla::Range<Latin1Char> Base;

  public:
+    using CharT = Latin1Char;
+
    Latin1Chars() : Base() {}
    Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
    Latin1Chars(const Latin1Char* aBytes, size_t aLength)
@ -49,6 +51,8 @@ class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
    typedef mozilla::RangedPtr<Latin1Char> Base;

  public:
+    using CharT = Latin1Char;
+
    Latin1CharsZ() : Base(nullptr, 0) {}

    Latin1CharsZ(char* aBytes, size_t aLength)
@ -73,6 +77,8 @@ class UTF8Chars : public mozilla::Range<unsigned char>
    typedef mozilla::Range<unsigned char> Base;

  public:
+    using CharT = unsigned char;
+
    UTF8Chars() : Base() {}
    UTF8Chars(char* aBytes, size_t aLength)
      : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
@ -90,6 +96,8 @@ class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
    typedef mozilla::RangedPtr<unsigned char> Base;

  public:
+    using CharT = unsigned char;
+
    UTF8CharsZ() : Base(nullptr, 0) {}

    UTF8CharsZ(char* aBytes, size_t aLength)
@ -120,6 +128,8 @@ class ConstUTF8CharsZ
    const char* data_;

  public:
+    using CharT = unsigned char;
+
    ConstUTF8CharsZ() : data_(nullptr)
    {}

@ -157,6 +167,8 @@ class TwoByteChars : public mozilla::Range<char16_t>
    typedef mozilla::Range<char16_t> Base;

  public:
+    using CharT = char16_t;
+
    TwoByteChars() : Base() {}
    TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
    TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
@ -170,6 +182,8 @@ class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
    typedef mozilla::RangedPtr<char16_t> Base;

  public:
+    using CharT = char16_t;
+
    TwoByteCharsZ() : Base(nullptr, 0) {}

    TwoByteCharsZ(char16_t* chars, size_t length)
@ -191,6 +205,8 @@ class ConstTwoByteChars : public mozilla::Range<const char16_t>
    typedef mozilla::Range<const char16_t> Base;

  public:
+    using CharT = char16_t;
+
    ConstTwoByteChars() : Base() {}
    ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
 };
@ -272,6 +288,23 @@ JS_PUBLIC_API(void)
 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
                          size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);

+/*
+  * Return a null-terminated Latin-1 string copied from the input string,
+  * storing its length (excluding null terminator) in |*outlen|.  Fail and
+  * report an error if the string contains non-Latin-1 codepoints.  Returns
+  * Latin1CharsZ() on failure.
+ */
+extern Latin1CharsZ
+UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+/*
+ * Return a null-terminated Latin-1 string copied from the input string,
+ * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
+ * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
+ */
+extern Latin1CharsZ
+LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
 } // namespace JS

 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@ -8,6 +8,8 @@

 #include "mozilla/Range.h"

+#include <type_traits>
+
 #include "jscntxt.h"
 #include "jsprf.h"

@ -253,19 +255,20 @@ enum InflateUTF8Action {
    Copy
 };

-static const uint32_t REPLACE_UTF8 = 0xFFFD;
+static const char16_t REPLACE_UTF8 = 0xFFFD;
+static const Latin1Char REPLACE_UTF8_LATIN1 = '?';

 // If making changes to this algorithm, make sure to also update
 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
-template <InflateUTF8Action Action>
+template <InflateUTF8Action Action, typename CharT>
 static bool
-InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, size_t* dstlenp,
+InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
                          bool* isAsciip)
 {
    if (Action != AssertNoInvalids)
        *isAsciip = true;

-    // Count how many char16_t characters need to be in the inflated string.
+    // Count how many code units need to be in the inflated string.
    // |i| is the index into |src|, and |j| is the the index into |dst|.
    size_t srclen = src.length();
    uint32_t j = 0;
@ -274,7 +277,7 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
        if (!(v & 0x80)) {
            // ASCII code unit.  Simple copy.
            if (Action == Copy)
-                dst[j] = char16_t(v);
+                dst[j] = CharT(v);

        } else {
            // Non-ASCII code unit.  Determine its length in bytes (n).
@ -292,10 +295,14 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
                } else if (Action == AssertNoInvalids) {                \
                    MOZ_CRASH("invalid UTF-8 string: " # report);       \
                } else {                                                \
-                    if (Action == Copy)                                 \
-                        dst[j] = char16_t(REPLACE_UTF8);                \
-                    else                                                \
+                    if (Action == Copy) {                               \
+                        if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
+                            dst[j] = CharT(REPLACE_UTF8_LATIN1);        \
+                        else                                            \
+                            dst[j] = CharT(REPLACE_UTF8);               \
+                    } else {                                            \
                        MOZ_ASSERT(Action == CountAndIgnoreInvalids);   \
+                    }                                                   \
                    n = n2;                                             \
                    goto invalidMultiByteCodeUnit;                      \
                }                                                       \
@ -324,25 +331,24 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
                if ((src[i + m] & 0xC0) != 0x80)
                    INVALID(ReportInvalidCharacter, i, m);

-            // Determine the code unit's length in char16_t and act accordingly.
+            // Determine the code unit's length in CharT and act accordingly.
            v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
            if (v < 0x10000) {
-                // The n-byte UTF8 code unit will fit in a single char16_t.
+                // The n-byte UTF8 code unit will fit in a single CharT.
                if (Action == Copy)
-                    dst[j] = char16_t(v);
-
+                    dst[j] = CharT(v);
            } else {
                v -= 0x10000;
                if (v <= 0xFFFFF) {
-                    // The n-byte UTF8 code unit will fit in two char16_t units.
+                    // The n-byte UTF8 code unit will fit in two CharT units.
                    if (Action == Copy)
-                        dst[j] = char16_t((v >> 10) + 0xD800);
+                        dst[j] = CharT((v >> 10) + 0xD800);
                    j++;
                    if (Action == Copy)
-                        dst[j] = char16_t((v & 0x3FF) + 0xDC00);
+                        dst[j] = CharT((v & 0x3FF) + 0xDC00);

                } else {
-                    // The n-byte UTF8 code unit won't fit in two char16_t units.
+                    // The n-byte UTF8 code unit won't fit in two CharT units.
                    INVALID(ReportTooBigCharacter, v, 1);
                }
            }
@ -361,61 +367,73 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
    return true;
 }

-template <InflateUTF8Action Action>
-static TwoByteCharsZ
+template <InflateUTF8Action Action, typename CharsT>
+static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
 {
+    using CharT = typename CharsT::CharT;
    *outlen = 0;

    bool isAscii;
-    if (!InflateUTF8StringToBuffer<Action>(cx, src, /* dst = */ nullptr, outlen, &isAscii))
-        return TwoByteCharsZ();
+    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &isAscii))
+        return CharsT();

-    char16_t* dst = cx->pod_malloc<char16_t>(*outlen + 1);  // +1 for NUL
+    CharT* dst = cx->pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
    if (!dst) {
        ReportOutOfMemory(cx);
-        return TwoByteCharsZ();
+        return CharsT();
    }

    if (isAscii) {
        size_t srclen = src.length();
        MOZ_ASSERT(*outlen == srclen);
        for (uint32_t i = 0; i < srclen; i++)
-            dst[i] = char16_t(src[i]);
-
+            dst[i] = CharT(src[i]);
    } else {
-        JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii));
+        JS_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &isAscii)));
    }

    dst[*outlen] = 0;    // NUL char

-    return TwoByteCharsZ(dst, *outlen);
+    return CharsT(dst, *outlen);
 }

 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
-    return InflateUTF8StringHelper<CountAndReportInvalids>(cx, utf8, outlen);
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
 }

 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
-    return InflateUTF8StringHelper<CountAndReportInvalids>(cx, chars, outlen);
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
 }

 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
-    return InflateUTF8StringHelper<CountAndIgnoreInvalids>(cx, utf8, outlen);
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
 }

 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
    UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
-    return InflateUTF8StringHelper<CountAndIgnoreInvalids>(cx, chars, outlen);
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+Latin1CharsZ
+JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
 }

 #ifdef DEBUG
@ -424,6 +442,7 @@ JS::ConstUTF8CharsZ::validate(size_t aLength)
 {
    MOZ_ASSERT(data_);
    UTF8Chars chars(data_, aLength);
-    InflateUTF8StringToBuffer<AssertNoInvalids>(nullptr, chars, nullptr, nullptr, nullptr);
+    InflateUTF8StringToBuffer<AssertNoInvalids, char16_t>(nullptr, chars, nullptr, nullptr,
+                                                          nullptr);
 }
 #endif