From 586c7b1a14afbfe83509ea4d048eb25cde8705c0 Mon Sep 17 00:00:00 2001
From: Tooru Fujisawa <arai_a@mac.com>
Date: Sat, 13 Aug 2016 23:03:31 +0900
Subject: [PATCH] Bug 1289003 - Part 1: Add UTF8CharsToNewLatin1CharsZ,
 LossyUTF8CharsToNewLatin1CharsZ. r=jwalden

---
 js/public/CharacterEncoding.h   | 33 ++++++++++++++
 js/src/vm/CharacterEncoding.cpp | 81 ++++++++++++++++++++-------------
 2 files changed, 83 insertions(+), 31 deletions(-)
diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h
index 99a6b4cdfce9..8c166013c788 100644
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@@ -31,6 +31,8 @@ class Latin1Chars : public mozilla::Range<Latin1Char>
     typedef mozilla::Range<Latin1Char> Base;
 
   public:
+    using CharT = Latin1Char;
+
     Latin1Chars() : Base() {}
     Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
     Latin1Chars(const Latin1Char* aBytes, size_t aLength)
@@ -49,6 +51,8 @@ class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
     typedef mozilla::RangedPtr<Latin1Char> Base;
 
   public:
+    using CharT = Latin1Char;
+
     Latin1CharsZ() : Base(nullptr, 0) {}
 
     Latin1CharsZ(char* aBytes, size_t aLength)
@@ -73,6 +77,8 @@ class UTF8Chars : public mozilla::Range<unsigned char>
     typedef mozilla::Range<unsigned char> Base;
 
   public:
+    using CharT = unsigned char;
+
     UTF8Chars() : Base() {}
     UTF8Chars(char* aBytes, size_t aLength)
       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
@@ -90,6 +96,8 @@ class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
     typedef mozilla::RangedPtr<unsigned char> Base;
 
   public:
+    using CharT = unsigned char;
+
     UTF8CharsZ() : Base(nullptr, 0) {}
 
     UTF8CharsZ(char* aBytes, size_t aLength)
@@ -120,6 +128,8 @@ class ConstUTF8CharsZ
     const char* data_;
 
   public:
+    using CharT = unsigned char;
+
     ConstUTF8CharsZ() : data_(nullptr)
     {}
 
@@ -157,6 +167,8 @@ class TwoByteChars : public mozilla::Range<char16_t>
     typedef mozilla::Range<char16_t> Base;
 
   public:
+    using CharT = char16_t;
+
     TwoByteChars() : Base() {}
     TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
     TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
@@ -170,6 +182,8 @@ class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
     typedef mozilla::RangedPtr<char16_t> Base;
 
   public:
+    using CharT = char16_t;
+
     TwoByteCharsZ() : Base(nullptr, 0) {}
 
     TwoByteCharsZ(char16_t* chars, size_t length)
@@ -191,6 +205,8 @@ class ConstTwoByteChars : public mozilla::Range<const char16_t>
     typedef mozilla::Range<const char16_t> Base;
 
   public:
+    using CharT = char16_t;
+
     ConstTwoByteChars() : Base() {}
     ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
 };
@@ -272,6 +288,23 @@ JS_PUBLIC_API(void)
 DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
                           size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
 
+/*
+  * Return a null-terminated Latin-1 string copied from the input string,
+  * storing its length (excluding null terminator) in |*outlen|.  Fail and
+  * report an error if the string contains non-Latin-1 codepoints.  Returns
+  * Latin1CharsZ() on failure.
+ */
+extern Latin1CharsZ
+UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
+/*
+ * Return a null-terminated Latin-1 string copied from the input string,
+ * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
+ * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
+ */
+extern Latin1CharsZ
+LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
+
 } // namespace JS
 
 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp
index fb4d6e12a71b..cd2cbb557b84 100644
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@@ -8,6 +8,8 @@
 
 #include "mozilla/Range.h"
 
+#include <type_traits>
+
 #include "jscntxt.h"
 #include "jsprf.h"
 
@@ -253,19 +255,20 @@ enum InflateUTF8Action {
     Copy
 };
 
-static const uint32_t REPLACE_UTF8 = 0xFFFD;
+static const char16_t REPLACE_UTF8 = 0xFFFD;
+static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
 
 // If making changes to this algorithm, make sure to also update
 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
-template <InflateUTF8Action Action>
+template <InflateUTF8Action Action, typename CharT>
 static bool
-InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, size_t* dstlenp,
+InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
                           bool* isAsciip)
 {
     if (Action != AssertNoInvalids)
         *isAsciip = true;
 
-    // Count how many char16_t characters need to be in the inflated string.
+    // Count how many code units need to be in the inflated string.
     // |i| is the index into |src|, and |j| is the the index into |dst|.
     size_t srclen = src.length();
     uint32_t j = 0;
@@ -274,7 +277,7 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
         if (!(v & 0x80)) {
             // ASCII code unit.  Simple copy.
             if (Action == Copy)
-                dst[j] = char16_t(v);
+                dst[j] = CharT(v);
 
         } else {
             // Non-ASCII code unit.  Determine its length in bytes (n).
@@ -292,10 +295,14 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
                 } else if (Action == AssertNoInvalids) {                \
                     MOZ_CRASH("invalid UTF-8 string: " # report);       \
                 } else {                                                \
-                    if (Action == Copy)                                 \
-                        dst[j] = char16_t(REPLACE_UTF8);                \
-                    else                                                \
+                    if (Action == Copy) {                               \
+                        if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
+                            dst[j] = CharT(REPLACE_UTF8_LATIN1);        \
+                        else                                            \
+                            dst[j] = CharT(REPLACE_UTF8);               \
+                    } else {                                            \
                         MOZ_ASSERT(Action == CountAndIgnoreInvalids);   \
+                    }                                                   \
                     n = n2;                                             \
                     goto invalidMultiByteCodeUnit;                      \
                 }                                                       \
@@ -324,25 +331,24 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
                 if ((src[i + m] & 0xC0) != 0x80)
                     INVALID(ReportInvalidCharacter, i, m);
 
-            // Determine the code unit's length in char16_t and act accordingly.
+            // Determine the code unit's length in CharT and act accordingly.
             v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
             if (v < 0x10000) {
-                // The n-byte UTF8 code unit will fit in a single char16_t.
+                // The n-byte UTF8 code unit will fit in a single CharT.
                 if (Action == Copy)
-                    dst[j] = char16_t(v);
-
+                    dst[j] = CharT(v);
             } else {
                 v -= 0x10000;
                 if (v <= 0xFFFFF) {
-                    // The n-byte UTF8 code unit will fit in two char16_t units.
+                    // The n-byte UTF8 code unit will fit in two CharT units.
                     if (Action == Copy)
-                        dst[j] = char16_t((v >> 10) + 0xD800);
+                        dst[j] = CharT((v >> 10) + 0xD800);
                     j++;
                     if (Action == Copy)
-                        dst[j] = char16_t((v & 0x3FF) + 0xDC00);
+                        dst[j] = CharT((v & 0x3FF) + 0xDC00);
 
                 } else {
-                    // The n-byte UTF8 code unit won't fit in two char16_t units.
+                    // The n-byte UTF8 code unit won't fit in two CharT units.
                     INVALID(ReportTooBigCharacter, v, 1);
                 }
             }
@@ -361,61 +367,73 @@ InflateUTF8StringToBuffer(JSContext* cx, const UTF8Chars src, char16_t* dst, siz
     return true;
 }
 
-template <InflateUTF8Action Action>
-static TwoByteCharsZ
+template <InflateUTF8Action Action, typename CharsT>
+static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
 {
+    using CharT = typename CharsT::CharT;
     *outlen = 0;
 
     bool isAscii;
-    if (!InflateUTF8StringToBuffer<Action>(cx, src, /* dst = */ nullptr, outlen, &isAscii))
-        return TwoByteCharsZ();
+    if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &isAscii))
+        return CharsT();
 
-    char16_t* dst = cx->pod_malloc<char16_t>(*outlen + 1);  // +1 for NUL
+    CharT* dst = cx->pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
     if (!dst) {
         ReportOutOfMemory(cx);
-        return TwoByteCharsZ();
+        return CharsT();
     }
 
     if (isAscii) {
         size_t srclen = src.length();
         MOZ_ASSERT(*outlen == srclen);
         for (uint32_t i = 0; i < srclen; i++)
-            dst[i] = char16_t(src[i]);
-
+            dst[i] = CharT(src[i]);
     } else {
-        JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii));
+        JS_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &isAscii)));
     }
 
     dst[*outlen] = 0;    // NUL char
 
-    return TwoByteCharsZ(dst, *outlen);
+    return CharsT(dst, *outlen);
 }
 
 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
-    return InflateUTF8StringHelper<CountAndReportInvalids>(cx, utf8, outlen);
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
 }
 
 TwoByteCharsZ
 JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
     UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
-    return InflateUTF8StringHelper<CountAndReportInvalids>(cx, chars, outlen);
+    return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
 }
 
 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
 {
-    return InflateUTF8StringHelper<CountAndIgnoreInvalids>(cx, utf8, outlen);
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
 }
 
 TwoByteCharsZ
 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
 {
     UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
-    return InflateUTF8StringHelper<CountAndIgnoreInvalids>(cx, chars, outlen);
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
+}
+
+Latin1CharsZ
+JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
+}
+
+Latin1CharsZ
+JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
+{
+    return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
 }
 
 #ifdef DEBUG
@@ -424,6 +442,7 @@ JS::ConstUTF8CharsZ::validate(size_t aLength)
 {
     MOZ_ASSERT(data_);
     UTF8Chars chars(data_, aLength);
-    InflateUTF8StringToBuffer<AssertNoInvalids>(nullptr, chars, nullptr, nullptr, nullptr);
+    InflateUTF8StringToBuffer<AssertNoInvalids, char16_t>(nullptr, chars, nullptr, nullptr,
+                                                          nullptr);
 }
 #endif