Bug 1492090 - Part 4: Add encoding functions to/from UTF-8 to system narrow/wide encoding. r=nbp

Differential Revision: https://phabricator.services.mozilla.com/D151448
2023-05-22 12:28:15 +00:00 · 2023-05-22 12:28:15 +00:00 · 5c1fd21f5e
--- a/js/public/CharacterEncoding.h
+++ b/js/public/CharacterEncoding.h
@ -341,6 +341,40 @@ extern JS_PUBLIC_API bool StringIsASCII(const char* s);
 */
 extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);

+/**
+ * Encode a narrow multibyte character string to a UTF-8 string.
+ *
+ * NOTE: Should only be used when interacting with POSIX/OS functions and not
+ *       for encoding ASCII/Latin-1/etc. strings to UTF-8.
+ */
+extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
+                                                        const char* chars);
+
+/**
+ * Encode a wide string to a UTF-8 string.
+ *
+ * NOTE: Should only be used when interacting with Windows API functions.
+ */
+extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
+                                                      const wchar_t* chars);
+
+/**
+ * Encode a UTF-8 string to a narrow multibyte character string.
+ *
+ * NOTE: Should only be used when interacting with POSIX/OS functions and not
+ *       for encoding UTF-8 to ASCII/Latin-1/etc. strings.
+ */
+extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
+                                                        const char* chars);
+
+/**
+ * Encode a UTF-8 string to a wide string.
+ *
+ * NOTE: Should only be used when interacting with Windows API functions.
+ */
+extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
+                                                          const char* chars);
+
 }  // namespace JS

 inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
--- a/js/public/Utility.h
+++ b/js/public/Utility.h
@ -632,9 +632,10 @@ struct FreePolicy {
  void operator()(const void* ptr) { js_free(const_cast<void*>(ptr)); }
 };

-typedef mozilla::UniquePtr<char[], JS::FreePolicy> UniqueChars;
-typedef mozilla::UniquePtr<char16_t[], JS::FreePolicy> UniqueTwoByteChars;
-typedef mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy> UniqueLatin1Chars;
+using UniqueChars = mozilla::UniquePtr<char[], JS::FreePolicy>;
+using UniqueTwoByteChars = mozilla::UniquePtr<char16_t[], JS::FreePolicy>;
+using UniqueLatin1Chars = mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy>;
+using UniqueWideChars = mozilla::UniquePtr<wchar_t[], JS::FreePolicy>;

 }  // namespace JS

--- a/js/public/friend/ErrorNumbers.msg
+++ b/js/public/friend/ErrorNumbers.msg
@ -400,6 +400,12 @@ MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT,  1, JSEXN_SYNTAXERR, "bad trailing UTF-8 b
 MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
 MSG_DEF(JSMSG_BAD_CODE_UNITS,          1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")

+// System encoding errors
+MSG_DEF(JSMSG_CANT_CONVERT_TO_NARROW,  0, JSEXN_NOTE, "can't convert to narrow string")
+MSG_DEF(JSMSG_CANT_CONVERT_TO_WIDE,    0, JSEXN_NOTE, "can't convert to wide string")
+MSG_DEF(JSMSG_CANT_CONVERT_WIDE_TO_UTF8, 0, JSEXN_NOTE, "can't convert wide string to UTF-8")
+MSG_DEF(JSMSG_CANT_CONVERT_UTF8_TO_WIDE, 0, JSEXN_NOTE, "can't convert UTF-8 to wide string")
+
 // SmooshMonkey
 MSG_DEF(JSMSG_SMOOSH_COMPILE_ERROR,    1, JSEXN_SYNTAXERR, "{0}")
 MSG_DEF(JSMSG_SMOOSH_UNIMPLEMENTED,    1, JSEXN_INTERNALERR, "{0}")
--- a/js/src/NamespaceImports.h
+++ b/js/src/NamespaceImports.h
@ -65,6 +65,7 @@ using JS::Latin1Char;
 using JS::UniqueChars;
 using JS::UniqueLatin1Chars;
 using JS::UniqueTwoByteChars;
+using JS::UniqueWideChars;

 using JS::Ok;
 using JS::OOM;
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@ -6,14 +6,23 @@

 #include "js/CharacterEncoding.h"

+#include "mozilla/CheckedInt.h"
+#include "mozilla/DebugOnly.h"
 #include "mozilla/Latin1.h"
+#include "mozilla/Maybe.h"
 #include "mozilla/Range.h"
 #include "mozilla/Span.h"
 #include "mozilla/Sprintf.h"
 #include "mozilla/TextUtils.h"
 #include "mozilla/Utf8.h"

+#ifndef XP_LINUX
+// We still support libstd++ versions without codecvt support on Linux.
+#  include <codecvt>
+#endif
+#include <cwchar>
 #include <limits>
+#include <locale>
 #include <type_traits>

 #include "frontend/FrontendContext.h"
@ -587,6 +596,230 @@ bool JS::StringIsASCII(const char* s) {

 bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }

+JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx,
+                                                     const char* chars) {
+  // Convert the narrow multibyte character string to a wide string and then
+  // use EncodeWideToUtf8() to convert the wide string to a UTF-8 string.
+
+  std::mbstate_t mb{};
+
+  // NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr
+  //       on Android NDK older than v16.  Use a temporary variable to save the
+  //       `chars` for the subsequent call.  See bug 1492090.
+  const char* tmpChars = chars;
+
+  size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb);
+  if (wideLen == size_t(-1)) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_TO_WIDE);
+    return nullptr;
+  }
+  MOZ_ASSERT(std::mbsinit(&mb),
+             "multi-byte state is in its initial state when no conversion "
+             "error occured");
+
+  size_t bufLen = wideLen + 1;
+  auto wideChars = cx->make_pod_array<wchar_t>(bufLen);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  mozilla::DebugOnly<size_t> actualLen =
+      std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb);
+  MOZ_ASSERT(wideLen == actualLen);
+  MOZ_ASSERT(wideChars[actualLen] == '\0');
+
+  return EncodeWideToUtf8(cx, wideChars.get());
+}
+
+JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx,
+                                                   const wchar_t* chars) {
+  using CheckedSizeT = mozilla::CheckedInt<size_t>;
+
+#ifndef XP_LINUX
+  // Use the standard codecvt facet to convert a wide string to UTF-8.
+  std::codecvt_utf8<wchar_t> cv;
+
+  size_t len = std::wcslen(chars);
+  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length();
+  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
+  if (!utf8BufLen.isValid()) {
+    JS_ReportAllocationOverflow(cx);
+    return nullptr;
+  }
+  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
+  if (!utf8) {
+    return nullptr;
+  }
+
+  // STL returns |codecvt_base::partial| for empty strings.
+  if (len == 0) {
+    return utf8;
+  }
+
+  std::mbstate_t mb{};
+  const wchar_t* fromNext;
+  char* toNext;
+  std::codecvt_base::result result =
+      cv.out(mb, chars, chars + len, fromNext, utf8.get(),
+             utf8.get() + utf8MaxLen.value(), toNext);
+  if (result != std::codecvt_base::ok) {
+    MOZ_ASSERT(result == std::codecvt_base::error);
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
+    return nullptr;
+  }
+  *toNext = '\0';  // Explicit null-termination required.
+
+  // codecvt_utf8 doesn't validate its output and may produce WTF-8 instead
+  // of UTF-8 on some platforms when the input contains unpaired surrogate
+  // characters. We don't allow this.
+  if (!mozilla::IsUtf8(
+          mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
+    return nullptr;
+  }
+
+  return utf8;
+#else
+  static_assert(sizeof(wchar_t) == 4,
+                "Assume wchar_t is UTF-32 on Linux systems");
+
+  constexpr size_t MaxUtf8CharLength = 4;
+
+  size_t len = std::wcslen(chars);
+  CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength;
+  CheckedSizeT utf8BufLen = utf8MaxLen + 1;
+  if (!utf8BufLen.isValid()) {
+    JS_ReportAllocationOverflow(cx);
+    return nullptr;
+  }
+  auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
+  if (!utf8) {
+    return nullptr;
+  }
+
+  char* dst = utf8.get();
+  for (size_t i = 0; i < len; i++) {
+    uint8_t utf8buf[MaxUtf8CharLength];
+    uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]);
+    for (size_t j = 0; j < utf8Len; j++) {
+      *dst++ = char(utf8buf[j]);
+    }
+  }
+  *dst = '\0';
+
+  return utf8;
+#endif
+}
+
+JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx,
+                                                     const char* chars) {
+  // Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and
+  // then convert the resulting wide string to a narrow multibyte character
+  // string.
+
+  auto wideChars = EncodeUtf8ToWide(cx, chars);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  const wchar_t* cWideChars = wideChars.get();
+  std::mbstate_t mb{};
+  size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb);
+  if (narrowLen == size_t(-1)) {
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_TO_NARROW);
+    return nullptr;
+  }
+  MOZ_ASSERT(std::mbsinit(&mb),
+             "multi-byte state is in its initial state when no conversion "
+             "error occured");
+
+  size_t bufLen = narrowLen + 1;
+  auto narrow = cx->make_pod_array<char>(bufLen);
+  if (!narrow) {
+    return nullptr;
+  }
+
+  mozilla::DebugOnly<size_t> actualLen =
+      std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb);
+  MOZ_ASSERT(narrowLen == actualLen);
+  MOZ_ASSERT(narrow[actualLen] == '\0');
+
+  return narrow;
+}
+
+JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx,
+                                                       const char* chars) {
+  // Only valid UTF-8 strings should be passed to this function.
+  MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars))));
+
+#ifndef XP_LINUX
+  // Use the standard codecvt facet to convert from UTF-8 to a wide string.
+  std::codecvt_utf8<wchar_t> cv;
+
+  size_t len = strlen(chars);
+  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  // STL returns |codecvt_base::partial| for empty strings.
+  if (len == 0) {
+    return wideChars;
+  }
+
+  std::mbstate_t mb{};
+  const char* fromNext;
+  wchar_t* toNext;
+  std::codecvt_base::result result =
+      cv.in(mb, chars, chars + len, fromNext, wideChars.get(),
+            wideChars.get() + len, toNext);
+  if (result != std::codecvt_base::ok) {
+    MOZ_ASSERT(result == std::codecvt_base::error);
+    JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
+                              JSMSG_CANT_CONVERT_UTF8_TO_WIDE);
+    return nullptr;
+  }
+  *toNext = '\0';  // Explicit null-termination required.
+
+  return wideChars;
+#else
+  static_assert(sizeof(wchar_t) == 4,
+                "Assume wchar_t is UTF-32 on Linux systems");
+
+  size_t len = strlen(chars);
+  auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
+  if (!wideChars) {
+    return nullptr;
+  }
+
+  const auto* s = reinterpret_cast<const unsigned char*>(chars);
+  const auto* const limit = s + len;
+
+  wchar_t* dst = wideChars.get();
+  while (s < limit) {
+    unsigned char c = *s++;
+
+    if (mozilla::IsAscii(c)) {
+      *dst++ = wchar_t(c);
+      continue;
+    }
+
+    mozilla::Utf8Unit utf8(c);
+    mozilla::Maybe<char32_t> codePoint =
+        mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit);
+    MOZ_ASSERT(codePoint.isSome());
+    *dst++ = wchar_t(*codePoint);
+  }
+  *dst++ = '\0';
+
+  return wideChars;
+#endif
+}
+
 bool StringBuffer::append(const Utf8Unit* units, size_t len) {
  MOZ_ASSERT(maybeCx_);