From 5c1fd21f5e30d266711c445adc86e1bd97552d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Bargull?= Date: Mon, 22 May 2023 12:28:15 +0000 Subject: [PATCH] Bug 1492090 - Part 4: Add encoding functions to/from UTF-8 to system narrow/wide encoding. r=nbp Differential Revision: https://phabricator.services.mozilla.com/D151448 --- js/public/CharacterEncoding.h | 34 +++++ js/public/Utility.h | 7 +- js/public/friend/ErrorNumbers.msg | 6 + js/src/NamespaceImports.h | 1 + js/src/vm/CharacterEncoding.cpp | 233 ++++++++++++++++++++++++++++++ 5 files changed, 278 insertions(+), 3 deletions(-) diff --git a/js/public/CharacterEncoding.h b/js/public/CharacterEncoding.h index 0cca266283bf..bf67e27aca30 100644 --- a/js/public/CharacterEncoding.h +++ b/js/public/CharacterEncoding.h @@ -341,6 +341,40 @@ extern JS_PUBLIC_API bool StringIsASCII(const char* s); */ extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span s); +/** + * Encode a narrow multibyte character string to a UTF-8 string. + * + * NOTE: Should only be used when interacting with POSIX/OS functions and not + * for encoding ASCII/Latin-1/etc. strings to UTF-8. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx, + const char* chars); + +/** + * Encode a wide string to a UTF-8 string. + * + * NOTE: Should only be used when interacting with Windows API functions. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx, + const wchar_t* chars); + +/** + * Encode a UTF-8 string to a narrow multibyte character string. + * + * NOTE: Should only be used when interacting with POSIX/OS functions and not + * for encoding UTF-8 to ASCII/Latin-1/etc. strings. + */ +extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx, + const char* chars); + +/** + * Encode a UTF-8 string to a wide string. + * + * NOTE: Should only be used when interacting with Windows API functions. + */ +extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx, + const char* chars); + } // namespace JS inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); } diff --git a/js/public/Utility.h b/js/public/Utility.h index 7979740bedf1..5a3002b08fc2 100644 --- a/js/public/Utility.h +++ b/js/public/Utility.h @@ -632,9 +632,10 @@ struct FreePolicy { void operator()(const void* ptr) { js_free(const_cast(ptr)); } }; -typedef mozilla::UniquePtr UniqueChars; -typedef mozilla::UniquePtr UniqueTwoByteChars; -typedef mozilla::UniquePtr UniqueLatin1Chars; +using UniqueChars = mozilla::UniquePtr; +using UniqueTwoByteChars = mozilla::UniquePtr; +using UniqueLatin1Chars = mozilla::UniquePtr; +using UniqueWideChars = mozilla::UniquePtr; } // namespace JS diff --git a/js/public/friend/ErrorNumbers.msg b/js/public/friend/ErrorNumbers.msg index 0048498cbb47..90f56f066257 100644 --- a/js/public/friend/ErrorNumbers.msg +++ b/js/public/friend/ErrorNumbers.msg @@ -400,6 +400,12 @@ MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "bad trailing UTF-8 b MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}") MSG_DEF(JSMSG_BAD_CODE_UNITS, 1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}") +// System encoding errors +MSG_DEF(JSMSG_CANT_CONVERT_TO_NARROW, 0, JSEXN_NOTE, "can't convert to narrow string") +MSG_DEF(JSMSG_CANT_CONVERT_TO_WIDE, 0, JSEXN_NOTE, "can't convert to wide string") +MSG_DEF(JSMSG_CANT_CONVERT_WIDE_TO_UTF8, 0, JSEXN_NOTE, "can't convert wide string to UTF-8") +MSG_DEF(JSMSG_CANT_CONVERT_UTF8_TO_WIDE, 0, JSEXN_NOTE, "can't convert UTF-8 to wide string") + // SmooshMonkey MSG_DEF(JSMSG_SMOOSH_COMPILE_ERROR, 1, JSEXN_SYNTAXERR, "{0}") MSG_DEF(JSMSG_SMOOSH_UNIMPLEMENTED, 1, JSEXN_INTERNALERR, "{0}") diff --git a/js/src/NamespaceImports.h b/js/src/NamespaceImports.h index 235483d7656e..6a94263c8ca9 100644 --- a/js/src/NamespaceImports.h +++ b/js/src/NamespaceImports.h @@ -65,6 +65,7 @@ using JS::Latin1Char; using JS::UniqueChars; using JS::UniqueLatin1Chars; using JS::UniqueTwoByteChars; +using JS::UniqueWideChars; using JS::Ok; using JS::OOM; diff --git a/js/src/vm/CharacterEncoding.cpp b/js/src/vm/CharacterEncoding.cpp index 35499855810c..52edcae45ef7 100644 --- a/js/src/vm/CharacterEncoding.cpp +++ b/js/src/vm/CharacterEncoding.cpp @@ -6,14 +6,23 @@ #include "js/CharacterEncoding.h" +#include "mozilla/CheckedInt.h" +#include "mozilla/DebugOnly.h" #include "mozilla/Latin1.h" +#include "mozilla/Maybe.h" #include "mozilla/Range.h" #include "mozilla/Span.h" #include "mozilla/Sprintf.h" #include "mozilla/TextUtils.h" #include "mozilla/Utf8.h" +#ifndef XP_LINUX +// We still support libstd++ versions without codecvt support on Linux. +# include +#endif +#include #include +#include #include #include "frontend/FrontendContext.h" @@ -587,6 +596,230 @@ bool JS::StringIsASCII(const char* s) { bool JS::StringIsASCII(Span s) { return IsAscii(s); } +JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx, + const char* chars) { + // Convert the narrow multibyte character string to a wide string and then + // use EncodeWideToUtf8() to convert the wide string to a UTF-8 string. + + std::mbstate_t mb{}; + + // NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr + // on Android NDK older than v16. Use a temporary variable to save the + // `chars` for the subsequent call. See bug 1492090. + const char* tmpChars = chars; + + size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb); + if (wideLen == size_t(-1)) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_CANT_CONVERT_TO_WIDE); + return nullptr; + } + MOZ_ASSERT(std::mbsinit(&mb), + "multi-byte state is in its initial state when no conversion " + "error occured"); + + size_t bufLen = wideLen + 1; + auto wideChars = cx->make_pod_array(bufLen); + if (!wideChars) { + return nullptr; + } + + mozilla::DebugOnly actualLen = + std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb); + MOZ_ASSERT(wideLen == actualLen); + MOZ_ASSERT(wideChars[actualLen] == '\0'); + + return EncodeWideToUtf8(cx, wideChars.get()); +} + +JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx, + const wchar_t* chars) { + using CheckedSizeT = mozilla::CheckedInt; + +#ifndef XP_LINUX + // Use the standard codecvt facet to convert a wide string to UTF-8. + std::codecvt_utf8 cv; + + size_t len = std::wcslen(chars); + CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length(); + CheckedSizeT utf8BufLen = utf8MaxLen + 1; + if (!utf8BufLen.isValid()) { + JS_ReportAllocationOverflow(cx); + return nullptr; + } + auto utf8 = cx->make_pod_array(utf8BufLen.value()); + if (!utf8) { + return nullptr; + } + + // STL returns |codecvt_base::partial| for empty strings. + if (len == 0) { + return utf8; + } + + std::mbstate_t mb{}; + const wchar_t* fromNext; + char* toNext; + std::codecvt_base::result result = + cv.out(mb, chars, chars + len, fromNext, utf8.get(), + utf8.get() + utf8MaxLen.value(), toNext); + if (result != std::codecvt_base::ok) { + MOZ_ASSERT(result == std::codecvt_base::error); + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_CANT_CONVERT_WIDE_TO_UTF8); + return nullptr; + } + *toNext = '\0'; // Explicit null-termination required. + + // codecvt_utf8 doesn't validate its output and may produce WTF-8 instead + // of UTF-8 on some platforms when the input contains unpaired surrogate + // characters. We don't allow this. + if (!mozilla::IsUtf8( + mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_CANT_CONVERT_WIDE_TO_UTF8); + return nullptr; + } + + return utf8; +#else + static_assert(sizeof(wchar_t) == 4, + "Assume wchar_t is UTF-32 on Linux systems"); + + constexpr size_t MaxUtf8CharLength = 4; + + size_t len = std::wcslen(chars); + CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength; + CheckedSizeT utf8BufLen = utf8MaxLen + 1; + if (!utf8BufLen.isValid()) { + JS_ReportAllocationOverflow(cx); + return nullptr; + } + auto utf8 = cx->make_pod_array(utf8BufLen.value()); + if (!utf8) { + return nullptr; + } + + char* dst = utf8.get(); + for (size_t i = 0; i < len; i++) { + uint8_t utf8buf[MaxUtf8CharLength]; + uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]); + for (size_t j = 0; j < utf8Len; j++) { + *dst++ = char(utf8buf[j]); + } + } + *dst = '\0'; + + return utf8; +#endif +} + +JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx, + const char* chars) { + // Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and + // then convert the resulting wide string to a narrow multibyte character + // string. + + auto wideChars = EncodeUtf8ToWide(cx, chars); + if (!wideChars) { + return nullptr; + } + + const wchar_t* cWideChars = wideChars.get(); + std::mbstate_t mb{}; + size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb); + if (narrowLen == size_t(-1)) { + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_CANT_CONVERT_TO_NARROW); + return nullptr; + } + MOZ_ASSERT(std::mbsinit(&mb), + "multi-byte state is in its initial state when no conversion " + "error occured"); + + size_t bufLen = narrowLen + 1; + auto narrow = cx->make_pod_array(bufLen); + if (!narrow) { + return nullptr; + } + + mozilla::DebugOnly actualLen = + std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb); + MOZ_ASSERT(narrowLen == actualLen); + MOZ_ASSERT(narrow[actualLen] == '\0'); + + return narrow; +} + +JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx, + const char* chars) { + // Only valid UTF-8 strings should be passed to this function. + MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars)))); + +#ifndef XP_LINUX + // Use the standard codecvt facet to convert from UTF-8 to a wide string. + std::codecvt_utf8 cv; + + size_t len = strlen(chars); + auto wideChars = cx->make_pod_array(len + 1); + if (!wideChars) { + return nullptr; + } + + // STL returns |codecvt_base::partial| for empty strings. + if (len == 0) { + return wideChars; + } + + std::mbstate_t mb{}; + const char* fromNext; + wchar_t* toNext; + std::codecvt_base::result result = + cv.in(mb, chars, chars + len, fromNext, wideChars.get(), + wideChars.get() + len, toNext); + if (result != std::codecvt_base::ok) { + MOZ_ASSERT(result == std::codecvt_base::error); + JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, + JSMSG_CANT_CONVERT_UTF8_TO_WIDE); + return nullptr; + } + *toNext = '\0'; // Explicit null-termination required. + + return wideChars; +#else + static_assert(sizeof(wchar_t) == 4, + "Assume wchar_t is UTF-32 on Linux systems"); + + size_t len = strlen(chars); + auto wideChars = cx->make_pod_array(len + 1); + if (!wideChars) { + return nullptr; + } + + const auto* s = reinterpret_cast(chars); + const auto* const limit = s + len; + + wchar_t* dst = wideChars.get(); + while (s < limit) { + unsigned char c = *s++; + + if (mozilla::IsAscii(c)) { + *dst++ = wchar_t(c); + continue; + } + + mozilla::Utf8Unit utf8(c); + mozilla::Maybe codePoint = + mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit); + MOZ_ASSERT(codePoint.isSome()); + *dst++ = wchar_t(*codePoint); + } + *dst++ = '\0'; + + return wideChars; +#endif +} + bool StringBuffer::append(const Utf8Unit* units, size_t len) { MOZ_ASSERT(maybeCx_);