Bug 1492090 - Part 4: Add encoding functions to/from UTF-8 to system narrow/wide encoding. r=nbp

Differential Revision: https://phabricator.services.mozilla.com/D151448
This commit is contained in:
André Bargull 2023-05-22 12:28:15 +00:00
Родитель f627124742
Коммит 5c1fd21f5e
5 изменённых файлов: 278 добавлений и 3 удалений

Просмотреть файл

@ -341,6 +341,40 @@ extern JS_PUBLIC_API bool StringIsASCII(const char* s);
*/
extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
/**
* Encode a narrow multibyte character string to a UTF-8 string.
*
* NOTE: Should only be used when interacting with POSIX/OS functions and not
* for encoding ASCII/Latin-1/etc. strings to UTF-8.
*/
extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
const char* chars);
/**
* Encode a wide string to a UTF-8 string.
*
* NOTE: Should only be used when interacting with Windows API functions.
*/
extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
const wchar_t* chars);
/**
* Encode a UTF-8 string to a narrow multibyte character string.
*
* NOTE: Should only be used when interacting with POSIX/OS functions and not
* for encoding UTF-8 to ASCII/Latin-1/etc. strings.
*/
extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
const char* chars);
/**
* Encode a UTF-8 string to a wide string.
*
* NOTE: Should only be used when interacting with Windows API functions.
*/
extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
const char* chars);
} // namespace JS
inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }

Просмотреть файл

@ -632,9 +632,10 @@ struct FreePolicy {
void operator()(const void* ptr) { js_free(const_cast<void*>(ptr)); }
};
typedef mozilla::UniquePtr<char[], JS::FreePolicy> UniqueChars;
typedef mozilla::UniquePtr<char16_t[], JS::FreePolicy> UniqueTwoByteChars;
typedef mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy> UniqueLatin1Chars;
using UniqueChars = mozilla::UniquePtr<char[], JS::FreePolicy>;
using UniqueTwoByteChars = mozilla::UniquePtr<char16_t[], JS::FreePolicy>;
using UniqueLatin1Chars = mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy>;
using UniqueWideChars = mozilla::UniquePtr<wchar_t[], JS::FreePolicy>;
} // namespace JS

Просмотреть файл

@ -400,6 +400,12 @@ MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "bad trailing UTF-8 b
MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
MSG_DEF(JSMSG_BAD_CODE_UNITS, 1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")
// System encoding errors
MSG_DEF(JSMSG_CANT_CONVERT_TO_NARROW, 0, JSEXN_NOTE, "can't convert to narrow string")
MSG_DEF(JSMSG_CANT_CONVERT_TO_WIDE, 0, JSEXN_NOTE, "can't convert to wide string")
MSG_DEF(JSMSG_CANT_CONVERT_WIDE_TO_UTF8, 0, JSEXN_NOTE, "can't convert wide string to UTF-8")
MSG_DEF(JSMSG_CANT_CONVERT_UTF8_TO_WIDE, 0, JSEXN_NOTE, "can't convert UTF-8 to wide string")
// SmooshMonkey
MSG_DEF(JSMSG_SMOOSH_COMPILE_ERROR, 1, JSEXN_SYNTAXERR, "{0}")
MSG_DEF(JSMSG_SMOOSH_UNIMPLEMENTED, 1, JSEXN_INTERNALERR, "{0}")

Просмотреть файл

@ -65,6 +65,7 @@ using JS::Latin1Char;
using JS::UniqueChars;
using JS::UniqueLatin1Chars;
using JS::UniqueTwoByteChars;
using JS::UniqueWideChars;
using JS::Ok;
using JS::OOM;

Просмотреть файл

@ -6,14 +6,23 @@
#include "js/CharacterEncoding.h"
#include "mozilla/CheckedInt.h"
#include "mozilla/DebugOnly.h"
#include "mozilla/Latin1.h"
#include "mozilla/Maybe.h"
#include "mozilla/Range.h"
#include "mozilla/Span.h"
#include "mozilla/Sprintf.h"
#include "mozilla/TextUtils.h"
#include "mozilla/Utf8.h"
#ifndef XP_LINUX
// We still support libstd++ versions without codecvt support on Linux.
# include <codecvt>
#endif
#include <cwchar>
#include <limits>
#include <locale>
#include <type_traits>
#include "frontend/FrontendContext.h"
@ -587,6 +596,230 @@ bool JS::StringIsASCII(const char* s) {
bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx,
const char* chars) {
// Convert the narrow multibyte character string to a wide string and then
// use EncodeWideToUtf8() to convert the wide string to a UTF-8 string.
std::mbstate_t mb{};
// NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr
// on Android NDK older than v16. Use a temporary variable to save the
// `chars` for the subsequent call. See bug 1492090.
const char* tmpChars = chars;
size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb);
if (wideLen == size_t(-1)) {
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_CANT_CONVERT_TO_WIDE);
return nullptr;
}
MOZ_ASSERT(std::mbsinit(&mb),
"multi-byte state is in its initial state when no conversion "
"error occured");
size_t bufLen = wideLen + 1;
auto wideChars = cx->make_pod_array<wchar_t>(bufLen);
if (!wideChars) {
return nullptr;
}
mozilla::DebugOnly<size_t> actualLen =
std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb);
MOZ_ASSERT(wideLen == actualLen);
MOZ_ASSERT(wideChars[actualLen] == '\0');
return EncodeWideToUtf8(cx, wideChars.get());
}
JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx,
const wchar_t* chars) {
using CheckedSizeT = mozilla::CheckedInt<size_t>;
#ifndef XP_LINUX
// Use the standard codecvt facet to convert a wide string to UTF-8.
std::codecvt_utf8<wchar_t> cv;
size_t len = std::wcslen(chars);
CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length();
CheckedSizeT utf8BufLen = utf8MaxLen + 1;
if (!utf8BufLen.isValid()) {
JS_ReportAllocationOverflow(cx);
return nullptr;
}
auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
if (!utf8) {
return nullptr;
}
// STL returns |codecvt_base::partial| for empty strings.
if (len == 0) {
return utf8;
}
std::mbstate_t mb{};
const wchar_t* fromNext;
char* toNext;
std::codecvt_base::result result =
cv.out(mb, chars, chars + len, fromNext, utf8.get(),
utf8.get() + utf8MaxLen.value(), toNext);
if (result != std::codecvt_base::ok) {
MOZ_ASSERT(result == std::codecvt_base::error);
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
return nullptr;
}
*toNext = '\0'; // Explicit null-termination required.
// codecvt_utf8 doesn't validate its output and may produce WTF-8 instead
// of UTF-8 on some platforms when the input contains unpaired surrogate
// characters. We don't allow this.
if (!mozilla::IsUtf8(
mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) {
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
return nullptr;
}
return utf8;
#else
static_assert(sizeof(wchar_t) == 4,
"Assume wchar_t is UTF-32 on Linux systems");
constexpr size_t MaxUtf8CharLength = 4;
size_t len = std::wcslen(chars);
CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength;
CheckedSizeT utf8BufLen = utf8MaxLen + 1;
if (!utf8BufLen.isValid()) {
JS_ReportAllocationOverflow(cx);
return nullptr;
}
auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
if (!utf8) {
return nullptr;
}
char* dst = utf8.get();
for (size_t i = 0; i < len; i++) {
uint8_t utf8buf[MaxUtf8CharLength];
uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]);
for (size_t j = 0; j < utf8Len; j++) {
*dst++ = char(utf8buf[j]);
}
}
*dst = '\0';
return utf8;
#endif
}
JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx,
const char* chars) {
// Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and
// then convert the resulting wide string to a narrow multibyte character
// string.
auto wideChars = EncodeUtf8ToWide(cx, chars);
if (!wideChars) {
return nullptr;
}
const wchar_t* cWideChars = wideChars.get();
std::mbstate_t mb{};
size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb);
if (narrowLen == size_t(-1)) {
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_CANT_CONVERT_TO_NARROW);
return nullptr;
}
MOZ_ASSERT(std::mbsinit(&mb),
"multi-byte state is in its initial state when no conversion "
"error occured");
size_t bufLen = narrowLen + 1;
auto narrow = cx->make_pod_array<char>(bufLen);
if (!narrow) {
return nullptr;
}
mozilla::DebugOnly<size_t> actualLen =
std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb);
MOZ_ASSERT(narrowLen == actualLen);
MOZ_ASSERT(narrow[actualLen] == '\0');
return narrow;
}
JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx,
const char* chars) {
// Only valid UTF-8 strings should be passed to this function.
MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars))));
#ifndef XP_LINUX
// Use the standard codecvt facet to convert from UTF-8 to a wide string.
std::codecvt_utf8<wchar_t> cv;
size_t len = strlen(chars);
auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
if (!wideChars) {
return nullptr;
}
// STL returns |codecvt_base::partial| for empty strings.
if (len == 0) {
return wideChars;
}
std::mbstate_t mb{};
const char* fromNext;
wchar_t* toNext;
std::codecvt_base::result result =
cv.in(mb, chars, chars + len, fromNext, wideChars.get(),
wideChars.get() + len, toNext);
if (result != std::codecvt_base::ok) {
MOZ_ASSERT(result == std::codecvt_base::error);
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
JSMSG_CANT_CONVERT_UTF8_TO_WIDE);
return nullptr;
}
*toNext = '\0'; // Explicit null-termination required.
return wideChars;
#else
static_assert(sizeof(wchar_t) == 4,
"Assume wchar_t is UTF-32 on Linux systems");
size_t len = strlen(chars);
auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
if (!wideChars) {
return nullptr;
}
const auto* s = reinterpret_cast<const unsigned char*>(chars);
const auto* const limit = s + len;
wchar_t* dst = wideChars.get();
while (s < limit) {
unsigned char c = *s++;
if (mozilla::IsAscii(c)) {
*dst++ = wchar_t(c);
continue;
}
mozilla::Utf8Unit utf8(c);
mozilla::Maybe<char32_t> codePoint =
mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit);
MOZ_ASSERT(codePoint.isSome());
*dst++ = wchar_t(*codePoint);
}
*dst++ = '\0';
return wideChars;
#endif
}
bool StringBuffer::append(const Utf8Unit* units, size_t len) {
MOZ_ASSERT(maybeCx_);