зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1492090
- Part 4: Add encoding functions to/from UTF-8 to system narrow/wide encoding. r=nbp
Differential Revision: https://phabricator.services.mozilla.com/D151448
This commit is contained in:
Родитель
f627124742
Коммит
5c1fd21f5e
|
@ -341,6 +341,40 @@ extern JS_PUBLIC_API bool StringIsASCII(const char* s);
|
|||
*/
|
||||
extern JS_PUBLIC_API bool StringIsASCII(mozilla::Span<const char> s);
|
||||
|
||||
/**
|
||||
* Encode a narrow multibyte character string to a UTF-8 string.
|
||||
*
|
||||
* NOTE: Should only be used when interacting with POSIX/OS functions and not
|
||||
* for encoding ASCII/Latin-1/etc. strings to UTF-8.
|
||||
*/
|
||||
extern JS_PUBLIC_API JS::UniqueChars EncodeNarrowToUtf8(JSContext* cx,
|
||||
const char* chars);
|
||||
|
||||
/**
|
||||
* Encode a wide string to a UTF-8 string.
|
||||
*
|
||||
* NOTE: Should only be used when interacting with Windows API functions.
|
||||
*/
|
||||
extern JS_PUBLIC_API JS::UniqueChars EncodeWideToUtf8(JSContext* cx,
|
||||
const wchar_t* chars);
|
||||
|
||||
/**
|
||||
* Encode a UTF-8 string to a narrow multibyte character string.
|
||||
*
|
||||
* NOTE: Should only be used when interacting with POSIX/OS functions and not
|
||||
* for encoding UTF-8 to ASCII/Latin-1/etc. strings.
|
||||
*/
|
||||
extern JS_PUBLIC_API JS::UniqueChars EncodeUtf8ToNarrow(JSContext* cx,
|
||||
const char* chars);
|
||||
|
||||
/**
|
||||
* Encode a UTF-8 string to a wide string.
|
||||
*
|
||||
* NOTE: Should only be used when interacting with Windows API functions.
|
||||
*/
|
||||
extern JS_PUBLIC_API JS::UniqueWideChars EncodeUtf8ToWide(JSContext* cx,
|
||||
const char* chars);
|
||||
|
||||
} // namespace JS
|
||||
|
||||
inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
|
||||
|
|
|
@ -632,9 +632,10 @@ struct FreePolicy {
|
|||
void operator()(const void* ptr) { js_free(const_cast<void*>(ptr)); }
|
||||
};
|
||||
|
||||
typedef mozilla::UniquePtr<char[], JS::FreePolicy> UniqueChars;
|
||||
typedef mozilla::UniquePtr<char16_t[], JS::FreePolicy> UniqueTwoByteChars;
|
||||
typedef mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy> UniqueLatin1Chars;
|
||||
using UniqueChars = mozilla::UniquePtr<char[], JS::FreePolicy>;
|
||||
using UniqueTwoByteChars = mozilla::UniquePtr<char16_t[], JS::FreePolicy>;
|
||||
using UniqueLatin1Chars = mozilla::UniquePtr<JS::Latin1Char[], JS::FreePolicy>;
|
||||
using UniqueWideChars = mozilla::UniquePtr<wchar_t[], JS::FreePolicy>;
|
||||
|
||||
} // namespace JS
|
||||
|
||||
|
|
|
@ -400,6 +400,12 @@ MSG_DEF(JSMSG_BAD_TRAILING_UTF8_UNIT, 1, JSEXN_SYNTAXERR, "bad trailing UTF-8 b
|
|||
MSG_DEF(JSMSG_FORBIDDEN_UTF8_CODE_POINT,2,JSEXN_SYNTAXERR, "{0} isn't a valid code point because {1}")
|
||||
MSG_DEF(JSMSG_BAD_CODE_UNITS, 1, JSEXN_NOTE, "the code units comprising this invalid code point were: {0}")
|
||||
|
||||
// System encoding errors
|
||||
MSG_DEF(JSMSG_CANT_CONVERT_TO_NARROW, 0, JSEXN_NOTE, "can't convert to narrow string")
|
||||
MSG_DEF(JSMSG_CANT_CONVERT_TO_WIDE, 0, JSEXN_NOTE, "can't convert to wide string")
|
||||
MSG_DEF(JSMSG_CANT_CONVERT_WIDE_TO_UTF8, 0, JSEXN_NOTE, "can't convert wide string to UTF-8")
|
||||
MSG_DEF(JSMSG_CANT_CONVERT_UTF8_TO_WIDE, 0, JSEXN_NOTE, "can't convert UTF-8 to wide string")
|
||||
|
||||
// SmooshMonkey
|
||||
MSG_DEF(JSMSG_SMOOSH_COMPILE_ERROR, 1, JSEXN_SYNTAXERR, "{0}")
|
||||
MSG_DEF(JSMSG_SMOOSH_UNIMPLEMENTED, 1, JSEXN_INTERNALERR, "{0}")
|
||||
|
|
|
@ -65,6 +65,7 @@ using JS::Latin1Char;
|
|||
using JS::UniqueChars;
|
||||
using JS::UniqueLatin1Chars;
|
||||
using JS::UniqueTwoByteChars;
|
||||
using JS::UniqueWideChars;
|
||||
|
||||
using JS::Ok;
|
||||
using JS::OOM;
|
||||
|
|
|
@ -6,14 +6,23 @@
|
|||
|
||||
#include "js/CharacterEncoding.h"
|
||||
|
||||
#include "mozilla/CheckedInt.h"
|
||||
#include "mozilla/DebugOnly.h"
|
||||
#include "mozilla/Latin1.h"
|
||||
#include "mozilla/Maybe.h"
|
||||
#include "mozilla/Range.h"
|
||||
#include "mozilla/Span.h"
|
||||
#include "mozilla/Sprintf.h"
|
||||
#include "mozilla/TextUtils.h"
|
||||
#include "mozilla/Utf8.h"
|
||||
|
||||
#ifndef XP_LINUX
|
||||
// We still support libstd++ versions without codecvt support on Linux.
|
||||
# include <codecvt>
|
||||
#endif
|
||||
#include <cwchar>
|
||||
#include <limits>
|
||||
#include <locale>
|
||||
#include <type_traits>
|
||||
|
||||
#include "frontend/FrontendContext.h"
|
||||
|
@ -587,6 +596,230 @@ bool JS::StringIsASCII(const char* s) {
|
|||
|
||||
bool JS::StringIsASCII(Span<const char> s) { return IsAscii(s); }
|
||||
|
||||
JS_PUBLIC_API JS::UniqueChars JS::EncodeNarrowToUtf8(JSContext* cx,
|
||||
const char* chars) {
|
||||
// Convert the narrow multibyte character string to a wide string and then
|
||||
// use EncodeWideToUtf8() to convert the wide string to a UTF-8 string.
|
||||
|
||||
std::mbstate_t mb{};
|
||||
|
||||
// NOTE: The 2nd parameter is overwritten even if the 1st parameter is nullptr
|
||||
// on Android NDK older than v16. Use a temporary variable to save the
|
||||
// `chars` for the subsequent call. See bug 1492090.
|
||||
const char* tmpChars = chars;
|
||||
|
||||
size_t wideLen = std::mbsrtowcs(nullptr, &tmpChars, 0, &mb);
|
||||
if (wideLen == size_t(-1)) {
|
||||
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
|
||||
JSMSG_CANT_CONVERT_TO_WIDE);
|
||||
return nullptr;
|
||||
}
|
||||
MOZ_ASSERT(std::mbsinit(&mb),
|
||||
"multi-byte state is in its initial state when no conversion "
|
||||
"error occured");
|
||||
|
||||
size_t bufLen = wideLen + 1;
|
||||
auto wideChars = cx->make_pod_array<wchar_t>(bufLen);
|
||||
if (!wideChars) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
mozilla::DebugOnly<size_t> actualLen =
|
||||
std::mbsrtowcs(wideChars.get(), &chars, bufLen, &mb);
|
||||
MOZ_ASSERT(wideLen == actualLen);
|
||||
MOZ_ASSERT(wideChars[actualLen] == '\0');
|
||||
|
||||
return EncodeWideToUtf8(cx, wideChars.get());
|
||||
}
|
||||
|
||||
JS_PUBLIC_API JS::UniqueChars JS::EncodeWideToUtf8(JSContext* cx,
|
||||
const wchar_t* chars) {
|
||||
using CheckedSizeT = mozilla::CheckedInt<size_t>;
|
||||
|
||||
#ifndef XP_LINUX
|
||||
// Use the standard codecvt facet to convert a wide string to UTF-8.
|
||||
std::codecvt_utf8<wchar_t> cv;
|
||||
|
||||
size_t len = std::wcslen(chars);
|
||||
CheckedSizeT utf8MaxLen = CheckedSizeT(len) * cv.max_length();
|
||||
CheckedSizeT utf8BufLen = utf8MaxLen + 1;
|
||||
if (!utf8BufLen.isValid()) {
|
||||
JS_ReportAllocationOverflow(cx);
|
||||
return nullptr;
|
||||
}
|
||||
auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
|
||||
if (!utf8) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// STL returns |codecvt_base::partial| for empty strings.
|
||||
if (len == 0) {
|
||||
return utf8;
|
||||
}
|
||||
|
||||
std::mbstate_t mb{};
|
||||
const wchar_t* fromNext;
|
||||
char* toNext;
|
||||
std::codecvt_base::result result =
|
||||
cv.out(mb, chars, chars + len, fromNext, utf8.get(),
|
||||
utf8.get() + utf8MaxLen.value(), toNext);
|
||||
if (result != std::codecvt_base::ok) {
|
||||
MOZ_ASSERT(result == std::codecvt_base::error);
|
||||
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
|
||||
JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
|
||||
return nullptr;
|
||||
}
|
||||
*toNext = '\0'; // Explicit null-termination required.
|
||||
|
||||
// codecvt_utf8 doesn't validate its output and may produce WTF-8 instead
|
||||
// of UTF-8 on some platforms when the input contains unpaired surrogate
|
||||
// characters. We don't allow this.
|
||||
if (!mozilla::IsUtf8(
|
||||
mozilla::Span(utf8.get(), size_t(toNext - utf8.get())))) {
|
||||
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
|
||||
JSMSG_CANT_CONVERT_WIDE_TO_UTF8);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return utf8;
|
||||
#else
|
||||
static_assert(sizeof(wchar_t) == 4,
|
||||
"Assume wchar_t is UTF-32 on Linux systems");
|
||||
|
||||
constexpr size_t MaxUtf8CharLength = 4;
|
||||
|
||||
size_t len = std::wcslen(chars);
|
||||
CheckedSizeT utf8MaxLen = CheckedSizeT(len) * MaxUtf8CharLength;
|
||||
CheckedSizeT utf8BufLen = utf8MaxLen + 1;
|
||||
if (!utf8BufLen.isValid()) {
|
||||
JS_ReportAllocationOverflow(cx);
|
||||
return nullptr;
|
||||
}
|
||||
auto utf8 = cx->make_pod_array<char>(utf8BufLen.value());
|
||||
if (!utf8) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
char* dst = utf8.get();
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
uint8_t utf8buf[MaxUtf8CharLength];
|
||||
uint32_t utf8Len = OneUcs4ToUtf8Char(utf8buf, chars[i]);
|
||||
for (size_t j = 0; j < utf8Len; j++) {
|
||||
*dst++ = char(utf8buf[j]);
|
||||
}
|
||||
}
|
||||
*dst = '\0';
|
||||
|
||||
return utf8;
|
||||
#endif
|
||||
}
|
||||
|
||||
JS_PUBLIC_API JS::UniqueChars JS::EncodeUtf8ToNarrow(JSContext* cx,
|
||||
const char* chars) {
|
||||
// Convert the UTF-8 string to a wide string via EncodeUtf8ToWide() and
|
||||
// then convert the resulting wide string to a narrow multibyte character
|
||||
// string.
|
||||
|
||||
auto wideChars = EncodeUtf8ToWide(cx, chars);
|
||||
if (!wideChars) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const wchar_t* cWideChars = wideChars.get();
|
||||
std::mbstate_t mb{};
|
||||
size_t narrowLen = std::wcsrtombs(nullptr, &cWideChars, 0, &mb);
|
||||
if (narrowLen == size_t(-1)) {
|
||||
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
|
||||
JSMSG_CANT_CONVERT_TO_NARROW);
|
||||
return nullptr;
|
||||
}
|
||||
MOZ_ASSERT(std::mbsinit(&mb),
|
||||
"multi-byte state is in its initial state when no conversion "
|
||||
"error occured");
|
||||
|
||||
size_t bufLen = narrowLen + 1;
|
||||
auto narrow = cx->make_pod_array<char>(bufLen);
|
||||
if (!narrow) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
mozilla::DebugOnly<size_t> actualLen =
|
||||
std::wcsrtombs(narrow.get(), &cWideChars, bufLen, &mb);
|
||||
MOZ_ASSERT(narrowLen == actualLen);
|
||||
MOZ_ASSERT(narrow[actualLen] == '\0');
|
||||
|
||||
return narrow;
|
||||
}
|
||||
|
||||
JS_PUBLIC_API JS::UniqueWideChars JS::EncodeUtf8ToWide(JSContext* cx,
|
||||
const char* chars) {
|
||||
// Only valid UTF-8 strings should be passed to this function.
|
||||
MOZ_ASSERT(mozilla::IsUtf8(mozilla::Span(chars, strlen(chars))));
|
||||
|
||||
#ifndef XP_LINUX
|
||||
// Use the standard codecvt facet to convert from UTF-8 to a wide string.
|
||||
std::codecvt_utf8<wchar_t> cv;
|
||||
|
||||
size_t len = strlen(chars);
|
||||
auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
|
||||
if (!wideChars) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// STL returns |codecvt_base::partial| for empty strings.
|
||||
if (len == 0) {
|
||||
return wideChars;
|
||||
}
|
||||
|
||||
std::mbstate_t mb{};
|
||||
const char* fromNext;
|
||||
wchar_t* toNext;
|
||||
std::codecvt_base::result result =
|
||||
cv.in(mb, chars, chars + len, fromNext, wideChars.get(),
|
||||
wideChars.get() + len, toNext);
|
||||
if (result != std::codecvt_base::ok) {
|
||||
MOZ_ASSERT(result == std::codecvt_base::error);
|
||||
JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr,
|
||||
JSMSG_CANT_CONVERT_UTF8_TO_WIDE);
|
||||
return nullptr;
|
||||
}
|
||||
*toNext = '\0'; // Explicit null-termination required.
|
||||
|
||||
return wideChars;
|
||||
#else
|
||||
static_assert(sizeof(wchar_t) == 4,
|
||||
"Assume wchar_t is UTF-32 on Linux systems");
|
||||
|
||||
size_t len = strlen(chars);
|
||||
auto wideChars = cx->make_pod_array<wchar_t>(len + 1);
|
||||
if (!wideChars) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
const auto* s = reinterpret_cast<const unsigned char*>(chars);
|
||||
const auto* const limit = s + len;
|
||||
|
||||
wchar_t* dst = wideChars.get();
|
||||
while (s < limit) {
|
||||
unsigned char c = *s++;
|
||||
|
||||
if (mozilla::IsAscii(c)) {
|
||||
*dst++ = wchar_t(c);
|
||||
continue;
|
||||
}
|
||||
|
||||
mozilla::Utf8Unit utf8(c);
|
||||
mozilla::Maybe<char32_t> codePoint =
|
||||
mozilla::DecodeOneUtf8CodePoint(utf8, &s, limit);
|
||||
MOZ_ASSERT(codePoint.isSome());
|
||||
*dst++ = wchar_t(*codePoint);
|
||||
}
|
||||
*dst++ = '\0';
|
||||
|
||||
return wideChars;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool StringBuffer::append(const Utf8Unit* units, size_t len) {
|
||||
MOZ_ASSERT(maybeCx_);
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче