2014-05-05 21:30:46 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
2012-05-21 15:12:37 +04:00
|
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
2003-05-22 02:20:27 +04:00
|
|
|
#ifndef nsUTF8Utils_h_
|
|
|
|
#define nsUTF8Utils_h_
|
|
|
|
|
2007-12-31 18:15:43 +03:00
|
|
|
// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
|
|
|
|
// file will provide signatures for the Mozilla abstract string types. It will
|
|
|
|
// use XPCOM assertion/debugging macros, etc.
|
|
|
|
|
|
|
|
#include "nscore.h"
|
2014-08-02 00:23:48 +04:00
|
|
|
#include "mozilla/Assertions.h"
|
2018-01-12 09:46:11 +03:00
|
|
|
#include "mozilla/EndianUtils.h"
|
2016-08-14 14:39:31 +03:00
|
|
|
#include "mozilla/TypeTraits.h"
|
2007-12-31 18:15:43 +03:00
|
|
|
|
2005-11-15 21:17:22 +03:00
|
|
|
#include "nsCharTraits.h"
|
|
|
|
|
2016-11-04 21:12:15 +03:00
|
|
|
#ifdef MOZILLA_INTERNAL_API
|
|
|
|
#define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
|
|
|
|
#else
|
|
|
|
#define UTF8UTILS_WARNING(msg)
|
|
|
|
#endif
|
|
|
|
|
2003-05-22 02:20:27 +04:00
|
|
|
class UTF8traits {
|
2014-05-05 21:30:46 +04:00
|
|
|
public:
|
2014-05-27 11:15:35 +04:00
|
|
|
static bool isASCII(char aChar) { return (aChar & 0x80) == 0x00; }
|
|
|
|
static bool isInSeq(char aChar) { return (aChar & 0xC0) == 0x80; }
|
|
|
|
static bool is2byte(char aChar) { return (aChar & 0xE0) == 0xC0; }
|
|
|
|
static bool is3byte(char aChar) { return (aChar & 0xF0) == 0xE0; }
|
|
|
|
static bool is4byte(char aChar) { return (aChar & 0xF8) == 0xF0; }
|
|
|
|
static bool is5byte(char aChar) { return (aChar & 0xFC) == 0xF8; }
|
|
|
|
static bool is6byte(char aChar) { return (aChar & 0xFE) == 0xFC; }
|
2018-03-16 22:57:00 +03:00
|
|
|
// return the number of bytes in a sequence beginning with aChar
|
|
|
|
static int bytes(char aChar) {
|
|
|
|
if (isASCII(aChar)) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
if (is2byte(aChar)) {
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
if (is3byte(aChar)) {
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
if (is4byte(aChar)) {
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
MOZ_ASSERT_UNREACHABLE("should not be used for in-sequence characters");
|
|
|
|
return 1;
|
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
};
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2005-11-04 22:52:18 +03:00
|
|
|
/**
|
2018-07-06 10:44:43 +03:00
|
|
|
* Extract the next Unicode scalar value from the buffer and return it. The
|
2005-11-04 22:52:18 +03:00
|
|
|
* pointer passed in is advanced to the start of the next character in the
|
2018-07-06 10:44:43 +03:00
|
|
|
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced
|
|
|
|
* over the maximal valid prefix and *aErr is set to true (if aErr is not
|
|
|
|
* null).
|
|
|
|
*
|
|
|
|
* Note: This method never sets *aErr to false to allow error accumulation
|
|
|
|
* across multiple calls.
|
|
|
|
*
|
|
|
|
* Precondition: *aBuffer < aEnd
|
2005-11-04 22:52:18 +03:00
|
|
|
*/
|
|
|
|
class UTF8CharEnumerator {
|
|
|
|
public:
|
2018-07-06 10:44:43 +03:00
|
|
|
static inline char32_t NextChar(const char** aBuffer, const char* aEnd,
|
|
|
|
bool* aErr = nullptr) {
|
|
|
|
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
|
|
|
|
MOZ_ASSERT(aEnd, "null end pointer");
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
const unsigned char* p = reinterpret_cast<const unsigned char*>(*aBuffer);
|
|
|
|
const unsigned char* end = reinterpret_cast<const unsigned char*>(aEnd);
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
MOZ_ASSERT(p, "null buffer");
|
|
|
|
MOZ_ASSERT(p < end, "Bogus range");
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
unsigned char first = *p++;
|
2010-02-23 20:38:10 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
if (MOZ_LIKELY(first < 0x80U)) {
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(p);
|
|
|
|
return first;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
// Unsigned underflow is defined behavior
|
|
|
|
if (MOZ_UNLIKELY((p == end) || ((first - 0xC2U) >= (0xF5U - 0xC2U)))) {
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(p);
|
|
|
|
if (aErr) {
|
2014-05-27 11:15:35 +04:00
|
|
|
*aErr = true;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
return 0xFFFDU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
unsigned char second = *p;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
if (first < 0xE0U) {
|
|
|
|
// Two-byte
|
|
|
|
if (MOZ_LIKELY((second & 0xC0U) == 0x80U)) {
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(++p);
|
|
|
|
return ((uint32_t(first) & 0x1FU) << 6) | (uint32_t(second) & 0x3FU);
|
|
|
|
}
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(p);
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
return 0xFFFDU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
if (MOZ_LIKELY(first < 0xF0U)) {
|
|
|
|
// Three-byte
|
|
|
|
unsigned char lower = 0x80U;
|
|
|
|
unsigned char upper = 0xBFU;
|
|
|
|
if (first == 0xE0U) {
|
|
|
|
lower = 0xA0U;
|
|
|
|
} else if (first == 0xEDU) {
|
|
|
|
upper = 0x9FU;
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
if (MOZ_LIKELY(second >= lower && second <= upper)) {
|
|
|
|
if (MOZ_LIKELY(p != end)) {
|
|
|
|
unsigned char third = *++p;
|
|
|
|
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(++p);
|
|
|
|
return ((uint32_t(first) & 0xFU) << 12) |
|
|
|
|
((uint32_t(second) & 0x3FU) << 6) |
|
|
|
|
(uint32_t(third) & 0x3FU);
|
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
*aBuffer = reinterpret_cast<const char*>(p);
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
return 0xFFFDU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
// Four-byte
|
|
|
|
unsigned char lower = 0x80U;
|
|
|
|
unsigned char upper = 0xBFU;
|
|
|
|
if (first == 0xF0U) {
|
|
|
|
lower = 0x90U;
|
|
|
|
} else if (first == 0xF4U) {
|
|
|
|
upper = 0x8FU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
if (MOZ_LIKELY(second >= lower && second <= upper)) {
|
|
|
|
if (MOZ_LIKELY(p != end)) {
|
|
|
|
unsigned char third = *++p;
|
|
|
|
if (MOZ_LIKELY((third & 0xC0U) == 0x80U)) {
|
|
|
|
if (MOZ_LIKELY(p != end)) {
|
|
|
|
unsigned char fourth = *++p;
|
|
|
|
if (MOZ_LIKELY((fourth & 0xC0U) == 0x80U)) {
|
|
|
|
*aBuffer = reinterpret_cast<const char*>(++p);
|
|
|
|
return ((uint32_t(first) & 0x7U) << 18) |
|
|
|
|
((uint32_t(second) & 0x3FU) << 12) |
|
|
|
|
((uint32_t(third) & 0x3FU) << 6) |
|
|
|
|
(uint32_t(fourth) & 0x3FU);
|
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
*aBuffer = reinterpret_cast<const char*>(p);
|
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
return 0xFFFDU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
};
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2003-06-11 08:27:13 +04:00
|
|
|
/**
|
2018-07-06 10:44:43 +03:00
|
|
|
* Extract the next Unicode scalar value from the buffer and return it. The
|
|
|
|
* pointer passed in is advanced to the start of the next character in the
|
|
|
|
* buffer. Upon error, the return value is 0xFFFD, *aBuffer is advanced over
|
|
|
|
* the unpaired surrogate and *aErr is set to true (if aErr is not null).
|
|
|
|
*
|
|
|
|
* Note: This method never sets *aErr to false to allow error accumulation
|
|
|
|
* across multiple calls.
|
|
|
|
*
|
|
|
|
* Precondition: *aBuffer < aEnd
|
2003-06-11 08:27:13 +04:00
|
|
|
*/
|
2018-07-06 10:44:43 +03:00
|
|
|
class UTF16CharEnumerator {
|
2014-05-05 21:30:46 +04:00
|
|
|
public:
|
2018-07-06 10:44:43 +03:00
|
|
|
static inline char32_t NextChar(const char16_t** aBuffer,
|
|
|
|
const char16_t* aEnd, bool* aErr = nullptr) {
|
|
|
|
MOZ_ASSERT(aBuffer, "null buffer pointer pointer");
|
|
|
|
MOZ_ASSERT(aEnd, "null end pointer");
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
const char16_t* p = *aBuffer;
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
MOZ_ASSERT(p, "null buffer");
|
|
|
|
MOZ_ASSERT(p < aEnd, "Bogus range");
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
char16_t c = *p++;
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2018-07-06 10:44:43 +03:00
|
|
|
// Let's use encoding_rs-style code golf here.
|
|
|
|
// Unsigned underflow is defined behavior
|
|
|
|
char16_t cMinusSurrogateStart = c - 0xD800U;
|
|
|
|
if (MOZ_LIKELY(cMinusSurrogateStart > (0xDFFFU - 0xD800U))) {
|
|
|
|
*aBuffer = p;
|
|
|
|
return c;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
if (MOZ_LIKELY(cMinusSurrogateStart <= (0xDBFFU - 0xD800U))) {
|
|
|
|
// High surrogate
|
|
|
|
if (MOZ_LIKELY(p != aEnd)) {
|
|
|
|
char16_t second = *p;
|
|
|
|
// Unsigned underflow is defined behavior
|
|
|
|
if (MOZ_LIKELY((second - 0xDC00U) <= (0xDFFFU - 0xDC00U))) {
|
|
|
|
*aBuffer = ++p;
|
|
|
|
return (uint32_t(c) << 10) + uint32_t(second) -
|
|
|
|
(((0xD800U << 10) - 0x10000U) + 0xDC00U);
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2003-06-11 08:27:13 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
// Unpaired surrogate
|
|
|
|
*aBuffer = p;
|
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2018-07-06 10:44:43 +03:00
|
|
|
return 0xFFFDU;
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
};
|
2016-08-14 14:39:31 +03:00
|
|
|
|
|
|
|
template <typename Char, typename UnsignedT>
|
|
|
|
inline UnsignedT RewindToPriorUTF8Codepoint(const Char* utf8Chars,
|
|
|
|
UnsignedT index) {
|
|
|
|
static_assert(mozilla::IsSame<Char, char>::value ||
|
|
|
|
mozilla::IsSame<Char, unsigned char>::value ||
|
|
|
|
mozilla::IsSame<Char, signed char>::value,
|
|
|
|
"UTF-8 data must be in 8-bit units");
|
|
|
|
static_assert(mozilla::IsUnsigned<UnsignedT>::value,
|
|
|
|
"index type must be unsigned");
|
|
|
|
while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80) --index;
|
|
|
|
|
|
|
|
return index;
|
|
|
|
}
|
|
|
|
|
2016-11-04 21:12:15 +03:00
|
|
|
#undef UTF8UTILS_WARNING
|
|
|
|
|
2003-05-22 02:20:27 +04:00
|
|
|
#endif /* !defined(nsUTF8Utils_h_) */
|