2014-05-05 21:30:46 +04:00
|
|
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
|
|
|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
2012-05-21 15:12:37 +04:00
|
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
2003-05-22 02:20:27 +04:00
|
|
|
#ifndef nsUTF8Utils_h_
|
|
|
|
#define nsUTF8Utils_h_
|
|
|
|
|
2007-12-31 18:15:43 +03:00
|
|
|
// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
|
|
|
|
// file will provide signatures for the Mozilla abstract string types. It will
|
|
|
|
// use XPCOM assertion/debugging macros, etc.
|
|
|
|
|
|
|
|
#include "nscore.h"
|
2014-08-02 00:23:48 +04:00
|
|
|
#include "mozilla/Assertions.h"
|
2010-08-13 20:15:44 +04:00
|
|
|
#include "mozilla/SSE.h"
|
2007-12-31 18:15:43 +03:00
|
|
|
|
2005-11-15 21:17:22 +03:00
|
|
|
#include "nsCharTraits.h"
|
|
|
|
|
2003-05-22 02:20:27 +04:00
|
|
|
class UTF8traits
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
2014-05-27 11:15:35 +04:00
|
|
|
static bool isASCII(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0x80) == 0x00;
|
|
|
|
}
|
|
|
|
static bool isInSeq(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xC0) == 0x80;
|
|
|
|
}
|
|
|
|
static bool is2byte(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xE0) == 0xC0;
|
|
|
|
}
|
|
|
|
static bool is3byte(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xF0) == 0xE0;
|
|
|
|
}
|
|
|
|
static bool is4byte(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xF8) == 0xF0;
|
|
|
|
}
|
|
|
|
static bool is5byte(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xFC) == 0xF8;
|
|
|
|
}
|
|
|
|
static bool is6byte(char aChar)
|
|
|
|
{
|
|
|
|
return (aChar & 0xFE) == 0xFC;
|
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
};
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2005-11-04 22:52:18 +03:00
|
|
|
/**
|
|
|
|
* Extract the next UCS-4 character from the buffer and return it. The
|
|
|
|
* pointer passed in is advanced to the start of the next character in the
|
|
|
|
* buffer. If non-null, the parameters err and overlong are filled in to
|
|
|
|
* indicate that the character was represented by an overlong sequence, or
|
|
|
|
* that an error occurred.
|
|
|
|
*/
|
|
|
|
|
|
|
|
class UTF8CharEnumerator
|
|
|
|
{
|
|
|
|
public:
|
2014-05-27 11:15:35 +04:00
|
|
|
static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
|
2005-11-04 22:52:18 +03:00
|
|
|
{
|
2014-05-27 11:15:35 +04:00
|
|
|
NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
const char* p = *aBuffer;
|
|
|
|
*aErr = false;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (p >= aEnd) {
|
|
|
|
*aErr = true;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
|
|
|
char c = *p++;
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (UTF8traits::isASCII(c)) {
|
|
|
|
*aBuffer = p;
|
2014-05-05 21:30:46 +04:00
|
|
|
return c;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2012-08-22 19:56:38 +04:00
|
|
|
uint32_t ucs4;
|
|
|
|
uint32_t minUcs4;
|
|
|
|
int32_t state = 0;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
|
|
|
if (!CalcState(c, ucs4, minUcs4, state)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
2014-05-27 11:15:35 +04:00
|
|
|
*aErr = true;
|
2010-02-23 20:38:10 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
while (state--) {
|
|
|
|
if (p == aEnd) {
|
|
|
|
*aErr = true;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
c = *p++;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (!AddByte(c, state, ucs4)) {
|
|
|
|
*aErr = true;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (ucs4 < minUcs4) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Overlong sequence
|
|
|
|
ucs4 = UCS2_REPLACEMENT_CHAR;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (ucs4 >= 0xD800 &&
|
|
|
|
(ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Surrogates and code points outside the Unicode range.
|
|
|
|
ucs4 = UCS2_REPLACEMENT_CHAR;
|
|
|
|
}
|
2010-02-23 20:38:10 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
*aBuffer = p;
|
2005-11-04 22:52:18 +03:00
|
|
|
return ucs4;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2014-05-27 11:15:35 +04:00
|
|
|
static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
|
|
|
|
int32_t& aState)
|
|
|
|
{
|
|
|
|
if (UTF8traits::is2byte(aChar)) {
|
|
|
|
aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
|
|
|
|
aState = 1;
|
|
|
|
aMinUcs4 = 0x00000080;
|
|
|
|
} else if (UTF8traits::is3byte(aChar)) {
|
|
|
|
aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
|
|
|
|
aState = 2;
|
|
|
|
aMinUcs4 = 0x00000800;
|
|
|
|
} else if (UTF8traits::is4byte(aChar)) {
|
|
|
|
aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
|
|
|
|
aState = 3;
|
|
|
|
aMinUcs4 = 0x00010000;
|
|
|
|
} else if (UTF8traits::is5byte(aChar)) {
|
|
|
|
aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
|
|
|
|
aState = 4;
|
|
|
|
aMinUcs4 = 0x00200000;
|
|
|
|
} else if (UTF8traits::is6byte(aChar)) {
|
|
|
|
aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
|
|
|
|
aState = 5;
|
|
|
|
aMinUcs4 = 0x04000000;
|
|
|
|
} else {
|
2014-05-05 21:30:46 +04:00
|
|
|
return false;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2011-10-17 18:59:28 +04:00
|
|
|
return true;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
|
2005-11-04 22:52:18 +03:00
|
|
|
{
|
2014-05-27 11:15:35 +04:00
|
|
|
if (UTF8traits::isInSeq(aChar)) {
|
|
|
|
int32_t shift = aState * 6;
|
|
|
|
aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
|
2014-05-05 21:30:46 +04:00
|
|
|
return true;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2011-10-17 18:59:28 +04:00
|
|
|
return false;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Extract the next UCS-4 character from the buffer and return it. The
|
|
|
|
* pointer passed in is advanced to the start of the next character in the
|
|
|
|
* buffer. If non-null, the err parameter is filled in if an error occurs.
|
2014-08-02 00:23:48 +04:00
|
|
|
*
|
|
|
|
* If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
|
|
|
|
* the buffer will be updated to move only a single UCS-2 character.
|
|
|
|
*
|
|
|
|
* Any other error returns 0 and does not move the buffer position.
|
2005-11-04 22:52:18 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
class UTF16CharEnumerator
|
|
|
|
{
|
|
|
|
public:
|
2014-05-27 11:15:35 +04:00
|
|
|
static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
|
|
|
|
bool* aErr = nullptr)
|
2005-11-04 22:52:18 +03:00
|
|
|
{
|
2014-05-27 11:15:35 +04:00
|
|
|
NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
const char16_t* p = *aBuffer;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (p >= aEnd) {
|
2014-05-05 21:30:46 +04:00
|
|
|
NS_ERROR("No input to work with");
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-01-04 19:02:17 +04:00
|
|
|
char16_t c = *p++;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
|
|
|
|
if (aErr) {
|
|
|
|
*aErr = false;
|
|
|
|
}
|
|
|
|
*aBuffer = p;
|
2014-05-05 21:30:46 +04:00
|
|
|
return c;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
|
|
|
|
if (p == aEnd) {
|
|
|
|
// Found a high surrogate at the end of the buffer. Flag this
|
2014-05-05 21:30:46 +04:00
|
|
|
// as an error and return the Unicode replacement
|
|
|
|
// character 0xFFFD.
|
2007-09-15 03:09:49 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
NS_WARNING("Unexpected end of buffer after high surrogate");
|
2007-09-15 03:09:49 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
|
|
|
*aBuffer = p;
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0xFFFD;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
// D800- DBFF - High Surrogate
|
|
|
|
char16_t h = c;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
c = *p++;
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (NS_IS_LOW_SURROGATE(c)) {
|
2005-11-04 22:52:18 +03:00
|
|
|
// DC00- DFFF - Low Surrogate
|
2014-05-05 21:30:46 +04:00
|
|
|
// N = (H - D800) *400 + 10000 + (L - DC00)
|
|
|
|
uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = false;
|
|
|
|
}
|
|
|
|
*aBuffer = p;
|
2014-05-05 21:30:46 +04:00
|
|
|
return ucs4;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Found a high surrogate followed by something other than
|
|
|
|
// a low surrogate. Flag this as an error and return the
|
|
|
|
// Unicode replacement character 0xFFFD. Note that the
|
|
|
|
// pointer to the next character points to the second 16-bit
|
|
|
|
// value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
|
|
|
|
// only the first code unit of an illegal sequence must be
|
|
|
|
// treated as an illegally terminated code unit sequence
|
|
|
|
// (also Chapter 3 D91, "isolated [not paired and ill-formed]
|
|
|
|
// UTF-16 code units in the range D800..DFFF are ill-formed").
|
|
|
|
NS_WARNING("got a High Surrogate but no low surrogate");
|
2007-09-15 03:09:49 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
|
|
|
*aBuffer = p - 1;
|
2007-09-15 03:09:49 +04:00
|
|
|
return 0xFFFD;
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
} else { // U+DC00 - U+DFFF
|
2014-05-05 21:30:46 +04:00
|
|
|
// DC00- DFFF - Low Surrogate
|
|
|
|
|
|
|
|
// Found a low surrogate w/o a preceding high surrogate. Flag
|
|
|
|
// this as an error and return the Unicode replacement
|
|
|
|
// character 0xFFFD.
|
|
|
|
|
|
|
|
NS_WARNING("got a low Surrogate but no high surrogate");
|
2014-05-27 11:15:35 +04:00
|
|
|
if (aErr) {
|
|
|
|
*aErr = true;
|
|
|
|
}
|
|
|
|
*aBuffer = p;
|
2014-05-05 21:30:46 +04:00
|
|
|
return 0xFFFD;
|
|
|
|
}
|
2005-11-04 22:52:18 +03:00
|
|
|
|
2014-08-02 00:23:48 +04:00
|
|
|
MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
|
2005-11-04 22:52:18 +03:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2003-05-23 01:25:43 +04:00
|
|
|
/**
|
|
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for converting
|
2003-06-23 08:30:57 +04:00
|
|
|
* UTF-8 to UTF-16
|
2003-05-23 01:25:43 +04:00
|
|
|
*/
|
2003-06-23 08:30:57 +04:00
|
|
|
class ConvertUTF8toUTF16
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char value_type;
|
|
|
|
typedef char16_t buffer_type;
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
|
|
|
|
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
|
|
|
|
{
|
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
size_t Length() const
|
|
|
|
{
|
|
|
|
return mBuffer - mStart;
|
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
bool ErrorEncountered() const
|
|
|
|
{
|
|
|
|
return mErrorEncountered;
|
|
|
|
}
|
2010-02-23 20:38:10 +03:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
void write(const value_type* aStart, uint32_t aN)
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
2014-05-27 11:15:35 +04:00
|
|
|
if (mErrorEncountered) {
|
2014-05-05 21:30:46 +04:00
|
|
|
return;
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
|
|
|
|
// algorithm assumes utf8 units won't
|
|
|
|
// be spread across fragments
|
2014-05-27 11:15:35 +04:00
|
|
|
const value_type* p = aStart;
|
|
|
|
const value_type* end = aStart + aN;
|
2014-05-05 21:30:46 +04:00
|
|
|
buffer_type* out = mBuffer;
|
2014-05-27 11:15:35 +04:00
|
|
|
for (; p != end /* && *p */;) {
|
2014-05-05 21:30:46 +04:00
|
|
|
bool err;
|
|
|
|
uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (err) {
|
2014-05-05 21:30:46 +04:00
|
|
|
mErrorEncountered = true;
|
2003-06-11 08:27:13 +04:00
|
|
|
mBuffer = out;
|
2014-05-05 21:30:46 +04:00
|
|
|
return;
|
2003-05-22 02:20:27 +04:00
|
|
|
}
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (ucs4 >= PLANE1_BASE) {
|
2014-05-05 21:30:46 +04:00
|
|
|
*out++ = (buffer_type)H_SURROGATE(ucs4);
|
|
|
|
*out++ = (buffer_type)L_SURROGATE(ucs4);
|
2014-05-27 11:15:35 +04:00
|
|
|
} else {
|
2014-05-05 21:30:46 +04:00
|
|
|
*out++ = ucs4;
|
2003-05-23 01:25:43 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
mBuffer = out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void write_terminator()
|
|
|
|
{
|
|
|
|
*mBuffer = buffer_type(0);
|
|
|
|
}
|
2003-05-23 01:25:43 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
private:
|
|
|
|
buffer_type* const mStart;
|
|
|
|
buffer_type* mBuffer;
|
|
|
|
bool mErrorEncountered;
|
|
|
|
};
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2003-05-23 01:25:43 +04:00
|
|
|
/**
|
|
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
2004-05-08 02:21:26 +04:00
|
|
|
* the length of the UTF-16 string equivalent to a UTF-8 string.
|
2003-05-23 01:25:43 +04:00
|
|
|
*/
|
2003-05-22 02:20:27 +04:00
|
|
|
class CalculateUTF8Length
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char value_type;
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
CalculateUTF8Length()
|
|
|
|
: mLength(0), mErrorEncountered(false)
|
|
|
|
{
|
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
size_t Length() const
|
|
|
|
{
|
|
|
|
return mLength;
|
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
void write(const value_type* aStart, uint32_t aN)
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
// ignore any further requests
|
2014-05-27 11:15:35 +04:00
|
|
|
if (mErrorEncountered) {
|
2014-05-05 21:30:46 +04:00
|
|
|
return;
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
|
|
|
|
// algorithm assumes utf8 units won't
|
|
|
|
// be spread across fragments
|
2014-05-27 11:15:35 +04:00
|
|
|
const value_type* p = aStart;
|
|
|
|
const value_type* end = aStart + aN;
|
|
|
|
for (; p < end /* && *p */; ++mLength) {
|
|
|
|
if (UTF8traits::isASCII(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
p += 1;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (UTF8traits::is2byte(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
p += 2;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (UTF8traits::is3byte(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
p += 3;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (UTF8traits::is4byte(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Because a UTF-8 sequence of 4 bytes represents a codepoint
|
|
|
|
// greater than 0xFFFF, it will become a surrogate pair in the
|
|
|
|
// UTF-16 string, so add 1 more to mLength.
|
|
|
|
// This doesn't happen with is5byte and is6byte because they
|
|
|
|
// are illegal UTF-8 sequences (greater than 0x10FFFF) so get
|
|
|
|
// converted to a single replacement character.
|
|
|
|
|
|
|
|
// However, there is one case when a 4 byte UTF-8 sequence will
|
|
|
|
// only generate 2 UTF-16 bytes. If we have a properly encoded
|
|
|
|
// sequence, but with an invalid value (too small or too big),
|
|
|
|
// that will result in a replacement character being written
|
|
|
|
// This replacement character is encoded as just 1 single
|
|
|
|
// UTF-16 character, which is 2 bytes.
|
|
|
|
|
|
|
|
// The below code therefore only adds 1 to mLength if the UTF8
|
|
|
|
// data will produce a decoded character which is greater than
|
|
|
|
// or equal to 0x010000 and less than 0x0110000.
|
|
|
|
|
|
|
|
// A 4byte UTF8 character is encoded as
|
|
|
|
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
|
|
// Bit 1-3 on the first byte, and bit 5-6 on the second byte,
|
|
|
|
// map to bit 17-21 in the final result. If these bits are
|
|
|
|
// between 0x01 and 0x11, that means that the final result is
|
|
|
|
// between 0x010000 and 0x110000. The below code reads these
|
|
|
|
// bits out and assigns them to c, but shifted up 4 bits to
|
|
|
|
// avoid having to shift twice.
|
|
|
|
|
|
|
|
// It doesn't matter what to do in the case where p + 4 > end
|
|
|
|
// since no UTF16 characters will be written in that case by
|
|
|
|
// ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
|
|
|
|
// any of the surrogate bits are wrong since no UTF16
|
|
|
|
// characters will be written in that case either.
|
|
|
|
|
|
|
|
if (p + 4 <= end) {
|
|
|
|
uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
|
|
|
|
((uint32_t)(p[1] & 0x30));
|
2014-05-27 11:15:35 +04:00
|
|
|
if (c >= 0x010 && c < 0x110) {
|
2014-05-05 21:30:46 +04:00
|
|
|
++mLength;
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
p += 4;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (UTF8traits::is5byte(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
p += 5;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (UTF8traits::is6byte(*p)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
p += 6;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else { // error
|
2014-05-05 21:30:46 +04:00
|
|
|
++mLength; // to account for the decrement below
|
|
|
|
break;
|
2003-05-22 02:20:27 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
if (p != end) {
|
2014-05-05 21:30:46 +04:00
|
|
|
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
|
|
|
|
--mLength; // The last multi-byte char wasn't complete, discard it.
|
|
|
|
mErrorEncountered = true;
|
|
|
|
}
|
|
|
|
}
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
private:
|
|
|
|
size_t mLength;
|
|
|
|
bool mErrorEncountered;
|
|
|
|
};
|
2003-05-22 02:20:27 +04:00
|
|
|
|
2003-06-11 08:27:13 +04:00
|
|
|
/**
|
2007-07-12 00:46:43 +04:00
|
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for
|
|
|
|
* converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
|
|
|
|
* (0xEFBFBD in UTF-8).
|
2003-06-11 08:27:13 +04:00
|
|
|
*/
|
2003-06-23 08:30:57 +04:00
|
|
|
class ConvertUTF16toUTF8
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char16_t value_type;
|
|
|
|
typedef char buffer_type;
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
// The error handling here is more lenient than that in
|
|
|
|
// |ConvertUTF8toUTF16|, but it's that way for backwards
|
|
|
|
// compatibility.
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
|
|
|
|
: mStart(aBuffer), mBuffer(aBuffer)
|
|
|
|
{
|
|
|
|
}
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
size_t Size() const
|
|
|
|
{
|
|
|
|
return mBuffer - mStart;
|
|
|
|
}
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
void write(const value_type* aStart, uint32_t aN)
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
2014-05-27 11:15:35 +04:00
|
|
|
buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
|
2014-05-05 21:30:46 +04:00
|
|
|
value_type c = *p;
|
2014-05-27 11:15:35 +04:00
|
|
|
if (!(c & 0xFF80)) { // U+0000 - U+007F
|
2014-05-05 21:30:46 +04:00
|
|
|
*out++ = (char)c;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (!(c & 0xF800)) { // U+0100 - U+07FF
|
2014-05-05 21:30:46 +04:00
|
|
|
*out++ = 0xC0 | (char)(c >> 6);
|
|
|
|
*out++ = 0x80 | (char)(0x003F & c);
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
|
2014-05-05 21:30:46 +04:00
|
|
|
*out++ = 0xE0 | (char)(c >> 12);
|
|
|
|
*out++ = 0x80 | (char)(0x003F & (c >> 6));
|
2014-05-27 11:15:35 +04:00
|
|
|
*out++ = 0x80 | (char)(0x003F & c);
|
|
|
|
} else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
|
2014-05-05 21:30:46 +04:00
|
|
|
// D800- DBFF - High Surrogate
|
|
|
|
value_type h = c;
|
|
|
|
|
|
|
|
++p;
|
2014-05-27 11:15:35 +04:00
|
|
|
if (p == end) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode
|
|
|
|
// replacement character 0xFFFD (0xEFBFBD in
|
|
|
|
// UTF-8)
|
|
|
|
*out++ = '\xEF';
|
|
|
|
*out++ = '\xBF';
|
|
|
|
*out++ = '\xBD';
|
|
|
|
|
|
|
|
NS_WARNING("String ending in half a surrogate pair!");
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
c = *p;
|
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (NS_IS_LOW_SURROGATE(c)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// DC00- DFFF - Low Surrogate
|
|
|
|
// N = (H - D800) *400 + 10000 + ( L - DC00 )
|
|
|
|
uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
|
|
|
|
|
|
|
|
// 0001 0000-001F FFFF
|
|
|
|
*out++ = 0xF0 | (char)(ucs4 >> 18);
|
|
|
|
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
|
|
|
|
*out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
|
|
|
|
*out++ = 0x80 | (char)(0x003F & ucs4);
|
2014-05-27 11:15:35 +04:00
|
|
|
} else {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode
|
|
|
|
// replacement character 0xFFFD (0xEFBFBD in
|
|
|
|
// UTF-8)
|
|
|
|
*out++ = '\xEF';
|
|
|
|
*out++ = '\xBF';
|
|
|
|
*out++ = '\xBD';
|
|
|
|
|
|
|
|
// The pointer to the next character points to the second
|
|
|
|
// 16-bit value, not beyond it, as per Unicode 5.0.0
|
|
|
|
// Chapter 3 C10, only the first code unit of an illegal
|
|
|
|
// sequence must be treated as an illegally terminated
|
|
|
|
// code unit sequence (also Chapter 3 D91, "isolated [not
|
|
|
|
// paired and ill-formed] UTF-16 code units in the range
|
|
|
|
// D800..DFFF are ill-formed").
|
|
|
|
p--;
|
|
|
|
|
|
|
|
NS_WARNING("got a High Surrogate but no low surrogate");
|
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
} else { // U+DC00 - U+DFFF
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode replacement
|
|
|
|
// character 0xFFFD (0xEFBFBD in UTF-8)
|
|
|
|
*out++ = '\xEF';
|
|
|
|
*out++ = '\xBF';
|
|
|
|
*out++ = '\xBD';
|
|
|
|
|
|
|
|
// DC00- DFFF - Low Surrogate
|
|
|
|
NS_WARNING("got a low Surrogate but no high surrogate");
|
2003-06-11 08:27:13 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
mBuffer = out;
|
|
|
|
}
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void write_terminator()
|
|
|
|
{
|
|
|
|
*mBuffer = buffer_type(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
buffer_type* const mStart;
|
|
|
|
buffer_type* mBuffer;
|
|
|
|
};
|
2003-06-11 08:27:13 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* A character sink (see |copy_string| in nsAlgorithm.h) for computing
|
2007-07-12 00:46:43 +04:00
|
|
|
* the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
|
|
|
|
* UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
|
2003-06-11 08:27:13 +04:00
|
|
|
*/
|
|
|
|
class CalculateUTF8Size
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char16_t value_type;
|
|
|
|
|
|
|
|
CalculateUTF8Size()
|
2014-05-27 11:15:35 +04:00
|
|
|
: mSize(0)
|
|
|
|
{
|
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
size_t Size() const
|
|
|
|
{
|
|
|
|
return mSize;
|
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
void write(const value_type* aStart, uint32_t aN)
|
2003-06-11 08:27:13 +04:00
|
|
|
{
|
2014-05-05 21:30:46 +04:00
|
|
|
// Assume UCS2 surrogate pairs won't be spread across fragments.
|
2014-05-27 11:15:35 +04:00
|
|
|
for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
|
2014-05-05 21:30:46 +04:00
|
|
|
value_type c = *p;
|
2014-05-27 11:15:35 +04:00
|
|
|
if (!(c & 0xFF80)) { // U+0000 - U+007F
|
2014-05-05 21:30:46 +04:00
|
|
|
mSize += 1;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (!(c & 0xF800)) { // U+0100 - U+07FF
|
2014-05-05 21:30:46 +04:00
|
|
|
mSize += 2;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
|
2014-05-05 21:30:46 +04:00
|
|
|
mSize += 3;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
|
2014-05-05 21:30:46 +04:00
|
|
|
++p;
|
2014-05-27 11:15:35 +04:00
|
|
|
if (p == end) {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode
|
|
|
|
// replacement character 0xFFFD (0xEFBFBD in
|
|
|
|
// UTF-8)
|
|
|
|
mSize += 3;
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
NS_WARNING("String ending in half a surrogate pair!");
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
c = *p;
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
if (0xDC00 == (0xFC00 & c)) {
|
2014-05-05 21:30:46 +04:00
|
|
|
mSize += 4;
|
2014-05-27 11:15:35 +04:00
|
|
|
} else {
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode
|
|
|
|
// replacement character 0xFFFD (0xEFBFBD in
|
|
|
|
// UTF-8)
|
|
|
|
mSize += 3;
|
|
|
|
|
|
|
|
// The next code unit is the second 16-bit value, not
|
|
|
|
// the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
|
|
|
|
// only the first code unit of an illegal sequence must
|
|
|
|
// be treated as an illegally terminated code unit
|
|
|
|
// sequence (also Chapter 3 D91, "isolated [not paired and
|
|
|
|
// ill-formed] UTF-16 code units in the range D800..DFFF
|
|
|
|
// are ill-formed").
|
|
|
|
p--;
|
|
|
|
|
|
|
|
NS_WARNING("got a high Surrogate but no low surrogate");
|
|
|
|
}
|
2014-05-27 11:15:35 +04:00
|
|
|
} else { // U+DC00 - U+DFFF
|
2014-05-05 21:30:46 +04:00
|
|
|
// Treat broken characters as the Unicode replacement
|
|
|
|
// character 0xFFFD (0xEFBFBD in UTF-8)
|
|
|
|
mSize += 3;
|
|
|
|
|
|
|
|
NS_WARNING("got a low Surrogate but no high surrogate");
|
2003-06-11 08:27:13 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
|
|
|
}
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
private:
|
|
|
|
size_t mSize;
|
|
|
|
};
|
2003-06-11 08:27:13 +04:00
|
|
|
|
2007-12-31 18:15:43 +03:00
|
|
|
#ifdef MOZILLA_INTERNAL_API
|
2004-02-19 05:44:03 +03:00
|
|
|
/**
|
2010-08-13 20:15:44 +04:00
|
|
|
* A character sink that performs a |reinterpret_cast|-style conversion
|
2014-01-04 19:02:17 +04:00
|
|
|
* from char to char16_t.
|
2004-02-19 05:44:03 +03:00
|
|
|
*/
|
2010-08-13 20:15:44 +04:00
|
|
|
class LossyConvertEncoding8to16
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char value_type;
|
|
|
|
typedef char input_type;
|
|
|
|
typedef char16_t output_type;
|
2004-02-19 05:44:03 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
public:
|
2014-05-27 11:15:35 +04:00
|
|
|
explicit LossyConvertEncoding8to16(char16_t* aDestination) :
|
|
|
|
mDestination(aDestination)
|
|
|
|
{
|
|
|
|
}
|
2004-02-19 05:44:03 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
2014-05-27 11:15:35 +04:00
|
|
|
write(const char* aSource, uint32_t aSourceLength)
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
2010-08-13 20:15:44 +04:00
|
|
|
#ifdef MOZILLA_MAY_SUPPORT_SSE2
|
2014-05-27 11:15:35 +04:00
|
|
|
if (mozilla::supports_sse2()) {
|
2014-05-05 21:30:46 +04:00
|
|
|
write_sse2(aSource, aSourceLength);
|
|
|
|
return;
|
|
|
|
}
|
2010-08-13 20:15:44 +04:00
|
|
|
#endif
|
2014-05-05 21:30:46 +04:00
|
|
|
const char* done_writing = aSource + aSourceLength;
|
2014-05-27 11:15:35 +04:00
|
|
|
while (aSource < done_writing) {
|
2014-05-05 21:30:46 +04:00
|
|
|
*mDestination++ = (char16_t)(unsigned char)(*aSource++);
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2010-08-13 20:15:44 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
2014-05-27 11:15:35 +04:00
|
|
|
write_sse2(const char* aSource, uint32_t aSourceLength);
|
2010-08-13 20:15:44 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
|
|
|
write_terminator()
|
|
|
|
{
|
|
|
|
*mDestination = (char16_t)(0);
|
|
|
|
}
|
2004-02-19 05:44:03 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
private:
|
|
|
|
char16_t* mDestination;
|
|
|
|
};
|
2010-08-13 20:15:44 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* A character sink that performs a |reinterpret_cast|-style conversion
|
2014-01-04 19:02:17 +04:00
|
|
|
* from char16_t to char.
|
2010-08-13 20:15:44 +04:00
|
|
|
*/
|
|
|
|
class LossyConvertEncoding16to8
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
|
|
|
public:
|
|
|
|
typedef char16_t value_type;
|
|
|
|
typedef char16_t input_type;
|
|
|
|
typedef char output_type;
|
2010-08-13 20:15:44 +04:00
|
|
|
|
2014-05-27 11:15:35 +04:00
|
|
|
explicit LossyConvertEncoding16to8(char* aDestination)
|
|
|
|
: mDestination(aDestination)
|
|
|
|
{
|
|
|
|
}
|
2010-08-13 20:15:44 +04:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
2014-05-27 11:15:35 +04:00
|
|
|
write(const char16_t* aSource, uint32_t aSourceLength)
|
2014-05-05 21:30:46 +04:00
|
|
|
{
|
2010-08-13 20:15:44 +04:00
|
|
|
#ifdef MOZILLA_MAY_SUPPORT_SSE2
|
2014-05-27 11:15:35 +04:00
|
|
|
if (mozilla::supports_sse2()) {
|
2014-05-05 21:30:46 +04:00
|
|
|
write_sse2(aSource, aSourceLength);
|
|
|
|
return;
|
|
|
|
}
|
2010-08-13 20:15:44 +04:00
|
|
|
#endif
|
2014-05-05 21:30:46 +04:00
|
|
|
const char16_t* done_writing = aSource + aSourceLength;
|
2014-05-27 11:15:35 +04:00
|
|
|
while (aSource < done_writing) {
|
2014-05-05 21:30:46 +04:00
|
|
|
*mDestination++ = (char)(*aSource++);
|
2014-05-27 11:15:35 +04:00
|
|
|
}
|
2014-05-05 21:30:46 +04:00
|
|
|
}
|
2010-08-13 20:15:44 +04:00
|
|
|
|
|
|
|
#ifdef MOZILLA_MAY_SUPPORT_SSE2
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
2014-05-27 11:15:35 +04:00
|
|
|
write_sse2(const char16_t* aSource, uint32_t aSourceLength);
|
2010-08-13 20:15:44 +04:00
|
|
|
#endif
|
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
void
|
|
|
|
write_terminator()
|
|
|
|
{
|
|
|
|
*mDestination = '\0';
|
|
|
|
}
|
2004-02-19 05:44:03 +03:00
|
|
|
|
2014-05-05 21:30:46 +04:00
|
|
|
private:
|
2014-05-27 11:15:35 +04:00
|
|
|
char* mDestination;
|
2014-05-05 21:30:46 +04:00
|
|
|
};
|
2007-12-31 18:15:43 +03:00
|
|
|
#endif // MOZILLA_INTERNAL_API
|
2004-02-19 05:44:03 +03:00
|
|
|
|
2003-05-22 02:20:27 +04:00
|
|
|
#endif /* !defined(nsUTF8Utils_h_) */
|