зеркало из https://github.com/mozilla/gecko-dev.git
Bug 746900 - Implement "Best Practices for Using U+FFFD" from the Unicode standard. r=smontagu
This commit is contained in:
Родитель
7f4216bbb3
Коммит
cd71c36381
|
@ -214,52 +214,38 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
|
|||
mFirst = false;
|
||||
|
||||
for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
|
||||
uint8_t c = *in;
|
||||
if (0 == mState) {
|
||||
// When mState is zero we expect either a US-ASCII character or a
|
||||
// multi-octet sequence.
|
||||
if (0 == (0x80 & (*in))) {
|
||||
if (c < 0x80) { // 00..7F
|
||||
int32_t max_loops = NS_MIN(inend - in, outend - out);
|
||||
Convert_ascii_run(in, out, max_loops);
|
||||
--in; // match the rest of the cases
|
||||
mBytes = 1;
|
||||
} else if (0xC0 == (0xE0 & (*in))) {
|
||||
} else if (c < 0xC2) { // C0/C1
|
||||
// Overlong 2 octet sequence
|
||||
res = NS_ERROR_ILLEGAL_INPUT;
|
||||
break;
|
||||
} else if (c < 0xE0) { // C2..DF
|
||||
// First octet of 2 octet sequence
|
||||
mUcs4 = (uint32_t)(*in);
|
||||
mUcs4 = c;
|
||||
mUcs4 = (mUcs4 & 0x1F) << 6;
|
||||
mState = 1;
|
||||
mBytes = 2;
|
||||
} else if (0xE0 == (0xF0 & (*in))) {
|
||||
} else if (c < 0xF0) { // E0..EF
|
||||
// First octet of 3 octet sequence
|
||||
mUcs4 = (uint32_t)(*in);
|
||||
mUcs4 = c;
|
||||
mUcs4 = (mUcs4 & 0x0F) << 12;
|
||||
mState = 2;
|
||||
mBytes = 3;
|
||||
} else if (0xF0 == (0xF8 & (*in))) {
|
||||
} else if (c < 0xF5) { // F0..F4
|
||||
// First octet of 4 octet sequence
|
||||
mUcs4 = (uint32_t)(*in);
|
||||
mUcs4 = c;
|
||||
mUcs4 = (mUcs4 & 0x07) << 18;
|
||||
mState = 3;
|
||||
mBytes = 4;
|
||||
} else if (0xF8 == (0xFC & (*in))) {
|
||||
/* First octet of 5 octet sequence.
|
||||
*
|
||||
* This is illegal because the encoded codepoint must be either
|
||||
* (a) not the shortest form or
|
||||
* (b) outside the Unicode range of 0-0x10FFFF.
|
||||
* Rather than trying to resynchronize, we will carry on until the end
|
||||
* of the sequence and let the later error handling code catch it.
|
||||
*/
|
||||
mUcs4 = (uint32_t)(*in);
|
||||
mUcs4 = (mUcs4 & 0x03) << 24;
|
||||
mState = 4;
|
||||
mBytes = 5;
|
||||
} else if (0xFC == (0xFE & (*in))) {
|
||||
// First octet of 6 octet sequence, see comments for 5 octet sequence.
|
||||
mUcs4 = (uint32_t)(*in);
|
||||
mUcs4 = (mUcs4 & 1) << 30;
|
||||
mState = 5;
|
||||
mBytes = 6;
|
||||
} else {
|
||||
} else { // F5..FF
|
||||
/* Current octet is neither in the US-ASCII range nor a legal first
|
||||
* octet of a multi-octet sequence.
|
||||
*
|
||||
|
@ -272,32 +258,34 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
|
|||
} else {
|
||||
// When mState is non-zero, we expect a continuation of the multi-octet
|
||||
// sequence
|
||||
if (0x80 == (0xC0 & (*in))) {
|
||||
if (0x80 == (0xC0 & c)) {
|
||||
if (mState > 1) {
|
||||
// If we are here, all possibilities are:
|
||||
// mState == 2 && mBytes == 3 ||
|
||||
// mState == 2 && mBytes == 4 ||
|
||||
// mState == 3 && mBytes == 4
|
||||
if (mBytes == 3 && (!mUcs4 && c < 0xA0 || // E0 80..9F
|
||||
mUcs4 == 0xD000 && c > 0x9F) || // ED A0..BF
|
||||
mState == 3 && (!mUcs4 && c < 0x90 || // F0 80..8F
|
||||
mUcs4 == 0x100000 && c > 0x8F)) { // F4 90..BF
|
||||
// illegal sequences or sequences converted into illegal ranges.
|
||||
in--;
|
||||
res = NS_ERROR_ILLEGAL_INPUT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Legal continuation.
|
||||
uint32_t shift = (mState - 1) * 6;
|
||||
uint32_t tmp = *in;
|
||||
uint32_t tmp = c;
|
||||
tmp = (tmp & 0x0000003FL) << shift;
|
||||
mUcs4 |= tmp;
|
||||
|
||||
if (0 == --mState) {
|
||||
/* End of the multi-octet sequence. mUcs4 now contains the final
|
||||
* Unicode codepoint to be output
|
||||
*
|
||||
* Check for illegal sequences and codepoints.
|
||||
*/
|
||||
|
||||
// From Unicode 3.1, non-shortest form is illegal
|
||||
if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
|
||||
((3 == mBytes) && (mUcs4 < 0x0800)) ||
|
||||
((4 == mBytes) && (mUcs4 < 0x10000)) ||
|
||||
(4 < mBytes) ||
|
||||
// From Unicode 3.2, surrogate characters are illegal
|
||||
((mUcs4 & 0xFFFFF800) == 0xD800) ||
|
||||
// Codepoints outside the Unicode range are illegal
|
||||
(mUcs4 > 0x10FFFF)) {
|
||||
res = NS_ERROR_ILLEGAL_INPUT;
|
||||
break;
|
||||
}
|
||||
if (mUcs4 > 0xFFFF) {
|
||||
// mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
|
||||
if (out + 2 > outend) {
|
||||
|
@ -320,7 +308,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
|
|||
mFirst = false;
|
||||
}
|
||||
} else {
|
||||
/* ((0xC0 & (*in) != 0x80) && (mState != 0))
|
||||
/* ((0xC0 & c != 0x80) && (mState != 0))
|
||||
*
|
||||
* Incomplete multi-octet sequence. Unconsume this
|
||||
* octet and return an error condition. Caller is responsible
|
||||
|
|
|
@ -50,9 +50,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
|
|||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair
|
||||
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
|
||||
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
|
||||
*dest++ = (char)0xef; //replacement character
|
||||
*dest++ = (char)0xbf;
|
||||
*dest++ = (char)0xbd;
|
||||
destLen -= 3;
|
||||
} else {
|
||||
n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) +
|
||||
|
@ -79,7 +79,17 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
|
|||
*dest++ = (char)0xc0 | (*src >> 6);
|
||||
*dest++ = (char)0x80 | (*src & 0x003f);
|
||||
destLen -= 2;
|
||||
} else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDC00) {
|
||||
} else if (*src >= (PRUnichar)0xd800 && *src <= (PRUnichar)0xdfff) {
|
||||
if (*src >= (PRUnichar)0xdc00) { //not a pair
|
||||
if (destLen < 3)
|
||||
goto error_more_output;
|
||||
*dest++ = (char)0xef; //replacement character
|
||||
*dest++ = (char)0xbf;
|
||||
*dest++ = (char)0xbd;
|
||||
destLen -= 3;
|
||||
++src;
|
||||
continue;
|
||||
}
|
||||
if ((src+1) >= srcEnd) {
|
||||
//we need another surrogate to complete this unicode char
|
||||
mHighSurrogate = *src;
|
||||
|
@ -90,9 +100,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
|
|||
if (destLen < 4)
|
||||
goto error_more_output;
|
||||
if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair
|
||||
*dest++ = (char)0xe0 | (*src >> 12);
|
||||
*dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (*src & 0x003f);
|
||||
*dest++ = (char)0xef; //replacement character
|
||||
*dest++ = (char)0xbf;
|
||||
*dest++ = (char)0xbd;
|
||||
destLen -= 3;
|
||||
} else {
|
||||
n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (uint32_t)0x10000;
|
||||
|
@ -133,9 +143,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength)
|
|||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
|
||||
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
|
||||
*dest++ = (char)0xef; //replacement character
|
||||
*dest++ = (char)0xbf;
|
||||
*dest++ = (char)0xbd;
|
||||
mHighSurrogate = 0;
|
||||
*aDestLength = 3;
|
||||
return NS_OK;
|
||||
|
|
|
@ -3,30 +3,92 @@
|
|||
const Cc = Components.Constructor;
|
||||
const Ci = Components.interfaces;
|
||||
|
||||
const inStrings1 = new Array("%c0%af", // long forms of 0x2F
|
||||
"%e0%80%af",
|
||||
"%f0%80%80%af",
|
||||
"%f8%80%80%80%af",
|
||||
"%fc%80%80%80%80%af",
|
||||
// lone surrogates
|
||||
const tests = [
|
||||
{ inStrings: ["%80", // Illegal or incomplete sequences
|
||||
"%8f",
|
||||
"%90",
|
||||
"%9f",
|
||||
"%a0",
|
||||
"%bf",
|
||||
"%c0",
|
||||
"%c1",
|
||||
"%c2",
|
||||
"%df",
|
||||
"%e0",
|
||||
"%e0%a0",
|
||||
"%e0%bf",
|
||||
"%ed%80",
|
||||
"%ed%9f",
|
||||
"%ef",
|
||||
"%ef%bf",
|
||||
"%f0",
|
||||
"%f0%90",
|
||||
"%f0%90%80",
|
||||
"%f0%90%bf",
|
||||
"%f0%bf",
|
||||
"%f0%bf%80",
|
||||
"%f0%bf%bf",
|
||||
"%f4",
|
||||
"%f4%80",
|
||||
"%f4%80%80",
|
||||
"%f4%80%bf",
|
||||
"%f4%8f",
|
||||
"%f4%8f%80",
|
||||
"%f4%8f%bf",
|
||||
"%f5",
|
||||
"%f7",
|
||||
"%f8",
|
||||
"%fb",
|
||||
"%fc",
|
||||
"%fd"],
|
||||
expected: "ABC\ufffdXYZ" },
|
||||
|
||||
{ inStrings: ["%c0%af", // Illegal bytes in 2-octet
|
||||
"%c1%af"], // sequences
|
||||
expected: "ABC\ufffd\ufffdXYZ" },
|
||||
|
||||
{ inStrings: ["%e0%80%80", // Illegal bytes in 3-octet
|
||||
"%e0%80%af", // sequences
|
||||
"%e0%9f%bf",
|
||||
// long surrogates
|
||||
"%ed%a0%80", // D800
|
||||
"%ed%ad%bf", // DB7F
|
||||
"%ed%ae%80", // DB80
|
||||
"%ed%af%bf", // DBFF
|
||||
"%ed%b0%80", // DC00
|
||||
"%ed%be%80", // DF80
|
||||
"%ed%bf%bf"); // DFFF
|
||||
const expected1 = "ABC\ufffdXYZ";
|
||||
"%ed%bf%bf"], // DFFF
|
||||
expected: "ABC\ufffd\ufffd\ufffdXYZ" },
|
||||
|
||||
{ inStrings: ["%f0%80%80%80", // Illegal bytes in 4-octet
|
||||
"%f0%80%80%af", // sequences
|
||||
"%f0%8f%bf%bf",
|
||||
"%f4%90%80%80",
|
||||
"%f4%bf%bf%bf",
|
||||
"%f5%80%80%80",
|
||||
"%f7%bf%bf%bf"],
|
||||
expected: "ABC\ufffd\ufffd\ufffd\ufffdXYZ" },
|
||||
|
||||
{ inStrings: ["%f8%80%80%80%80", // Illegal bytes in 5-octet
|
||||
"%f8%80%80%80%af", // sequences
|
||||
"%fb%bf%bf%bf%bf"],
|
||||
expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
|
||||
|
||||
// Surrogate pairs
|
||||
const inStrings2 = new Array("%ed%a0%80%ed%b0%80", // D800 DC00
|
||||
{ inStrings: ["%ed%a0%80%ed%b0%80", // D800 DC00
|
||||
"%ed%a0%80%ed%bf%bf", // D800 DFFF
|
||||
"%ed%ad%bf%ed%b0%80", // DB7F DC00
|
||||
"%ed%ad%bf%ed%bf%bf", // DB7F DFFF
|
||||
"%ed%ae%80%ed%b0%80", // DB80 DC00
|
||||
"%ed%ae%80%ed%bf%bf", // DB80 DFFF
|
||||
"%ed%af%bf%ed%b0%80", // DBFF DC00
|
||||
"%ed%ad%bf%ed%bf%bf"); // DBFF DFFF
|
||||
const expected2 = "ABC\ufffd\ufffdXYZ";
|
||||
"%ed%ad%bf%ed%bf%bf", // DBFF DFFF
|
||||
"%fc%80%80%80%80%80", // Illegal bytes in 6-octet
|
||||
"%fc%80%80%80%80%af", // sequences
|
||||
"%fd%bf%bf%bf%bf%bf"],
|
||||
expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
|
||||
];
|
||||
|
||||
|
||||
function testCaseInputStream(inStr, expected)
|
||||
{
|
||||
|
@ -66,12 +128,9 @@ function testCaseInputStream(inStr, expected)
|
|||
}
|
||||
|
||||
function run_test() {
|
||||
for (var i = 0; i < inStrings1.length; ++i) {
|
||||
var inStr = inStrings1[i];
|
||||
testCaseInputStream(inStr, expected1);
|
||||
for (var t of tests) {
|
||||
for (var inStr of t.inStrings) {
|
||||
testCaseInputStream(inStr, t.expected);
|
||||
}
|
||||
for (var i = 0; i < inStrings2.length; ++i) {
|
||||
var inStr = inStrings2[i];
|
||||
testCaseInputStream(inStr, expected2);
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче