Bug 746900 - Implement "Best Practices for Using U+FFFD" from the Unicode standard. r=smontagu

This commit is contained in:
Masatoshi Kimura 2012-11-26 20:38:19 -05:00
Родитель 7f4216bbb3
Коммит cd71c36381
3 изменённых файлов: 143 добавлений и 86 удалений

Просмотреть файл

@ -214,52 +214,38 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
mFirst = false;
for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
uint8_t c = *in;
if (0 == mState) {
// When mState is zero we expect either a US-ASCII character or a
// multi-octet sequence.
if (0 == (0x80 & (*in))) {
if (c < 0x80) { // 00..7F
int32_t max_loops = NS_MIN(inend - in, outend - out);
Convert_ascii_run(in, out, max_loops);
--in; // match the rest of the cases
mBytes = 1;
} else if (0xC0 == (0xE0 & (*in))) {
} else if (c < 0xC2) { // C0/C1
// Overlong 2 octet sequence
res = NS_ERROR_ILLEGAL_INPUT;
break;
} else if (c < 0xE0) { // C2..DF
// First octet of 2 octet sequence
mUcs4 = (uint32_t)(*in);
mUcs4 = c;
mUcs4 = (mUcs4 & 0x1F) << 6;
mState = 1;
mBytes = 2;
} else if (0xE0 == (0xF0 & (*in))) {
} else if (c < 0xF0) { // E0..EF
// First octet of 3 octet sequence
mUcs4 = (uint32_t)(*in);
mUcs4 = c;
mUcs4 = (mUcs4 & 0x0F) << 12;
mState = 2;
mBytes = 3;
} else if (0xF0 == (0xF8 & (*in))) {
} else if (c < 0xF5) { // F0..F4
// First octet of 4 octet sequence
mUcs4 = (uint32_t)(*in);
mUcs4 = c;
mUcs4 = (mUcs4 & 0x07) << 18;
mState = 3;
mBytes = 4;
} else if (0xF8 == (0xFC & (*in))) {
/* First octet of 5 octet sequence.
*
* This is illegal because the encoded codepoint must be either
* (a) not the shortest form or
* (b) outside the Unicode range of 0-0x10FFFF.
* Rather than trying to resynchronize, we will carry on until the end
* of the sequence and let the later error handling code catch it.
*/
mUcs4 = (uint32_t)(*in);
mUcs4 = (mUcs4 & 0x03) << 24;
mState = 4;
mBytes = 5;
} else if (0xFC == (0xFE & (*in))) {
// First octet of 6 octet sequence, see comments for 5 octet sequence.
mUcs4 = (uint32_t)(*in);
mUcs4 = (mUcs4 & 1) << 30;
mState = 5;
mBytes = 6;
} else {
} else { // F5..FF
/* Current octet is neither in the US-ASCII range nor a legal first
* octet of a multi-octet sequence.
*
@ -272,32 +258,34 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
} else {
// When mState is non-zero, we expect a continuation of the multi-octet
// sequence
if (0x80 == (0xC0 & (*in))) {
if (0x80 == (0xC0 & c)) {
if (mState > 1) {
// If we are here, all possibilities are:
// mState == 2 && mBytes == 3 ||
// mState == 2 && mBytes == 4 ||
// mState == 3 && mBytes == 4
if (mBytes == 3 && (!mUcs4 && c < 0xA0 || // E0 80..9F
mUcs4 == 0xD000 && c > 0x9F) || // ED A0..BF
mState == 3 && (!mUcs4 && c < 0x90 || // F0 80..8F
mUcs4 == 0x100000 && c > 0x8F)) { // F4 90..BF
// illegal sequences or sequences converted into illegal ranges.
in--;
res = NS_ERROR_ILLEGAL_INPUT;
break;
}
}
// Legal continuation.
uint32_t shift = (mState - 1) * 6;
uint32_t tmp = *in;
uint32_t tmp = c;
tmp = (tmp & 0x0000003FL) << shift;
mUcs4 |= tmp;
if (0 == --mState) {
/* End of the multi-octet sequence. mUcs4 now contains the final
* Unicode codepoint to be output
*
* Check for illegal sequences and codepoints.
*/
// From Unicode 3.1, non-shortest form is illegal
if (((2 == mBytes) && (mUcs4 < 0x0080)) ||
((3 == mBytes) && (mUcs4 < 0x0800)) ||
((4 == mBytes) && (mUcs4 < 0x10000)) ||
(4 < mBytes) ||
// From Unicode 3.2, surrogate characters are illegal
((mUcs4 & 0xFFFFF800) == 0xD800) ||
// Codepoints outside the Unicode range are illegal
(mUcs4 > 0x10FFFF)) {
res = NS_ERROR_ILLEGAL_INPUT;
break;
}
if (mUcs4 > 0xFFFF) {
// mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
if (out + 2 > outend) {
@ -320,7 +308,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
mFirst = false;
}
} else {
/* ((0xC0 & (*in) != 0x80) && (mState != 0))
/* ((0xC0 & c != 0x80) && (mState != 0))
*
* Incomplete multi-octet sequence. Unconsume this
* octet and return an error condition. Caller is responsible

Просмотреть файл

@ -50,9 +50,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
return NS_OK_UENC_MOREOUTPUT;
}
if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
*dest++ = (char)0xef; //replacement character
*dest++ = (char)0xbf;
*dest++ = (char)0xbd;
destLen -= 3;
} else {
n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) +
@ -79,7 +79,17 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
*dest++ = (char)0xc0 | (*src >> 6);
*dest++ = (char)0x80 | (*src & 0x003f);
destLen -= 2;
} else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDC00) {
} else if (*src >= (PRUnichar)0xd800 && *src <= (PRUnichar)0xdfff) {
if (*src >= (PRUnichar)0xdc00) { //not a pair
if (destLen < 3)
goto error_more_output;
*dest++ = (char)0xef; //replacement character
*dest++ = (char)0xbf;
*dest++ = (char)0xbd;
destLen -= 3;
++src;
continue;
}
if ((src+1) >= srcEnd) {
//we need another surrogate to complete this unicode char
mHighSurrogate = *src;
@ -90,9 +100,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
if (destLen < 4)
goto error_more_output;
if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair
*dest++ = (char)0xe0 | (*src >> 12);
*dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
*dest++ = (char)0x80 | (*src & 0x003f);
*dest++ = (char)0xef; //replacement character
*dest++ = (char)0xbf;
*dest++ = (char)0xbd;
destLen -= 3;
} else {
n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (uint32_t)0x10000;
@ -133,9 +143,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength)
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
*dest++ = (char)0xef; //replacement character
*dest++ = (char)0xbf;
*dest++ = (char)0xbd;
mHighSurrogate = 0;
*aDestLength = 3;
return NS_OK;

Просмотреть файл

@ -3,30 +3,92 @@
const Cc = Components.Constructor;
const Ci = Components.interfaces;
const inStrings1 = new Array("%c0%af", // long forms of 0x2F
"%e0%80%af",
"%f0%80%80%af",
"%f8%80%80%80%af",
"%fc%80%80%80%80%af",
// lone surrogates
"%ed%a0%80", // D800
"%ed%ad%bf", // DB7F
"%ed%ae%80", // DB80
"%ed%af%bf", // DBFF
"%ed%b0%80", // DC00
"%ed%be%80", // DF80
"%ed%bf%bf"); // DFFF
const expected1 = "ABC\ufffdXYZ";
// Surrogate pairs
const inStrings2 = new Array("%ed%a0%80%ed%b0%80", // D800 DC00
"%ed%a0%80%ed%bf%bf", // D800 DFFF
"%ed%ad%bf%ed%b0%80", // DB7F DC00
"%ed%ad%bf%ed%bf%bf", // DB7F DFFF
"%ed%ae%80%ed%b0%80", // DB80 DC00
"%ed%ae%80%ed%bf%bf", // DB80 DFFF
"%ed%af%bf%ed%b0%80", // DBFF DC00
"%ed%ad%bf%ed%bf%bf"); // DBFF DFFF
const expected2 = "ABC\ufffd\ufffdXYZ";
const tests = [
{ inStrings: ["%80", // Illegal or incomplete sequences
"%8f",
"%90",
"%9f",
"%a0",
"%bf",
"%c0",
"%c1",
"%c2",
"%df",
"%e0",
"%e0%a0",
"%e0%bf",
"%ed%80",
"%ed%9f",
"%ef",
"%ef%bf",
"%f0",
"%f0%90",
"%f0%90%80",
"%f0%90%bf",
"%f0%bf",
"%f0%bf%80",
"%f0%bf%bf",
"%f4",
"%f4%80",
"%f4%80%80",
"%f4%80%bf",
"%f4%8f",
"%f4%8f%80",
"%f4%8f%bf",
"%f5",
"%f7",
"%f8",
"%fb",
"%fc",
"%fd"],
expected: "ABC\ufffdXYZ" },
{ inStrings: ["%c0%af", // Illegal bytes in 2-octet
"%c1%af"], // sequences
expected: "ABC\ufffd\ufffdXYZ" },
{ inStrings: ["%e0%80%80", // Illegal bytes in 3-octet
"%e0%80%af", // sequences
"%e0%9f%bf",
// long surrogates
"%ed%a0%80", // D800
"%ed%ad%bf", // DB7F
"%ed%ae%80", // DB80
"%ed%af%bf", // DBFF
"%ed%b0%80", // DC00
"%ed%be%80", // DF80
"%ed%bf%bf"], // DFFF
expected: "ABC\ufffd\ufffd\ufffdXYZ" },
{ inStrings: ["%f0%80%80%80", // Illegal bytes in 4-octet
"%f0%80%80%af", // sequences
"%f0%8f%bf%bf",
"%f4%90%80%80",
"%f4%bf%bf%bf",
"%f5%80%80%80",
"%f7%bf%bf%bf"],
expected: "ABC\ufffd\ufffd\ufffd\ufffdXYZ" },
{ inStrings: ["%f8%80%80%80%80", // Illegal bytes in 5-octet
"%f8%80%80%80%af", // sequences
"%fb%bf%bf%bf%bf"],
expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
// Surrogate pairs
{ inStrings: ["%ed%a0%80%ed%b0%80", // D800 DC00
"%ed%a0%80%ed%bf%bf", // D800 DFFF
"%ed%ad%bf%ed%b0%80", // DB7F DC00
"%ed%ad%bf%ed%bf%bf", // DB7F DFFF
"%ed%ae%80%ed%b0%80", // DB80 DC00
"%ed%ae%80%ed%bf%bf", // DB80 DFFF
"%ed%af%bf%ed%b0%80", // DBFF DC00
"%ed%ad%bf%ed%bf%bf", // DBFF DFFF
"%fc%80%80%80%80%80", // Illegal bytes in 6-octet
"%fc%80%80%80%80%af", // sequences
"%fd%bf%bf%bf%bf%bf"],
expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" },
];
function testCaseInputStream(inStr, expected)
{
@ -66,12 +128,9 @@ function testCaseInputStream(inStr, expected)
}
function run_test() {
for (var i = 0; i < inStrings1.length; ++i) {
var inStr = inStrings1[i];
testCaseInputStream(inStr, expected1);
}
for (var i = 0; i < inStrings2.length; ++i) {
var inStr = inStrings2[i];
testCaseInputStream(inStr, expected2);
for (var t of tests) {
for (var inStr of t.inStrings) {
testCaseInputStream(inStr, t.expected);
}
}
}