diff --git a/intl/uconv/src/nsUTF8ToUnicode.cpp b/intl/uconv/src/nsUTF8ToUnicode.cpp index fb3810efae26..fc76259c26b0 100644 --- a/intl/uconv/src/nsUTF8ToUnicode.cpp +++ b/intl/uconv/src/nsUTF8ToUnicode.cpp @@ -214,52 +214,38 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, mFirst = false; for (in = aSrc; ((in < inend) && (out < outend)); ++in) { + uint8_t c = *in; if (0 == mState) { // When mState is zero we expect either a US-ASCII character or a // multi-octet sequence. - if (0 == (0x80 & (*in))) { + if (c < 0x80) { // 00..7F int32_t max_loops = NS_MIN(inend - in, outend - out); Convert_ascii_run(in, out, max_loops); --in; // match the rest of the cases mBytes = 1; - } else if (0xC0 == (0xE0 & (*in))) { + } else if (c < 0xC2) { // C0/C1 + // Overlong 2 octet sequence + res = NS_ERROR_ILLEGAL_INPUT; + break; + } else if (c < 0xE0) { // C2..DF // First octet of 2 octet sequence - mUcs4 = (uint32_t)(*in); + mUcs4 = c; mUcs4 = (mUcs4 & 0x1F) << 6; mState = 1; mBytes = 2; - } else if (0xE0 == (0xF0 & (*in))) { + } else if (c < 0xF0) { // E0..EF // First octet of 3 octet sequence - mUcs4 = (uint32_t)(*in); + mUcs4 = c; mUcs4 = (mUcs4 & 0x0F) << 12; mState = 2; mBytes = 3; - } else if (0xF0 == (0xF8 & (*in))) { + } else if (c < 0xF5) { // F0..F4 // First octet of 4 octet sequence - mUcs4 = (uint32_t)(*in); + mUcs4 = c; mUcs4 = (mUcs4 & 0x07) << 18; mState = 3; mBytes = 4; - } else if (0xF8 == (0xFC & (*in))) { - /* First octet of 5 octet sequence. - * - * This is illegal because the encoded codepoint must be either - * (a) not the shortest form or - * (b) outside the Unicode range of 0-0x10FFFF. - * Rather than trying to resynchronize, we will carry on until the end - * of the sequence and let the later error handling code catch it. - */ - mUcs4 = (uint32_t)(*in); - mUcs4 = (mUcs4 & 0x03) << 24; - mState = 4; - mBytes = 5; - } else if (0xFC == (0xFE & (*in))) { - // First octet of 6 octet sequence, see comments for 5 octet sequence. - mUcs4 = (uint32_t)(*in); - mUcs4 = (mUcs4 & 1) << 30; - mState = 5; - mBytes = 6; - } else { + } else { // F5..FF /* Current octet is neither in the US-ASCII range nor a legal first * octet of a multi-octet sequence. * @@ -272,32 +258,34 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, } else { // When mState is non-zero, we expect a continuation of the multi-octet // sequence - if (0x80 == (0xC0 & (*in))) { + if (0x80 == (0xC0 & c)) { + if (mState > 1) { + // If we are here, all possibilities are: + // mState == 2 && mBytes == 3 || + // mState == 2 && mBytes == 4 || + // mState == 3 && mBytes == 4 + if (mBytes == 3 && (!mUcs4 && c < 0xA0 || // E0 80..9F + mUcs4 == 0xD000 && c > 0x9F) || // ED A0..BF + mState == 3 && (!mUcs4 && c < 0x90 || // F0 80..8F + mUcs4 == 0x100000 && c > 0x8F)) { // F4 90..BF + // illegal sequences or sequences converted into illegal ranges. + in--; + res = NS_ERROR_ILLEGAL_INPUT; + break; + } + } + // Legal continuation. uint32_t shift = (mState - 1) * 6; - uint32_t tmp = *in; + uint32_t tmp = c; tmp = (tmp & 0x0000003FL) << shift; mUcs4 |= tmp; if (0 == --mState) { /* End of the multi-octet sequence. mUcs4 now contains the final * Unicode codepoint to be output - * - * Check for illegal sequences and codepoints. */ - // From Unicode 3.1, non-shortest form is illegal - if (((2 == mBytes) && (mUcs4 < 0x0080)) || - ((3 == mBytes) && (mUcs4 < 0x0800)) || - ((4 == mBytes) && (mUcs4 < 0x10000)) || - (4 < mBytes) || - // From Unicode 3.2, surrogate characters are illegal - ((mUcs4 & 0xFFFFF800) == 0xD800) || - // Codepoints outside the Unicode range are illegal - (mUcs4 > 0x10FFFF)) { - res = NS_ERROR_ILLEGAL_INPUT; - break; - } if (mUcs4 > 0xFFFF) { // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair if (out + 2 > outend) { @@ -320,7 +308,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, mFirst = false; } } else { - /* ((0xC0 & (*in) != 0x80) && (mState != 0)) + /* ((0xC0 & c != 0x80) && (mState != 0)) * * Incomplete multi-octet sequence. Unconsume this * octet and return an error condition. Caller is responsible diff --git a/intl/uconv/src/nsUnicodeToUTF8.cpp b/intl/uconv/src/nsUnicodeToUTF8.cpp index b3fcee5e07e8..8fc76bde989f 100644 --- a/intl/uconv/src/nsUnicodeToUTF8.cpp +++ b/intl/uconv/src/nsUnicodeToUTF8.cpp @@ -50,9 +50,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc, return NS_OK_UENC_MOREOUTPUT; } if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair - *dest++ = (char)0xe0 | (mHighSurrogate >> 12); - *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f); - *dest++ = (char)0x80 | (mHighSurrogate & 0x003f); + *dest++ = (char)0xef; //replacement character + *dest++ = (char)0xbf; + *dest++ = (char)0xbd; destLen -= 3; } else { n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) + @@ -79,7 +79,17 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc, *dest++ = (char)0xc0 | (*src >> 6); *dest++ = (char)0x80 | (*src & 0x003f); destLen -= 2; - } else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDC00) { + } else if (*src >= (PRUnichar)0xd800 && *src <= (PRUnichar)0xdfff) { + if (*src >= (PRUnichar)0xdc00) { //not a pair + if (destLen < 3) + goto error_more_output; + *dest++ = (char)0xef; //replacement character + *dest++ = (char)0xbf; + *dest++ = (char)0xbd; + destLen -= 3; + ++src; + continue; + } if ((src+1) >= srcEnd) { //we need another surrogate to complete this unicode char mHighSurrogate = *src; @@ -90,9 +100,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc, if (destLen < 4) goto error_more_output; if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair - *dest++ = (char)0xe0 | (*src >> 12); - *dest++ = (char)0x80 | ((*src >> 6) & 0x003f); - *dest++ = (char)0x80 | (*src & 0x003f); + *dest++ = (char)0xef; //replacement character + *dest++ = (char)0xbf; + *dest++ = (char)0xbd; destLen -= 3; } else { n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (uint32_t)0x10000; @@ -133,9 +143,9 @@ NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength) *aDestLength = 0; return NS_OK_UENC_MOREOUTPUT; } - *dest++ = (char)0xe0 | (mHighSurrogate >> 12); - *dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f); - *dest++ = (char)0x80 | (mHighSurrogate & 0x003f); + *dest++ = (char)0xef; //replacement character + *dest++ = (char)0xbf; + *dest++ = (char)0xbd; mHighSurrogate = 0; *aDestLength = 3; return NS_OK; diff --git a/intl/uconv/tests/unit/test_utf8_illegals.js b/intl/uconv/tests/unit/test_utf8_illegals.js index f54073726a64..a1a029d63149 100644 --- a/intl/uconv/tests/unit/test_utf8_illegals.js +++ b/intl/uconv/tests/unit/test_utf8_illegals.js @@ -2,31 +2,93 @@ const Cc = Components.Constructor; const Ci = Components.interfaces; - -const inStrings1 = new Array("%c0%af", // long forms of 0x2F - "%e0%80%af", - "%f0%80%80%af", - "%f8%80%80%80%af", - "%fc%80%80%80%80%af", - // lone surrogates - "%ed%a0%80", // D800 - "%ed%ad%bf", // DB7F - "%ed%ae%80", // DB80 - "%ed%af%bf", // DBFF - "%ed%b0%80", // DC00 - "%ed%be%80", // DF80 - "%ed%bf%bf"); // DFFF -const expected1 = "ABC\ufffdXYZ"; - // Surrogate pairs -const inStrings2 = new Array("%ed%a0%80%ed%b0%80", // D800 DC00 - "%ed%a0%80%ed%bf%bf", // D800 DFFF - "%ed%ad%bf%ed%b0%80", // DB7F DC00 - "%ed%ad%bf%ed%bf%bf", // DB7F DFFF - "%ed%ae%80%ed%b0%80", // DB80 DC00 - "%ed%ae%80%ed%bf%bf", // DB80 DFFF - "%ed%af%bf%ed%b0%80", // DBFF DC00 - "%ed%ad%bf%ed%bf%bf"); // DBFF DFFF -const expected2 = "ABC\ufffd\ufffdXYZ"; + +const tests = [ +{ inStrings: ["%80", // Illegal or incomplete sequences + "%8f", + "%90", + "%9f", + "%a0", + "%bf", + "%c0", + "%c1", + "%c2", + "%df", + "%e0", + "%e0%a0", + "%e0%bf", + "%ed%80", + "%ed%9f", + "%ef", + "%ef%bf", + "%f0", + "%f0%90", + "%f0%90%80", + "%f0%90%bf", + "%f0%bf", + "%f0%bf%80", + "%f0%bf%bf", + "%f4", + "%f4%80", + "%f4%80%80", + "%f4%80%bf", + "%f4%8f", + "%f4%8f%80", + "%f4%8f%bf", + "%f5", + "%f7", + "%f8", + "%fb", + "%fc", + "%fd"], + expected: "ABC\ufffdXYZ" }, + +{ inStrings: ["%c0%af", // Illegal bytes in 2-octet + "%c1%af"], // sequences + expected: "ABC\ufffd\ufffdXYZ" }, + +{ inStrings: ["%e0%80%80", // Illegal bytes in 3-octet + "%e0%80%af", // sequences + "%e0%9f%bf", + // long surrogates + "%ed%a0%80", // D800 + "%ed%ad%bf", // DB7F + "%ed%ae%80", // DB80 + "%ed%af%bf", // DBFF + "%ed%b0%80", // DC00 + "%ed%be%80", // DF80 + "%ed%bf%bf"], // DFFF + expected: "ABC\ufffd\ufffd\ufffdXYZ" }, + +{ inStrings: ["%f0%80%80%80", // Illegal bytes in 4-octet + "%f0%80%80%af", // sequences + "%f0%8f%bf%bf", + "%f4%90%80%80", + "%f4%bf%bf%bf", + "%f5%80%80%80", + "%f7%bf%bf%bf"], + expected: "ABC\ufffd\ufffd\ufffd\ufffdXYZ" }, + +{ inStrings: ["%f8%80%80%80%80", // Illegal bytes in 5-octet + "%f8%80%80%80%af", // sequences + "%fb%bf%bf%bf%bf"], + expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" }, + + // Surrogate pairs +{ inStrings: ["%ed%a0%80%ed%b0%80", // D800 DC00 + "%ed%a0%80%ed%bf%bf", // D800 DFFF + "%ed%ad%bf%ed%b0%80", // DB7F DC00 + "%ed%ad%bf%ed%bf%bf", // DB7F DFFF + "%ed%ae%80%ed%b0%80", // DB80 DC00 + "%ed%ae%80%ed%bf%bf", // DB80 DFFF + "%ed%af%bf%ed%b0%80", // DBFF DC00 + "%ed%ad%bf%ed%bf%bf", // DBFF DFFF + "%fc%80%80%80%80%80", // Illegal bytes in 6-octet + "%fc%80%80%80%80%af", // sequences + "%fd%bf%bf%bf%bf%bf"], + expected: "ABC\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdXYZ" }, +]; + function testCaseInputStream(inStr, expected) { @@ -66,12 +128,9 @@ function testCaseInputStream(inStr, expected) } function run_test() { - for (var i = 0; i < inStrings1.length; ++i) { - var inStr = inStrings1[i]; - testCaseInputStream(inStr, expected1); - } - for (var i = 0; i < inStrings2.length; ++i) { - var inStr = inStrings2[i]; - testCaseInputStream(inStr, expected2); + for (var t of tests) { + for (var inStr of t.inStrings) { + testCaseInputStream(inStr, t.expected); } + } }