From bf2788d424cd2377e52c0401e536ea002897ae22 Mon Sep 17 00:00:00 2001 From: "jshin%mailaps.org" Date: Tue, 1 Jun 2004 17:26:27 +0000 Subject: [PATCH] bug 236941 (patch by Jean-Marc Desperrier) : UTF-8 converter loses full lines of text if there's any invalid character (r=jshin, sr=bienvenu) --- mailnews/mime/src/mimemoz2.cpp | 81 +++++++++++++++++++++++++--------- mailnews/mime/src/mimetext.cpp | 4 ++ 2 files changed, 63 insertions(+), 22 deletions(-) diff --git a/mailnews/mime/src/mimemoz2.cpp b/mailnews/mime/src/mimemoz2.cpp index ea3751bdf5df..5e7c37cc2078 100644 --- a/mailnews/mime/src/mimemoz2.cpp +++ b/mailnews/mime/src/mimemoz2.cpp @@ -765,31 +765,68 @@ int ConvertUsingEncoderAndDecoder(const char *stringToUse, PRInt32 inLength, rv = NS_ERROR_OUT_OF_MEMORY; } else { - // convert to unicode - rv = decoder->Convert(stringToUse, &srcLen, unichars, &unicharLength); - if (NS_SUCCEEDED(rv)) { - rv = encoder->GetMaxLength(unichars, unicharLength, &dstLength); - // allocale an output buffer - dstPtr = (char *) PR_Malloc(dstLength + 1); - if (dstPtr == nsnull) { - rv = NS_ERROR_OUT_OF_MEMORY; - } - else { - PRInt32 buffLength = dstLength; - // convert from unicode - rv = encoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nsnull, '?'); + // convert to unicode, replacing failed chars with 0xFFFD as in + // the methode used in nsXMLHttpRequest::ConvertBodyToText and nsScanner::Append + // + // We will need several pass to convert the whole string if it has invalid characters + // 'totalChars' is where the sum of the number of converted characters will be done + // 'dataLen' is the number of character left to convert + // 'outLen' is the number of characters still available in the output buffer as input of decoder->Convert + // and the number of characters written in it as output. + PRInt32 totalChars = 0, + inBufferIndex = 0, + outBufferIndex = 0; + PRInt32 dataLen = srcLen, + outLen = unicharLength; + + do { + PRInt32 inBufferLength = dataLen; + rv = decoder->Convert(&stringToUse[inBufferIndex], + &inBufferLength, + &unichars[outBufferIndex], + &outLen); + totalChars += outLen; + // Done if conversion successful + if (NS_SUCCEEDED(rv)) + break; + + // We consume one byte, replace it with U+FFFD + // and try the conversion again. + outBufferIndex += outLen; + unichars[outBufferIndex++] = PRUnichar(0xFFFD); + // totalChars is updated here + outLen = unicharLength - (++totalChars); + + inBufferIndex += inBufferLength + 1; + dataLen -= inBufferLength + 1; + + decoder->Reset(); + + // If there is not at least one byte available after the one we + // consumed, we're done + } while ( dataLen > 0 ); + + rv = encoder->GetMaxLength(unichars, totalChars, &dstLength); + // allocale an output buffer + dstPtr = (char *) PR_Malloc(dstLength + 1); + if (dstPtr == nsnull) { + rv = NS_ERROR_OUT_OF_MEMORY; + } + else { + PRInt32 buffLength = dstLength; + // convert from unicode + rv = encoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nsnull, '?'); + if (NS_SUCCEEDED(rv)) { + rv = encoder->Convert(unichars, &totalChars, dstPtr, &dstLength); if (NS_SUCCEEDED(rv)) { - rv = encoder->Convert(unichars, &unicharLength, dstPtr, &dstLength); + PRInt32 finLen = buffLength - dstLength; + rv = encoder->Finish((char *)(dstPtr+dstLength), &finLen); if (NS_SUCCEEDED(rv)) { - PRInt32 finLen = buffLength - dstLength; - rv = encoder->Finish((char *)(dstPtr+dstLength), &finLen); - if (NS_SUCCEEDED(rv)) { - dstLength += finLen; - } - dstPtr[dstLength] = '\0'; - *pConvertedString = dstPtr; // set the result string - *outLength = dstLength; + dstLength += finLen; } + dstPtr[dstLength] = '\0'; + *pConvertedString = dstPtr; // set the result string + *outLength = dstLength; } } } diff --git a/mailnews/mime/src/mimetext.cpp b/mailnews/mime/src/mimetext.cpp index c96e46740c05..ce042a6f61ef 100644 --- a/mailnews/mime/src/mimetext.cpp +++ b/mailnews/mime/src/mimetext.cpp @@ -390,6 +390,10 @@ MimeInlineText_convert_and_parse_line(char *line, PRInt32 length, MimeObject *ob //initiate decoder if not yet if (text->inputDecoder == nsnull) MIME_get_unicode_decoder(text->charset, getter_AddRefs(text->inputDecoder)); + // If no decoder found, use ""UTF-8"", that will map most non-US-ASCII chars as invalid + // A pure-ASCII only decoder would be better, but there is none + if (text->inputDecoder == nsnull) + MIME_get_unicode_decoder("UTF-8", getter_AddRefs(text->inputDecoder)); if (text->utf8Encoder == nsnull) MIME_get_unicode_encoder("UTF-8", getter_AddRefs(text->utf8Encoder));