header decoding should use folder charset: bug 65277, allow spaces bug 69251 r=ducarroz r=nhotta sr=sspitzer

2001-02-22 03:02:00 +00:00 · 2001-02-22 03:02:00 +00:00 · 7a36aa7cf0
--- a/mailnews/mime/src/comi18n.cpp
+++ b/mailnews/mime/src/comi18n.cpp
@ -56,7 +56,7 @@ extern "C"  char * MIME_StripContinuations(char *original);

 ////////////////////////////////////////////////////////////////////////////////
 //  Pasted from the old code (xp_wrap.c)
-//  Removed multi byte support because we use utf-8 internally and no overwrap with us-ascii.
+//  Removed multi byte support because we use UTF-8 internally and no overwrap with us-ascii.

 #undef OUTPUT
 #define OUTPUT(b) \
@ -528,7 +528,7 @@ static char *intlmime_encode_next8bitword(char *src)
    {
      break;
    }
-    p++; // the string is utf-8 thus no conflict with scanning chars (which is all us-ascii).
+    p++; // the string is UTF-8 thus no conflict with scanning chars (which is all us-ascii).
  }

  if (non_ascii)
@ -628,7 +628,7 @@ char * utf8_mime_encode_mail_address(char *charset, const char *src, int maxLine
  *retbuf = '\0';

  // loop for separating encoded words by the separators
-  // the input string is utf-8 at this point
+  // the input string is UTF-8 at this point
  srclen = nsCRT::strlen(srcbuf);

 convert_and_encode:
@ -685,17 +685,17 @@ convert_and_encode:
          end = q;
          break;
        }
-        q++;  // the string is utf-8 thus no conflict with scanning chars (which is all us-ascii).
+        q++;  // the string is UTF-8 thus no conflict with scanning chars (which is all us-ascii).
      }
    }

-    // convert utf-8 to mail charset
+    // convert UTF-8 to mail charset
    /* get the to_be_converted_buffer's len */
    len = nsCRT::strlen(begin);

    if ( !intlmime_only_ascii_str(begin) )
    {
-      // now the input is utf-8, a character may be more than 2 bytes len
+      // now the input is UTF-8, a character may be more than 2 bytes len
      // so we may over estimate (i.e. threshold may be smaller) but wrapping early is no problem, I think.

      /*
@ -720,7 +720,7 @@ convert_and_encode:
      }

      // loop for line wrapping: estimate converted/encoded length 
-      // and apply conversion (utf-8 to mail charset)
+      // and apply conversion (UTF-8 to mail charset)

      /* iEffectLen - the max byte-string length of JIS ( converted form S-JIS )
         name - such as "iso-2022-jp", the encoding name, MUST be shorter than 23 bytes
@ -749,7 +749,7 @@ convert_and_encode:
          */
          return NULL;
        }
-        // utf-8 to mail charset conversion (or iso-8859-1 in case of us-ascii).
+        // UTF-8 to mail charset conversion (or iso-8859-1 in case of us-ascii).
        PRUnichar *u = NULL;
        nsAutoString fmt; fmt.AssignWithConversion("%s");
        char aChar = begin[len];
@ -957,7 +957,7 @@ convert_and_encode:

 */

-// input utf-8, return NULL in case of error.
+// input UTF-8, return NULL in case of error.
 static
 char *utf8_EncodeMimePartIIStr(const char *subject, char *charset, int maxLineLen)
 {
@ -1048,12 +1048,89 @@ static char *intlmime_decode_q(const char *in, unsigned length)
  return NULL;
 }

+static PRBool intl_is_legal_utf8(const char *input, unsigned len)
+{
+  int c;
+
+  while (len) {
+    c = (unsigned char)*input++;
+    len--;
+    if (c == 0x1B) break;
+    if ((c & 0x80) == 0) continue;
+    if ((c & 0xE0) == 0xC0) {
+      if (len < 1 || (*input & 0xC0) != 0x80 ||
+        ((c & 0x1F)<<6) + (*input & 0x3f) < 0x80) {
+        return PR_FALSE;
+      }
+      input++;
+      len--;
+    } else if ((c & 0xF0) == 0xE0) {
+      if (len < 2 || (input[0] & 0xC0) != 0x80 ||
+        (input[1] & 0xC0) != 0x80) {
+        return PR_FALSE;
+      }
+      input += 2;
+      len -= 2;
+    } else if ((c & 0xF8) == 0xF0) {
+      if (len < 3 || (input[0] & 0xC0) != 0x80 ||
+        (input[1] & 0xC0) != 0x80 || (input[2] & 0xC0) != 0x80) {
+        return PR_FALSE;
+      }
+      input += 2;
+      len -= 2;
+    } else {
+      return PR_FALSE;
+    }
+  }
+  return PR_TRUE;
+}
+
+static void intl_copy_uncoded_header(char **output, const char *input,
+  unsigned len, const char *default_charset)
+{
+  int c;
+  char *dest = *output;
+  char *utf8_text;
+  PRInt32 output_len;
+
+  if (!default_charset) {
+    memcpy(dest, input, len);
+    *output = dest + len;
+    return;
+  }
+
+  // Copy as long as it's US-ASCII.  An ESC may indicate ISO 2022
+  while (len && (c = (unsigned char)*input++) != 0x1B && !(c & 0x80)) {
+    *dest++ = c;
+    len--;
+  }
+  if (!len) {
+    *output = dest;
+    return;
+  }
+  input--;
+
+  // If not legal UTF-8, treat as default charset
+  if (!intl_is_legal_utf8(input, len) &&
+      MIME_ConvertCharset(PR_FALSE, default_charset, "UTF-8",
+                          input, len, &utf8_text, &output_len, NULL) == 0) {
+    memcpy(dest, utf8_text, output_len);
+    *output = dest + output_len;
+    PR_Free(utf8_text);
+  } else {
+    memcpy(dest, input, len);
+    *output = dest + len;
+  }
+}
+
 static char *especials = "()<>@,;:\\\"/[]?.=";

 static
-char *intl_decode_mime_part2_str(const char *header)
+char *intl_decode_mime_part2_str(const char *header,
+  const char *default_charset, PRBool override_charset)
 {
  char *output_p = NULL;
+  PRInt32 output_len;
  char *retbuff = NULL;
  const char *p, *q, *r;
  char *decoded_text, *utf8_text;
@ -1066,7 +1143,7 @@ char *intl_decode_mime_part2_str(const char *header)
  charset[0] = '\0';

  /* Assume no more than 2X expansion due to UTF-8 conversion */
-  retbuff = (char *)PR_Malloc(2*strlen(header)+1);
+  retbuff = (char *)PR_Malloc(2*nsCRT::strlen(header)+1);

  if (retbuff == NULL)
    return NULL;
@ -1084,8 +1161,7 @@ char *intl_decode_mime_part2_str(const char *header)

    if (!last_saw_encoded_word || q < p) {
      /* copy the part before the encoded-word */
-      PL_strncpy(output_p, begin, p - begin);
-      output_p += p - begin;
+      intl_copy_uncoded_header(&output_p, begin, p - begin, default_charset);
      begin = p;
    }

@ -1123,7 +1199,7 @@ char *intl_decode_mime_part2_str(const char *header)

    r = q;
    for (r = q + 2; *r != '?'; r++) {
-      if (*r <= ' ') goto badsyntax;
+      if (*r < ' ') goto badsyntax;
    }
    if (r == q + 2 || r[1] != '=') goto badsyntax;

@ -1135,9 +1211,16 @@ char *intl_decode_mime_part2_str(const char *header)
    if (decoded_text == NULL)
      goto badsyntax;

-    if (MIME_ConvertString(charset, "UTF-8", decoded_text, &utf8_text) == 0) {
-      PL_strcpy(output_p, (char *)utf8_text);
-      output_p += nsCRT::strlen(utf8_text);
+    // Override charset if requested.  Never override labeled UTF-8.
+    if (override_charset && 0 != nsCRT::strcasecmp(charset, "UTF-8")) {
+      PL_strcpy(charset, default_charset);
+    }
+
+    if (MIME_ConvertCharset(PR_FALSE, charset, "UTF-8", 
+                            decoded_text, nsCRT::strlen(decoded_text),
+                            &utf8_text, &output_len, NULL) == 0) {
+      memcpy(output_p, utf8_text, output_len);
+      output_p += output_len;
      PR_Free(utf8_text);
    } else {
      PL_strcpy(output_p, "\347\277\275"); /* UTF-8 encoding of U+FFFD */
@ -1157,13 +1240,53 @@ char *intl_decode_mime_part2_str(const char *header)
    begin = p;
    last_saw_encoded_word = 0;
  }
-  PL_strcpy(output_p, (char *)begin);     /* put the tail back  */
+
+  /* put the tail back  */
+  intl_copy_uncoded_header(&output_p, begin, nsCRT::strlen(begin), default_charset);
+  *output_p = '\0';

  return retbuff;
 }

 ////////////////////////////////////////////////////////////////////////////////

+class MimeCharsetConverterClass {
+public:
+  MimeCharsetConverterClass();
+  virtual ~MimeCharsetConverterClass();
+
+  // Initialize converters for charsets, fails if converter not available.
+  // 
+  PRInt32 Initialize(const char* from_charset, const char* to_charset, 
+                     const PRBool autoDetect=PR_FALSE, const PRInt32 maxNumCharsDetect=-1);
+
+  // Converts input buffer or duplicates input if converters not available (and returns 0).
+  // Also duplicates input if convertion not needed.
+  // C string is generated for converted string.
+  PRInt32 Convert(const char* inBuffer, const PRInt32 inLength, 
+                  char** outBuffer, PRInt32* outLength,
+                  PRInt32* numUnConverted);
+
+  static nsIStringCharsetDetector *mDetector;  // charset detector
+
+protected:
+  nsIUnicodeDecoder * GetUnicodeDecoder() {return (mAutoDetect && NULL != mDecoderDetected) ? mDecoderDetected : mDecoder;}
+  nsIUnicodeEncoder * GetUnicodeEncoder() {return mEncoder;}
+  PRBool NeedCharsetConversion(const nsString& from_charset, const nsString& to_charset);
+
+private:
+  nsIUnicodeDecoder *mDecoder;          // decoder (convert to unicode)  
+  nsIUnicodeEncoder *mEncoder;          // encoder (convert from unicode)
+  nsIUnicodeDecoder *mDecoderDetected;  // decoder of detected charset (after when auto detection succeeded)
+  PRInt32 mMaxNumCharsDetect;           // maximum number of characters in bytes to abort auto detection 
+                                        // (-1 for no limit)
+  PRInt32 mNumChars;                    // accumulated number of characters converted in bytes
+  PRBool mAutoDetect;                   // true if apply auto detection
+  nsString mInputCharset;               // input charset for auto detection hint as well as need conversion check
+  nsString mOutputCharset;              // output charset for need conversion check
+  static nsCString mDetectorContractID;     // ContractID of charset detector
+};
+
 nsIStringCharsetDetector* MimeCharsetConverterClass::mDetector = NULL;
 nsCString MimeCharsetConverterClass::mDetectorContractID;

@ -1456,6 +1579,32 @@ PRInt32 MIME_ConvertCharset(const PRBool autoDetection, const char* from_charset
  return res;
 }

+extern "C" char *MIME_DecodeMimeHeader(const char *header, 
+                                       const char *default_charset,
+                                       PRBool override_charset,
+                                       PRBool eatContinuations)
+{
+  char *result = nsnull;
+
+  if (header == 0)
+    return nsnull;
+
+  // If no MIME encoded then do nothing otherwise decode the input.
+  if (PL_strstr(header, "=?") ||
+      (default_charset && !intl_is_legal_utf8(header, nsCRT::strlen(header)))) {
+	  result = intl_decode_mime_part2_str(header, default_charset, override_charset);
+  } else if (eatContinuations && 
+             (PL_strchr(header, '\n') || PL_strchr(header, '\r'))) {
+    result = nsCRT::strdup(header);
+  } else {
+    eatContinuations = PR_FALSE;
+  }
+  if (eatContinuations)
+    result = MIME_StripContinuations(result);
+
+  return result;
+}  
+
 extern "C" char *MIME_DecodeMimePartIIStr(const char *header, char *charset,
                                          PRBool eatContinuations)
 {
@ -1466,7 +1615,7 @@ extern "C" char *MIME_DecodeMimePartIIStr(const char *header, char *charset,

  // If no MIME encoded then do nothing otherwise decode the input.
  if (*header != '\0' && PL_strstr(header, "=?")) {
-	  result = intl_decode_mime_part2_str(header);
+	  result = intl_decode_mime_part2_str(header, NULL, PR_FALSE);
      if (charset) PL_strcpy(charset, "UTF-8");
  }
  else if (charset && *charset == '\0') {
@ -1515,18 +1664,3 @@ void comi18n_destructor()
 } /* end of extern "C" */
 // END PUBLIC INTERFACE

-/*
-main()
-{
-        char *encoded, *decoded;
-        printf("mime\n");
-        encoded = intl_EncodeMimePartIIStr("hello worldÉ", INTL_CsidToCharsetNamePt(0), PR_TRUE,
-kMIME_ENCODED_WORD_SIZE);
-        printf("%s\n", encoded);
-        decoded = intl_DecodeMimePartIIStr((const char *) encoded,
-nsCRT::strlen(encoded), PR_TRUE);
-
-        return 0;
-}
-*/
-
--- a/mailnews/mime/src/comi18n.h
+++ b/mailnews/mime/src/comi18n.h
@ -36,54 +36,32 @@ class nsIUnicodeDecoder;
 class nsIUnicodeEncoder;
 class nsIStringCharsetDetector;

-class MimeCharsetConverterClass {
-public:
-  MimeCharsetConverterClass();
-  virtual ~MimeCharsetConverterClass();
-
-  // Initialize converters for charsets, fails if converter not available.
-  // 
-  PRInt32 Initialize(const char* from_charset, const char* to_charset, 
-                     const PRBool autoDetect=PR_FALSE, const PRInt32 maxNumCharsDetect=-1);
-
-  // Converts input buffer or duplicates input if converters not available (and returns 0).
-  // Also duplicates input if convertion not needed.
-  // C string is generated for converted string.
-  PRInt32 Convert(const char* inBuffer, const PRInt32 inLength, 
-                  char** outBuffer, PRInt32* outLength,
-                  PRInt32* numUnConverted);
-
-  static nsIStringCharsetDetector *mDetector;  // charset detector
-
-protected:
-  nsIUnicodeDecoder * GetUnicodeDecoder() {return (mAutoDetect && NULL != mDecoderDetected) ? mDecoderDetected : mDecoder;}
-  nsIUnicodeEncoder * GetUnicodeEncoder() {return mEncoder;}
-  PRBool NeedCharsetConversion(const nsString& from_charset, const nsString& to_charset);
-
-private:
-  nsIUnicodeDecoder *mDecoder;          // decoder (convert to unicode)  
-  nsIUnicodeEncoder *mEncoder;          // encoder (convert from unicode)
-  nsIUnicodeDecoder *mDecoderDetected;  // decoder of detected charset (after when auto detection succeeded)
-  PRInt32 mMaxNumCharsDetect;           // maximum number of characters in bytes to abort auto detection 
-                                        // (-1 for no limit)
-  PRInt32 mNumChars;                    // accumulated number of characters converted in bytes
-  PRBool mAutoDetect;                   // true if apply auto detection
-  nsString mInputCharset;               // input charset for auto detection hint as well as need conversion check
-  nsString mOutputCharset;              // output charset for need conversion check
-  static nsCString mDetectorContractID;     // ContractID of charset detector
-};
-
-  

 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */

+/**
+ * Decode MIME header to UTF-8.
+ * This is a replacement for MIME_DecodeMimePartIIStr
+ * Uses MIME_ConvertCharset if the decoded string needs a conversion.
+ *
+ *
+ * @param header      [IN] A header to decode.
+ * @param default_charset     [IN] Default charset to apply to ulabeled non-UTF-8 8bit data
+ * @param override_charset    [IN] If PR_TRUE, default_charset used instead of any charset labeling other than UTF-8
+ * @param eatContinuations    [IN] If PR_TRUE, unfold headers
+ * @return            Decoded buffer (in C string) or return NULL if the header needs no conversion
+ */
+extern "C" char *MIME_DecodeMimeHeader(const char *header, 
+                                       const char *default_charset,
+                                       PRBool override_charset,
+                                       PRBool eatContinuations);
+
 /**
 * If a header is MIME encoded then decode a header and sets a charset name.
- * This is a replacement for INTL_DecodeMimePartIIStr.
- * Unlike INTL_DecodeMimePartIIStr, this does not apply any charset conversion.
- * Use MIME_ConvertCharset if the decoded string needs a conversion.
+ * Obsolete.
+ * Uses MIME_ConvertCharset if the decoded string needs a conversion.
 *
 *
 * @param header      [IN] A header to decode.
--- a/mailnews/mime/src/nsMsgHeaderParser.cpp
+++ b/mailnews/mime/src/nsMsgHeaderParser.cpp
@ -226,35 +226,15 @@ nsMsgHeaderParser::nsMsgHeaderParser()
 {
  /* the following macro is used to initialize the ref counting data */
  NS_INIT_REFCNT();
-  m_USAsciiToUtf8CharsetConverter = nsnull;
  mUnicodeConverter = do_GetService(kCMimeConverterCID);
 }

 nsMsgHeaderParser::~nsMsgHeaderParser()
 {
-	delete m_USAsciiToUtf8CharsetConverter;
 }

 NS_IMPL_ISUPPORTS1(nsMsgHeaderParser, nsIMsgHeaderParser)

-MimeCharsetConverterClass *nsMsgHeaderParser::GetUSAsciiToUtf8CharsetConverter()
-{
-	if (!m_USAsciiToUtf8CharsetConverter)
-	{
-		m_USAsciiToUtf8CharsetConverter = new MimeCharsetConverterClass;
-		if (m_USAsciiToUtf8CharsetConverter)
-		{
-			nsresult rv = m_USAsciiToUtf8CharsetConverter->Initialize("us-ascii","UTF-8", PR_FALSE);
-			if (!NS_SUCCEEDED(rv))
-			{
-				delete m_USAsciiToUtf8CharsetConverter;
-				m_USAsciiToUtf8CharsetConverter = nsnull;
-			}
-		}
-	}
-	return m_USAsciiToUtf8CharsetConverter;
-}
-
 NS_IMETHODIMP nsMsgHeaderParser::ParseHeadersWithEnumerator(const PRUnichar *line, 
                                                            nsISimpleEnumerator **aResultEnumerator)
 {
@ -288,7 +268,6 @@ NS_IMETHODIMP nsMsgHeaderParser::ParseHeadersWithEnumerator(const PRUnichar *lin
 nsresult nsMsgHeaderParser::ParseHeaderAddresses (const char *charset, const char *line, char **names, char **addresses, PRUint32 *numAddresses)
 {
  char *utf8Str, *outStrings;
-  MimeCharsetConverterClass *converter = nsnull;
  nsresult rv=NS_OK;

  if (nsnull == line || MIME_ConvertString(CHARSET(charset), "UTF-8", line, &utf8Str) != 0) {
@ -310,12 +289,9 @@ nsresult nsMsgHeaderParser::ParseHeaderAddresses (const char *charset, const cha
    // convert array of strings
 	if (!charset)
 	{
-		converter = GetUSAsciiToUtf8CharsetConverter();
-		if (converter)
-			rv = converter->Convert(*names, len_all, &outStrings, &outStrLen, nsnull);
-	}
-	if (!converter)
-	{
+    outStrings = (char *)PL_strdup(*names);
+    rv = NS_OK;
+  } else {
 		rv = MIME_ConvertCharset(PR_FALSE, "UTF-8", CHARSET(charset), *names, 
                            len_all, &outStrings, &outStrLen, NULL) ; 
 	}
@ -336,14 +312,9 @@ nsresult nsMsgHeaderParser::ParseHeaderAddresses (const char *charset, const cha
    // convert array of strings
 	if (!charset)
 	{
-		converter = GetUSAsciiToUtf8CharsetConverter();
-		if (converter)
-			rv = converter->Convert(*addresses, 
-                            len_all, &outStrings, &outStrLen, nsnull);
-	}
-	// if non null charset, or couldn't get a converter, use MIME_ function.
-	if (!converter)
-	{
+    outStrings = (char *)PL_strdup(*addresses);
+    rv = NS_OK;
+  } else {
 		rv = MIME_ConvertCharset(PR_FALSE, "UTF-8", CHARSET(charset), *addresses, 
                            len_all, &outStrings, &outStrLen, NULL);
 	}
--- a/mailnews/mime/src/nsMsgHeaderParser.h
+++ b/mailnews/mime/src/nsMsgHeaderParser.h
@ -50,10 +50,8 @@ public:

  NS_DECL_NSIMSGHEADERPARSER
 	
-	MimeCharsetConverterClass *GetUSAsciiToUtf8CharsetConverter();
 protected:
  nsCOMPtr<nsIMimeConverter> mUnicodeConverter;
-  MimeCharsetConverterClass *m_USAsciiToUtf8CharsetConverter;
 }; 

 #endif /* nsMSGRFCPARSER_h__ */