Adding I18N functions for pref migration, bug 14349, r=ftang.

2000-01-11 01:24:21 +00:00 · 2000-01-11 01:24:21 +00:00 · 98b37023a9
--- a/profile/pref-migrator/src/nsPrefMigration.cpp
+++ b/profile/pref-migrator/src/nsPrefMigration.cpp
@ -49,6 +49,12 @@
 #include "nsNetUtil.h"
 #include "nsCRT.h"

+#define NS_IMPL_IDS
+#include "nsICharsetConverterManager.h"
+#include "nsIPlatformCharset.h"
+#undef NS_IMPL_IDS
+
+
 /* Network */

 #include "net.h"
@ -203,6 +209,8 @@ static NS_DEFINE_CID(kCommonDialogsCID, NS_CommonDialog_CID);
 static NS_DEFINE_CID(kDialogParamBlockCID, NS_DialogParamBlock_CID);
 static NS_DEFINE_CID(kFileLocatorCID,       NS_FILELOCATOR_CID);

+static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
+
 nsPrefMigration* nsPrefMigration::mInstance = nsnull;

 nsPrefMigration *
@ -1937,3 +1945,189 @@ nsPrefMigration::ShowPMDialogEngine(nsIDialogParamBlock *ioParamBlock, const cha

 #endif /* 0 */

+
+// This is to be called per string pref to check whether the input pref needs
+// charset conversion (from platform charset to UTF-8) for the pref migration.
+PRBool
+nsPrefMigration::PrefStringNeedsCharsetConversion(const char* prefName)
+{
+  //TODO: we need a complete list of 4.x prefs which are saved as the platform charset.
+  // also may need an extensibility in addition to the hard coded names.
+  const char *names[] = {
+    "mail.identity.username",
+     ""
+  };
+
+  for (int i = 0; names[i][0]; i++) {
+    if (!nsCRT::strcasecmp(prefName, names[i]))
+      return PR_TRUE;
+  }
+
+  return PR_FALSE;
+}
+
+// for UTF-8 detection
+#define kLeft1BitMask  0x80
+#define kLeft2BitsMask 0xC0
+#define kLeft3BitsMask 0xE0
+#define kLeft4BitsMask 0xF0
+#define kLeft5BitsMask 0xF8
+#define kLeft6BitsMask 0xFC
+#define kLeft7BitsMask 0xFE
+
+#define k2BytesLeadByte kLeft2BitsMask
+#define k3BytesLeadByte kLeft3BitsMask
+#define k4BytesLeadByte kLeft4BitsMask
+#define k5BytesLeadByte kLeft5BitsMask
+#define k6BytesLeadByte kLeft6BitsMask
+#define kTrialByte      kLeft1BitMask
+
+#define UTF8_1Byte(c) ( 0 == ((c) & kLeft1BitMask))
+#define UTF8_2Bytes(c) ( k2BytesLeadByte == ((c) & kLeft3BitsMask))
+#define UTF8_3Bytes(c) ( k3BytesLeadByte == ((c) & kLeft4BitsMask))
+#define UTF8_4Bytes(c) ( k4BytesLeadByte == ((c) & kLeft5BitsMask))
+#define UTF8_5Bytes(c) ( k5BytesLeadByte == ((c) & kLeft6BitsMask))
+#define UTF8_6Bytes(c) ( k6BytesLeadByte == ((c) & kLeft7BitsMask))
+#define UTF8_ValidTrialByte(c) ( kTrialByte == ((c) & kLeft2BitsMask))
+
+// Check if the given C string is UTF-8 or not.
+PRBool
+nsPrefMigration::IsUTF8String(const unsigned char* utf8)
+{
+  if(NULL == utf8)
+    return PR_TRUE;
+  return IsUTF8Text(utf8, nsCRT::strlen((char *)utf8));}
+
+// Check if the given buffer is UTF-8 or not.
+PRBool
+nsPrefMigration::IsUTF8Text(const unsigned char* utf8, PRInt32 len)
+{
+   PRInt32 i;
+   PRInt32 j;
+   PRInt32 clen;
+   for(i =0; i < len; i += clen)
+   {
+      if(UTF8_1Byte(utf8[i]))
+      {
+        clen = 1;
+      } else if(UTF8_2Bytes(utf8[i])) {
+        clen = 2;
+        /* No enough trail bytes */
+        if( (i + clen) > len) 
+          return PR_FALSE;
+        /* 0000 0000 - 0000 007F : should encode in less bytes */
+        if(0 ==  (utf8[i] & 0x1E )) 
+          return PR_FALSE;
+      } else if(UTF8_3Bytes(utf8[i])) {
+        clen = 3;
+        /* No enough trail bytes */
+        if( (i + clen) > len) 
+          return PR_FALSE;
+        /* a single Surrogate should not show in 3 bytes UTF8, instead, the pair 
+           should be intepreted
+           as one single UCS4 char and encoded UTF8 in 4 bytes */
+        if((0xED == utf8[i] ) && (0xA0 ==  (utf8[i+1] & 0xA0 ) )) 
+          return PR_FALSE;
+        /* 0000 0000 - 0000 07FF : should encode in less bytes */
+        if((0 ==  (utf8[i] & 0x0F )) && (0 ==  (utf8[i+1] & 0x20 ) )) 
+          return PR_FALSE;
+      } else if(UTF8_4Bytes(utf8[i])) {
+        clen = 4;
+        /* No enough trail bytes */
+        if( (i + clen) > len) 
+          return PR_FALSE;
+        /* 0000 0000 - 0000 FFFF : should encode in less bytes */
+        if((0 ==  (utf8[i] & 0x07 )) && (0 ==  (utf8[i+1] & 0x30 )) ) 
+          return PR_FALSE;
+      } else if(UTF8_5Bytes(utf8[i])) {
+        clen = 5;
+        /* No enough trail bytes */
+        if( (i + clen) > len) 
+          return FALSE;
+        /* 0000 0000 - 001F FFFF : should encode in less bytes */
+        if((0 ==  (utf8[i] & 0x03 )) && (0 ==  (utf8[i+1] & 0x38 )) ) 
+          return PR_FALSE;
+      } else if(UTF8_6Bytes(utf8[i])) {
+        clen = 6;
+        /* No enough trail bytes */
+        if( (i + clen) > len) 
+          return PR_FALSE;
+        /* 0000 0000 - 03FF FFFF : should encode in less bytes */
+        if((0 ==  (utf8[i] & 0x01 )) && (0 ==  (utf8[i+1] & 0x3E )) ) 
+          return PR_FALSE;
+      } else {
+        return PR_FALSE;
+      }
+      for(j = 1; j<clen ;j++)
+      {
+        if(! UTF8_ValidTrialByte(utf8[i+j])) /* Trail bytes invalid */
+          return PR_FALSE;
+      }
+   }
+   return PR_TRUE;
+}
+
+// A wrapper function to call the interface to get a platform file charset.
+nsresult 
+nsPrefMigration::GetPlatformCharset(nsAutoString& aCharset)
+{
+  nsresult rv;
+
+  // we may cache it since the platform charset will not change through application life
+  nsCOMPtr <nsIPlatformCharset> platformCharset;
+  rv = nsComponentManager::CreateInstance(NS_PLATFORMCHARSET_PROGID, nsnull, 
+                                         NS_GET_IID(nsIPlatformCharset), getter_AddRefs(platformCharset));
+  if (NS_SUCCEEDED(rv)) {
+   rv = platformCharset->GetCharset(kPlatformCharsetSel_FileName, aCharset);
+  }
+  if (NS_FAILED(rv)) {
+   aCharset.SetString("ISO-8859-1");  // use ISO-8859-1 in case of any error
+  }
+ 
+  return rv;
+}
+
+// Apply a charset conversion from the given charset to UTF-8 for the input C string.
+nsresult 
+nsPrefMigration::ConvertStringToUTF8(nsAutoString& aCharset, const char* inString, char** outString)
+{
+  if (nsnull == outString)
+    return NS_ERROR_NULL_POINTER;
+
+  nsresult rv;
+  // convert result to unicode
+  NS_WITH_SERVICE(nsICharsetConverterManager, ccm, kCharsetConverterManagerCID, &rv);
+
+  if(NS_SUCCEEDED(rv)) {
+    nsCOMPtr <nsIUnicodeDecoder> decoder; // this may be cached
+
+    rv = ccm->GetUnicodeDecoder(&aCharset, getter_AddRefs(decoder));
+    if(NS_SUCCEEDED(rv) && decoder) {
+      PRInt32 uniLength = 0;
+      PRInt32 srcLength = nsCRT::strlen(inString);
+      rv = decoder->GetMaxLength(inString, srcLength, &uniLength);
+      if (NS_SUCCEEDED(rv)) {
+        PRUnichar *unichars = new PRUnichar [uniLength];
+
+        if (nsnull != unichars) {
+          // convert to unicode
+          rv = decoder->Convert(inString, &srcLength, unichars, &uniLength);
+          if (NS_SUCCEEDED(rv)) {
+            nsAutoString aString;
+            aString.SetString(unichars, uniLength);
+            // convert to UTF-8
+            *outString = aString.ToNewUTF8String();
+          }
+          delete [] unichars;
+        }
+        else {
+          rv = NS_ERROR_OUT_OF_MEMORY;
+        }
+      }
+    }    
+  }
+
+  return rv;
+}
+
+
--- a/profile/pref-migrator/src/nsPrefMigration.h
+++ b/profile/pref-migrator/src/nsPrefMigration.h
@ -121,7 +121,37 @@ class nsPrefMigration: public nsIPrefMigration
      nsresult GetPremigratedFilePref(const char *pref_name, nsIFileSpec **filePath);
 #endif /* NEED_TO_COPY_AND_RENAME_NEWSRC_FILES */
      
-      
+      // I18N pref migration:
+      //
+      // 5.0 stores pref strings are UTF-8 while 4.x stores them either plat form charset or UTF-8
+      // depends on the pref.
+      // Functions here provide possible two ways to deal with the I18N migration.
+      // 
+      // 1) Use the knowleage of which 4.x pref strings are platform charset. 
+      // If PrefStringNeedsCharsetConversion() returns true then the string to be converted to UTF-8.
+      //
+      // 2) Apply UTF-8 detection to all string pres. Apply the conversion if the string is detected as UTF-8.
+      //
+      // The user of the functions need to decide 1) or 2).
+      // The functions to get platform charset and charset conversion code to UTF-8 are also provided.
+      //
+
+      // This is to be called per string pref to check whether the input pref needs
+      // charset conversion (from platform charset to UTF-8) for the pref migration.
+      PRBool PrefStringNeedsCharsetConversion(const char* prefName);
+
+      // Check if the given C string is UTF-8 or not.
+      PRBool IsUTF8String(const unsigned char* utf8);
+
+      // Check if the given buffer is UTF-8 or not.
+      PRBool IsUTF8Text(const unsigned char* utf8, PRInt32 len);
+
+      // A wrapper function to call the interface to get a platform file charset.
+      nsresult GetPlatformCharset(nsAutoString& aCharset);
+
+      // Apply a charset conversion from the given charset to UTF-8 for the input C string.
+      nsresult ConvertStringToUTF8(nsAutoString& aCharset, const char* inString, char** outString);
+
      nsIPref* m_prefs;
      nsresult getPrefService();
      nsCOMPtr<nsIFileSpec> m_prefsFile;