bug 191542 : Add UTF-8 equivalent of |IsASCII|, IsUTF8. r=smontagu, sr=alecf

2003-03-25 08:11:13 +00:00 · 2003-03-25 08:11:13 +00:00 · b9b88d097e
--- a/intl/uconv/src/nsTextToSubURI.cpp
+++ b/intl/uconv/src/nsTextToSubURI.cpp
@ -191,9 +191,8 @@ nsresult nsTextToSubURI::convertURItoUnicode(const nsAFlatCString &aCharset,
  }

  if (!isStatefulCharset && aIRI) {
-    NS_ConvertUTF8toUCS2 ucs2(aURI);
-    if (aURI.Equals(NS_ConvertUCS2toUTF8(ucs2))) {
-      _retval.Assign(ucs2);
+    if (IsUTF8(aURI)) {
+      _retval.Assign(NS_ConvertUTF8toUCS2(aURI));
      return rv;
    }
  }
--- a/string/public/nsReadableUtils.h
+++ b/string/public/nsReadableUtils.h
@ -174,6 +174,26 @@ NS_COM PRBool IsASCII( const nsAString& aString );
 NS_COM PRBool IsASCII( const nsACString& aString );


+  /**
+   * Returns |PR_TRUE| if |aString| is a valid UTF-8 string.
+   * XXX This is not bullet-proof and nor an all-purpose UTF-8 validator. 
+   * It is mainly written to replace and roughly equivalent to
+   *
+   *    str.Equals(NS_ConvertUCS2toUTF8(NS_ConvertUTF8toUCS2(str)))
+   *
+   * (see bug 191541)
+   * As such,  it does not check for non-UTF-8 7bit encodings such as 
+   * ISO-2022-JP and HZ. However, it filters out  UTF-8 representation
+   * of surrogate codepoints and non-characters ( 0xFFFE and 0xFFFF
+   * in planes 0 through 16.) as well as overlong UTF-8 sequences. 
+   * Also note that it regards UTF-8 sequences corresponding to 
+   * codepoints above 0x10FFFF as invalid in accordance with 
+   * http://www.ietf.org/internet-drafts/draft-yergeau-rfc2279bis-04.txt
+   *
+   * @param aString an 8-bit wide string to scan
+   */
+NS_COM PRBool IsUTF8( const nsACString& aString );
+

  /**
   * Converts case in place in the argument string.
--- a/string/src/nsReadableUtils.cpp
+++ b/string/src/nsReadableUtils.cpp
@ -367,7 +367,102 @@ IsASCII( const nsACString& aString )
    return PR_TRUE;
  }

+NS_COM
+PRBool
+IsUTF8( const nsACString& aString )
+  {
+    nsReadingIterator<char> done_reading;
+    aString.EndReading(done_reading);

+    PRInt32 state = 0;
+    PRBool overlong = PR_FALSE;
+    PRBool surrogate = PR_FALSE;
+    PRBool nonchar = PR_FALSE;
+    PRUint16 olupper = 0; // overlong byte upper bound.
+    PRUint16 slower = 0;  // surrogate byte lower bound.
+
+      // for each chunk of |aString|...
+    PRUint32 fragmentLength = 0;
+    nsReadingIterator<char> iter;
+
+    for ( aString.BeginReading(iter); iter != done_reading; iter.advance( PRInt32(fragmentLength) ) )
+      {
+        fragmentLength = PRUint32(iter.size_forward());
+        const char* ptr = iter.get();
+        const char* fragmentEnd = ptr + fragmentLength;
+
+          // for each character in this chunk...
+        while ( ptr < fragmentEnd )
+          {
+            PRUint8 c;
+            
+            if (0 == state)
+              {
+                c = *ptr++;
+
+                if ( UTF8traits::isASCII(c) ) 
+                  continue;
+
+                if ( c <= 0xC1 ) // [80-BF] where not expected, [C0-C1] for overlong.
+                  return PR_FALSE;
+                else if ( UTF8traits::is2byte(c) ) 
+                    state = 1;
+                else if ( UTF8traits::is3byte(c) ) 
+                  {
+                    state = 2;
+                    if ( c == 0xE0 ) // to exclude E0[80-9F][80-BF] 
+                      {
+                        overlong = PR_TRUE;
+                        olupper = 0x9F;
+                      }
+                    else if ( c == 0xED ) // ED[A0-BF][80-BF] : surrogate codepoint
+                      {
+                        surrogate = PR_TRUE;
+                        slower = 0xA0;
+                      }
+                    else if ( c == 0xEF ) // EF BF [BE-BF] : non-character
+                      nonchar = PR_TRUE;
+                  }
+                else if ( c <= 0xF4 ) // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
+                  {
+                    state = 3;
+                    nonchar = PR_TRUE;
+                    if ( c == 0xF0 ) // to exclude F0[80-8F][80-BF]{2}
+                      {
+                        overlong = PR_TRUE;
+                        olupper = 0x8F;
+                      }
+                    else if ( c == 0xF4 ) // to exclude F4[90-BF][80-BF] 
+                      {
+                        // actually not surrogates but codepoints beyond 0x10FFFF
+                        surrogate = PR_TRUE;
+                        slower = 0x90;
+                      }
+                  }
+                else
+                  return PR_FALSE; // Not UTF8 string
+              }
+              
+              while (ptr < fragmentEnd && state)
+                {
+                  c = *ptr++;
+                  --state;
+
+                  // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
+                  if ( nonchar &&  ( !state &&  c < 0xBE ||
+                       state == 1 && c != 0xBF  ||
+                       state == 2 && 0x0F != (0x0F & c) ))
+                     nonchar = PR_FALSE;
+
+                  if ( !UTF8traits::isInSeq(c) || overlong && c <= olupper || 
+                       surrogate && slower <= c || nonchar && !state )
+                    return PR_FALSE; // Not UTF8 string
+                  overlong = surrogate = PR_FALSE;
+                }
+            }
+        }
+    return !state; // state != 0 at the end indicates an invalid UTF-8 seq. 
+  }

  /**
   * A character sink for in-place case conversion.
--- a/xpcom/string/public/nsReadableUtils.h
+++ b/xpcom/string/public/nsReadableUtils.h
@ -174,6 +174,26 @@ NS_COM PRBool IsASCII( const nsAString& aString );
 NS_COM PRBool IsASCII( const nsACString& aString );


+  /**
+   * Returns |PR_TRUE| if |aString| is a valid UTF-8 string.
+   * XXX This is not bullet-proof and nor an all-purpose UTF-8 validator. 
+   * It is mainly written to replace and roughly equivalent to
+   *
+   *    str.Equals(NS_ConvertUCS2toUTF8(NS_ConvertUTF8toUCS2(str)))
+   *
+   * (see bug 191541)
+   * As such,  it does not check for non-UTF-8 7bit encodings such as 
+   * ISO-2022-JP and HZ. However, it filters out  UTF-8 representation
+   * of surrogate codepoints and non-characters ( 0xFFFE and 0xFFFF
+   * in planes 0 through 16.) as well as overlong UTF-8 sequences. 
+   * Also note that it regards UTF-8 sequences corresponding to 
+   * codepoints above 0x10FFFF as invalid in accordance with 
+   * http://www.ietf.org/internet-drafts/draft-yergeau-rfc2279bis-04.txt
+   *
+   * @param aString an 8-bit wide string to scan
+   */
+NS_COM PRBool IsUTF8( const nsACString& aString );
+

  /**
   * Converts case in place in the argument string.
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -367,7 +367,102 @@ IsASCII( const nsACString& aString )
    return PR_TRUE;
  }

+NS_COM
+PRBool
+IsUTF8( const nsACString& aString )
+  {
+    nsReadingIterator<char> done_reading;
+    aString.EndReading(done_reading);

+    PRInt32 state = 0;
+    PRBool overlong = PR_FALSE;
+    PRBool surrogate = PR_FALSE;
+    PRBool nonchar = PR_FALSE;
+    PRUint16 olupper = 0; // overlong byte upper bound.
+    PRUint16 slower = 0;  // surrogate byte lower bound.
+
+      // for each chunk of |aString|...
+    PRUint32 fragmentLength = 0;
+    nsReadingIterator<char> iter;
+
+    for ( aString.BeginReading(iter); iter != done_reading; iter.advance( PRInt32(fragmentLength) ) )
+      {
+        fragmentLength = PRUint32(iter.size_forward());
+        const char* ptr = iter.get();
+        const char* fragmentEnd = ptr + fragmentLength;
+
+          // for each character in this chunk...
+        while ( ptr < fragmentEnd )
+          {
+            PRUint8 c;
+            
+            if (0 == state)
+              {
+                c = *ptr++;
+
+                if ( UTF8traits::isASCII(c) ) 
+                  continue;
+
+                if ( c <= 0xC1 ) // [80-BF] where not expected, [C0-C1] for overlong.
+                  return PR_FALSE;
+                else if ( UTF8traits::is2byte(c) ) 
+                    state = 1;
+                else if ( UTF8traits::is3byte(c) ) 
+                  {
+                    state = 2;
+                    if ( c == 0xE0 ) // to exclude E0[80-9F][80-BF] 
+                      {
+                        overlong = PR_TRUE;
+                        olupper = 0x9F;
+                      }
+                    else if ( c == 0xED ) // ED[A0-BF][80-BF] : surrogate codepoint
+                      {
+                        surrogate = PR_TRUE;
+                        slower = 0xA0;
+                      }
+                    else if ( c == 0xEF ) // EF BF [BE-BF] : non-character
+                      nonchar = PR_TRUE;
+                  }
+                else if ( c <= 0xF4 ) // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
+                  {
+                    state = 3;
+                    nonchar = PR_TRUE;
+                    if ( c == 0xF0 ) // to exclude F0[80-8F][80-BF]{2}
+                      {
+                        overlong = PR_TRUE;
+                        olupper = 0x8F;
+                      }
+                    else if ( c == 0xF4 ) // to exclude F4[90-BF][80-BF] 
+                      {
+                        // actually not surrogates but codepoints beyond 0x10FFFF
+                        surrogate = PR_TRUE;
+                        slower = 0x90;
+                      }
+                  }
+                else
+                  return PR_FALSE; // Not UTF8 string
+              }
+              
+              while (ptr < fragmentEnd && state)
+                {
+                  c = *ptr++;
+                  --state;
+
+                  // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
+                  if ( nonchar &&  ( !state &&  c < 0xBE ||
+                       state == 1 && c != 0xBF  ||
+                       state == 2 && 0x0F != (0x0F & c) ))
+                     nonchar = PR_FALSE;
+
+                  if ( !UTF8traits::isInSeq(c) || overlong && c <= olupper || 
+                       surrogate && slower <= c || nonchar && !state )
+                    return PR_FALSE; // Not UTF8 string
+                  overlong = surrogate = PR_FALSE;
+                }
+            }
+        }
+    return !state; // state != 0 at the end indicates an invalid UTF-8 seq. 
+  }

  /**
   * A character sink for in-place case conversion.