Add StringBeginsWith, StringEndsWith, and UTF8ToNewUnicode. Move UTF8 handling utilities to separate file. b=131293 r=jst sr=alecf,jag a=asa b=131293

2003-05-21 22:20:27 +00:00 · 2003-05-21 22:20:27 +00:00 · 77bf6f4e26
--- a/string/obsolete/nsString2.cpp
+++ b/string/obsolete/nsString2.cpp
@ -44,6 +44,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -1348,61 +1349,6 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
      }
  }

-class CalculateUTF8Length
-  {
-    public:
-      typedef nsACString::char_type value_type;
-
-    CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
-
-    size_t Length() const { return mLength; }
-
-    PRUint32 write( const value_type* start, PRUint32 N )
-      {
-          // ignore any further requests
-        if ( mErrorEncountered )
-            return N;
-
-        // algorithm assumes utf8 units won't
-        // be spread across fragments
-        const value_type* p = start;
-        const value_type* end = start + N;
-        for ( ; p < end /* && *p */; ++mLength )
-          {
-            if ( UTF8traits::isASCII(*p) )
-                p += 1;
-            else if ( UTF8traits::is2byte(*p) )
-                p += 2;
-            else if ( UTF8traits::is3byte(*p) )
-                p += 3;
-            else if ( UTF8traits::is4byte(*p) ) {
-                p += 4;
-                ++mLength;
-            }
-            else if ( UTF8traits::is5byte(*p) )
-                p += 5;
-            else if ( UTF8traits::is6byte(*p) )
-                p += 6;
-            else
-              {
-                break;
-              }
-          }
-        if ( p != end )
-          {
-            NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-            mErrorEncountered = PR_TRUE;
-            mLength = 0;
-            return N;
-          }
-        return p - start;
-      }
-
-    private:
-      size_t mLength;
-      PRBool mErrorEncountered;
-  };
-
 void
 NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
 {
--- a/string/obsolete/nsString2.h
+++ b/string/obsolete/nsString2.h
@ -76,18 +76,6 @@

 #include "nsStr.h"

-class UTF8traits
-  {
-    public:
-      static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
-      static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
-      static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
-      static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
-      static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
-      static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
-      static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
-  };
-
 #ifdef STANDALONE_MI_STRING_TESTS
  class nsAFlatString { public: virtual ~nsAString() { } };
 #endif
@ -566,140 +554,4 @@ class NS_COM NS_ConvertUTF8toUCS2
      NS_ConvertUTF8toUCS2( PRUnichar );
  };

-#define PLANE1_BASE           0x00010000  
-#define UCS2_REPLACEMENT_CHAR 0xfffd     
-
-class ConvertUTF8toUCS2
-  {
-    public:
-      typedef nsACString::char_type value_type;
-      typedef nsAString::char_type  buffer_type;
-
-    ConvertUTF8toUCS2( buffer_type* aBuffer )
-        : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
-
-    size_t Length() const { return mBuffer - mStart; }
-
-    PRUint32 write( const value_type* start, PRUint32 N )
-      {
-        if ( mErrorEncountered )
-          return N;
-
-        // algorithm assumes utf8 units won't
-        // be spread across fragments
-        const value_type* p = start;
-        const value_type* end = start + N;
-        for ( ; p != end /* && *p */; )
-          {
-            char c = *p++;
-
-            if ( UTF8traits::isASCII(c) )
-              {
-                *mBuffer++ = buffer_type(c);
-                continue;
-              }
-
-            PRUint32 ucs4;
-            PRUint32 minUcs4;
-            PRInt32 state = 0;
-
-            if ( UTF8traits::is2byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
-                state = 1;
-                minUcs4 = 0x00000080;
-              }
-            else if ( UTF8traits::is3byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
-                state = 2;
-                minUcs4 = 0x00000800;
-              }
-            else if ( UTF8traits::is4byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
-                state = 3;
-                minUcs4 = 0x00010000;
-              }
-            else if ( UTF8traits::is5byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 24) & 0x03000000L;
-                state = 4;
-                minUcs4 = 0x00200000;
-              }
-            else if ( UTF8traits::is6byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 30) & 0x40000000L;
-                state = 5;
-                minUcs4 = 0x04000000;
-              }
-            else
-              {
-                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-                mErrorEncountered = PR_TRUE;
-                return N;
-              }
-
-            while ( state-- )
-              {
-                c = *p++;
-
-                if ( UTF8traits::isInSeq(c) )
-                  {
-                    PRInt32 shift = state * 6;
-                    ucs4 |= (PRUint32(c) & 0x3F) << shift;
-                  }
-                else
-                  {
-                    NS_ERROR("not a UTF8 string");
-                    mErrorEncountered = PR_TRUE;
-                    return N;
-                  }
-              }
-
-            if ( ucs4 < minUcs4 )
-              {
-                // Overlong sequence
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 <= 0xD7FF )
-              {
-                *mBuffer++ = ucs4;
-              }
-            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
-              {
-                // Surrogates
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
-              {
-                // Prohibited characters
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 >= PLANE1_BASE )
-              {
-                if ( ucs4 >= 0x00110000 )
-                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-                else {
-                  // surrogate, see unicode specification 3.7 for following math.
-                  ucs4 -= PLANE1_BASE;
-                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
-                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
-                }
-              }
-            else
-              {
-                if ( ucs4 != 0xFEFF ) // ignore BOM
-                    *mBuffer++ = ucs4;
-              }
-          }
-        return p - start;
-      }
-
-    private:
-      buffer_type* mStart;
-      buffer_type* mBuffer;
-      PRBool mErrorEncountered;
-  };
-
 #endif /* !defined(nsString2_h__) */
--- a/string/public/Makefile.in
+++ b/string/public/Makefile.in
@ -59,6 +59,7 @@ EXPORTS		=				\
 		nsSharableString.h		\
 		nsSharedBufferList.h		\
 		nsSlidingString.h		\
+		nsUTF8Utils.h			\
 		nsXPIDLString.h			\
 		$(NULL)			

--- a/string/public/nsReadableUtils.h
+++ b/string/public/nsReadableUtils.h
@ -107,11 +107,28 @@ NS_COM PRUnichar* ToNewUnicode( const nsAString& aSource );
   * This conversion is not well defined; but it reproduces legacy string behavior.
   * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
   *
-   * @param aSource an 8-bit wide string
+   * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
   * @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
   */
 NS_COM PRUnichar* ToNewUnicode( const nsACString& aSource );

+  /**
+   * Returns a new |PRUnichar| buffer containing a zero-terminated copy
+   * of |aSource|.
+   *
+   * Allocates and returns a new |char| buffer which you must free with
+   * |nsMemory::Free|.  Performs an encoding conversion by 0-padding
+   * 8-bit wide characters up to 16-bits wide while copying |aSource| to
+   * your new buffer.  This conversion is not well defined; but it
+   * reproduces legacy string behavior.  The new buffer is
+   * zero-terminated, but that may not help you if |aSource| contains
+   * embedded nulls.
+   *
+   * @param aSource an 8-bit wide string, UTF-8 encoded
+   * @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
+   */
+NS_COM PRUnichar* UTF8ToNewUnicode( const nsACString& aSource );
+
  /**
   * Copies |aLength| 16-bit characters from the start of |aSource| to the
   * |PRUnichar| buffer |aDest|.
@ -281,6 +298,15 @@ NS_COM PRUint32 CountCharInReadable( const nsAString& aStr,
 NS_COM PRUint32 CountCharInReadable( const nsACString& aStr,
                                     char aChar );

+NS_COM PRBool StringBeginsWith( const nsAString& aSource,
+                                const nsAString& aSubstring);
+NS_COM PRBool StringBeginsWith( const nsACString& aSource,
+                                const nsACString& aSubstring);
+NS_COM PRBool StringEndsWith( const nsAString& aSource,
+                              const nsAString& aSubstring);
+NS_COM PRBool StringEndsWith( const nsACString& aSource,
+                              const nsACString& aSubstring);
+
 NS_COM PRUint32 HashString( const nsAString& aStr );
 NS_COM PRUint32 HashString( const nsACString& aStr );

--- a/string/public/nsUTF8Utils.h
+++ b/string/public/nsUTF8Utils.h
@ -0,0 +1,245 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: NPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Netscape Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/NPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is 
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Peter Annema <jaggernaut@netscape.com> (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or 
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the NPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the NPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef nsUTF8Utils_h_
+#define nsUTF8Utils_h_
+
+class UTF8traits
+  {
+    public:
+      static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
+      static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
+      static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
+      static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
+      static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
+      static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
+      static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
+  };
+
+#define PLANE1_BASE           0x00010000  
+#define UCS2_REPLACEMENT_CHAR 0xfffd     
+
+class ConvertUTF8toUCS2
+  {
+    public:
+      typedef nsACString::char_type value_type;
+      typedef nsAString::char_type  buffer_type;
+
+    ConvertUTF8toUCS2( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
+
+    size_t Length() const { return mBuffer - mStart; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        if ( mErrorEncountered )
+          return N;
+
+        // algorithm assumes utf8 units won't
+        // be spread across fragments
+        const value_type* p = start;
+        const value_type* end = start + N;
+        for ( ; p != end /* && *p */; )
+          {
+            char c = *p++;
+
+            if ( UTF8traits::isASCII(c) )
+              {
+                *mBuffer++ = buffer_type(c);
+                continue;
+              }
+
+            PRUint32 ucs4;
+            PRUint32 minUcs4;
+            PRInt32 state = 0;
+
+            if ( UTF8traits::is2byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
+                state = 1;
+                minUcs4 = 0x00000080;
+              }
+            else if ( UTF8traits::is3byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
+                state = 2;
+                minUcs4 = 0x00000800;
+              }
+            else if ( UTF8traits::is4byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
+                state = 3;
+                minUcs4 = 0x00010000;
+              }
+            else if ( UTF8traits::is5byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 24) & 0x03000000L;
+                state = 4;
+                minUcs4 = 0x00200000;
+              }
+            else if ( UTF8traits::is6byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 30) & 0x40000000L;
+                state = 5;
+                minUcs4 = 0x04000000;
+              }
+            else
+              {
+                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+                mErrorEncountered = PR_TRUE;
+                return N;
+              }
+
+            while ( state-- )
+              {
+                c = *p++;
+
+                if ( UTF8traits::isInSeq(c) )
+                  {
+                    PRInt32 shift = state * 6;
+                    ucs4 |= (PRUint32(c) & 0x3F) << shift;
+                  }
+                else
+                  {
+                    NS_ERROR("not a UTF8 string");
+                    mErrorEncountered = PR_TRUE;
+                    return N;
+                  }
+              }
+
+            if ( ucs4 < minUcs4 )
+              {
+                // Overlong sequence
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 <= 0xD7FF )
+              {
+                *mBuffer++ = ucs4;
+              }
+            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
+              {
+                // Surrogates
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
+              {
+                // Prohibited characters
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 >= PLANE1_BASE )
+              {
+                if ( ucs4 >= 0x00110000 )
+                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                else {
+                  // surrogate, see unicode specification 3.7 for following math.
+                  ucs4 -= PLANE1_BASE;
+                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
+                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+                }
+              }
+            else
+              {
+                if ( ucs4 != 0xFEFF ) // ignore BOM
+                    *mBuffer++ = ucs4;
+              }
+          }
+        return p - start;
+      }
+
+    private:
+      buffer_type* mStart;
+      buffer_type* mBuffer;
+      PRBool mErrorEncountered;
+  };
+
+class CalculateUTF8Length
+  {
+    public:
+      typedef nsACString::char_type value_type;
+
+    CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
+
+    size_t Length() const { return mLength; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+          // ignore any further requests
+        if ( mErrorEncountered )
+            return N;
+
+        // algorithm assumes utf8 units won't
+        // be spread across fragments
+        const value_type* p = start;
+        const value_type* end = start + N;
+        for ( ; p < end /* && *p */; ++mLength )
+          {
+            if ( UTF8traits::isASCII(*p) )
+                p += 1;
+            else if ( UTF8traits::is2byte(*p) )
+                p += 2;
+            else if ( UTF8traits::is3byte(*p) )
+                p += 3;
+            else if ( UTF8traits::is4byte(*p) ) {
+                p += 4;
+                ++mLength;
+            }
+            else if ( UTF8traits::is5byte(*p) )
+                p += 5;
+            else if ( UTF8traits::is6byte(*p) )
+                p += 6;
+            else
+              {
+                break;
+              }
+          }
+        if ( p != end )
+          {
+            NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+            mErrorEncountered = PR_TRUE;
+            mLength = 0;
+            return N;
+          }
+        return p - start;
+      }
+
+    private:
+      size_t mLength;
+      PRBool mErrorEncountered;
+  };
+
+#endif /* !defined(nsUTF8Utils_h_) */
--- a/string/src/nsReadableUtils.cpp
+++ b/string/src/nsReadableUtils.cpp
@ -25,6 +25,7 @@
 #include "nsMemory.h"
 #include "nsString.h"
 #include "nsCRT.h"
+#include "nsUTF8Utils.h"

 #ifndef nsStringTraits_h___
 #include "nsStringTraits.h"
@ -208,6 +209,8 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
+    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
+    // refactored so that we can use it here without a double-copy.
    NS_ConvertUCS2toUTF8 temp(aSource);

    char* result;
@ -268,6 +271,26 @@ ToNewUnicode( const nsACString& aSource )
    return result;
  }

+NS_COM
+PRUnichar*
+UTF8ToNewUnicode( const nsACString& aSource )
+  {
+    nsACString::const_iterator start, end;
+    CalculateUTF8Length calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);
+
+    PRUnichar *result = NS_STATIC_CAST(PRUnichar*,
+        nsMemory::Alloc(sizeof(PRUnichar) * (calculator.Length() + 1)));
+
+    ConvertUTF8toUCS2 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter);
+    NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
+
+    return result;
+  }
+
 NS_COM
 PRUnichar*
 CopyUnicodeTo( const nsAString& aSource, PRUint32 aSrcOffset, PRUnichar* aDest, PRUint32 aLength )
@ -1080,6 +1103,48 @@ CountCharInReadable( const nsACString& aStr,
  return count;
 }

+NS_COM PRBool
+StringBeginsWith( const nsAString& aSource, const nsAString& aSubstring)
+  {
+    nsAString::size_type src_len = aSource.Length(),
+                         sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, 0, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringBeginsWith( const nsACString& aSource, const nsACString& aSubstring)
+  {
+    nsACString::size_type src_len = aSource.Length(),
+                          sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, 0, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringEndsWith( const nsAString& aSource, const nsAString& aSubstring)
+  {
+    nsAString::size_type src_len = aSource.Length(),
+                         sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringEndsWith( const nsACString& aSource, const nsACString& aSubstring)
+  {
+    nsACString::size_type src_len = aSource.Length(),
+                          sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
+  }
+
+
+
 template <class CharT>
 class CalculateHashCode
  {
--- a/xpcom/io/nsUnicharInputStream.cpp
+++ b/xpcom/io/nsUnicharInputStream.cpp
@ -42,6 +42,7 @@
 #include "nsIServiceManager.h"
 #include "nsString.h"
 #include "nsCRT.h"
+#include "nsUTF8Utils.h"
 #include <fcntl.h>
 #if defined(NS_WIN32) || defined(XP_OS2_VACPP)
 #include <io.h>
--- a/xpcom/string/obsolete/nsString2.cpp
+++ b/xpcom/string/obsolete/nsString2.cpp
@ -44,6 +44,7 @@
 #include "nsString.h"
 #include "nsReadableUtils.h"
 #include "nsDebug.h"
+#include "nsUTF8Utils.h"

 #ifndef nsCharTraits_h___
 #include "nsCharTraits.h"
@ -1348,61 +1349,6 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
      }
  }

-class CalculateUTF8Length
-  {
-    public:
-      typedef nsACString::char_type value_type;
-
-    CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
-
-    size_t Length() const { return mLength; }
-
-    PRUint32 write( const value_type* start, PRUint32 N )
-      {
-          // ignore any further requests
-        if ( mErrorEncountered )
-            return N;
-
-        // algorithm assumes utf8 units won't
-        // be spread across fragments
-        const value_type* p = start;
-        const value_type* end = start + N;
-        for ( ; p < end /* && *p */; ++mLength )
-          {
-            if ( UTF8traits::isASCII(*p) )
-                p += 1;
-            else if ( UTF8traits::is2byte(*p) )
-                p += 2;
-            else if ( UTF8traits::is3byte(*p) )
-                p += 3;
-            else if ( UTF8traits::is4byte(*p) ) {
-                p += 4;
-                ++mLength;
-            }
-            else if ( UTF8traits::is5byte(*p) )
-                p += 5;
-            else if ( UTF8traits::is6byte(*p) )
-                p += 6;
-            else
-              {
-                break;
-              }
-          }
-        if ( p != end )
-          {
-            NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-            mErrorEncountered = PR_TRUE;
-            mLength = 0;
-            return N;
-          }
-        return p - start;
-      }
-
-    private:
-      size_t mLength;
-      PRBool mErrorEncountered;
-  };
-
 void
 NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
 {
--- a/xpcom/string/obsolete/nsString2.h
+++ b/xpcom/string/obsolete/nsString2.h
@ -76,18 +76,6 @@

 #include "nsStr.h"

-class UTF8traits
-  {
-    public:
-      static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
-      static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
-      static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
-      static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
-      static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
-      static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
-      static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
-  };
-
 #ifdef STANDALONE_MI_STRING_TESTS
  class nsAFlatString { public: virtual ~nsAString() { } };
 #endif
@ -566,140 +554,4 @@ class NS_COM NS_ConvertUTF8toUCS2
      NS_ConvertUTF8toUCS2( PRUnichar );
  };

-#define PLANE1_BASE           0x00010000  
-#define UCS2_REPLACEMENT_CHAR 0xfffd     
-
-class ConvertUTF8toUCS2
-  {
-    public:
-      typedef nsACString::char_type value_type;
-      typedef nsAString::char_type  buffer_type;
-
-    ConvertUTF8toUCS2( buffer_type* aBuffer )
-        : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
-
-    size_t Length() const { return mBuffer - mStart; }
-
-    PRUint32 write( const value_type* start, PRUint32 N )
-      {
-        if ( mErrorEncountered )
-          return N;
-
-        // algorithm assumes utf8 units won't
-        // be spread across fragments
-        const value_type* p = start;
-        const value_type* end = start + N;
-        for ( ; p != end /* && *p */; )
-          {
-            char c = *p++;
-
-            if ( UTF8traits::isASCII(c) )
-              {
-                *mBuffer++ = buffer_type(c);
-                continue;
-              }
-
-            PRUint32 ucs4;
-            PRUint32 minUcs4;
-            PRInt32 state = 0;
-
-            if ( UTF8traits::is2byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
-                state = 1;
-                minUcs4 = 0x00000080;
-              }
-            else if ( UTF8traits::is3byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
-                state = 2;
-                minUcs4 = 0x00000800;
-              }
-            else if ( UTF8traits::is4byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
-                state = 3;
-                minUcs4 = 0x00010000;
-              }
-            else if ( UTF8traits::is5byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 24) & 0x03000000L;
-                state = 4;
-                minUcs4 = 0x00200000;
-              }
-            else if ( UTF8traits::is6byte(c) )
-              {
-                ucs4 = (PRUint32(c) << 30) & 0x40000000L;
-                state = 5;
-                minUcs4 = 0x04000000;
-              }
-            else
-              {
-                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
-                mErrorEncountered = PR_TRUE;
-                return N;
-              }
-
-            while ( state-- )
-              {
-                c = *p++;
-
-                if ( UTF8traits::isInSeq(c) )
-                  {
-                    PRInt32 shift = state * 6;
-                    ucs4 |= (PRUint32(c) & 0x3F) << shift;
-                  }
-                else
-                  {
-                    NS_ERROR("not a UTF8 string");
-                    mErrorEncountered = PR_TRUE;
-                    return N;
-                  }
-              }
-
-            if ( ucs4 < minUcs4 )
-              {
-                // Overlong sequence
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 <= 0xD7FF )
-              {
-                *mBuffer++ = ucs4;
-              }
-            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
-              {
-                // Surrogates
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
-              {
-                // Prohibited characters
-                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-              }
-            else if ( ucs4 >= PLANE1_BASE )
-              {
-                if ( ucs4 >= 0x00110000 )
-                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
-                else {
-                  // surrogate, see unicode specification 3.7 for following math.
-                  ucs4 -= PLANE1_BASE;
-                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
-                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
-                }
-              }
-            else
-              {
-                if ( ucs4 != 0xFEFF ) // ignore BOM
-                    *mBuffer++ = ucs4;
-              }
-          }
-        return p - start;
-      }
-
-    private:
-      buffer_type* mStart;
-      buffer_type* mBuffer;
-      PRBool mErrorEncountered;
-  };
-
 #endif /* !defined(nsString2_h__) */
--- a/xpcom/string/public/Makefile.in
+++ b/xpcom/string/public/Makefile.in
@ -59,6 +59,7 @@ EXPORTS		=				\
 		nsSharableString.h		\
 		nsSharedBufferList.h		\
 		nsSlidingString.h		\
+		nsUTF8Utils.h			\
 		nsXPIDLString.h			\
 		$(NULL)			

--- a/xpcom/string/public/nsReadableUtils.h
+++ b/xpcom/string/public/nsReadableUtils.h
@ -107,11 +107,28 @@ NS_COM PRUnichar* ToNewUnicode( const nsAString& aSource );
   * This conversion is not well defined; but it reproduces legacy string behavior.
   * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
   *
-   * @param aSource an 8-bit wide string
+   * @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
   * @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
   */
 NS_COM PRUnichar* ToNewUnicode( const nsACString& aSource );

+  /**
+   * Returns a new |PRUnichar| buffer containing a zero-terminated copy
+   * of |aSource|.
+   *
+   * Allocates and returns a new |char| buffer which you must free with
+   * |nsMemory::Free|.  Performs an encoding conversion by 0-padding
+   * 8-bit wide characters up to 16-bits wide while copying |aSource| to
+   * your new buffer.  This conversion is not well defined; but it
+   * reproduces legacy string behavior.  The new buffer is
+   * zero-terminated, but that may not help you if |aSource| contains
+   * embedded nulls.
+   *
+   * @param aSource an 8-bit wide string, UTF-8 encoded
+   * @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
+   */
+NS_COM PRUnichar* UTF8ToNewUnicode( const nsACString& aSource );
+
  /**
   * Copies |aLength| 16-bit characters from the start of |aSource| to the
   * |PRUnichar| buffer |aDest|.
@ -281,6 +298,15 @@ NS_COM PRUint32 CountCharInReadable( const nsAString& aStr,
 NS_COM PRUint32 CountCharInReadable( const nsACString& aStr,
                                     char aChar );

+NS_COM PRBool StringBeginsWith( const nsAString& aSource,
+                                const nsAString& aSubstring);
+NS_COM PRBool StringBeginsWith( const nsACString& aSource,
+                                const nsACString& aSubstring);
+NS_COM PRBool StringEndsWith( const nsAString& aSource,
+                              const nsAString& aSubstring);
+NS_COM PRBool StringEndsWith( const nsACString& aSource,
+                              const nsACString& aSubstring);
+
 NS_COM PRUint32 HashString( const nsAString& aStr );
 NS_COM PRUint32 HashString( const nsACString& aStr );

--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -0,0 +1,245 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: NPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Netscape Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/NPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is mozilla.org code.
+ *
+ * The Initial Developer of the Original Code is 
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Peter Annema <jaggernaut@netscape.com> (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or 
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the NPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the NPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef nsUTF8Utils_h_
+#define nsUTF8Utils_h_
+
+class UTF8traits
+  {
+    public:
+      static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
+      static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
+      static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
+      static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
+      static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
+      static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
+      static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
+  };
+
+#define PLANE1_BASE           0x00010000  
+#define UCS2_REPLACEMENT_CHAR 0xfffd     
+
+class ConvertUTF8toUCS2
+  {
+    public:
+      typedef nsACString::char_type value_type;
+      typedef nsAString::char_type  buffer_type;
+
+    ConvertUTF8toUCS2( buffer_type* aBuffer )
+        : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
+
+    size_t Length() const { return mBuffer - mStart; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+        if ( mErrorEncountered )
+          return N;
+
+        // algorithm assumes utf8 units won't
+        // be spread across fragments
+        const value_type* p = start;
+        const value_type* end = start + N;
+        for ( ; p != end /* && *p */; )
+          {
+            char c = *p++;
+
+            if ( UTF8traits::isASCII(c) )
+              {
+                *mBuffer++ = buffer_type(c);
+                continue;
+              }
+
+            PRUint32 ucs4;
+            PRUint32 minUcs4;
+            PRInt32 state = 0;
+
+            if ( UTF8traits::is2byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
+                state = 1;
+                minUcs4 = 0x00000080;
+              }
+            else if ( UTF8traits::is3byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
+                state = 2;
+                minUcs4 = 0x00000800;
+              }
+            else if ( UTF8traits::is4byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
+                state = 3;
+                minUcs4 = 0x00010000;
+              }
+            else if ( UTF8traits::is5byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 24) & 0x03000000L;
+                state = 4;
+                minUcs4 = 0x00200000;
+              }
+            else if ( UTF8traits::is6byte(c) )
+              {
+                ucs4 = (PRUint32(c) << 30) & 0x40000000L;
+                state = 5;
+                minUcs4 = 0x04000000;
+              }
+            else
+              {
+                NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+                mErrorEncountered = PR_TRUE;
+                return N;
+              }
+
+            while ( state-- )
+              {
+                c = *p++;
+
+                if ( UTF8traits::isInSeq(c) )
+                  {
+                    PRInt32 shift = state * 6;
+                    ucs4 |= (PRUint32(c) & 0x3F) << shift;
+                  }
+                else
+                  {
+                    NS_ERROR("not a UTF8 string");
+                    mErrorEncountered = PR_TRUE;
+                    return N;
+                  }
+              }
+
+            if ( ucs4 < minUcs4 )
+              {
+                // Overlong sequence
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 <= 0xD7FF )
+              {
+                *mBuffer++ = ucs4;
+              }
+            else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
+              {
+                // Surrogates
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
+              {
+                // Prohibited characters
+                *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+              }
+            else if ( ucs4 >= PLANE1_BASE )
+              {
+                if ( ucs4 >= 0x00110000 )
+                  *mBuffer++ = UCS2_REPLACEMENT_CHAR;
+                else {
+                  // surrogate, see unicode specification 3.7 for following math.
+                  ucs4 -= PLANE1_BASE;
+                  *mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
+                  *mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
+                }
+              }
+            else
+              {
+                if ( ucs4 != 0xFEFF ) // ignore BOM
+                    *mBuffer++ = ucs4;
+              }
+          }
+        return p - start;
+      }
+
+    private:
+      buffer_type* mStart;
+      buffer_type* mBuffer;
+      PRBool mErrorEncountered;
+  };
+
+class CalculateUTF8Length
+  {
+    public:
+      typedef nsACString::char_type value_type;
+
+    CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
+
+    size_t Length() const { return mLength; }
+
+    PRUint32 write( const value_type* start, PRUint32 N )
+      {
+          // ignore any further requests
+        if ( mErrorEncountered )
+            return N;
+
+        // algorithm assumes utf8 units won't
+        // be spread across fragments
+        const value_type* p = start;
+        const value_type* end = start + N;
+        for ( ; p < end /* && *p */; ++mLength )
+          {
+            if ( UTF8traits::isASCII(*p) )
+                p += 1;
+            else if ( UTF8traits::is2byte(*p) )
+                p += 2;
+            else if ( UTF8traits::is3byte(*p) )
+                p += 3;
+            else if ( UTF8traits::is4byte(*p) ) {
+                p += 4;
+                ++mLength;
+            }
+            else if ( UTF8traits::is5byte(*p) )
+                p += 5;
+            else if ( UTF8traits::is6byte(*p) )
+                p += 6;
+            else
+              {
+                break;
+              }
+          }
+        if ( p != end )
+          {
+            NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
+            mErrorEncountered = PR_TRUE;
+            mLength = 0;
+            return N;
+          }
+        return p - start;
+      }
+
+    private:
+      size_t mLength;
+      PRBool mErrorEncountered;
+  };
+
+#endif /* !defined(nsUTF8Utils_h_) */
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -25,6 +25,7 @@
 #include "nsMemory.h"
 #include "nsString.h"
 #include "nsCRT.h"
+#include "nsUTF8Utils.h"

 #ifndef nsStringTraits_h___
 #include "nsStringTraits.h"
@ -208,6 +209,8 @@ NS_COM
 char*
 ToNewUTF8String( const nsAString& aSource )
  {
+    // XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
+    // refactored so that we can use it here without a double-copy.
    NS_ConvertUCS2toUTF8 temp(aSource);

    char* result;
@ -268,6 +271,26 @@ ToNewUnicode( const nsACString& aSource )
    return result;
  }

+NS_COM
+PRUnichar*
+UTF8ToNewUnicode( const nsACString& aSource )
+  {
+    nsACString::const_iterator start, end;
+    CalculateUTF8Length calculator;
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                calculator);
+
+    PRUnichar *result = NS_STATIC_CAST(PRUnichar*,
+        nsMemory::Alloc(sizeof(PRUnichar) * (calculator.Length() + 1)));
+
+    ConvertUTF8toUCS2 converter(result);
+    copy_string(aSource.BeginReading(start), aSource.EndReading(end),
+                converter);
+    NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
+
+    return result;
+  }
+
 NS_COM
 PRUnichar*
 CopyUnicodeTo( const nsAString& aSource, PRUint32 aSrcOffset, PRUnichar* aDest, PRUint32 aLength )
@ -1080,6 +1103,48 @@ CountCharInReadable( const nsACString& aStr,
  return count;
 }

+NS_COM PRBool
+StringBeginsWith( const nsAString& aSource, const nsAString& aSubstring)
+  {
+    nsAString::size_type src_len = aSource.Length(),
+                         sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, 0, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringBeginsWith( const nsACString& aSource, const nsACString& aSubstring)
+  {
+    nsACString::size_type src_len = aSource.Length(),
+                          sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, 0, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringEndsWith( const nsAString& aSource, const nsAString& aSubstring)
+  {
+    nsAString::size_type src_len = aSource.Length(),
+                         sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
+  }
+
+NS_COM PRBool
+StringEndsWith( const nsACString& aSource, const nsACString& aSubstring)
+  {
+    nsACString::size_type src_len = aSource.Length(),
+                          sub_len = aSubstring.Length();
+    if (sub_len > src_len)
+      return PR_FALSE;
+    return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
+  }
+
+
+
 template <class CharT>
 class CalculateHashCode
  {