From 46c1433563ef69c075094e9e0508446f3b95acbd Mon Sep 17 00:00:00 2001
From: Justin Lebar <justin.lebar@gmail.com>
Date: Tue, 31 Aug 2010 18:03:40 -0700
Subject: [PATCH] Bug 570975 - Don't convert UTF-8 strings to UTF-16 in
 MatchAutoCompleteFunction r=sdwilsh, a2.0=blocking

--HG--
extra : rebase_source : 70dd986a655e1670fec86e21c8cea7a596e3b0ac
---
 .../components/places/src/SQLFunctions.cpp    | 284 +++++++++++++-----
 toolkit/components/places/src/SQLFunctions.h  |  67 ++---
 2 files changed, 225 insertions(+), 126 deletions(-)

diff --git a/toolkit/components/places/src/SQLFunctions.cpp b/toolkit/components/places/src/SQLFunctions.cpp
index a7757c4657d4..79d7d472181a 100644
--- a/toolkit/components/places/src/SQLFunctions.cpp
+++ b/toolkit/components/places/src/SQLFunctions.cpp
@@ -43,9 +43,161 @@
 #include "nsEscape.h"
 #include "mozIPlacesAutoComplete.h"
 #include "SQLFunctions.h"
+#include "nsUTF8Utils.h"
 
 using namespace mozilla::storage;
 
+////////////////////////////////////////////////////////////////////////////////
+//// Anonymous Helpers
+
+namespace {
+
+  typedef nsACString::const_char_iterator const_char_iterator;
+
+  /**
+   * Get a pointer to the word boundary after aStart if aStart points to an
+   * ASCII letter (i.e. [a-zA-Z]).  Otherwise, return aNext, which we assume
+   * points to the next character in the UTF-8 sequence.
+   *
+   * We define a word boundary as anything that's not [a-z] -- this lets us
+   * match CamelCase words.
+   *
+   * @param aStart the beginning of the UTF-8 sequence
+   * @param aNext the next character in the sequence
+   * @param aEnd the first byte which is not part of the sequence
+   *
+   * @return a pointer to the next word boundary after aStart
+   */
+  static
+  NS_ALWAYS_INLINE const_char_iterator
+  nextWordBoundary(const_char_iterator const aStart,
+                   const_char_iterator const aNext,
+                   const_char_iterator const aEnd) {
+
+    const_char_iterator cur = aStart;
+    if (('a' <= *cur && *cur <= 'z') ||
+        ('A' <= *cur && *cur <= 'Z')) {
+
+      // Since we'll halt as soon as we see a non-ASCII letter, we can do a
+      // simple byte-by-byte comparison here and avoid the overhead of a
+      // UTF8CharEnumerator.
+      do {
+        cur++;
+      } while (cur < aEnd && 'a' <= *cur && *cur <= 'z');
+    }
+    else {
+      cur = aNext;
+    }
+
+    return cur;
+  }
+
+  enum FindInStringBehavior {
+    eFindOnBoundary,
+    eFindAnywhere
+  };
+
+  /**
+   * findAnywhere and findOnBoundary do almost the same thing, so it's natural
+   * to implement them in terms of a single function.  They're both
+   * performance-critical functions, however, and checking aBehavior makes them
+   * a bit slower.  Our solution is to define findInString as NS_ALWAYS_INLINE
+   * and rely on the compiler to optimize out the aBehavior check.
+   *
+   * @param aToken
+   *        The token we're searching for
+   * @param aSourceString
+   *        The string in which we're searching
+   * @param aBehavior
+   *        eFindOnBoundary if we should only consider matchines which occur on
+   *        word boundaries, or eFindAnywhere if we should consider matches
+   *        which appear anywhere.
+   *
+   * @return true if aToken was found in aSourceString, false otherwise.
+   */
+  static
+  NS_ALWAYS_INLINE bool
+  findInString(const nsDependentCSubstring &aToken,
+               const nsACString &aSourceString,
+               FindInStringBehavior aBehavior)
+  {
+    // CaseInsensitiveUTF8CharsEqual assumes that there's at least one byte in
+    // the both strings, so don't pass an empty token here.
+    NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
+
+    // We cannot match anything if there is nothing to search.
+    if (aSourceString.IsEmpty()) {
+      return false;
+    }
+
+    const_char_iterator tokenStart(aToken.BeginReading()),
+                        tokenEnd(aToken.EndReading()),
+                        sourceStart(aSourceString.BeginReading()),
+                        sourceEnd(aSourceString.EndReading());
+
+    do {
+      // We are on a word boundary (if aBehavior == eFindOnBoundary).  See if
+      // aToken matches sourceStart.
+
+      // Check whether the first character in the token matches the character
+      // at sourceStart.  At the same time, get a pointer to the next character
+      // in both the token and the source.
+      const_char_iterator sourceNext, tokenCur;
+      PRBool error;
+      if (CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
+                                        sourceEnd, tokenEnd,
+                                        &sourceNext, &tokenCur, &error)) {
+
+        // We don't need to check |error| here -- if
+        // CaseInsensitiveUTF8CharCompare encounters an error, it'll also
+        // return false and we'll catch the error outside the if.
+
+        const_char_iterator sourceCur = sourceNext;
+        while (true) {
+          if (tokenCur >= tokenEnd) {
+            // We matched the whole token!
+            return true;
+          }
+
+          if (sourceCur >= sourceEnd) {
+            // We ran into the end of source while matching a token.  This
+            // means we'll never find the token we're looking for.
+            return false;
+          }
+
+          if (!CaseInsensitiveUTF8CharsEqual(sourceCur, tokenCur,
+                                             sourceEnd, tokenEnd,
+                                             &sourceCur, &tokenCur, &error)) {
+            // sourceCur doesn't match tokenCur (or there's an error), so break
+            // out of this loop.
+            break;
+          }
+        }
+      }
+
+      // If something went wrong above, get out of here!
+      if (NS_UNLIKELY(error)) {
+        return false;
+      }
+
+      // We didn't match the token.  If we're searching for matches on word
+      // boundaries, skip to the next word boundary.  Otherwise, advance
+      // forward one character, using the sourceNext pointer we saved earlier.
+
+      if (aBehavior == eFindOnBoundary) {
+        sourceStart = nextWordBoundary(sourceStart, sourceNext, sourceEnd);
+      }
+      else {
+        sourceStart = sourceNext;
+      }
+
+    } while (sourceStart < sourceEnd);
+
+    return false;
+  }
+
+} // End anonymous namespace
+
 namespace mozilla {
 namespace places {
 
@@ -73,117 +225,87 @@ namespace places {
   /* static */
   void
   MatchAutoCompleteFunction::fixupURISpec(const nsCString &aURISpec,
-                                          nsString &_fixedSpec)
+                                          nsCString &_fixedSpec)
   {
     nsCString unescapedSpec;
     (void)NS_UnescapeURL(aURISpec, esc_SkipControl | esc_AlwaysCopy,
                          unescapedSpec);
 
-    // If this unescaped string is valid UTF-8, we'll convert it.  Otherwise,
-    // we will simply convert our original string.
+    // If this unescaped string is valid UTF-8, we'll use it.  Otherwise,
+    // we will simply use our original string.
     NS_ASSERTION(_fixedSpec.IsEmpty(),
                  "Passing a non-empty string as an out parameter!");
     if (IsUTF8(unescapedSpec))
-      CopyUTF8toUTF16(unescapedSpec, _fixedSpec);
+      _fixedSpec.Assign(unescapedSpec);
     else
-      CopyUTF8toUTF16(aURISpec, _fixedSpec);
+      _fixedSpec.Assign(aURISpec);
 
-    if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("http://")))
+    if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("http://")))
       _fixedSpec.Cut(0, 7);
-    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("https://")))
+    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("https://")))
       _fixedSpec.Cut(0, 8);
-    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("ftp://")))
+    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("ftp://")))
       _fixedSpec.Cut(0, 6);
 
-    if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("www.")))
+    if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("www.")))
       _fixedSpec.Cut(0, 4);
   }
 
   /* static */
   bool
-  MatchAutoCompleteFunction::findAnywhere(const nsDependentSubstring &aToken,
-                                          const nsAString &aSourceString)
+  MatchAutoCompleteFunction::findAnywhere(const nsDependentCSubstring &aToken,
+                                          const nsACString &aSourceString)
   {
-    return !!CaseInsensitiveFindInReadable(aToken, aSourceString);
+    // We can't use FindInReadable here; it works only for ASCII.
+
+    return findInString(aToken, aSourceString, eFindAnywhere);
   }
 
   /* static */
   bool
-  MatchAutoCompleteFunction::findBeginning(const nsDependentSubstring &aToken,
-                                           const nsAString &aSourceString)
+  MatchAutoCompleteFunction::findOnBoundary(const nsDependentCSubstring &aToken,
+                                            const nsACString &aSourceString)
   {
-    return !!StringBeginsWith(aSourceString, aToken,
-                              nsCaseInsensitiveStringComparator());
+    return findInString(aToken, aSourceString, eFindOnBoundary);
   }
 
   /* static */
   bool
-  MatchAutoCompleteFunction::findOnBoundary(const nsDependentSubstring &aToken,
-                                            const nsAString &aSourceString)
+  MatchAutoCompleteFunction::findBeginning(const nsDependentCSubstring &aToken,
+                                           const nsACString &aSourceString)
   {
-    // We cannot match anything if there is nothing to search.
-    if (aSourceString.IsEmpty())
-      return false;
+    NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
 
-    // Define a const instance of this class so it is created once.
-    const nsCaseInsensitiveStringComparator caseInsensitiveCompare;
+    // We can't use StringBeginsWith here, unfortunately.  Although it will
+    // happily take a case-insensitive UTF8 comparator, it eventually calls
+    // nsACString::Equals, which checks that the two strings contain the same
+    // number of bytes before calling the comparator.  This is clearly not what
+    // we want.
 
-    const_wchar_iterator tokenStart(aToken.BeginReading()),
-                         tokenEnd(aToken.EndReading()),
-                         sourceStart(aSourceString.BeginReading()),
-                         sourceEnd(aSourceString.EndReading());
+    const_char_iterator tokenStart(aToken.BeginReading()),
+                        tokenEnd(aToken.EndReading()),
+                        sourceStart(aSourceString.BeginReading()),
+                        sourceEnd(aSourceString.EndReading());
 
-    // The start of aSourceString is considered a word boundary, so start there.
-    do {
-      // We are on a word boundary, so start by copying the iterators.
-      const_wchar_iterator testTokenItr(tokenStart),
-                           testSourceItr(sourceStart);
+    PRBool dummy;
+    while (sourceStart < sourceEnd &&
+           CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
+                                         sourceEnd, tokenEnd,
+                                         &sourceStart, &tokenStart, &dummy)) {
 
-      // Keep trying to match the token one by one until it doesn't match.
-      while (!caseInsensitiveCompare(testTokenItr, testSourceItr, 1, 1)) {
-        // We matched something, so move down one.
-        testTokenItr++;
-        testSourceItr++;
-
-        // Matched the full token, so we are done!
-        if (testTokenItr == tokenEnd)
-          return true;
-
-        // However, if we ran into the end of the source while matching the
-        // token, we will not find it.
-        if (testSourceItr == sourceEnd)
-          return false;
+      // We found the token!
+      if (tokenStart >= tokenEnd) {
+        return true;
       }
+    }
 
-      // Always advance our starting iterator, and if we are not currently on a
-      // word boundary, advance to the next word boundary.
-      if (!isWordBoundary(ToLowerCase(*sourceStart++)))
-        sourceStart = nextWordBoundary(sourceStart, sourceEnd);
-    } while (sourceStart != sourceEnd);
+    // We don't need to check CaseInsensitiveUTF8CharsEqual's error condition
+    // (stored in |dummy|), since the function will return false if it
+    // encounters an error.
 
     return false;
   }
 
-  /* static */
-  MatchAutoCompleteFunction::const_wchar_iterator
-  MatchAutoCompleteFunction::nextWordBoundary(const_wchar_iterator aStart,
-                                              const_wchar_iterator aEnd)
-  {
-    while (aStart != aEnd && !isWordBoundary(*aStart))
-      aStart++;
-    return aStart;
-  }
-
-  /* static */
-  bool
-  MatchAutoCompleteFunction::isWordBoundary(const PRUnichar &aChar)
-  {
-    // Only check lowercase alphabetic characters so we can match CamelCase
-    // words.  This means that matches will happen after an upper-case
-    // character.
-    return !(PRUnichar('a') <= aChar && aChar <= PRUnichar('z'));
-  }
-
   /* static */
   MatchAutoCompleteFunction::searchFunctionPtr
   MatchAutoCompleteFunction::getSearchFunction(PRInt32 aBehavior)
@@ -217,15 +339,15 @@ namespace places {
     #define HAS_BEHAVIOR(aBitName) \
       (searchBehavior & mozIPlacesAutoComplete::BEHAVIOR_##aBitName)
 
-    nsAutoString searchString;
-    (void)aArguments->GetString(kArgSearchString, searchString);
+    nsCAutoString searchString;
+    (void)aArguments->GetUTF8String(kArgSearchString, searchString);
     nsCString url;
     (void)aArguments->GetUTF8String(kArgIndexURL, url);
 
     // We only want to filter javascript: URLs if we are not supposed to search
     // for them, and the search does not start with "javascript:".
     if (!HAS_BEHAVIOR(JAVASCRIPT) &&
-        !StringBeginsWith(searchString, NS_LITERAL_STRING("javascript:")) &&
+        !StringBeginsWith(searchString, NS_LITERAL_CSTRING("javascript:")) &&
         StringBeginsWith(url, NS_LITERAL_CSTRING("javascript:"))) {
       NS_IF_ADDREF(*_result = new IntegerVariant(0));
       NS_ENSURE_TRUE(*_result, NS_ERROR_OUT_OF_MEMORY);
@@ -235,8 +357,8 @@ namespace places {
     PRInt32 visitCount = aArguments->AsInt32(kArgIndexVisitCount);
     bool typed = aArguments->AsInt32(kArgIndexTyped) ? true : false;
     bool bookmark = aArguments->AsInt32(kArgIndexBookmark) ? true : false;
-    nsAutoString tags;
-    (void)aArguments->GetString(kArgIndexTags, tags);
+    nsCAutoString tags;
+    (void)aArguments->GetUTF8String(kArgIndexTags, tags);
     PRInt32 openPageCount = aArguments->AsInt32(kArgIndexOpenPageCount);
 
     // Make sure we match all the filter requirements.  If a given restriction
@@ -255,21 +377,21 @@ namespace places {
     }
 
     // Clean up our URI spec and prepare it for searching.
-    nsString fixedURI;
+    nsCString fixedURI;
     fixupURISpec(url, fixedURI);
 
     // Obtain our search function.
     PRInt32 matchBehavior = aArguments->AsInt32(kArgIndexMatchBehavior);
     searchFunctionPtr searchFunction = getSearchFunction(matchBehavior);
 
-    nsAutoString title;
-    (void)aArguments->GetString(kArgIndexTitle, title);
+    nsCAutoString title;
+    (void)aArguments->GetUTF8String(kArgIndexTitle, title);
 
     // Determine if every token matches either the bookmark title, tags, page
     // title, or page URL.
-    nsWhitespaceTokenizer tokenizer(searchString);
+    nsCWhitespaceTokenizer tokenizer(searchString);
     while (matches && tokenizer.hasMoreTokens()) {
-      const nsDependentSubstring &token = tokenizer.nextToken();
+      const nsDependentCSubstring &token = tokenizer.nextToken();
 
       bool matchTags = searchFunction(token, tags);
       bool matchTitle = searchFunction(token, title);
diff --git a/toolkit/components/places/src/SQLFunctions.h b/toolkit/components/places/src/SQLFunctions.h
index 85699612a74c..28d4f4ffc6e2 100644
--- a/toolkit/components/places/src/SQLFunctions.h
+++ b/toolkit/components/places/src/SQLFunctions.h
@@ -15,7 +15,8 @@
  * The Original Code is Places code.
  *
  * The Initial Developer of the Original Code is
- * Mozilla Corporation.
+ * the Mozilla Foundation.
+ *
  * Portions created by the Initial Developer are Copyright (C) 2009
  * the Initial Developer. All Rights Reserved.
  *
@@ -118,10 +119,10 @@ private:
   /**
    * Typedefs
    */
-  typedef bool (*searchFunctionPtr)(const nsDependentSubstring &aToken,
-                                    const nsAString &aSourceString);
+  typedef bool (*searchFunctionPtr)(const nsDependentCSubstring &aToken,
+                                    const nsACString &aSourceString);
 
-  typedef nsAString::const_char_iterator const_wchar_iterator;
+  typedef nsACString::const_char_iterator const_char_iterator;
 
   /**
    * Obtains the search function to match on.
@@ -133,6 +134,18 @@ private:
    */
   static searchFunctionPtr getSearchFunction(PRInt32 aBehavior);
 
+  /**
+   * Tests if aSourceString starts with aToken.
+   *
+   * @param aToken
+   *        The string to search for.
+   * @param aSourceString
+   *        The string to search.
+   * @return true if found, false otherwise.
+   */
+  static bool findBeginning(const nsDependentCSubstring &aToken,
+                            const nsACString &aSourceString);
+
   /**
    * Searches aSourceString for aToken anywhere in the string in a case-
    * insensitive way.
@@ -143,20 +156,8 @@ private:
    *        The string to search.
    * @return true if found, false otherwise.
    */
-  static bool findAnywhere(const nsDependentSubstring &aToken,
-                           const nsAString &aSourceString);
-
-  /**
-   * Tests if aSourceString starts with aToken.
-   *
-   * @param aToken
-   *        The string to search for.
-   * @param aSourceString
-   *        The string to search.
-   * @return true if found, false otherwise.
-   */
-  static bool findBeginning(const nsDependentSubstring &aToken,
-                            const nsAString &aSourceString);
+  static bool findAnywhere(const nsDependentCSubstring &aToken,
+                           const nsACString &aSourceString);
 
   /**
    * Tests if aToken is found on a word boundary in aSourceString.
@@ -167,33 +168,9 @@ private:
    *        The string to search.
    * @return true if found, false otherwise.
    */
-  static bool findOnBoundary(const nsDependentSubstring &aToken,
-                             const nsAString &aSourceString);
+  static bool findOnBoundary(const nsDependentCSubstring &aToken,
+                             const nsACString &aSourceString);
 
-  /**
-   * Obtains an iterator to the next word boundary as defined by isWordBoundary.
-   *
-   * @param aStart
-   *        An iterator pointing to the start of the string.
-   * @param aEnd
-   *        An iterator pointing to the end of the string.
-   * @return an iterator pointing to the next word boundary.
-   */
-  static const_wchar_iterator nextWordBoundary(const_wchar_iterator aStart,
-                                               const_wchar_iterator aEnd);
-  /**
-   * Determines if aChar is a word boundary.  A 'word boundary' is anything that
-   * is not used to build up a word from a string of characters.  We are very
-   * conservative here because anything that we do not list will be treated as a
-   * word boundary.  This means searching for that not-actually-a-word-boundary
-   * character can still be matched in the middle of a word.
-   *
-   * @param aChar
-   *        The Unicode character to check against.
-   * @return true if the character is considered a word boundary, false
-   *          otherwise.
-   */
-  static inline bool isWordBoundary(const PRUnichar &aChar);
 
   /**
    * Fixes a URI's spec such that it is ready to be searched.  This includes
@@ -205,7 +182,7 @@ private:
    * @param _fixedSpec
    *        An out parameter that is the fixed up string.
    */
-  static void fixupURISpec(const nsCString &aURISpec, nsString &_fixedSpec);
+  static void fixupURISpec(const nsCString &aURISpec, nsCString &_fixedSpec);
 };
 
 } // namespace places