Bug 570975 - Don't convert UTF-8 strings to UTF-16 in MatchAutoCompleteFunction r=sdwilsh, a2.0=blocking

--HG-- extra : rebase_source : 70dd986a655e1670fec86e21c8cea7a596e3b0ac
2010-08-31 18:03:40 -07:00 · 2010-08-31 18:03:40 -07:00 · 46c1433563
--- a/toolkit/components/places/src/SQLFunctions.cpp
+++ b/toolkit/components/places/src/SQLFunctions.cpp
@ -43,9 +43,161 @@
 #include "nsEscape.h"
 #include "mozIPlacesAutoComplete.h"
 #include "SQLFunctions.h"
 #include "nsUTF8Utils.h"
 using namespace mozilla::storage;
 ////////////////////////////////////////////////////////////////////////////////
 //// Anonymous Helpers
 namespace {
  typedef nsACString::const_char_iterator const_char_iterator;
  /**
   * Get a pointer to the word boundary after aStart if aStart points to an
   * ASCII letter (i.e. [a-zA-Z]).  Otherwise, return aNext, which we assume
   * points to the next character in the UTF-8 sequence.
   *
   * We define a word boundary as anything that's not [a-z] -- this lets us
   * match CamelCase words.
   *
   * @param aStart the beginning of the UTF-8 sequence
   * @param aNext the next character in the sequence
   * @param aEnd the first byte which is not part of the sequence
   *
   * @return a pointer to the next word boundary after aStart
   */
  static
  NS_ALWAYS_INLINE const_char_iterator
  nextWordBoundary(const_char_iterator const aStart,
                   const_char_iterator const aNext,
                   const_char_iterator const aEnd) {
    const_char_iterator cur = aStart;
    if (('a' <= *cur && *cur <= 'z') ||
        ('A' <= *cur && *cur <= 'Z')) {
      // Since we'll halt as soon as we see a non-ASCII letter, we can do a
      // simple byte-by-byte comparison here and avoid the overhead of a
      // UTF8CharEnumerator.
      do {
        cur++;
      } while (cur < aEnd && 'a' <= *cur && *cur <= 'z');
    }
    else {
      cur = aNext;
    }
    return cur;
  }
  enum FindInStringBehavior {
    eFindOnBoundary,
    eFindAnywhere
  };
  /**
   * findAnywhere and findOnBoundary do almost the same thing, so it's natural
   * to implement them in terms of a single function.  They're both
   * performance-critical functions, however, and checking aBehavior makes them
   * a bit slower.  Our solution is to define findInString as NS_ALWAYS_INLINE
   * and rely on the compiler to optimize out the aBehavior check.
   *
   * @param aToken
   *        The token we're searching for
   * @param aSourceString
   *        The string in which we're searching
   * @param aBehavior
   *        eFindOnBoundary if we should only consider matchines which occur on
   *        word boundaries, or eFindAnywhere if we should consider matches
   *        which appear anywhere.
   *
   * @return true if aToken was found in aSourceString, false otherwise.
   */
  static
  NS_ALWAYS_INLINE bool
  findInString(const nsDependentCSubstring &aToken,
               const nsACString &aSourceString,
               FindInStringBehavior aBehavior)
  {
    // CaseInsensitiveUTF8CharsEqual assumes that there's at least one byte in
    // the both strings, so don't pass an empty token here.
    NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
    // We cannot match anything if there is nothing to search.
    if (aSourceString.IsEmpty()) {
      return false;
    }
    const_char_iterator tokenStart(aToken.BeginReading()),
                        tokenEnd(aToken.EndReading()),
                        sourceStart(aSourceString.BeginReading()),
                        sourceEnd(aSourceString.EndReading());
    do {
      // We are on a word boundary (if aBehavior == eFindOnBoundary).  See if
      // aToken matches sourceStart.
      // Check whether the first character in the token matches the character
      // at sourceStart.  At the same time, get a pointer to the next character
      // in both the token and the source.
      const_char_iterator sourceNext, tokenCur;
      PRBool error;
      if (CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
                                        sourceEnd, tokenEnd,
                                        &sourceNext, &tokenCur, &error)) {
        // We don't need to check |error| here -- if
        // CaseInsensitiveUTF8CharCompare encounters an error, it'll also
        // return false and we'll catch the error outside the if.
        const_char_iterator sourceCur = sourceNext;
        while (true) {
          if (tokenCur >= tokenEnd) {
            // We matched the whole token!
            return true;
          }
          if (sourceCur >= sourceEnd) {
            // We ran into the end of source while matching a token.  This
            // means we'll never find the token we're looking for.
            return false;
          }
          if (!CaseInsensitiveUTF8CharsEqual(sourceCur, tokenCur,
                                             sourceEnd, tokenEnd,
                                             &sourceCur, &tokenCur, &error)) {
            // sourceCur doesn't match tokenCur (or there's an error), so break
            // out of this loop.
            break;
          }
        }
      }
      // If something went wrong above, get out of here!
      if (NS_UNLIKELY(error)) {
        return false;
      }
      // We didn't match the token.  If we're searching for matches on word
      // boundaries, skip to the next word boundary.  Otherwise, advance
      // forward one character, using the sourceNext pointer we saved earlier.
      if (aBehavior == eFindOnBoundary) {
        sourceStart = nextWordBoundary(sourceStart, sourceNext, sourceEnd);
      }
      else {
        sourceStart = sourceNext;
      }
    } while (sourceStart < sourceEnd);
    return false;
  }
 } // End anonymous namespace
 namespace mozilla {
 namespace places {
@ -73,117 +225,87 @@ namespace places {
  /* static */
  void
  MatchAutoCompleteFunction::fixupURISpec(const nsCString &aURISpec,
-                                          nsString &_fixedSpec)
+                                          nsCString &_fixedSpec)
  {
    nsCString unescapedSpec;
    (void)NS_UnescapeURL(aURISpec, esc_SkipControl | esc_AlwaysCopy,
                         unescapedSpec);
-    // If this unescaped string is valid UTF-8, we'll convert it.  Otherwise,
+    // If this unescaped string is valid UTF-8, we'll use it.  Otherwise,
-    // we will simply convert our original string.
+    // we will simply use our original string.
    NS_ASSERTION(_fixedSpec.IsEmpty(),
                 "Passing a non-empty string as an out parameter!");
    if (IsUTF8(unescapedSpec))
-      CopyUTF8toUTF16(unescapedSpec, _fixedSpec);
+      _fixedSpec.Assign(unescapedSpec);
    else
-      CopyUTF8toUTF16(aURISpec, _fixedSpec);
+      _fixedSpec.Assign(aURISpec);
-    if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("http://")))
+    if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("http://")))
      _fixedSpec.Cut(0, 7);
-    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("https://")))
+    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("https://")))
      _fixedSpec.Cut(0, 8);
-    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("ftp://")))
+    else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("ftp://")))
      _fixedSpec.Cut(0, 6);
-    if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("www.")))
+    if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("www.")))
      _fixedSpec.Cut(0, 4);
  }
  /* static */
  bool
-  MatchAutoCompleteFunction::findAnywhere(const nsDependentSubstring &aToken,
+  MatchAutoCompleteFunction::findAnywhere(const nsDependentCSubstring &aToken,
-                                          const nsAString &aSourceString)
+                                          const nsACString &aSourceString)
  {
-    return !!CaseInsensitiveFindInReadable(aToken, aSourceString);
+    // We can't use FindInReadable here; it works only for ASCII.
    return findInString(aToken, aSourceString, eFindAnywhere);
  }
  /* static */
  bool
-  MatchAutoCompleteFunction::findBeginning(const nsDependentSubstring &aToken,
+  MatchAutoCompleteFunction::findOnBoundary(const nsDependentCSubstring &aToken,
-                                           const nsAString &aSourceString)
+                                            const nsACString &aSourceString)
  {
-    return !!StringBeginsWith(aSourceString, aToken,
+    return findInString(aToken, aSourceString, eFindOnBoundary);
                              nsCaseInsensitiveStringComparator());
  }
  /* static */
  bool
-  MatchAutoCompleteFunction::findOnBoundary(const nsDependentSubstring &aToken,
+  MatchAutoCompleteFunction::findBeginning(const nsDependentCSubstring &aToken,
-                                            const nsAString &aSourceString)
+                                           const nsACString &aSourceString)
  {
-    // We cannot match anything if there is nothing to search.
+    NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
    if (aSourceString.IsEmpty())
      return false;
-    // Define a const instance of this class so it is created once.
+    // We can't use StringBeginsWith here, unfortunately.  Although it will
-    const nsCaseInsensitiveStringComparator caseInsensitiveCompare;
+    // happily take a case-insensitive UTF8 comparator, it eventually calls
    // nsACString::Equals, which checks that the two strings contain the same
    // number of bytes before calling the comparator.  This is clearly not what
    // we want.
-    const_wchar_iterator tokenStart(aToken.BeginReading()),
+    const_char_iterator tokenStart(aToken.BeginReading()),
-                         tokenEnd(aToken.EndReading()),
+                        tokenEnd(aToken.EndReading()),
-                         sourceStart(aSourceString.BeginReading()),
+                        sourceStart(aSourceString.BeginReading()),
-                         sourceEnd(aSourceString.EndReading());
+                        sourceEnd(aSourceString.EndReading());
-    // The start of aSourceString is considered a word boundary, so start there.
+    PRBool dummy;
-    do {
+    while (sourceStart < sourceEnd &&
-      // We are on a word boundary, so start by copying the iterators.
+           CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
-      const_wchar_iterator testTokenItr(tokenStart),
+                                         sourceEnd, tokenEnd,
-                           testSourceItr(sourceStart);
+                                         &sourceStart, &tokenStart, &dummy)) {
-      // Keep trying to match the token one by one until it doesn't match.
+      // We found the token!
-      while (!caseInsensitiveCompare(testTokenItr, testSourceItr, 1, 1)) {
+      if (tokenStart >= tokenEnd) {
-        // We matched something, so move down one.
+        return true;
        testTokenItr++;
        testSourceItr++;
        // Matched the full token, so we are done!
        if (testTokenItr == tokenEnd)
          return true;
        // However, if we ran into the end of the source while matching the
        // token, we will not find it.
        if (testSourceItr == sourceEnd)
          return false;
      }
    }
-      // Always advance our starting iterator, and if we are not currently on a
+    // We don't need to check CaseInsensitiveUTF8CharsEqual's error condition
-      // word boundary, advance to the next word boundary.
+    // (stored in |dummy|), since the function will return false if it
-      if (!isWordBoundary(ToLowerCase(*sourceStart++)))
+    // encounters an error.
        sourceStart = nextWordBoundary(sourceStart, sourceEnd);
    } while (sourceStart != sourceEnd);
    return false;
  }
  /* static */
  MatchAutoCompleteFunction::const_wchar_iterator
  MatchAutoCompleteFunction::nextWordBoundary(const_wchar_iterator aStart,
                                              const_wchar_iterator aEnd)
  {
    while (aStart != aEnd && !isWordBoundary(*aStart))
      aStart++;
    return aStart;
  }
  /* static */
  bool
  MatchAutoCompleteFunction::isWordBoundary(const PRUnichar &aChar)
  {
    // Only check lowercase alphabetic characters so we can match CamelCase
    // words.  This means that matches will happen after an upper-case
    // character.
    return !(PRUnichar('a') <= aChar && aChar <= PRUnichar('z'));
  }
  /* static */
  MatchAutoCompleteFunction::searchFunctionPtr
  MatchAutoCompleteFunction::getSearchFunction(PRInt32 aBehavior)
@ -217,15 +339,15 @@ namespace places {
    #define HAS_BEHAVIOR(aBitName) \
      (searchBehavior & mozIPlacesAutoComplete::BEHAVIOR_##aBitName)
-    nsAutoString searchString;
+    nsCAutoString searchString;
-    (void)aArguments->GetString(kArgSearchString, searchString);
+    (void)aArguments->GetUTF8String(kArgSearchString, searchString);
    nsCString url;
    (void)aArguments->GetUTF8String(kArgIndexURL, url);
    // We only want to filter javascript: URLs if we are not supposed to search
    // for them, and the search does not start with "javascript:".
    if (!HAS_BEHAVIOR(JAVASCRIPT) &&
-        !StringBeginsWith(searchString, NS_LITERAL_STRING("javascript:")) &&
+        !StringBeginsWith(searchString, NS_LITERAL_CSTRING("javascript:")) &&
        StringBeginsWith(url, NS_LITERAL_CSTRING("javascript:"))) {
      NS_IF_ADDREF(*_result = new IntegerVariant(0));
      NS_ENSURE_TRUE(*_result, NS_ERROR_OUT_OF_MEMORY);
@ -235,8 +357,8 @@ namespace places {
    PRInt32 visitCount = aArguments->AsInt32(kArgIndexVisitCount);
    bool typed = aArguments->AsInt32(kArgIndexTyped) ? true : false;
    bool bookmark = aArguments->AsInt32(kArgIndexBookmark) ? true : false;
-    nsAutoString tags;
+    nsCAutoString tags;
-    (void)aArguments->GetString(kArgIndexTags, tags);
+    (void)aArguments->GetUTF8String(kArgIndexTags, tags);
    PRInt32 openPageCount = aArguments->AsInt32(kArgIndexOpenPageCount);
    // Make sure we match all the filter requirements.  If a given restriction
@ -255,21 +377,21 @@ namespace places {
    }
    // Clean up our URI spec and prepare it for searching.
-    nsString fixedURI;
+    nsCString fixedURI;
    fixupURISpec(url, fixedURI);
    // Obtain our search function.
    PRInt32 matchBehavior = aArguments->AsInt32(kArgIndexMatchBehavior);
    searchFunctionPtr searchFunction = getSearchFunction(matchBehavior);
-    nsAutoString title;
+    nsCAutoString title;
-    (void)aArguments->GetString(kArgIndexTitle, title);
+    (void)aArguments->GetUTF8String(kArgIndexTitle, title);
    // Determine if every token matches either the bookmark title, tags, page
    // title, or page URL.
-    nsWhitespaceTokenizer tokenizer(searchString);
+    nsCWhitespaceTokenizer tokenizer(searchString);
    while (matches && tokenizer.hasMoreTokens()) {
-      const nsDependentSubstring &token = tokenizer.nextToken();
+      const nsDependentCSubstring &token = tokenizer.nextToken();
      bool matchTags = searchFunction(token, tags);
      bool matchTitle = searchFunction(token, title);
--- a/toolkit/components/places/src/SQLFunctions.h
+++ b/toolkit/components/places/src/SQLFunctions.h
@ -15,7 +15,8 @@
 * The Original Code is Places code.
 *
 * The Initial Developer of the Original Code is
- * Mozilla Corporation.
+ * the Mozilla Foundation.
 *
 * Portions created by the Initial Developer are Copyright (C) 2009
 * the Initial Developer. All Rights Reserved.
 *
@ -118,10 +119,10 @@ private:
  /**
   * Typedefs
   */
-  typedef bool (*searchFunctionPtr)(const nsDependentSubstring &aToken,
+  typedef bool (*searchFunctionPtr)(const nsDependentCSubstring &aToken,
-                                    const nsAString &aSourceString);
+                                    const nsACString &aSourceString);
-  typedef nsAString::const_char_iterator const_wchar_iterator;
+  typedef nsACString::const_char_iterator const_char_iterator;
  /**
   * Obtains the search function to match on.
@ -133,6 +134,18 @@ private:
   */
  static searchFunctionPtr getSearchFunction(PRInt32 aBehavior);
  /**
   * Tests if aSourceString starts with aToken.
   *
   * @param aToken
   *        The string to search for.
   * @param aSourceString
   *        The string to search.
   * @return true if found, false otherwise.
   */
  static bool findBeginning(const nsDependentCSubstring &aToken,
                            const nsACString &aSourceString);
  /**
   * Searches aSourceString for aToken anywhere in the string in a case-
   * insensitive way.
@ -143,20 +156,8 @@ private:
   *        The string to search.
   * @return true if found, false otherwise.
   */
-  static bool findAnywhere(const nsDependentSubstring &aToken,
+  static bool findAnywhere(const nsDependentCSubstring &aToken,
-                           const nsAString &aSourceString);
+                           const nsACString &aSourceString);
  /**
   * Tests if aSourceString starts with aToken.
   *
   * @param aToken
   *        The string to search for.
   * @param aSourceString
   *        The string to search.
   * @return true if found, false otherwise.
   */
  static bool findBeginning(const nsDependentSubstring &aToken,
                            const nsAString &aSourceString);
  /**
   * Tests if aToken is found on a word boundary in aSourceString.
@ -167,33 +168,9 @@ private:
   *        The string to search.
   * @return true if found, false otherwise.
   */
-  static bool findOnBoundary(const nsDependentSubstring &aToken,
+  static bool findOnBoundary(const nsDependentCSubstring &aToken,
-                             const nsAString &aSourceString);
+                             const nsACString &aSourceString);
  /**
   * Obtains an iterator to the next word boundary as defined by isWordBoundary.
   *
   * @param aStart
   *        An iterator pointing to the start of the string.
   * @param aEnd
   *        An iterator pointing to the end of the string.
   * @return an iterator pointing to the next word boundary.
   */
  static const_wchar_iterator nextWordBoundary(const_wchar_iterator aStart,
                                               const_wchar_iterator aEnd);
  /**
   * Determines if aChar is a word boundary.  A 'word boundary' is anything that
   * is not used to build up a word from a string of characters.  We are very
   * conservative here because anything that we do not list will be treated as a
   * word boundary.  This means searching for that not-actually-a-word-boundary
   * character can still be matched in the middle of a word.
   *
   * @param aChar
   *        The Unicode character to check against.
   * @return true if the character is considered a word boundary, false
   *          otherwise.
   */
  static inline bool isWordBoundary(const PRUnichar &aChar);
  /**
   * Fixes a URI's spec such that it is ready to be searched.  This includes
@ -205,7 +182,7 @@ private:
   * @param _fixedSpec
   *        An out parameter that is the fixed up string.
   */
-  static void fixupURISpec(const nsCString &aURISpec, nsString &_fixedSpec);
+  static void fixupURISpec(const nsCString &aURISpec, nsCString &_fixedSpec);
 };
 } // namespace places