Bug 570975 - Don't convert UTF-8 strings to UTF-16 in MatchAutoCompleteFunction r=sdwilsh, a2.0=blocking

--HG--
extra : rebase_source : 70dd986a655e1670fec86e21c8cea7a596e3b0ac
This commit is contained in:
Justin Lebar 2010-08-31 18:03:40 -07:00
Родитель a6e293112f
Коммит 46c1433563
2 изменённых файлов: 225 добавлений и 126 удалений

Просмотреть файл

@ -43,9 +43,161 @@
#include "nsEscape.h" #include "nsEscape.h"
#include "mozIPlacesAutoComplete.h" #include "mozIPlacesAutoComplete.h"
#include "SQLFunctions.h" #include "SQLFunctions.h"
#include "nsUTF8Utils.h"
using namespace mozilla::storage; using namespace mozilla::storage;
////////////////////////////////////////////////////////////////////////////////
//// Anonymous Helpers
namespace {
typedef nsACString::const_char_iterator const_char_iterator;
/**
* Get a pointer to the word boundary after aStart if aStart points to an
* ASCII letter (i.e. [a-zA-Z]). Otherwise, return aNext, which we assume
* points to the next character in the UTF-8 sequence.
*
* We define a word boundary as anything that's not [a-z] -- this lets us
* match CamelCase words.
*
* @param aStart the beginning of the UTF-8 sequence
* @param aNext the next character in the sequence
* @param aEnd the first byte which is not part of the sequence
*
* @return a pointer to the next word boundary after aStart
*/
static
NS_ALWAYS_INLINE const_char_iterator
nextWordBoundary(const_char_iterator const aStart,
const_char_iterator const aNext,
const_char_iterator const aEnd) {
const_char_iterator cur = aStart;
if (('a' <= *cur && *cur <= 'z') ||
('A' <= *cur && *cur <= 'Z')) {
// Since we'll halt as soon as we see a non-ASCII letter, we can do a
// simple byte-by-byte comparison here and avoid the overhead of a
// UTF8CharEnumerator.
do {
cur++;
} while (cur < aEnd && 'a' <= *cur && *cur <= 'z');
}
else {
cur = aNext;
}
return cur;
}
enum FindInStringBehavior {
eFindOnBoundary,
eFindAnywhere
};
/**
* findAnywhere and findOnBoundary do almost the same thing, so it's natural
* to implement them in terms of a single function. They're both
* performance-critical functions, however, and checking aBehavior makes them
* a bit slower. Our solution is to define findInString as NS_ALWAYS_INLINE
* and rely on the compiler to optimize out the aBehavior check.
*
* @param aToken
* The token we're searching for
* @param aSourceString
* The string in which we're searching
* @param aBehavior
* eFindOnBoundary if we should only consider matchines which occur on
* word boundaries, or eFindAnywhere if we should consider matches
* which appear anywhere.
*
* @return true if aToken was found in aSourceString, false otherwise.
*/
static
NS_ALWAYS_INLINE bool
findInString(const nsDependentCSubstring &aToken,
const nsACString &aSourceString,
FindInStringBehavior aBehavior)
{
// CaseInsensitiveUTF8CharsEqual assumes that there's at least one byte in
// the both strings, so don't pass an empty token here.
NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
// We cannot match anything if there is nothing to search.
if (aSourceString.IsEmpty()) {
return false;
}
const_char_iterator tokenStart(aToken.BeginReading()),
tokenEnd(aToken.EndReading()),
sourceStart(aSourceString.BeginReading()),
sourceEnd(aSourceString.EndReading());
do {
// We are on a word boundary (if aBehavior == eFindOnBoundary). See if
// aToken matches sourceStart.
// Check whether the first character in the token matches the character
// at sourceStart. At the same time, get a pointer to the next character
// in both the token and the source.
const_char_iterator sourceNext, tokenCur;
PRBool error;
if (CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
sourceEnd, tokenEnd,
&sourceNext, &tokenCur, &error)) {
// We don't need to check |error| here -- if
// CaseInsensitiveUTF8CharCompare encounters an error, it'll also
// return false and we'll catch the error outside the if.
const_char_iterator sourceCur = sourceNext;
while (true) {
if (tokenCur >= tokenEnd) {
// We matched the whole token!
return true;
}
if (sourceCur >= sourceEnd) {
// We ran into the end of source while matching a token. This
// means we'll never find the token we're looking for.
return false;
}
if (!CaseInsensitiveUTF8CharsEqual(sourceCur, tokenCur,
sourceEnd, tokenEnd,
&sourceCur, &tokenCur, &error)) {
// sourceCur doesn't match tokenCur (or there's an error), so break
// out of this loop.
break;
}
}
}
// If something went wrong above, get out of here!
if (NS_UNLIKELY(error)) {
return false;
}
// We didn't match the token. If we're searching for matches on word
// boundaries, skip to the next word boundary. Otherwise, advance
// forward one character, using the sourceNext pointer we saved earlier.
if (aBehavior == eFindOnBoundary) {
sourceStart = nextWordBoundary(sourceStart, sourceNext, sourceEnd);
}
else {
sourceStart = sourceNext;
}
} while (sourceStart < sourceEnd);
return false;
}
} // End anonymous namespace
namespace mozilla { namespace mozilla {
namespace places { namespace places {
@ -73,117 +225,87 @@ namespace places {
/* static */ /* static */
void void
MatchAutoCompleteFunction::fixupURISpec(const nsCString &aURISpec, MatchAutoCompleteFunction::fixupURISpec(const nsCString &aURISpec,
nsString &_fixedSpec) nsCString &_fixedSpec)
{ {
nsCString unescapedSpec; nsCString unescapedSpec;
(void)NS_UnescapeURL(aURISpec, esc_SkipControl | esc_AlwaysCopy, (void)NS_UnescapeURL(aURISpec, esc_SkipControl | esc_AlwaysCopy,
unescapedSpec); unescapedSpec);
// If this unescaped string is valid UTF-8, we'll convert it. Otherwise, // If this unescaped string is valid UTF-8, we'll use it. Otherwise,
// we will simply convert our original string. // we will simply use our original string.
NS_ASSERTION(_fixedSpec.IsEmpty(), NS_ASSERTION(_fixedSpec.IsEmpty(),
"Passing a non-empty string as an out parameter!"); "Passing a non-empty string as an out parameter!");
if (IsUTF8(unescapedSpec)) if (IsUTF8(unescapedSpec))
CopyUTF8toUTF16(unescapedSpec, _fixedSpec); _fixedSpec.Assign(unescapedSpec);
else else
CopyUTF8toUTF16(aURISpec, _fixedSpec); _fixedSpec.Assign(aURISpec);
if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("http://"))) if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("http://")))
_fixedSpec.Cut(0, 7); _fixedSpec.Cut(0, 7);
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("https://"))) else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("https://")))
_fixedSpec.Cut(0, 8); _fixedSpec.Cut(0, 8);
else if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("ftp://"))) else if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("ftp://")))
_fixedSpec.Cut(0, 6); _fixedSpec.Cut(0, 6);
if (StringBeginsWith(_fixedSpec, NS_LITERAL_STRING("www."))) if (StringBeginsWith(_fixedSpec, NS_LITERAL_CSTRING("www.")))
_fixedSpec.Cut(0, 4); _fixedSpec.Cut(0, 4);
} }
/* static */ /* static */
bool bool
MatchAutoCompleteFunction::findAnywhere(const nsDependentSubstring &aToken, MatchAutoCompleteFunction::findAnywhere(const nsDependentCSubstring &aToken,
const nsAString &aSourceString) const nsACString &aSourceString)
{ {
return !!CaseInsensitiveFindInReadable(aToken, aSourceString); // We can't use FindInReadable here; it works only for ASCII.
return findInString(aToken, aSourceString, eFindAnywhere);
} }
/* static */ /* static */
bool bool
MatchAutoCompleteFunction::findBeginning(const nsDependentSubstring &aToken, MatchAutoCompleteFunction::findOnBoundary(const nsDependentCSubstring &aToken,
const nsAString &aSourceString) const nsACString &aSourceString)
{ {
return !!StringBeginsWith(aSourceString, aToken, return findInString(aToken, aSourceString, eFindOnBoundary);
nsCaseInsensitiveStringComparator());
} }
/* static */ /* static */
bool bool
MatchAutoCompleteFunction::findOnBoundary(const nsDependentSubstring &aToken, MatchAutoCompleteFunction::findBeginning(const nsDependentCSubstring &aToken,
const nsAString &aSourceString) const nsACString &aSourceString)
{ {
// We cannot match anything if there is nothing to search. NS_PRECONDITION(!aToken.IsEmpty(), "Don't search for an empty token!");
if (aSourceString.IsEmpty())
return false;
// Define a const instance of this class so it is created once. // We can't use StringBeginsWith here, unfortunately. Although it will
const nsCaseInsensitiveStringComparator caseInsensitiveCompare; // happily take a case-insensitive UTF8 comparator, it eventually calls
// nsACString::Equals, which checks that the two strings contain the same
// number of bytes before calling the comparator. This is clearly not what
// we want.
const_wchar_iterator tokenStart(aToken.BeginReading()), const_char_iterator tokenStart(aToken.BeginReading()),
tokenEnd(aToken.EndReading()), tokenEnd(aToken.EndReading()),
sourceStart(aSourceString.BeginReading()), sourceStart(aSourceString.BeginReading()),
sourceEnd(aSourceString.EndReading()); sourceEnd(aSourceString.EndReading());
// The start of aSourceString is considered a word boundary, so start there. PRBool dummy;
do { while (sourceStart < sourceEnd &&
// We are on a word boundary, so start by copying the iterators. CaseInsensitiveUTF8CharsEqual(sourceStart, tokenStart,
const_wchar_iterator testTokenItr(tokenStart), sourceEnd, tokenEnd,
testSourceItr(sourceStart); &sourceStart, &tokenStart, &dummy)) {
// Keep trying to match the token one by one until it doesn't match. // We found the token!
while (!caseInsensitiveCompare(testTokenItr, testSourceItr, 1, 1)) { if (tokenStart >= tokenEnd) {
// We matched something, so move down one. return true;
testTokenItr++;
testSourceItr++;
// Matched the full token, so we are done!
if (testTokenItr == tokenEnd)
return true;
// However, if we ran into the end of the source while matching the
// token, we will not find it.
if (testSourceItr == sourceEnd)
return false;
} }
}
// Always advance our starting iterator, and if we are not currently on a // We don't need to check CaseInsensitiveUTF8CharsEqual's error condition
// word boundary, advance to the next word boundary. // (stored in |dummy|), since the function will return false if it
if (!isWordBoundary(ToLowerCase(*sourceStart++))) // encounters an error.
sourceStart = nextWordBoundary(sourceStart, sourceEnd);
} while (sourceStart != sourceEnd);
return false; return false;
} }
/* static */
MatchAutoCompleteFunction::const_wchar_iterator
MatchAutoCompleteFunction::nextWordBoundary(const_wchar_iterator aStart,
const_wchar_iterator aEnd)
{
while (aStart != aEnd && !isWordBoundary(*aStart))
aStart++;
return aStart;
}
/* static */
bool
MatchAutoCompleteFunction::isWordBoundary(const PRUnichar &aChar)
{
// Only check lowercase alphabetic characters so we can match CamelCase
// words. This means that matches will happen after an upper-case
// character.
return !(PRUnichar('a') <= aChar && aChar <= PRUnichar('z'));
}
/* static */ /* static */
MatchAutoCompleteFunction::searchFunctionPtr MatchAutoCompleteFunction::searchFunctionPtr
MatchAutoCompleteFunction::getSearchFunction(PRInt32 aBehavior) MatchAutoCompleteFunction::getSearchFunction(PRInt32 aBehavior)
@ -217,15 +339,15 @@ namespace places {
#define HAS_BEHAVIOR(aBitName) \ #define HAS_BEHAVIOR(aBitName) \
(searchBehavior & mozIPlacesAutoComplete::BEHAVIOR_##aBitName) (searchBehavior & mozIPlacesAutoComplete::BEHAVIOR_##aBitName)
nsAutoString searchString; nsCAutoString searchString;
(void)aArguments->GetString(kArgSearchString, searchString); (void)aArguments->GetUTF8String(kArgSearchString, searchString);
nsCString url; nsCString url;
(void)aArguments->GetUTF8String(kArgIndexURL, url); (void)aArguments->GetUTF8String(kArgIndexURL, url);
// We only want to filter javascript: URLs if we are not supposed to search // We only want to filter javascript: URLs if we are not supposed to search
// for them, and the search does not start with "javascript:". // for them, and the search does not start with "javascript:".
if (!HAS_BEHAVIOR(JAVASCRIPT) && if (!HAS_BEHAVIOR(JAVASCRIPT) &&
!StringBeginsWith(searchString, NS_LITERAL_STRING("javascript:")) && !StringBeginsWith(searchString, NS_LITERAL_CSTRING("javascript:")) &&
StringBeginsWith(url, NS_LITERAL_CSTRING("javascript:"))) { StringBeginsWith(url, NS_LITERAL_CSTRING("javascript:"))) {
NS_IF_ADDREF(*_result = new IntegerVariant(0)); NS_IF_ADDREF(*_result = new IntegerVariant(0));
NS_ENSURE_TRUE(*_result, NS_ERROR_OUT_OF_MEMORY); NS_ENSURE_TRUE(*_result, NS_ERROR_OUT_OF_MEMORY);
@ -235,8 +357,8 @@ namespace places {
PRInt32 visitCount = aArguments->AsInt32(kArgIndexVisitCount); PRInt32 visitCount = aArguments->AsInt32(kArgIndexVisitCount);
bool typed = aArguments->AsInt32(kArgIndexTyped) ? true : false; bool typed = aArguments->AsInt32(kArgIndexTyped) ? true : false;
bool bookmark = aArguments->AsInt32(kArgIndexBookmark) ? true : false; bool bookmark = aArguments->AsInt32(kArgIndexBookmark) ? true : false;
nsAutoString tags; nsCAutoString tags;
(void)aArguments->GetString(kArgIndexTags, tags); (void)aArguments->GetUTF8String(kArgIndexTags, tags);
PRInt32 openPageCount = aArguments->AsInt32(kArgIndexOpenPageCount); PRInt32 openPageCount = aArguments->AsInt32(kArgIndexOpenPageCount);
// Make sure we match all the filter requirements. If a given restriction // Make sure we match all the filter requirements. If a given restriction
@ -255,21 +377,21 @@ namespace places {
} }
// Clean up our URI spec and prepare it for searching. // Clean up our URI spec and prepare it for searching.
nsString fixedURI; nsCString fixedURI;
fixupURISpec(url, fixedURI); fixupURISpec(url, fixedURI);
// Obtain our search function. // Obtain our search function.
PRInt32 matchBehavior = aArguments->AsInt32(kArgIndexMatchBehavior); PRInt32 matchBehavior = aArguments->AsInt32(kArgIndexMatchBehavior);
searchFunctionPtr searchFunction = getSearchFunction(matchBehavior); searchFunctionPtr searchFunction = getSearchFunction(matchBehavior);
nsAutoString title; nsCAutoString title;
(void)aArguments->GetString(kArgIndexTitle, title); (void)aArguments->GetUTF8String(kArgIndexTitle, title);
// Determine if every token matches either the bookmark title, tags, page // Determine if every token matches either the bookmark title, tags, page
// title, or page URL. // title, or page URL.
nsWhitespaceTokenizer tokenizer(searchString); nsCWhitespaceTokenizer tokenizer(searchString);
while (matches && tokenizer.hasMoreTokens()) { while (matches && tokenizer.hasMoreTokens()) {
const nsDependentSubstring &token = tokenizer.nextToken(); const nsDependentCSubstring &token = tokenizer.nextToken();
bool matchTags = searchFunction(token, tags); bool matchTags = searchFunction(token, tags);
bool matchTitle = searchFunction(token, title); bool matchTitle = searchFunction(token, title);

Просмотреть файл

@ -15,7 +15,8 @@
* The Original Code is Places code. * The Original Code is Places code.
* *
* The Initial Developer of the Original Code is * The Initial Developer of the Original Code is
* Mozilla Corporation. * the Mozilla Foundation.
*
* Portions created by the Initial Developer are Copyright (C) 2009 * Portions created by the Initial Developer are Copyright (C) 2009
* the Initial Developer. All Rights Reserved. * the Initial Developer. All Rights Reserved.
* *
@ -118,10 +119,10 @@ private:
/** /**
* Typedefs * Typedefs
*/ */
typedef bool (*searchFunctionPtr)(const nsDependentSubstring &aToken, typedef bool (*searchFunctionPtr)(const nsDependentCSubstring &aToken,
const nsAString &aSourceString); const nsACString &aSourceString);
typedef nsAString::const_char_iterator const_wchar_iterator; typedef nsACString::const_char_iterator const_char_iterator;
/** /**
* Obtains the search function to match on. * Obtains the search function to match on.
@ -133,6 +134,18 @@ private:
*/ */
static searchFunctionPtr getSearchFunction(PRInt32 aBehavior); static searchFunctionPtr getSearchFunction(PRInt32 aBehavior);
/**
* Tests if aSourceString starts with aToken.
*
* @param aToken
* The string to search for.
* @param aSourceString
* The string to search.
* @return true if found, false otherwise.
*/
static bool findBeginning(const nsDependentCSubstring &aToken,
const nsACString &aSourceString);
/** /**
* Searches aSourceString for aToken anywhere in the string in a case- * Searches aSourceString for aToken anywhere in the string in a case-
* insensitive way. * insensitive way.
@ -143,20 +156,8 @@ private:
* The string to search. * The string to search.
* @return true if found, false otherwise. * @return true if found, false otherwise.
*/ */
static bool findAnywhere(const nsDependentSubstring &aToken, static bool findAnywhere(const nsDependentCSubstring &aToken,
const nsAString &aSourceString); const nsACString &aSourceString);
/**
* Tests if aSourceString starts with aToken.
*
* @param aToken
* The string to search for.
* @param aSourceString
* The string to search.
* @return true if found, false otherwise.
*/
static bool findBeginning(const nsDependentSubstring &aToken,
const nsAString &aSourceString);
/** /**
* Tests if aToken is found on a word boundary in aSourceString. * Tests if aToken is found on a word boundary in aSourceString.
@ -167,33 +168,9 @@ private:
* The string to search. * The string to search.
* @return true if found, false otherwise. * @return true if found, false otherwise.
*/ */
static bool findOnBoundary(const nsDependentSubstring &aToken, static bool findOnBoundary(const nsDependentCSubstring &aToken,
const nsAString &aSourceString); const nsACString &aSourceString);
/**
* Obtains an iterator to the next word boundary as defined by isWordBoundary.
*
* @param aStart
* An iterator pointing to the start of the string.
* @param aEnd
* An iterator pointing to the end of the string.
* @return an iterator pointing to the next word boundary.
*/
static const_wchar_iterator nextWordBoundary(const_wchar_iterator aStart,
const_wchar_iterator aEnd);
/**
* Determines if aChar is a word boundary. A 'word boundary' is anything that
* is not used to build up a word from a string of characters. We are very
* conservative here because anything that we do not list will be treated as a
* word boundary. This means searching for that not-actually-a-word-boundary
* character can still be matched in the middle of a word.
*
* @param aChar
* The Unicode character to check against.
* @return true if the character is considered a word boundary, false
* otherwise.
*/
static inline bool isWordBoundary(const PRUnichar &aChar);
/** /**
* Fixes a URI's spec such that it is ready to be searched. This includes * Fixes a URI's spec such that it is ready to be searched. This includes
@ -205,7 +182,7 @@ private:
* @param _fixedSpec * @param _fixedSpec
* An out parameter that is the fixed up string. * An out parameter that is the fixed up string.
*/ */
static void fixupURISpec(const nsCString &aURISpec, nsString &_fixedSpec); static void fixupURISpec(const nsCString &aURISpec, nsCString &_fixedSpec);
}; };
} // namespace places } // namespace places