Bug 1368646 - Move code from nsSemanticUnitScanner.cpp to mailnews. r=aceman DONTBUILD

This commit is contained in:
Jorg K 2017-06-05 07:54:08 +02:00
Родитель 095a85de33
Коммит 6fe0dc2a87
2 изменённых файлов: 75 добавлений и 29 удалений

Просмотреть файл

@ -28,6 +28,7 @@
#include "nsIObserverService.h"
#include "nsIChannel.h"
#include "nsDependentSubstring.h"
#include "nsLWBrkCIID.h"
#include "mozilla/ArenaAllocatorExtensions.h" // for ArenaStrdup
@ -713,6 +714,61 @@ nsresult Tokenizer::stripHTML(const nsAString& inString, nsAString& outString)
return utils->ConvertToPlainText(inString, flags, 80, outString);
}
// Copied from nsSemanticUnitScanner.cpp which was removed in bug 1368418.
nsresult Tokenizer::ScannerNext(const char16_t *text, int32_t length, int32_t pos, bool isLastBuffer, int32_t *begin, int32_t *end, bool *_retval)
{
if (!mWordBreaker) {
nsresult rv;
mWordBreaker = do_CreateInstance(NS_WBRK_CONTRACTID, &rv);
if (NS_FAILED(rv))
return rv;
}
// if we reach the end, just return
if (pos >= length) {
*begin = pos;
*end = pos;
*_retval = false;
return NS_OK;
}
nsWordBreakClass char_class = nsIWordBreaker::GetClass(text[pos]);
// If we are in Chinese mode, return one Han letter at a time.
// We should not do this if we are in Japanese or Korean mode.
if (kWbClassHanLetter == char_class) {
*begin = pos;
*end = pos+1;
*_retval = true;
return NS_OK;
}
int32_t next;
// Find the next "word".
next = mWordBreaker->NextWord(text, (uint32_t) length, (uint32_t) pos);
// If we don't have enough text to make decision, return.
if (next == NS_WORDBREAKER_NEED_MORE_TEXT) {
*begin = pos;
*end = isLastBuffer ? length : pos;
*_retval = isLastBuffer;
return NS_OK;
}
// If what we got is space or punct, look at the next break.
if ((char_class == kWbClassSpace) || (char_class == kWbClassPunct)) {
// If the next "word" is not letters,
// call itself recursively with the new pos.
return ScannerNext(text, length, next, isLastBuffer, begin, end, _retval);
}
// For the rest, return.
*begin = pos;
*end = next;
*_retval = true;
return NS_OK;
}
void Tokenizer::tokenize(const char* aText)
{
MOZ_LOG(BayesianFilterLogModule, LogLevel::Debug, ("tokenize: %s", aText));
@ -762,40 +818,26 @@ void Tokenizer::tokenize(const char* aText)
tokenize_ascii_word(word);
else if (isJapanese(word))
tokenize_japanese_word(word);
#if 0
M-C removed this in bug 1368418
else {
nsresult rv;
// use I18N scanner to break this word into meaningful semantic units.
if (!mScanner) {
mScanner = do_CreateInstance(NS_SEMANTICUNITSCANNER_CONTRACTID, &rv);
NS_ASSERTION(NS_SUCCEEDED(rv), "couldn't create semantic unit scanner!");
if (NS_FAILED(rv)) {
return;
}
}
if (mScanner) {
mScanner->Start("UTF-8");
// convert this word from UTF-8 into UCS2.
NS_ConvertUTF8toUTF16 uword(word);
ToLowerCase(uword);
const char16_t* utext = uword.get();
int32_t len = uword.Length(), pos = 0, begin, end;
bool gotUnit;
while (pos < len) {
rv = mScanner->Next(utext, len, pos, true, &begin, &end, &gotUnit);
if (NS_SUCCEEDED(rv) && gotUnit) {
NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin);
add(utfUnit.get());
// advance to end of current unit.
pos = end;
} else {
break;
}
// Convert this word from UTF-8 into UCS2.
NS_ConvertUTF8toUTF16 uword(word);
ToLowerCase(uword);
const char16_t* utext = uword.get();
int32_t len = uword.Length(), pos = 0, begin, end;
bool gotUnit;
while (pos < len) {
rv = ScannerNext(utext, len, pos, true, &begin, &end, &gotUnit);
if (NS_SUCCEEDED(rv) && gotUnit) {
NS_ConvertUTF16toUTF8 utfUnit(utext + begin, end - begin);
add(utfUnit.get());
// Advance to end of current unit.
pos = end;
} else {
break;
}
}
}
#endif
}
}

Просмотреть файл

@ -15,6 +15,7 @@
#include "nsString.h"
#include "nsWeakReference.h"
#include "nsIObserver.h"
#include "nsIWordBreaker.h"
#include "mozilla/ArenaAllocator.h"
@ -148,6 +149,9 @@ private:
nsresult stripHTML(const nsAString& inString, nsAString& outString);
// helper function to escape \n, \t, etc from a CString
void UnescapeCString(nsCString& aCString);
nsresult ScannerNext(const char16_t *text, int32_t length, int32_t pos,
bool isLastBuffer, int32_t *begin, int32_t *end, bool *_retval);
nsCOMPtr<nsIWordBreaker> mWordBreaker;
};
/**