Add support for REP tables in spellchecker, to improve suggestions by knowing about common misspellings

bug 227214, patch by mscott and me, r=mscott, sr=bienvenu
This commit is contained in:
mvl%exedo.nl 2004-04-13 12:09:44 +00:00
Родитель 071d8d7188
Коммит ebf5bcc2b5
4 изменённых файлов: 164 добавлений и 20 удалений

Просмотреть файл

@ -62,6 +62,7 @@
#include "nsNetUtil.h"
#include "nsICharsetConverterManager.h"
#include "nsUnicharUtilCIID.h"
#include "nsUnicharUtils.h"
static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID);
@ -69,7 +70,8 @@ static NS_DEFINE_CID(kUnicharUtilCID, NS_UNICHARUTIL_CID);
static PRInt32 SplitString(nsACString &in,nsCString out[],PRInt32 size);
static void doubleReverseHack(nsACString &s);
myspAffixMgr::myspAffixMgr()
myspAffixMgr::myspAffixMgr() :
mReplaceTable(nsnull)
{
}
@ -77,6 +79,7 @@ myspAffixMgr::myspAffixMgr()
myspAffixMgr::~myspAffixMgr()
{
mPersonalDictionary = nsnull;
delete[] mReplaceTable;
}
nsresult myspAffixMgr::GetPersonalDictionary(mozIPersonalDictionary * *aPersonalDictionary)
@ -160,7 +163,6 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
PRInt32 numents;
nsLineBuffer *lineBuffer;
nsresult rv = NS_InitLineBuffer(&lineBuffer);
nsCAutoString line;
PRBool moreData=PR_TRUE;
PRInt32 pos;
nsCString cmds[5];
@ -169,6 +171,9 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
prefixes.clear();
suffixes.clear();
nsCOMPtr<nsICharsetConverterManager> ccm = do_GetService(kCharsetConverterManagerCID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
numents = 0; // number of affentry structures to parse
char flag='\0'; // affix char identifier
{
@ -179,6 +184,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
// read in each line ignoring any that do not
// start with PFX or SFX
nsCAutoString line;
while (moreData) {
NS_ReadLine(strm,lineBuffer,line,&moreData);
/* parse in the try string */
@ -191,11 +197,56 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
/* parse in the name of the character set used by the .dict and .aff */
if (Substring(line,0,3).Equals("SET")) {
pos = line.FindChar(' ');
if(pos != -1){
mEncoding.Assign(Substring(line,pos+1,line.Length()-pos-1));
mEncoding.CompressWhitespace(PR_TRUE,PR_TRUE);
rv = ccm->GetUnicodeDecoder(mEncoding.get(), getter_AddRefs(mDecoder));
NS_ENSURE_SUCCESS(rv, rv);
rv = ccm->GetUnicodeEncoder(mEncoding.get(), getter_AddRefs(mEncoder));
if (mEncoder && NS_SUCCEEDED(rv)) {
mEncoder->SetOutputErrorBehavior(mEncoder->kOnError_Signal, nsnull, '?');
}
NS_ENSURE_SUCCESS(rv, rv);
}
}
/* parse in the typical fault correcting table */
if (Substring(line,0,3).Equals("REP")) {
PRInt32 numFields = SplitString(line, cmds, 3);
if (numFields == 2)
numents = atoi(cmds[1].get());
mReplaceTable = new mozReplaceTable[numents];
mReplaceTableLength = numents;
PRInt32 i = 0;
nsAutoString pattern, replacement;
for (j = 0; (j < numents) && moreData; j++) {
NS_ReadLine(strm,lineBuffer,line,&moreData);
numFields = SplitString(line, cmds, 3);
if(!cmds[0].Equals("REP")) { //consistency check
NS_WARNING("REP line from .aff file is inconsitent");
continue;
}
rv = DecodeString(cmds[1], pattern);
NS_ENSURE_SUCCESS(rv, rv);
rv = DecodeString(cmds[2], replacement);
NS_ENSURE_SUCCESS(rv, rv);
// Make sure the replacements are lower case.
// We don't want to convert them for every lookup.
ToLowerCase(pattern);
ToLowerCase(replacement);
mReplaceTable[i].pattern = pattern.get();
mReplaceTable[i].replacement = replacement.get();
i++;
}
}
@ -207,7 +258,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
numents = 0;
ff=0;
// split line into pieces
PRInt32 numFields=SplitString(line,cmds,5);
PRInt32 numFields=SplitString(line, cmds, 5);
if(numFields > 1)flag=cmds[1].First();
if((numFields > 2)&&(cmds[2].First()=='Y'))ff=XPRODUCT;
if(numFields >3)numents = atoi(cmds[3].get());
@ -219,7 +270,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
nsString tempStr;
if((numFields < 5)||(cmds[1].First()!=flag)){ //consistency check
//complain loudly
NS_WARNING("PFX/SFX line from .aff file is inconsitent");
continue;
}
if(cmds[3].Equals("0")){
@ -239,7 +290,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
else{ // cmds[2] != 0
newMod.mAppend.Assign( cmds[2]);
if((cmds[2].Length()>cmds[4].Length())||!cmds[2].Equals(Substring(cmds[4],0,cmds[2].Length()))){
//complain loudly
NS_WARNING("PFX/SFX line from .aff file is inconsitent");
continue;
}
cmds[3].Append(Substring(cmds[4],cmds[2].Length(),cmds[4].Length()-cmds[2].Length()));
@ -262,7 +313,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
newMod.mAppend.Assign( cmds[2]);
if((cmds[2].Length()>cmds[4].Length())||
!cmds[2].Equals(Substring(cmds[4],cmds[4].Length()-cmds[2].Length(),cmds[2].Length()))){
//complain loudly
NS_WARNING("PFX/SFX line from .aff file is inconsitent");
continue;
}
suffixTest=Substring(cmds[4],0,cmds[4].Length()-cmds[2].Length());
@ -276,17 +327,7 @@ nsresult myspAffixMgr::parse_file(nsIInputStream *strm)
}
}
// We do this here, instead of where we set the charset,
// to prevent all kind of leakage in case it fails.
nsCOMPtr<nsICharsetConverterManager> ccm = do_GetService(kCharsetConverterManagerCID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = ccm->GetUnicodeDecoder(mEncoding.get(), getter_AddRefs(mDecoder));
NS_ENSURE_SUCCESS(rv, rv);
rv = ccm->GetUnicodeEncoder(mEncoding.get(), getter_AddRefs(mEncoder));
if (mEncoder && NS_SUCCEEDED(rv)) {
mEncoder->SetOutputErrorBehavior(mEncoder->kOnError_Signal, nsnull, '?');
}
return rv;
return NS_OK;
}
@ -356,6 +397,16 @@ void myspAffixMgr::get_try_string(nsAString &aTryString)
}
}
mozReplaceTable *myspAffixMgr::getReplaceTable()
{
return mReplaceTable;
}
PRUint32 myspAffixMgr::getReplaceTableLength()
{
return mReplaceTableLength;
}
PRBool
myspAffixMgr::prefixCheck(const nsAFlatCString &word)
{
@ -465,6 +516,27 @@ PRBool myspAffixMgr::check(const nsAFlatString &word)
return good;
}
nsresult
myspAffixMgr::DecodeString(const nsAFlatCString &aSource, nsAString &aDest)
{
if (!mDecoder) {
aDest.Assign(NS_LITERAL_STRING(""));
return NS_OK;
}
PRInt32 inLength = aSource.Length();
PRInt32 outLength;
nsresult rv = mDecoder->GetMaxLength(aSource.get(), inLength, &outLength);
NS_ENSURE_SUCCESS(rv, rv);
PRUnichar *dest = (PRUnichar *)malloc(sizeof(PRUnichar) * (outLength + 1));
if (!dest)
return NS_ERROR_OUT_OF_MEMORY;
rv = mDecoder->Convert(aSource.get(), &inLength, dest, &outLength);
dest[outLength] = 0;
aDest = dest;
free(dest);
return rv;
}
static PRInt32
SplitString(nsACString &in,nsCString out[],PRInt32 size)

Просмотреть файл

@ -79,6 +79,10 @@
#define XPRODUCT 1
struct mozReplaceTable {
nsString pattern;
nsString replacement;
};
class myspPrefix;
class myspSuffix;
@ -91,6 +95,8 @@ public:
~myspAffixMgr();
nsresult GetPersonalDictionary(mozIPersonalDictionary * *aPersonalDictionary);
nsresult SetPersonalDictionary(mozIPersonalDictionary * aPersonalDictionary);
mozReplaceTable *getReplaceTable();
PRUint32 getReplaceTableLength();
PRBool check(const nsAFlatString &word);
void get_try_string(nsAString &aTryString);
nsresult Load(const nsString &aDictionary);
@ -101,7 +107,9 @@ protected:
PRBool suffixCheck(const nsAFlatCString &word,PRBool cross=PR_FALSE,char crossID=' ');
nsresult LoadDictionary(nsIInputStream *strm);
nsresult parse_file(nsIInputStream *strm);
nsresult parse_file(nsIInputStream *strm);
nsresult DecodeString(const nsAFlatCString &aSource, nsAString &aDest);
mozAffixState prefixes;
mozAffixState suffixes;
@ -110,6 +118,8 @@ protected:
nsCString mEncoding;
nsString mLanguage;
mozCStr2CStrHashtable mHashTable;
mozReplaceTable *mReplaceTable;
PRUint32 mReplaceTableLength;
nsCOMPtr<mozIPersonalDictionary> mPersonalDictionary;
nsCOMPtr<nsIUnicodeEncoder> mEncoder;
nsCOMPtr<nsIUnicodeDecoder> mDecoder;

Просмотреть файл

@ -57,6 +57,7 @@
#include "plstr.h"
#include "nsReadableUtils.h"
#include "nsMemory.h"
#include "nsUnicharUtils.h"
myspSuggestMgr::myspSuggestMgr()
{
@ -104,9 +105,15 @@ nsresult myspSuggestMgr::suggest(PRUnichar ***slst,const nsAFlatString &word, PR
nsug=*num;
}
// perhaps we made a typical spelling error.
res = replacechars(wlst, word, &nsug);
// did we forget to add a char
res = forgotchar(wlst, word, &nsug);
if ((nsug < maxSug) && NS_SUCCEEDED(res)){
res = forgotchar(wlst, word, &nsug);
}
// did we swap the order of chars by mistake
if ((nsug < maxSug) && NS_SUCCEEDED(res)){
res = swapchar(wlst, word, &nsug);
@ -141,6 +148,60 @@ nsresult myspSuggestMgr::suggest(PRUnichar ***slst,const nsAFlatString &word, PR
}
// suggestions for a typical spelling error that
// differs by more than 1 letter from the right spelling
nsresult myspSuggestMgr::replacechars(PRUnichar ** wlst,const nsAFlatString &word, PRUint32 *ns)
{
nsAutoString candidate;
PRBool cwrd;
PRUint32 i,k;
PRUint32 startOffset, findOffset;
if (word.Length() < 2 || !pAMgr)
return NS_OK;
PRUint32 replaceTableLength = pAMgr->getReplaceTableLength();
struct mozReplaceTable *replaceTable = pAMgr->getReplaceTable();
if (replaceTable == nsnull)
return NS_OK;
for (i = 0; i < replaceTableLength; i++) {
startOffset = 0;
candidate.Assign(word);
ToLowerCase(candidate);
while ((findOffset = candidate.Find(replaceTable[i].pattern, startOffset)) != -1) {
candidate.Assign(word);
ToLowerCase(candidate);
candidate.Replace(findOffset, replaceTable[i].pattern.Length(), replaceTable[i].replacement);
cwrd = PR_TRUE;
for (k = 0; k < *ns; k++) {
if (candidate.Equals(wlst[k])){
cwrd = PR_FALSE;
break;
}
}
if (cwrd && pAMgr->check(candidate)) {
if (*ns < maxSug) {
wlst[*ns] = ToNewUnicode(candidate);
if (!wlst[*ns])
return NS_ERROR_OUT_OF_MEMORY;
(*ns)++;
} else {
return NS_OK;
}
}
startOffset = findOffset + replaceTable[i].pattern.Length();
}
}
return NS_OK;
}
// error is wrong char in place of correct one
nsresult myspSuggestMgr::badchar(PRUnichar ** wlst,const nsAFlatString &word, PRUint32 *ns)

Просмотреть файл

@ -78,6 +78,7 @@ public:
nsresult suggest(PRUnichar ***slst, const nsAFlatString &word, PRUint32 *num);
protected:
nsresult replacechars(PRUnichar **wlst,const nsAFlatString &word, PRUint32 *num);
nsresult forgotchar(PRUnichar **wlst,const nsAFlatString &word, PRUint32 *num);
nsresult swapchar(PRUnichar **wlst,const nsAFlatString &word, PRUint32 *num);
nsresult extrachar(PRUnichar **wlst,const nsAFlatString &word, PRUint32 *num);