зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1326277 - Upgrade Hunspell to version 1.6.0. r=masayuki
--HG-- extra : rebase_source : dfd3b592a52a708fe62c49c826bfaedea801769d
This commit is contained in:
Родитель
5b5403b0a2
Коммит
7afab1ba7f
|
@ -2048,7 +2048,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
|
|||
wordnum = oldwordnum2;
|
||||
|
||||
// perhaps second word is a compound word (recursive call)
|
||||
if (wordnum < maxwordnum) {
|
||||
if (wordnum + 2 < maxwordnum) {
|
||||
rv = compound_check(st.substr(i), wordnum + 1,
|
||||
numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
|
||||
is_sug, info);
|
||||
|
@ -2621,7 +2621,7 @@ int AffixMgr::compound_check_morph(const char* word,
|
|||
wordnum = oldwordnum2;
|
||||
|
||||
// perhaps second word is a compound word (recursive call)
|
||||
if ((wordnum < maxwordnum) && (ok == 0)) {
|
||||
if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
|
||||
compound_check_morph((word + i), strlen(word + i), wordnum + 1,
|
||||
numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
|
||||
result, &presult);
|
||||
|
|
|
@ -83,9 +83,12 @@
|
|||
|
||||
#define MAXSUGGESTION 15
|
||||
#define MAXSHARPS 5
|
||||
#define MAXWORDLEN 100
|
||||
|
||||
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
|
||||
#ifndef MAXWORDLEN
|
||||
#define MAXWORDLEN 100
|
||||
#endif
|
||||
|
||||
#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
|
||||
# define H_DEPRECATED __attribute__((__deprecated__))
|
||||
#elif defined(_MSC_VER) && (_MSC_VER >= 1300)
|
||||
# define H_DEPRECATED __declspec(deprecated)
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
Bug 1322666 - Change MAXWORDLEN to 100
|
||||
|
||||
diff --git a/extensions/spellcheck/hunspell/src/hunspell.hxx b/extensions/spellcheck/hunspell/src/hunspell.hxx
|
||||
--- a/extensions/spellcheck/hunspell/src/hunspell.hxx
|
||||
+++ b/extensions/spellcheck/hunspell/src/hunspell.hxx
|
||||
@@ -78,17 +78,17 @@
|
||||
#include "atypes.hxx"
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define SPELL_XML "<?xml?>"
|
||||
|
||||
#define MAXSUGGESTION 15
|
||||
#define MAXSHARPS 5
|
||||
-#define MAXWORDLEN 176
|
||||
+#define MAXWORDLEN 100
|
||||
|
||||
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
|
||||
# define H_DEPRECATED __attribute__((__deprecated__))
|
||||
#elif defined(_MSC_VER) && (_MSC_VER >= 1300)
|
||||
# define H_DEPRECATED __declspec(deprecated)
|
||||
#else
|
||||
# define H_DEPRECATED
|
||||
#endif
|
|
@ -1050,12 +1050,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
|
||||
std::string target;
|
||||
std::string candidate;
|
||||
std::vector<w_char> w_candidate;
|
||||
if (ph) {
|
||||
if (utf8) {
|
||||
std::vector<w_char> _w;
|
||||
u8_u16(_w, word);
|
||||
mkallcap_utf(_w, langnum);
|
||||
u16_u8(candidate, _w);
|
||||
u8_u16(w_candidate, word);
|
||||
mkallcap_utf(w_candidate, langnum);
|
||||
u16_u8(candidate, w_candidate);
|
||||
} else {
|
||||
candidate.assign(word);
|
||||
if (!nonbmp)
|
||||
|
@ -1069,6 +1069,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
|
||||
FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;
|
||||
|
||||
std::vector<w_char> w_word, w_target;
|
||||
if (utf8) {
|
||||
u8_u16(w_word, word);
|
||||
u8_u16(w_target, target);
|
||||
}
|
||||
|
||||
std::vector<w_char> w_entry;
|
||||
std::string f;
|
||||
std::vector<w_char> w_f;
|
||||
std::vector<w_char> w_target2;
|
||||
|
||||
for (size_t i = 0; i < rHMgr.size(); ++i) {
|
||||
while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
|
||||
if ((hp->astr) && (pAMgr) &&
|
||||
|
@ -1079,15 +1090,30 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
TESTAFF(hp->astr, onlyincompound, hp->alen)))
|
||||
continue;
|
||||
|
||||
sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
|
||||
leftcommonsubstring(word, HENTRY_WORD(hp));
|
||||
if (utf8) {
|
||||
w_entry.clear();
|
||||
u8_u16(w_entry, HENTRY_WORD(hp));
|
||||
sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
|
||||
leftcommonsubstring(w_word, w_entry);
|
||||
} else {
|
||||
sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
|
||||
leftcommonsubstring(word, HENTRY_WORD(hp));
|
||||
}
|
||||
|
||||
// check special pronounciation
|
||||
std::string f;
|
||||
f.clear();
|
||||
if ((hp->var & H_OPT_PHON) &&
|
||||
copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
|
||||
int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
|
||||
+leftcommonsubstring(word, f.c_str());
|
||||
int sc2;
|
||||
if (utf8) {
|
||||
w_f.clear();
|
||||
u8_u16(w_f, f.c_str());
|
||||
sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
|
||||
leftcommonsubstring(w_word, w_f);
|
||||
} else {
|
||||
sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
|
||||
leftcommonsubstring(word, f.c_str());
|
||||
}
|
||||
if (sc2 > sc)
|
||||
sc = sc2;
|
||||
}
|
||||
|
@ -1095,16 +1121,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
int scphon = -20000;
|
||||
if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
|
||||
if (utf8) {
|
||||
std::vector<w_char> _w;
|
||||
u8_u16(_w, HENTRY_WORD(hp));
|
||||
mkallcap_utf(_w, langnum);
|
||||
u16_u8(candidate, _w);
|
||||
w_candidate.clear();
|
||||
u8_u16(w_candidate, HENTRY_WORD(hp));
|
||||
mkallcap_utf(w_candidate, langnum);
|
||||
u16_u8(candidate, w_candidate);
|
||||
} else {
|
||||
candidate.assign(HENTRY_WORD(hp));
|
||||
candidate = HENTRY_WORD(hp);
|
||||
mkallcap(candidate, csconv);
|
||||
}
|
||||
std::string target2 = phonet(candidate, *ph);
|
||||
scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
|
||||
w_target2.clear();
|
||||
if (utf8) {
|
||||
u8_u16(w_target2, target2.c_str());
|
||||
scphon = 2 * ngram(3, w_target, w_target2,
|
||||
NGRAM_LONGER_WORSE);
|
||||
} else {
|
||||
scphon = 2 * ngram(3, target, target2,
|
||||
NGRAM_LONGER_WORSE);
|
||||
}
|
||||
}
|
||||
|
||||
if (sc > scores[lp]) {
|
||||
|
@ -1134,22 +1168,21 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
// find minimum threshold for a passable suggestion
|
||||
// mangle original word three differnt ways
|
||||
// and score them to generate a minimum acceptable score
|
||||
std::vector<w_char> w_mw;
|
||||
int thresh = 0;
|
||||
for (int sp = 1; sp < 4; sp++) {
|
||||
if (utf8) {
|
||||
u8_u16(u8, word);
|
||||
w_mw = w_word;
|
||||
for (int k = sp; k < n; k += 4) {
|
||||
u8[k].l = '*';
|
||||
u8[k].h = 0;
|
||||
w_mw[k].l = '*';
|
||||
w_mw[k].h = 0;
|
||||
}
|
||||
std::string mw;
|
||||
u16_u8(mw, u8);
|
||||
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
||||
thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
|
||||
} else {
|
||||
std::string mw(word);
|
||||
std::string mw = word;
|
||||
for (int k = sp; k < n; k += 4)
|
||||
mw[k] = '*';
|
||||
thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
||||
thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
|
||||
}
|
||||
}
|
||||
thresh = thresh / 3;
|
||||
|
@ -1177,11 +1210,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
return;
|
||||
}
|
||||
|
||||
std::vector<w_char> w_glst_word;
|
||||
for (int i = 0; i < MAX_ROOTS; i++) {
|
||||
if (roots[i]) {
|
||||
struct hentry* rp = roots[i];
|
||||
|
||||
std::string f;
|
||||
f.clear();
|
||||
const char *field = NULL;
|
||||
if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
|
||||
field = f.c_str();
|
||||
|
@ -1190,8 +1224,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
nc, field);
|
||||
|
||||
for (int k = 0; k < nw; k++) {
|
||||
sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
|
||||
leftcommonsubstring(word, glst[k].word);
|
||||
if (utf8) {
|
||||
w_glst_word.clear();
|
||||
u8_u16(w_glst_word, glst[k].word);
|
||||
sc = ngram(n, w_word, w_glst_word,
|
||||
NGRAM_ANY_MISMATCH + low) +
|
||||
leftcommonsubstring(w_word, w_glst_word);
|
||||
} else {
|
||||
sc = ngram(n, word, glst[k].word,
|
||||
NGRAM_ANY_MISMATCH + low) +
|
||||
leftcommonsubstring(word, glst[k].word);
|
||||
}
|
||||
|
||||
if (sc > thresh) {
|
||||
if (sc > gscore[lp]) {
|
||||
|
@ -1245,16 +1288,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
fact = (10.0 - maxd) / 5.0;
|
||||
}
|
||||
|
||||
std::vector<w_char> w_gl;
|
||||
for (int i = 0; i < MAX_GUESS; i++) {
|
||||
if (guess[i]) {
|
||||
// lowering guess[i]
|
||||
std::string gl;
|
||||
int len;
|
||||
if (utf8) {
|
||||
std::vector<w_char> _w;
|
||||
len = u8_u16(_w, guess[i]);
|
||||
mkallsmall_utf(_w, langnum);
|
||||
u16_u8(gl, _w);
|
||||
w_gl.clear();
|
||||
len = u8_u16(w_gl, guess[i]);
|
||||
mkallsmall_utf(w_gl, langnum);
|
||||
u16_u8(gl, w_gl);
|
||||
} else {
|
||||
gl.assign(guess[i]);
|
||||
if (!nonbmp)
|
||||
|
@ -1271,14 +1315,29 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
}
|
||||
// using 2-gram instead of 3, and other weightening
|
||||
|
||||
re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
||||
ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
||||
w_gl.clear();
|
||||
if (utf8) {
|
||||
u8_u16(w_gl, gl);
|
||||
re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
||||
ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
||||
} else {
|
||||
re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
|
||||
ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
|
||||
}
|
||||
|
||||
int ngram_score, leftcommon_score;
|
||||
if (utf8) {
|
||||
ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
|
||||
leftcommon_score = leftcommonsubstring(w_word, w_gl);
|
||||
} else {
|
||||
ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
|
||||
leftcommon_score = leftcommonsubstring(word, gl.c_str());
|
||||
}
|
||||
gscore[i] =
|
||||
// length of longest common subsequent minus length difference
|
||||
2 * _lcs - abs((int)(n - len)) +
|
||||
// weight length of the left common substring
|
||||
leftcommonsubstring(word, gl.c_str()) +
|
||||
leftcommon_score +
|
||||
// weight equal character positions
|
||||
(!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)
|
||||
? 1
|
||||
|
@ -1286,7 +1345,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
// swap character (not neighboring)
|
||||
((is_swap) ? 10 : 0) +
|
||||
// ngram
|
||||
ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
|
||||
ngram_score +
|
||||
// weighted ngrams
|
||||
re +
|
||||
// different limit for dictionaries with PHONE rules
|
||||
|
@ -1304,11 +1363,11 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
// lowering rootphon[i]
|
||||
std::string gl;
|
||||
int len;
|
||||
w_gl.clear();
|
||||
if (utf8) {
|
||||
std::vector<w_char> _w;
|
||||
len = u8_u16(_w, rootsphon[i]);
|
||||
mkallsmall_utf(_w, langnum);
|
||||
u16_u8(gl, _w);
|
||||
len = u8_u16(w_gl, rootsphon[i]);
|
||||
mkallsmall_utf(w_gl, langnum);
|
||||
u16_u8(gl, w_gl);
|
||||
} else {
|
||||
gl.assign(rootsphon[i]);
|
||||
if (!nonbmp)
|
||||
|
@ -1316,10 +1375,15 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
|
|||
len = strlen(rootsphon[i]);
|
||||
}
|
||||
|
||||
// weight length of the left common substring
|
||||
int leftcommon_score;
|
||||
if (utf8)
|
||||
leftcommon_score = leftcommonsubstring(w_word, w_gl);
|
||||
else
|
||||
leftcommon_score = leftcommonsubstring(word, gl.c_str());
|
||||
// heuristic weigthing of ngram scores
|
||||
scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +
|
||||
// weight length of the left common substring
|
||||
leftcommonsubstring(word, gl.c_str());
|
||||
leftcommon_score;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1724,10 +1788,10 @@ std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const
|
|||
return result2;
|
||||
}
|
||||
|
||||
// generate an n-gram score comparing s1 and s2
|
||||
// generate an n-gram score comparing s1 and s2, UTF16 version
|
||||
int SuggestMgr::ngram(int n,
|
||||
const std::string& s1,
|
||||
const std::string& s2,
|
||||
const std::vector<w_char>& su1,
|
||||
const std::vector<w_char>& su2,
|
||||
int opt) {
|
||||
int nscore = 0;
|
||||
int ns;
|
||||
|
@ -1735,68 +1799,44 @@ int SuggestMgr::ngram(int n,
|
|||
int l2;
|
||||
int test = 0;
|
||||
|
||||
if (utf8) {
|
||||
std::vector<w_char> su1;
|
||||
std::vector<w_char> su2;
|
||||
l1 = u8_u16(su1, s1);
|
||||
l2 = u8_u16(su2, s2);
|
||||
if ((l2 <= 0) || (l1 == -1))
|
||||
return 0;
|
||||
// lowering dictionary word
|
||||
if (opt & NGRAM_LOWERING)
|
||||
mkallsmall_utf(su2, langnum);
|
||||
for (int j = 1; j <= n; j++) {
|
||||
ns = 0;
|
||||
for (int i = 0; i <= (l1 - j); i++) {
|
||||
int k = 0;
|
||||
for (int l = 0; l <= (l2 - j); l++) {
|
||||
for (k = 0; k < j; k++) {
|
||||
w_char& c1 = su1[i + k];
|
||||
w_char& c2 = su2[l + k];
|
||||
if ((c1.l != c2.l) || (c1.h != c2.h))
|
||||
break;
|
||||
}
|
||||
if (k == j) {
|
||||
ns++;
|
||||
l1 = su1.size();
|
||||
l2 = su2.size();
|
||||
if (l2 == 0)
|
||||
return 0;
|
||||
// lowering dictionary word
|
||||
const std::vector<w_char>* p_su2 = &su2;
|
||||
std::vector<w_char> su2_copy;
|
||||
if (opt & NGRAM_LOWERING) {
|
||||
su2_copy = su2;
|
||||
mkallsmall_utf(su2_copy, langnum);
|
||||
p_su2 = &su2_copy;
|
||||
}
|
||||
for (int j = 1; j <= n; j++) {
|
||||
ns = 0;
|
||||
for (int i = 0; i <= (l1 - j); i++) {
|
||||
int k = 0;
|
||||
for (int l = 0; l <= (l2 - j); l++) {
|
||||
for (k = 0; k < j; k++) {
|
||||
const w_char& c1 = su1[i + k];
|
||||
const w_char& c2 = (*p_su2)[l + k];
|
||||
if ((c1.l != c2.l) || (c1.h != c2.h))
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (k != j && opt & NGRAM_WEIGHTED) {
|
||||
ns--;
|
||||
test++;
|
||||
if (i == 0 || i == l1 - j)
|
||||
ns--; // side weight
|
||||
}
|
||||
}
|
||||
nscore = nscore + ns;
|
||||
if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
l2 = s2.size();
|
||||
if (l2 == 0)
|
||||
return 0;
|
||||
l1 = s1.size();
|
||||
std::string t(s2);
|
||||
if (opt & NGRAM_LOWERING)
|
||||
mkallsmall(t, csconv);
|
||||
for (int j = 1; j <= n; j++) {
|
||||
ns = 0;
|
||||
for (int i = 0; i <= (l1 - j); i++) {
|
||||
//t is haystack, s1[i..i+j) is needle
|
||||
if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
|
||||
if (k == j) {
|
||||
ns++;
|
||||
} else if (opt & NGRAM_WEIGHTED) {
|
||||
ns--;
|
||||
test++;
|
||||
if (i == 0 || i == l1 - j)
|
||||
ns--; // side weight
|
||||
break;
|
||||
}
|
||||
}
|
||||
nscore = nscore + ns;
|
||||
if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
||||
break;
|
||||
if (k != j && opt & NGRAM_WEIGHTED) {
|
||||
ns--;
|
||||
test++;
|
||||
if (i == 0 || i == l1 - j)
|
||||
ns--; // side weight
|
||||
}
|
||||
}
|
||||
nscore = nscore + ns;
|
||||
if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
||||
break;
|
||||
}
|
||||
|
||||
ns = 0;
|
||||
|
@ -1808,46 +1848,95 @@ int SuggestMgr::ngram(int n,
|
|||
return ns;
|
||||
}
|
||||
|
||||
// length of the left common substring of s1 and (decapitalised) s2
|
||||
int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {
|
||||
if (utf8) {
|
||||
std::vector<w_char> su1;
|
||||
std::vector<w_char> su2;
|
||||
int l1 = u8_u16(su1, s1);
|
||||
int l2 = u8_u16(su2, s2);
|
||||
// decapitalize dictionary word
|
||||
if (complexprefixes) {
|
||||
if (su1[l1 - 1] == su2[l2 - 1])
|
||||
return 1;
|
||||
} else {
|
||||
unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
|
||||
unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
|
||||
if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
|
||||
return 0;
|
||||
int i;
|
||||
for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
|
||||
(su1[i].h == su2[i].h);
|
||||
i++)
|
||||
;
|
||||
return i;
|
||||
// generate an n-gram score comparing s1 and s2, non-UTF16 version
|
||||
int SuggestMgr::ngram(int n,
|
||||
const std::string& s1,
|
||||
const std::string& s2,
|
||||
int opt) {
|
||||
int nscore = 0;
|
||||
int ns;
|
||||
int l1;
|
||||
int l2;
|
||||
int test = 0;
|
||||
|
||||
l2 = s2.size();
|
||||
if (l2 == 0)
|
||||
return 0;
|
||||
l1 = s1.size();
|
||||
std::string t(s2);
|
||||
if (opt & NGRAM_LOWERING)
|
||||
mkallsmall(t, csconv);
|
||||
for (int j = 1; j <= n; j++) {
|
||||
ns = 0;
|
||||
for (int i = 0; i <= (l1 - j); i++) {
|
||||
//t is haystack, s1[i..i+j) is needle
|
||||
if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
|
||||
ns++;
|
||||
} else if (opt & NGRAM_WEIGHTED) {
|
||||
ns--;
|
||||
test++;
|
||||
if (i == 0 || i == l1 - j)
|
||||
ns--; // side weight
|
||||
}
|
||||
}
|
||||
nscore = nscore + ns;
|
||||
if (ns < 2 && !(opt & NGRAM_WEIGHTED))
|
||||
break;
|
||||
}
|
||||
|
||||
ns = 0;
|
||||
if (opt & NGRAM_LONGER_WORSE)
|
||||
ns = (l2 - l1) - 2;
|
||||
if (opt & NGRAM_ANY_MISMATCH)
|
||||
ns = abs(l2 - l1) - 2;
|
||||
ns = (nscore - ((ns > 0) ? ns : 0));
|
||||
return ns;
|
||||
}
|
||||
|
||||
// length of the left common substring of s1 and (decapitalised) s2, UTF version
|
||||
int SuggestMgr::leftcommonsubstring(
|
||||
const std::vector<w_char>& su1,
|
||||
const std::vector<w_char>& su2) {
|
||||
int l1 = su1.size();
|
||||
int l2 = su2.size();
|
||||
// decapitalize dictionary word
|
||||
if (complexprefixes) {
|
||||
if (su1[l1 - 1] == su2[l2 - 1])
|
||||
return 1;
|
||||
} else {
|
||||
if (complexprefixes) {
|
||||
int l1 = strlen(s1);
|
||||
int l2 = strlen(s2);
|
||||
if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
|
||||
return 1;
|
||||
} else if (csconv) {
|
||||
const char* olds = s1;
|
||||
// decapitalise dictionary word
|
||||
if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
|
||||
return 0;
|
||||
do {
|
||||
s1++;
|
||||
s2++;
|
||||
} while ((*s1 == *s2) && (*s1 != '\0'));
|
||||
return (int)(s1 - olds);
|
||||
}
|
||||
unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
|
||||
unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
|
||||
if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
|
||||
return 0;
|
||||
int i;
|
||||
for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
|
||||
(su1[i].h == su2[i].h);
|
||||
i++)
|
||||
;
|
||||
return i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// length of the left common substring of s1 and (decapitalised) s2, non-UTF
|
||||
int SuggestMgr::leftcommonsubstring(
|
||||
const char* s1,
|
||||
const char* s2) {
|
||||
if (complexprefixes) {
|
||||
int l1 = strlen(s1);
|
||||
int l2 = strlen(s2);
|
||||
if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
|
||||
return 1;
|
||||
} else if (csconv) {
|
||||
const char* olds = s1;
|
||||
// decapitalise dictionary word
|
||||
if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
|
||||
return 0;
|
||||
do {
|
||||
s1++;
|
||||
s2++;
|
||||
} while ((*s1 == *s2) && (*s1 != '\0'));
|
||||
return (int)(s1 - olds);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -173,8 +173,12 @@ class SuggestMgr {
|
|||
const std::vector<mapentry>&,
|
||||
int*,
|
||||
clock_t*);
|
||||
int ngram(int n, const std::vector<w_char>& su1,
|
||||
const std::vector<w_char>& su2, int opt);
|
||||
int ngram(int n, const std::string& s1, const std::string& s2, int opt);
|
||||
int mystrlen(const char* word);
|
||||
int leftcommonsubstring(const std::vector<w_char>& su1,
|
||||
const std::vector<w_char>& su2);
|
||||
int leftcommonsubstring(const char* s1, const char* s2);
|
||||
int commoncharacterpositions(const char* s1, const char* s2, int* is_swap);
|
||||
void bubblesort(char** rwd, char** rwd2, int* rsc, int n);
|
||||
|
|
Загрузка…
Ссылка в новой задаче