diff --git a/extensions/spellcheck/hunspell/src/affixmgr.cxx b/extensions/spellcheck/hunspell/src/affixmgr.cxx index f288fcb8300d..658a8aa091f1 100644 --- a/extensions/spellcheck/hunspell/src/affixmgr.cxx +++ b/extensions/spellcheck/hunspell/src/affixmgr.cxx @@ -2048,7 +2048,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word, wordnum = oldwordnum2; // perhaps second word is a compound word (recursive call) - if (wordnum < maxwordnum) { + if (wordnum + 2 < maxwordnum) { rv = compound_check(st.substr(i), wordnum + 1, numsyllable, maxwordnum, wnum + 1, words, rwords, 0, is_sug, info); @@ -2621,7 +2621,7 @@ int AffixMgr::compound_check_morph(const char* word, wordnum = oldwordnum2; // perhaps second word is a compound word (recursive call) - if ((wordnum < maxwordnum) && (ok == 0)) { + if ((wordnum + 2 < maxwordnum) && (ok == 0)) { compound_check_morph((word + i), strlen(word + i), wordnum + 1, numsyllable, maxwordnum, wnum + 1, words, rwords, 0, result, &presult); diff --git a/extensions/spellcheck/hunspell/src/hunspell.hxx b/extensions/spellcheck/hunspell/src/hunspell.hxx index bd25b017ddf4..43af66b5ac13 100644 --- a/extensions/spellcheck/hunspell/src/hunspell.hxx +++ b/extensions/spellcheck/hunspell/src/hunspell.hxx @@ -83,9 +83,12 @@ #define MAXSUGGESTION 15 #define MAXSHARPS 5 -#define MAXWORDLEN 100 -#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1) +#ifndef MAXWORDLEN +#define MAXWORDLEN 100 +#endif + +#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) # define H_DEPRECATED __attribute__((__deprecated__)) #elif defined(_MSC_VER) && (_MSC_VER >= 1300) # define H_DEPRECATED __declspec(deprecated) diff --git a/extensions/spellcheck/hunspell/src/patches/1322666 b/extensions/spellcheck/hunspell/src/patches/1322666 deleted file mode 100644 index 861c43f10f53..000000000000 --- a/extensions/spellcheck/hunspell/src/patches/1322666 +++ /dev/null @@ -1,24 +0,0 @@ -Bug 1322666 - Change MAXWORDLEN to 100 - -diff --git a/extensions/spellcheck/hunspell/src/hunspell.hxx b/extensions/spellcheck/hunspell/src/hunspell.hxx ---- a/extensions/spellcheck/hunspell/src/hunspell.hxx -+++ b/extensions/spellcheck/hunspell/src/hunspell.hxx -@@ -78,17 +78,17 @@ - #include "atypes.hxx" - #include - #include - - #define SPELL_XML "" - - #define MAXSUGGESTION 15 - #define MAXSHARPS 5 --#define MAXWORDLEN 176 -+#define MAXWORDLEN 100 - - #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1) - # define H_DEPRECATED __attribute__((__deprecated__)) - #elif defined(_MSC_VER) && (_MSC_VER >= 1300) - # define H_DEPRECATED __declspec(deprecated) - #else - # define H_DEPRECATED - #endif diff --git a/extensions/spellcheck/hunspell/src/suggestmgr.cxx b/extensions/spellcheck/hunspell/src/suggestmgr.cxx index 6dff02f10237..b9983417cb20 100644 --- a/extensions/spellcheck/hunspell/src/suggestmgr.cxx +++ b/extensions/spellcheck/hunspell/src/suggestmgr.cxx @@ -1050,12 +1050,12 @@ void SuggestMgr::ngsuggest(std::vector& wlst, phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL; std::string target; std::string candidate; + std::vector w_candidate; if (ph) { if (utf8) { - std::vector _w; - u8_u16(_w, word); - mkallcap_utf(_w, langnum); - u16_u8(candidate, _w); + u8_u16(w_candidate, word); + mkallcap_utf(w_candidate, langnum); + u16_u8(candidate, w_candidate); } else { candidate.assign(word); if (!nonbmp) @@ -1069,6 +1069,17 @@ void SuggestMgr::ngsuggest(std::vector& wlst, FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL; FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL; + std::vector w_word, w_target; + if (utf8) { + u8_u16(w_word, word); + u8_u16(w_target, target); + } + + std::vector w_entry; + std::string f; + std::vector w_f; + std::vector w_target2; + for (size_t i = 0; i < rHMgr.size(); ++i) { while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) { if ((hp->astr) && (pAMgr) && @@ -1079,15 +1090,30 @@ void SuggestMgr::ngsuggest(std::vector& wlst, TESTAFF(hp->astr, onlyincompound, hp->alen))) continue; - sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + - leftcommonsubstring(word, HENTRY_WORD(hp)); + if (utf8) { + w_entry.clear(); + u8_u16(w_entry, HENTRY_WORD(hp)); + sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) + + leftcommonsubstring(w_word, w_entry); + } else { + sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) + + leftcommonsubstring(word, HENTRY_WORD(hp)); + } // check special pronounciation - std::string f; + f.clear(); if ((hp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) { - int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + - +leftcommonsubstring(word, f.c_str()); + int sc2; + if (utf8) { + w_f.clear(); + u8_u16(w_f, f.c_str()); + sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) + + leftcommonsubstring(w_word, w_f); + } else { + sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) + + leftcommonsubstring(word, f.c_str()); + } if (sc2 > sc) sc = sc2; } @@ -1095,16 +1121,24 @@ void SuggestMgr::ngsuggest(std::vector& wlst, int scphon = -20000; if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) { if (utf8) { - std::vector _w; - u8_u16(_w, HENTRY_WORD(hp)); - mkallcap_utf(_w, langnum); - u16_u8(candidate, _w); + w_candidate.clear(); + u8_u16(w_candidate, HENTRY_WORD(hp)); + mkallcap_utf(w_candidate, langnum); + u16_u8(candidate, w_candidate); } else { - candidate.assign(HENTRY_WORD(hp)); + candidate = HENTRY_WORD(hp); mkallcap(candidate, csconv); } std::string target2 = phonet(candidate, *ph); - scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE); + w_target2.clear(); + if (utf8) { + u8_u16(w_target2, target2.c_str()); + scphon = 2 * ngram(3, w_target, w_target2, + NGRAM_LONGER_WORSE); + } else { + scphon = 2 * ngram(3, target, target2, + NGRAM_LONGER_WORSE); + } } if (sc > scores[lp]) { @@ -1134,22 +1168,21 @@ void SuggestMgr::ngsuggest(std::vector& wlst, // find minimum threshold for a passable suggestion // mangle original word three differnt ways // and score them to generate a minimum acceptable score + std::vector w_mw; int thresh = 0; for (int sp = 1; sp < 4; sp++) { if (utf8) { - u8_u16(u8, word); + w_mw = w_word; for (int k = sp; k < n; k += 4) { - u8[k].l = '*'; - u8[k].h = 0; + w_mw[k].l = '*'; + w_mw[k].h = 0; } - std::string mw; - u16_u8(mw, u8); - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low); } else { - std::string mw(word); + std::string mw = word; for (int k = sp; k < n; k += 4) mw[k] = '*'; - thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); + thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low); } } thresh = thresh / 3; @@ -1177,11 +1210,12 @@ void SuggestMgr::ngsuggest(std::vector& wlst, return; } + std::vector w_glst_word; for (int i = 0; i < MAX_ROOTS; i++) { if (roots[i]) { struct hentry* rp = roots[i]; - std::string f; + f.clear(); const char *field = NULL; if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON)) field = f.c_str(); @@ -1190,8 +1224,17 @@ void SuggestMgr::ngsuggest(std::vector& wlst, nc, field); for (int k = 0; k < nw; k++) { - sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) + - leftcommonsubstring(word, glst[k].word); + if (utf8) { + w_glst_word.clear(); + u8_u16(w_glst_word, glst[k].word); + sc = ngram(n, w_word, w_glst_word, + NGRAM_ANY_MISMATCH + low) + + leftcommonsubstring(w_word, w_glst_word); + } else { + sc = ngram(n, word, glst[k].word, + NGRAM_ANY_MISMATCH + low) + + leftcommonsubstring(word, glst[k].word); + } if (sc > thresh) { if (sc > gscore[lp]) { @@ -1245,16 +1288,17 @@ void SuggestMgr::ngsuggest(std::vector& wlst, fact = (10.0 - maxd) / 5.0; } + std::vector w_gl; for (int i = 0; i < MAX_GUESS; i++) { if (guess[i]) { // lowering guess[i] std::string gl; int len; if (utf8) { - std::vector _w; - len = u8_u16(_w, guess[i]); - mkallsmall_utf(_w, langnum); - u16_u8(gl, _w); + w_gl.clear(); + len = u8_u16(w_gl, guess[i]); + mkallsmall_utf(w_gl, langnum); + u16_u8(gl, w_gl); } else { gl.assign(guess[i]); if (!nonbmp) @@ -1271,14 +1315,29 @@ void SuggestMgr::ngsuggest(std::vector& wlst, } // using 2-gram instead of 3, and other weightening - re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + - ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + w_gl.clear(); + if (utf8) { + u8_u16(w_gl, gl); + re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + + ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + } else { + re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) + + ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED); + } + int ngram_score, leftcommon_score; + if (utf8) { + ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low); + leftcommon_score = leftcommonsubstring(w_word, w_gl); + } else { + ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low); + leftcommon_score = leftcommonsubstring(word, gl.c_str()); + } gscore[i] = // length of longest common subsequent minus length difference 2 * _lcs - abs((int)(n - len)) + // weight length of the left common substring - leftcommonsubstring(word, gl.c_str()) + + leftcommon_score + // weight equal character positions (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap) ? 1 @@ -1286,7 +1345,7 @@ void SuggestMgr::ngsuggest(std::vector& wlst, // swap character (not neighboring) ((is_swap) ? 10 : 0) + // ngram - ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) + + ngram_score + // weighted ngrams re + // different limit for dictionaries with PHONE rules @@ -1304,11 +1363,11 @@ void SuggestMgr::ngsuggest(std::vector& wlst, // lowering rootphon[i] std::string gl; int len; + w_gl.clear(); if (utf8) { - std::vector _w; - len = u8_u16(_w, rootsphon[i]); - mkallsmall_utf(_w, langnum); - u16_u8(gl, _w); + len = u8_u16(w_gl, rootsphon[i]); + mkallsmall_utf(w_gl, langnum); + u16_u8(gl, w_gl); } else { gl.assign(rootsphon[i]); if (!nonbmp) @@ -1316,10 +1375,15 @@ void SuggestMgr::ngsuggest(std::vector& wlst, len = strlen(rootsphon[i]); } + // weight length of the left common substring + int leftcommon_score; + if (utf8) + leftcommon_score = leftcommonsubstring(w_word, w_gl); + else + leftcommon_score = leftcommonsubstring(word, gl.c_str()); // heuristic weigthing of ngram scores scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) + - // weight length of the left common substring - leftcommonsubstring(word, gl.c_str()); + leftcommon_score; } } @@ -1724,10 +1788,10 @@ std::string SuggestMgr::suggest_gen(const std::vector& desc, const return result2; } -// generate an n-gram score comparing s1 and s2 +// generate an n-gram score comparing s1 and s2, UTF16 version int SuggestMgr::ngram(int n, - const std::string& s1, - const std::string& s2, + const std::vector& su1, + const std::vector& su2, int opt) { int nscore = 0; int ns; @@ -1735,68 +1799,44 @@ int SuggestMgr::ngram(int n, int l2; int test = 0; - if (utf8) { - std::vector su1; - std::vector su2; - l1 = u8_u16(su1, s1); - l2 = u8_u16(su2, s2); - if ((l2 <= 0) || (l1 == -1)) - return 0; - // lowering dictionary word - if (opt & NGRAM_LOWERING) - mkallsmall_utf(su2, langnum); - for (int j = 1; j <= n; j++) { - ns = 0; - for (int i = 0; i <= (l1 - j); i++) { - int k = 0; - for (int l = 0; l <= (l2 - j); l++) { - for (k = 0; k < j; k++) { - w_char& c1 = su1[i + k]; - w_char& c2 = su2[l + k]; - if ((c1.l != c2.l) || (c1.h != c2.h)) - break; - } - if (k == j) { - ns++; + l1 = su1.size(); + l2 = su2.size(); + if (l2 == 0) + return 0; + // lowering dictionary word + const std::vector* p_su2 = &su2; + std::vector su2_copy; + if (opt & NGRAM_LOWERING) { + su2_copy = su2; + mkallsmall_utf(su2_copy, langnum); + p_su2 = &su2_copy; + } + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + int k = 0; + for (int l = 0; l <= (l2 - j); l++) { + for (k = 0; k < j; k++) { + const w_char& c1 = su1[i + k]; + const w_char& c2 = (*p_su2)[l + k]; + if ((c1.l != c2.l) || (c1.h != c2.h)) break; - } } - if (k != j && opt & NGRAM_WEIGHTED) { - ns--; - test++; - if (i == 0 || i == l1 - j) - ns--; // side weight - } - } - nscore = nscore + ns; - if (ns < 2 && !(opt & NGRAM_WEIGHTED)) - break; - } - } else { - l2 = s2.size(); - if (l2 == 0) - return 0; - l1 = s1.size(); - std::string t(s2); - if (opt & NGRAM_LOWERING) - mkallsmall(t, csconv); - for (int j = 1; j <= n; j++) { - ns = 0; - for (int i = 0; i <= (l1 - j); i++) { - //t is haystack, s1[i..i+j) is needle - if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { + if (k == j) { ns++; - } else if (opt & NGRAM_WEIGHTED) { - ns--; - test++; - if (i == 0 || i == l1 - j) - ns--; // side weight + break; } } - nscore = nscore + ns; - if (ns < 2 && !(opt & NGRAM_WEIGHTED)) - break; + if (k != j && opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; } ns = 0; @@ -1808,46 +1848,95 @@ int SuggestMgr::ngram(int n, return ns; } -// length of the left common substring of s1 and (decapitalised) s2 -int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) { - if (utf8) { - std::vector su1; - std::vector su2; - int l1 = u8_u16(su1, s1); - int l2 = u8_u16(su2, s2); - // decapitalize dictionary word - if (complexprefixes) { - if (su1[l1 - 1] == su2[l2 - 1]) - return 1; - } else { - unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; - unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; - if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) - return 0; - int i; - for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && - (su1[i].h == su2[i].h); - i++) - ; - return i; +// generate an n-gram score comparing s1 and s2, non-UTF16 version +int SuggestMgr::ngram(int n, + const std::string& s1, + const std::string& s2, + int opt) { + int nscore = 0; + int ns; + int l1; + int l2; + int test = 0; + + l2 = s2.size(); + if (l2 == 0) + return 0; + l1 = s1.size(); + std::string t(s2); + if (opt & NGRAM_LOWERING) + mkallsmall(t, csconv); + for (int j = 1; j <= n; j++) { + ns = 0; + for (int i = 0; i <= (l1 - j); i++) { + //t is haystack, s1[i..i+j) is needle + if (t.find(s1.c_str()+i, 0, j) != std::string::npos) { + ns++; + } else if (opt & NGRAM_WEIGHTED) { + ns--; + test++; + if (i == 0 || i == l1 - j) + ns--; // side weight + } } + nscore = nscore + ns; + if (ns < 2 && !(opt & NGRAM_WEIGHTED)) + break; + } + + ns = 0; + if (opt & NGRAM_LONGER_WORSE) + ns = (l2 - l1) - 2; + if (opt & NGRAM_ANY_MISMATCH) + ns = abs(l2 - l1) - 2; + ns = (nscore - ((ns > 0) ? ns : 0)); + return ns; +} + +// length of the left common substring of s1 and (decapitalised) s2, UTF version +int SuggestMgr::leftcommonsubstring( + const std::vector& su1, + const std::vector& su2) { + int l1 = su1.size(); + int l2 = su2.size(); + // decapitalize dictionary word + if (complexprefixes) { + if (su1[l1 - 1] == su2[l2 - 1]) + return 1; } else { - if (complexprefixes) { - int l1 = strlen(s1); - int l2 = strlen(s2); - if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) - return 1; - } else if (csconv) { - const char* olds = s1; - // decapitalise dictionary word - if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) - return 0; - do { - s1++; - s2++; - } while ((*s1 == *s2) && (*s1 != '\0')); - return (int)(s1 - olds); - } + unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l; + unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l; + if (otheridx != idx && (otheridx != unicodetolower(idx, langnum))) + return 0; + int i; + for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) && + (su1[i].h == su2[i].h); + i++) + ; + return i; + } + return 0; +} + +// length of the left common substring of s1 and (decapitalised) s2, non-UTF +int SuggestMgr::leftcommonsubstring( + const char* s1, + const char* s2) { + if (complexprefixes) { + int l1 = strlen(s1); + int l2 = strlen(s2); + if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1]) + return 1; + } else if (csconv) { + const char* olds = s1; + // decapitalise dictionary word + if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower)) + return 0; + do { + s1++; + s2++; + } while ((*s1 == *s2) && (*s1 != '\0')); + return (int)(s1 - olds); } return 0; } diff --git a/extensions/spellcheck/hunspell/src/suggestmgr.hxx b/extensions/spellcheck/hunspell/src/suggestmgr.hxx index ea6ad79e6f59..6ba9dc8e3703 100644 --- a/extensions/spellcheck/hunspell/src/suggestmgr.hxx +++ b/extensions/spellcheck/hunspell/src/suggestmgr.hxx @@ -173,8 +173,12 @@ class SuggestMgr { const std::vector&, int*, clock_t*); + int ngram(int n, const std::vector& su1, + const std::vector& su2, int opt); int ngram(int n, const std::string& s1, const std::string& s2, int opt); int mystrlen(const char* word); + int leftcommonsubstring(const std::vector& su1, + const std::vector& su2); int leftcommonsubstring(const char* s1, const char* s2); int commoncharacterpositions(const char* s1, const char* s2, int* is_swap); void bubblesort(char** rwd, char** rwd2, int* rsc, int n);