Bug 1326277 - Upgrade Hunspell to version 1.6.0. r=masayuki

--HG-- extra : rebase_source : dfd3b592a52a708fe62c49c826bfaedea801769d
2017-01-05 23:42:31 -05:00 · 2017-01-05 23:42:31 -05:00 · 7afab1ba7f
--- a/extensions/spellcheck/hunspell/src/affixmgr.cxx
+++ b/extensions/spellcheck/hunspell/src/affixmgr.cxx
@ -2048,7 +2048,7 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
            wordnum = oldwordnum2;

            // perhaps second word is a compound word (recursive call)
-            if (wordnum < maxwordnum) {
+            if (wordnum + 2 < maxwordnum) {
              rv = compound_check(st.substr(i), wordnum + 1,
                                  numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
                                  is_sug, info);
@ -2621,7 +2621,7 @@ int AffixMgr::compound_check_morph(const char* word,
        wordnum = oldwordnum2;

        // perhaps second word is a compound word (recursive call)
-        if ((wordnum < maxwordnum) && (ok == 0)) {
+        if ((wordnum + 2 < maxwordnum) && (ok == 0)) {
          compound_check_morph((word + i), strlen(word + i), wordnum + 1,
                               numsyllable, maxwordnum, wnum + 1, words, rwords, 0,
                               result, &presult);
--- a/extensions/spellcheck/hunspell/src/hunspell.hxx
+++ b/extensions/spellcheck/hunspell/src/hunspell.hxx
@ -83,9 +83,12 @@

 #define MAXSUGGESTION 15
 #define MAXSHARPS 5
-#define MAXWORDLEN 100

-#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
+#ifndef MAXWORDLEN
+#define MAXWORDLEN 100
+#endif
+
+#if defined __GNUC__ && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
 #  define H_DEPRECATED __attribute__((__deprecated__))
 #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
 #  define H_DEPRECATED __declspec(deprecated)
--- a/extensions/spellcheck/hunspell/src/patches/1322666
+++ b/extensions/spellcheck/hunspell/src/patches/1322666
@ -1,24 +0,0 @@
-Bug 1322666 - Change MAXWORDLEN to 100
-
-diff --git a/extensions/spellcheck/hunspell/src/hunspell.hxx b/extensions/spellcheck/hunspell/src/hunspell.hxx
--- a/extensions/spellcheck/hunspell/src/hunspell.hxx
-+++ b/extensions/spellcheck/hunspell/src/hunspell.hxx
-@@ -78,17 +78,17 @@
- #include "atypes.hxx"
- #include <string>
- #include <vector>
- 
- #define SPELL_XML "<?xml?>"
- 
- #define MAXSUGGESTION 15
- #define MAXSHARPS 5
-#define MAXWORDLEN 176
-+#define MAXWORDLEN 100
- 
- #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)
- #  define H_DEPRECATED __attribute__((__deprecated__))
- #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
- #  define H_DEPRECATED __declspec(deprecated)
- #else
- #  define H_DEPRECATED
- #endif
--- a/extensions/spellcheck/hunspell/src/suggestmgr.cxx
+++ b/extensions/spellcheck/hunspell/src/suggestmgr.cxx
@ -1050,12 +1050,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  phonetable* ph = (pAMgr) ? pAMgr->get_phonetable() : NULL;
  std::string target;
  std::string candidate;
+  std::vector<w_char> w_candidate;
  if (ph) {
    if (utf8) {
-      std::vector<w_char> _w;
-      u8_u16(_w, word);
-      mkallcap_utf(_w, langnum);
-      u16_u8(candidate, _w);
+      u8_u16(w_candidate, word);
+      mkallcap_utf(w_candidate, langnum);
+      u16_u8(candidate, w_candidate);
    } else {
      candidate.assign(word);
      if (!nonbmp)
@ -1069,6 +1069,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  FLAG nongramsuggest = pAMgr ? pAMgr->get_nongramsuggest() : FLAG_NULL;
  FLAG onlyincompound = pAMgr ? pAMgr->get_onlyincompound() : FLAG_NULL;

+  std::vector<w_char> w_word, w_target;
+  if (utf8) {
+    u8_u16(w_word, word);
+    u8_u16(w_target, target);
+  }
+  
+  std::vector<w_char> w_entry;
+  std::string f;
+  std::vector<w_char> w_f;
+  std::vector<w_char> w_target2;
+  
  for (size_t i = 0; i < rHMgr.size(); ++i) {
    while (0 != (hp = rHMgr[i]->walk_hashtable(col, hp))) {
      if ((hp->astr) && (pAMgr) &&
@ -1079,15 +1090,30 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
           TESTAFF(hp->astr, onlyincompound, hp->alen)))
        continue;

-      sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
-           leftcommonsubstring(word, HENTRY_WORD(hp));
+      if (utf8) {
+        w_entry.clear();
+        u8_u16(w_entry, HENTRY_WORD(hp));
+        sc = ngram(3, w_word, w_entry, NGRAM_LONGER_WORSE + low) +
+             leftcommonsubstring(w_word, w_entry);
+      } else {
+        sc = ngram(3, word, HENTRY_WORD(hp), NGRAM_LONGER_WORSE + low) +
+             leftcommonsubstring(word, HENTRY_WORD(hp));
+      }

      // check special pronounciation
-      std::string f;
+      f.clear();
      if ((hp->var & H_OPT_PHON) &&
          copy_field(f, HENTRY_DATA(hp), MORPH_PHON)) {
-        int sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
-                  +leftcommonsubstring(word, f.c_str());
+        int sc2;
+        if (utf8) {
+          w_f.clear();
+          u8_u16(w_f, f.c_str());
+          sc2 = ngram(3, w_word, w_f, NGRAM_LONGER_WORSE + low) +
+                leftcommonsubstring(w_word, w_f);
+        } else {
+          sc2 = ngram(3, word, f, NGRAM_LONGER_WORSE + low) +
+                leftcommonsubstring(word, f.c_str());
+        }
        if (sc2 > sc)
          sc = sc2;
      }
@ -1095,16 +1121,24 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
      int scphon = -20000;
      if (ph && (sc > 2) && (abs(n - (int)hp->clen) <= 3)) {
        if (utf8) {
-          std::vector<w_char> _w;
-          u8_u16(_w, HENTRY_WORD(hp));
-          mkallcap_utf(_w, langnum);
-          u16_u8(candidate, _w);
+          w_candidate.clear();
+          u8_u16(w_candidate, HENTRY_WORD(hp));
+          mkallcap_utf(w_candidate, langnum);
+          u16_u8(candidate, w_candidate);
        } else {
-          candidate.assign(HENTRY_WORD(hp));
+          candidate = HENTRY_WORD(hp);
          mkallcap(candidate, csconv);
        }
        std::string target2 = phonet(candidate, *ph);
-        scphon = 2 * ngram(3, target, target2, NGRAM_LONGER_WORSE);
+        w_target2.clear();
+        if (utf8) {
+          u8_u16(w_target2, target2.c_str());
+          scphon = 2 * ngram(3, w_target, w_target2,
+                             NGRAM_LONGER_WORSE);
+        } else {
+          scphon = 2 * ngram(3, target, target2,
+                             NGRAM_LONGER_WORSE);
+        }
      }

      if (sc > scores[lp]) {
@ -1134,22 +1168,21 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
  // find minimum threshold for a passable suggestion
  // mangle original word three differnt ways
  // and score them to generate a minimum acceptable score
+  std::vector<w_char> w_mw;
  int thresh = 0;
  for (int sp = 1; sp < 4; sp++) {
    if (utf8) {
-      u8_u16(u8, word);
+      w_mw = w_word;
      for (int k = sp; k < n; k += 4) {
-        u8[k].l = '*';
-        u8[k].h = 0;
+        w_mw[k].l = '*';
+        w_mw[k].h = 0;
      }
-      std::string mw;
-      u16_u8(mw, u8);
-      thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
+      thresh += ngram(n, w_word, w_mw, NGRAM_ANY_MISMATCH + low);
    } else {
-      std::string mw(word);
+      std::string mw = word;
      for (int k = sp; k < n; k += 4)
        mw[k] = '*';
-      thresh = thresh + ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
+      thresh += ngram(n, word, mw, NGRAM_ANY_MISMATCH + low);
    }
  }
  thresh = thresh / 3;
@ -1177,11 +1210,12 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
    return;
  }

+  std::vector<w_char> w_glst_word;
  for (int i = 0; i < MAX_ROOTS; i++) {
    if (roots[i]) {
      struct hentry* rp = roots[i];

-      std::string f;
+      f.clear();
      const char *field = NULL;
      if ((rp->var & H_OPT_PHON) && copy_field(f, HENTRY_DATA(rp), MORPH_PHON))
          field = f.c_str();
@ -1190,8 +1224,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
          nc, field);

      for (int k = 0; k < nw; k++) {
-        sc = ngram(n, word, glst[k].word, NGRAM_ANY_MISMATCH + low) +
-             leftcommonsubstring(word, glst[k].word);
+        if (utf8) {
+          w_glst_word.clear();
+          u8_u16(w_glst_word, glst[k].word);
+          sc = ngram(n, w_word, w_glst_word,
+                     NGRAM_ANY_MISMATCH + low) +
+               leftcommonsubstring(w_word, w_glst_word);
+        } else {
+          sc = ngram(n, word, glst[k].word,
+                     NGRAM_ANY_MISMATCH + low) +
+               leftcommonsubstring(word, glst[k].word);
+        }

        if (sc > thresh) {
          if (sc > gscore[lp]) {
@ -1245,16 +1288,17 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
      fact = (10.0 - maxd) / 5.0;
  }

+  std::vector<w_char> w_gl;
  for (int i = 0; i < MAX_GUESS; i++) {
    if (guess[i]) {
      // lowering guess[i]
      std::string gl;
      int len;
      if (utf8) {
-        std::vector<w_char> _w;
-        len = u8_u16(_w, guess[i]);
-        mkallsmall_utf(_w, langnum);
-        u16_u8(gl, _w);
+        w_gl.clear();
+        len = u8_u16(w_gl, guess[i]);
+        mkallsmall_utf(w_gl, langnum);
+        u16_u8(gl, w_gl);
      } else {
        gl.assign(guess[i]);
        if (!nonbmp)
@ -1271,14 +1315,29 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
      }
      // using 2-gram instead of 3, and other weightening

-      re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
-           ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
+      w_gl.clear();
+      if (utf8) {
+        u8_u16(w_gl, gl);
+        re = ngram(2, w_word, w_gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
+             ngram(2, w_gl, w_word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
+      } else {
+        re = ngram(2, word, gl, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED) +
+             ngram(2, gl, word, NGRAM_ANY_MISMATCH + low + NGRAM_WEIGHTED);
+      }

+      int ngram_score, leftcommon_score;
+      if (utf8) {
+        ngram_score = ngram(4, w_word, w_gl, NGRAM_ANY_MISMATCH + low);
+        leftcommon_score = leftcommonsubstring(w_word, w_gl);
+      } else {
+        ngram_score = ngram(4, word, gl, NGRAM_ANY_MISMATCH + low);
+        leftcommon_score = leftcommonsubstring(word, gl.c_str());
+      }
      gscore[i] =
          // length of longest common subsequent minus length difference
          2 * _lcs - abs((int)(n - len)) +
          // weight length of the left common substring
-          leftcommonsubstring(word, gl.c_str()) +
+          leftcommon_score +
          // weight equal character positions
          (!nonbmp && commoncharacterpositions(word, gl.c_str(), &is_swap)
               ? 1
@ -1286,7 +1345,7 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
          // swap character (not neighboring)
          ((is_swap) ? 10 : 0) +
          // ngram
-          ngram(4, word, gl, NGRAM_ANY_MISMATCH + low) +
+          ngram_score +
          // weighted ngrams
          re +
          // different limit for dictionaries with PHONE rules
@ -1304,11 +1363,11 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
        // lowering rootphon[i]
        std::string gl;
        int len;
+        w_gl.clear();
        if (utf8) {
-          std::vector<w_char> _w;
-          len = u8_u16(_w, rootsphon[i]);
-          mkallsmall_utf(_w, langnum);
-          u16_u8(gl, _w);
+          len = u8_u16(w_gl, rootsphon[i]);
+          mkallsmall_utf(w_gl, langnum);
+          u16_u8(gl, w_gl);
        } else {
          gl.assign(rootsphon[i]);
          if (!nonbmp)
@ -1316,10 +1375,15 @@ void SuggestMgr::ngsuggest(std::vector<std::string>& wlst,
          len = strlen(rootsphon[i]);
        }

+        // weight length of the left common substring
+        int leftcommon_score;
+        if (utf8)
+          leftcommon_score = leftcommonsubstring(w_word, w_gl);
+        else
+          leftcommon_score = leftcommonsubstring(word, gl.c_str());
        // heuristic weigthing of ngram scores
        scoresphon[i] += 2 * lcslen(word, gl) - abs((int)(n - len)) +
-                         // weight length of the left common substring
-                         leftcommonsubstring(word, gl.c_str());
+                         leftcommon_score;
      }
    }

@ -1724,10 +1788,10 @@ std::string SuggestMgr::suggest_gen(const std::vector<std::string>& desc, const
  return result2;
 }

-// generate an n-gram score comparing s1 and s2
+// generate an n-gram score comparing s1 and s2, UTF16 version
 int SuggestMgr::ngram(int n,
-                      const std::string& s1,
-                      const std::string& s2,
+                      const std::vector<w_char>& su1,
+                      const std::vector<w_char>& su2,
                      int opt) {
  int nscore = 0;
  int ns;
@ -1735,68 +1799,44 @@ int SuggestMgr::ngram(int n,
  int l2;
  int test = 0;

-  if (utf8) {
-    std::vector<w_char> su1;
-    std::vector<w_char> su2;
-    l1 = u8_u16(su1, s1);
-    l2 = u8_u16(su2, s2);
-    if ((l2 <= 0) || (l1 == -1))
-      return 0;
-    // lowering dictionary word
-    if (opt & NGRAM_LOWERING)
-      mkallsmall_utf(su2, langnum);
-    for (int j = 1; j <= n; j++) {
-      ns = 0;
-      for (int i = 0; i <= (l1 - j); i++) {
-        int k = 0;
-        for (int l = 0; l <= (l2 - j); l++) {
-          for (k = 0; k < j; k++) {
-            w_char& c1 = su1[i + k];
-            w_char& c2 = su2[l + k];
-            if ((c1.l != c2.l) || (c1.h != c2.h))
-              break;
-          }
-          if (k == j) {
-            ns++;
+  l1 = su1.size();
+  l2 = su2.size();
+  if (l2 == 0)
+    return 0;
+  // lowering dictionary word
+  const std::vector<w_char>* p_su2 = &su2;
+  std::vector<w_char> su2_copy;
+  if (opt & NGRAM_LOWERING) {
+    su2_copy = su2;
+    mkallsmall_utf(su2_copy, langnum);
+    p_su2 = &su2_copy;
+  }
+  for (int j = 1; j <= n; j++) {
+    ns = 0;
+    for (int i = 0; i <= (l1 - j); i++) {
+      int k = 0;
+      for (int l = 0; l <= (l2 - j); l++) {
+        for (k = 0; k < j; k++) {
+          const w_char& c1 = su1[i + k];
+          const w_char& c2 = (*p_su2)[l + k];
+          if ((c1.l != c2.l) || (c1.h != c2.h))
            break;
-          }
        }
-        if (k != j && opt & NGRAM_WEIGHTED) {
-          ns--;
-          test++;
-          if (i == 0 || i == l1 - j)
-            ns--;  // side weight
-        }
-      }
-      nscore = nscore + ns;
-      if (ns < 2 && !(opt & NGRAM_WEIGHTED))
-        break;
-    }
-  } else {
-    l2 = s2.size();
-    if (l2 == 0)
-      return 0;
-    l1 = s1.size();
-    std::string t(s2);
-    if (opt & NGRAM_LOWERING)
-      mkallsmall(t, csconv);
-    for (int j = 1; j <= n; j++) {
-      ns = 0;
-      for (int i = 0; i <= (l1 - j); i++) {
-        //t is haystack, s1[i..i+j) is needle
-        if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
+        if (k == j) {
          ns++;
-        } else if (opt & NGRAM_WEIGHTED) {
-          ns--;
-          test++;
-          if (i == 0 || i == l1 - j)
-            ns--;  // side weight
+          break;
        }
      }
-      nscore = nscore + ns;
-      if (ns < 2 && !(opt & NGRAM_WEIGHTED))
-        break;
+      if (k != j && opt & NGRAM_WEIGHTED) {
+        ns--;
+        test++;
+        if (i == 0 || i == l1 - j)
+          ns--;  // side weight
+      }
    }
+    nscore = nscore + ns;
+    if (ns < 2 && !(opt & NGRAM_WEIGHTED))
+      break;
  }

  ns = 0;
@ -1808,46 +1848,95 @@ int SuggestMgr::ngram(int n,
  return ns;
 }

-// length of the left common substring of s1 and (decapitalised) s2
-int SuggestMgr::leftcommonsubstring(const char* s1, const char* s2) {
-  if (utf8) {
-    std::vector<w_char> su1;
-    std::vector<w_char> su2;
-    int l1 = u8_u16(su1, s1);
-    int l2 = u8_u16(su2, s2);
-    // decapitalize dictionary word
-    if (complexprefixes) {
-      if (su1[l1 - 1] == su2[l2 - 1])
-        return 1;
-    } else {
-      unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
-      unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
-      if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
-        return 0;
-      int i;
-      for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
-                  (su1[i].h == su2[i].h);
-           i++)
-        ;
-      return i;
+// generate an n-gram score comparing s1 and s2, non-UTF16 version
+int SuggestMgr::ngram(int n,
+                      const std::string& s1,
+                      const std::string& s2,
+                      int opt) {
+  int nscore = 0;
+  int ns;
+  int l1;
+  int l2;
+  int test = 0;
+
+  l2 = s2.size();
+  if (l2 == 0)
+    return 0;
+  l1 = s1.size();
+  std::string t(s2);
+  if (opt & NGRAM_LOWERING)
+    mkallsmall(t, csconv);
+  for (int j = 1; j <= n; j++) {
+    ns = 0;
+    for (int i = 0; i <= (l1 - j); i++) {
+      //t is haystack, s1[i..i+j) is needle
+      if (t.find(s1.c_str()+i, 0, j) != std::string::npos) {
+        ns++;
+      } else if (opt & NGRAM_WEIGHTED) {
+        ns--;
+        test++;
+        if (i == 0 || i == l1 - j)
+          ns--;  // side weight
+      }
    }
+    nscore = nscore + ns;
+    if (ns < 2 && !(opt & NGRAM_WEIGHTED))
+      break;
+  }
+
+  ns = 0;
+  if (opt & NGRAM_LONGER_WORSE)
+    ns = (l2 - l1) - 2;
+  if (opt & NGRAM_ANY_MISMATCH)
+    ns = abs(l2 - l1) - 2;
+  ns = (nscore - ((ns > 0) ? ns : 0));
+  return ns;
+}
+
+// length of the left common substring of s1 and (decapitalised) s2, UTF version
+int SuggestMgr::leftcommonsubstring(
+    const std::vector<w_char>& su1,
+    const std::vector<w_char>& su2) {
+  int l1 = su1.size();
+  int l2 = su2.size();
+  // decapitalize dictionary word
+  if (complexprefixes) {
+    if (su1[l1 - 1] == su2[l2 - 1])
+      return 1;
  } else {
-    if (complexprefixes) {
-      int l1 = strlen(s1);
-      int l2 = strlen(s2);
-      if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
-        return 1;
-    } else if (csconv) {
-      const char* olds = s1;
-      // decapitalise dictionary word
-      if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
-        return 0;
-      do {
-        s1++;
-        s2++;
-      } while ((*s1 == *s2) && (*s1 != '\0'));
-      return (int)(s1 - olds);
-    }
+    unsigned short idx = su2.empty() ? 0 : (su2[0].h << 8) + su2[0].l;
+    unsigned short otheridx = su1.empty() ? 0 : (su1[0].h << 8) + su1[0].l;
+    if (otheridx != idx && (otheridx != unicodetolower(idx, langnum)))
+      return 0;
+    int i;
+    for (i = 1; (i < l1) && (i < l2) && (su1[i].l == su2[i].l) &&
+                (su1[i].h == su2[i].h);
+         i++)
+      ;
+    return i;
+  }
+  return 0;
+}
+
+// length of the left common substring of s1 and (decapitalised) s2, non-UTF
+int SuggestMgr::leftcommonsubstring(
+    const char* s1,
+    const char* s2) {
+  if (complexprefixes) {
+    int l1 = strlen(s1);
+    int l2 = strlen(s2);
+    if (l1 <= l2 && s2[l1 - 1] == s2[l2 - 1])
+      return 1;
+  } else if (csconv) {
+    const char* olds = s1;
+    // decapitalise dictionary word
+    if ((*s1 != *s2) && (*s1 != csconv[((unsigned char)*s2)].clower))
+      return 0;
+    do {
+      s1++;
+      s2++;
+    } while ((*s1 == *s2) && (*s1 != '\0'));
+    return (int)(s1 - olds);
  }
  return 0;
 }
--- a/extensions/spellcheck/hunspell/src/suggestmgr.hxx
+++ b/extensions/spellcheck/hunspell/src/suggestmgr.hxx
@ -173,8 +173,12 @@ class SuggestMgr {
                  const std::vector<mapentry>&,
                  int*,
                  clock_t*);
+  int ngram(int n, const std::vector<w_char>& su1,
+            const std::vector<w_char>& su2, int opt);
  int ngram(int n, const std::string& s1, const std::string& s2, int opt);
  int mystrlen(const char* word);
+  int leftcommonsubstring(const std::vector<w_char>& su1,
+                          const std::vector<w_char>& su2);
  int leftcommonsubstring(const char* s1, const char* s2);
  int commoncharacterpositions(const char* s1, const char* s2, int* is_swap);
  void bubblesort(char** rwd, char** rwd2, int* rsc, int n);