gecko-dev/browser/extensions/formautofill/FormAutofillNameUtils.jsm

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

"use strict";

var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"];

// FormAutofillNameUtils is initially translated from
// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
var FormAutofillNameUtils = {
  NAME_PREFIXES: [
    "1lt",
    "1st",
    "2lt",
    "2nd",
    "3rd",
    "admiral",
    "capt",
    "captain",
    "col",
    "cpt",
    "dr",
    "gen",
    "general",
    "lcdr",
    "lt",
    "ltc",
    "ltg",
    "ltjg",
    "maj",
    "major",
    "mg",
    "mr",
    "mrs",
    "ms",
    "pastor",
    "prof",
    "rep",
    "reverend",
    "rev",
    "sen",
    "st",
  ],

  NAME_SUFFIXES: [
    "b.a",
    "ba",
    "d.d.s",
    "dds",
    "i",
    "ii",
    "iii",
    "iv",
    "ix",
    "jr",
    "m.a",
    "m.d",
    "ma",
    "md",
    "ms",
    "ph.d",
    "phd",
    "sr",
    "v",
    "vi",
    "vii",
    "viii",
    "x",
  ],

  FAMILY_NAME_PREFIXES: [
    "d'",
    "de",
    "del",
    "der",
    "di",
    "la",
    "le",
    "mc",
    "san",
    "st",
    "ter",
    "van",
    "von",
  ],

  // The common and non-ambiguous CJK surnames (last names) that have more than
  // one character.
  COMMON_CJK_MULTI_CHAR_SURNAMES: [
    // Korean, taken from the list of surnames:
    // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
    "남궁",
    "사공",
    "서문",
    "선우",
    "제갈",
    "황보",
    "독고",
    "망절",

    // Chinese, taken from the top 10 Chinese 2-character surnames:
    // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
    // Simplified Chinese (mostly mainland China)
    "欧阳",
    "令狐",
    "皇甫",
    "上官",
    "司徒",
    "诸葛",
    "司马",
    "宇文",
    "呼延",
    "端木",
    // Traditional Chinese (mostly Taiwan)
    "張簡",
    "歐陽",
    "諸葛",
    "申屠",
    "尉遲",
    "司馬",
    "軒轅",
    "夏侯",
  ],

  // All Korean surnames that have more than one character, even the
  // rare/ambiguous ones.
  KOREAN_MULTI_CHAR_SURNAMES: [
    "강전",
    "남궁",
    "독고",
    "동방",
    "망절",
    "사공",
    "서문",
    "선우",
    "소봉",
    "어금",
    "장곡",
    "제갈",
    "황목",
    "황보",
  ],

  // The whitespace definition based on
  // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
  WHITESPACE: [
    "\u0009", // CHARACTER TABULATION
    "\u000A", // LINE FEED (LF)
    "\u000B", // LINE TABULATION
    "\u000C", // FORM FEED (FF)
    "\u000D", // CARRIAGE RETURN (CR)
    "\u0020", // SPACE
    "\u0085", // NEXT LINE (NEL)
    "\u00A0", // NO-BREAK SPACE
    "\u1680", // OGHAM SPACE MARK
    "\u2000", // EN QUAD
    "\u2001", // EM QUAD
    "\u2002", // EN SPACE
    "\u2003", // EM SPACE
    "\u2004", // THREE-PER-EM SPACE
    "\u2005", // FOUR-PER-EM SPACE
    "\u2006", // SIX-PER-EM SPACE
    "\u2007", // FIGURE SPACE
    "\u2008", // PUNCTUATION SPACE
    "\u2009", // THIN SPACE
    "\u200A", // HAIR SPACE
    "\u2028", // LINE SEPARATOR
    "\u2029", // PARAGRAPH SEPARATOR
    "\u202F", // NARROW NO-BREAK SPACE
    "\u205F", // MEDIUM MATHEMATICAL SPACE
    "\u3000", // IDEOGRAPHIC SPACE
  ],

  // The middle dot is used as a separator for foreign names in Japanese.
  MIDDLE_DOT: [
    "\u30FB", // KATAKANA MIDDLE DOT
    "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
  ],

  // The Unicode range is based on Wiki:
  // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
  // https://en.wikipedia.org/wiki/Hangul
  // https://en.wikipedia.org/wiki/Japanese_writing_system
  CJK_RANGE: [
    "\u1100-\u11FF", // Hangul Jamo
    "\u3040-\u309F", // Hiragana
    "\u30A0-\u30FF", // Katakana
    "\u3105-\u312C", // Bopomofo
    "\u3130-\u318F", // Hangul Compatibility Jamo
    "\u31F0-\u31FF", // Katakana Phonetic Extensions
    "\u3200-\u32FF", // Enclosed CJK Letters and Months
    "\u3400-\u4DBF", // CJK unified ideographs Extension A
    "\u4E00-\u9FFF", // CJK Unified Ideographs
    "\uA960-\uA97F", // Hangul Jamo Extended-A
    "\uAC00-\uD7AF", // Hangul Syllables
    "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
    "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
  ],

  HANGUL_RANGE: [
    "\u1100-\u11FF", // Hangul Jamo
    "\u3130-\u318F", // Hangul Compatibility Jamo
    "\uA960-\uA97F", // Hangul Jamo Extended-A
    "\uAC00-\uD7AF", // Hangul Syllables
    "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
  ],

  _dataLoaded: false,

  // Returns true if |set| contains |token|, modulo a final period.
  _containsString(set, token) {
    let target = token.replace(/\.$/, "").toLowerCase();
    return set.includes(target);
  },

  // Removes common name prefixes from |name_tokens|.
  _stripPrefixes(nameTokens) {
    for (let i in nameTokens) {
      if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
        return nameTokens.slice(i);
      }
    }
    return [];
  },

  // Removes common name suffixes from |name_tokens|.
  _stripSuffixes(nameTokens) {
    for (let i = nameTokens.length - 1; i >= 0; i--) {
      if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
        return nameTokens.slice(0, i + 1);
      }
    }
    return [];
  },

  _isCJKName(name) {
    // The name is considered to be a CJK name if it is only CJK characters,
    // spaces, and "middle dot" separators, with at least one CJK character, and
    // no more than 2 words.
    //
    // Chinese and Japanese names are usually spelled out using the Han
    // characters (logographs), which constitute the "CJK Unified Ideographs"
    // block in Unicode, also referred to as Unihan. Korean names are usually
    // spelled out in the Korean alphabet (Hangul), although they do have a Han
    // equivalent as well.

    if (!name) {
      return false;
    }

    let previousWasCJK = false;
    let wordCount = 0;

    for (let c of name) {
      let isMiddleDot = this.MIDDLE_DOT.includes(c);
      let isCJK = !isMiddleDot && this.reCJK.test(c);
      if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
        return false;
      }
      if (isCJK && !previousWasCJK) {
        wordCount++;
      }
      previousWasCJK = isCJK;
    }

    return wordCount > 0 && wordCount < 3;
  },

  // Tries to split a Chinese, Japanese, or Korean name into its given name &
  // surname parts. If splitting did not work for whatever reason, returns null.
  _splitCJKName(nameTokens) {
    // The convention for CJK languages is to put the surname (last name) first,
    // and the given name (first name) second. In a continuous text, there is
    // normally no space between the two parts of the name. When entering their
    // name into a field, though, some people add a space to disambiguate. CJK
    // names (almost) never have a middle name.

    let reHangulName = new RegExp(
      "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
      "u"
    );
    let nameParts = {
      given: "",
      middle: "",
      family: "",
    };

    if (nameTokens.length == 1) {
      // There is no space between the surname and given name. Try to infer
      // where to separate between the two. Most Chinese and Korean surnames
      // have only one character, but there are a few that have 2. If the name
      // does not start with a surname from a known list, default to one
      // character.
      let name = nameTokens[0];
      let isKorean = reHangulName.test(name);
      let surnameLength = 0;

      // 4-character Korean names are more likely to be 2/2 than 1/3, so use
      // the full list of Korean 2-char surnames. (instead of only the common
      // ones)
      let multiCharSurnames =
        isKorean && name.length > 3
          ? this.KOREAN_MULTI_CHAR_SURNAMES
          : this.COMMON_CJK_MULTI_CHAR_SURNAMES;

      // Default to 1 character if the surname is not in the list.
      surnameLength = multiCharSurnames.some(surname =>
        name.startsWith(surname)
      )
        ? 2
        : 1;

      nameParts.family = name.substr(0, surnameLength);
      nameParts.given = name.substr(surnameLength);
    } else if (nameTokens.length == 2) {
      // The user entered a space between the two name parts. This makes our job
      // easier. Family name first, given name second.
      nameParts.family = nameTokens[0];
      nameParts.given = nameTokens[1];
    } else {
      return null;
    }

    return nameParts;
  },

  init() {
    if (this._dataLoaded) {
      return;
    }
    this._dataLoaded = true;

    this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
  },

  splitName(name) {
    let nameParts = {
      given: "",
      middle: "",
      family: "",
    };

    if (!name) {
      return nameParts;
    }

    let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
    nameTokens = this._stripPrefixes(nameTokens);

    if (this._isCJKName(name)) {
      let parts = this._splitCJKName(nameTokens);
      if (parts) {
        return parts;
      }
    }

    // Don't assume "Ma" is a suffix in John Ma.
    if (nameTokens.length > 2) {
      nameTokens = this._stripSuffixes(nameTokens);
    }

    if (!nameTokens.length) {
      // Bad things have happened; just assume the whole thing is a given name.
      nameParts.given = name;
      return nameParts;
    }

    // Only one token, assume given name.
    if (nameTokens.length == 1) {
      nameParts.given = nameTokens[0];
      return nameParts;
    }

    // 2 or more tokens. Grab the family, which is the last word plus any
    // recognizable family prefixes.
    let familyTokens = [nameTokens.pop()];
    while (nameTokens.length) {
      let lastToken = nameTokens[nameTokens.length - 1];
      if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
        break;
      }
      familyTokens.unshift(lastToken);
      nameTokens.pop();
    }
    nameParts.family = familyTokens.join(" ");

    // Take the last remaining token as the middle name (if there are at least 2
    // tokens).
    if (nameTokens.length >= 2) {
      nameParts.middle = nameTokens.pop();
    }

    // Remainder is given name.
    nameParts.given = nameTokens.join(" ");

    return nameParts;
  },

  joinNameParts({ given, middle, family }) {
    if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
      return family + given;
    }
    return [given, middle, family]
      .filter(part => part && part.length)
      .join(" ");
  },
};

FormAutofillNameUtils.init();