зеркало из https://github.com/mozilla/gecko-dev.git
411 строки
11 KiB
JavaScript
411 строки
11 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
"use strict";
|
|
|
|
var EXPORTED_SYMBOLS = ["FormAutofillNameUtils"];
|
|
|
|
// FormAutofillNameUtils is initially translated from
|
|
// https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
|
|
var FormAutofillNameUtils = {
|
|
NAME_PREFIXES: [
|
|
"1lt",
|
|
"1st",
|
|
"2lt",
|
|
"2nd",
|
|
"3rd",
|
|
"admiral",
|
|
"capt",
|
|
"captain",
|
|
"col",
|
|
"cpt",
|
|
"dr",
|
|
"gen",
|
|
"general",
|
|
"lcdr",
|
|
"lt",
|
|
"ltc",
|
|
"ltg",
|
|
"ltjg",
|
|
"maj",
|
|
"major",
|
|
"mg",
|
|
"mr",
|
|
"mrs",
|
|
"ms",
|
|
"pastor",
|
|
"prof",
|
|
"rep",
|
|
"reverend",
|
|
"rev",
|
|
"sen",
|
|
"st",
|
|
],
|
|
|
|
NAME_SUFFIXES: [
|
|
"b.a",
|
|
"ba",
|
|
"d.d.s",
|
|
"dds",
|
|
"i",
|
|
"ii",
|
|
"iii",
|
|
"iv",
|
|
"ix",
|
|
"jr",
|
|
"m.a",
|
|
"m.d",
|
|
"ma",
|
|
"md",
|
|
"ms",
|
|
"ph.d",
|
|
"phd",
|
|
"sr",
|
|
"v",
|
|
"vi",
|
|
"vii",
|
|
"viii",
|
|
"x",
|
|
],
|
|
|
|
FAMILY_NAME_PREFIXES: [
|
|
"d'",
|
|
"de",
|
|
"del",
|
|
"der",
|
|
"di",
|
|
"la",
|
|
"le",
|
|
"mc",
|
|
"san",
|
|
"st",
|
|
"ter",
|
|
"van",
|
|
"von",
|
|
],
|
|
|
|
// The common and non-ambiguous CJK surnames (last names) that have more than
|
|
// one character.
|
|
COMMON_CJK_MULTI_CHAR_SURNAMES: [
|
|
// Korean, taken from the list of surnames:
|
|
// https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
|
|
"남궁",
|
|
"사공",
|
|
"서문",
|
|
"선우",
|
|
"제갈",
|
|
"황보",
|
|
"독고",
|
|
"망절",
|
|
|
|
// Chinese, taken from the top 10 Chinese 2-character surnames:
|
|
// https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
|
|
// Simplified Chinese (mostly mainland China)
|
|
"欧阳",
|
|
"令狐",
|
|
"皇甫",
|
|
"上官",
|
|
"司徒",
|
|
"诸葛",
|
|
"司马",
|
|
"宇文",
|
|
"呼延",
|
|
"端木",
|
|
// Traditional Chinese (mostly Taiwan)
|
|
"張簡",
|
|
"歐陽",
|
|
"諸葛",
|
|
"申屠",
|
|
"尉遲",
|
|
"司馬",
|
|
"軒轅",
|
|
"夏侯",
|
|
],
|
|
|
|
// All Korean surnames that have more than one character, even the
|
|
// rare/ambiguous ones.
|
|
KOREAN_MULTI_CHAR_SURNAMES: [
|
|
"강전",
|
|
"남궁",
|
|
"독고",
|
|
"동방",
|
|
"망절",
|
|
"사공",
|
|
"서문",
|
|
"선우",
|
|
"소봉",
|
|
"어금",
|
|
"장곡",
|
|
"제갈",
|
|
"황목",
|
|
"황보",
|
|
],
|
|
|
|
// The whitespace definition based on
|
|
// https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
|
|
WHITESPACE: [
|
|
"\u0009", // CHARACTER TABULATION
|
|
"\u000A", // LINE FEED (LF)
|
|
"\u000B", // LINE TABULATION
|
|
"\u000C", // FORM FEED (FF)
|
|
"\u000D", // CARRIAGE RETURN (CR)
|
|
"\u0020", // SPACE
|
|
"\u0085", // NEXT LINE (NEL)
|
|
"\u00A0", // NO-BREAK SPACE
|
|
"\u1680", // OGHAM SPACE MARK
|
|
"\u2000", // EN QUAD
|
|
"\u2001", // EM QUAD
|
|
"\u2002", // EN SPACE
|
|
"\u2003", // EM SPACE
|
|
"\u2004", // THREE-PER-EM SPACE
|
|
"\u2005", // FOUR-PER-EM SPACE
|
|
"\u2006", // SIX-PER-EM SPACE
|
|
"\u2007", // FIGURE SPACE
|
|
"\u2008", // PUNCTUATION SPACE
|
|
"\u2009", // THIN SPACE
|
|
"\u200A", // HAIR SPACE
|
|
"\u2028", // LINE SEPARATOR
|
|
"\u2029", // PARAGRAPH SEPARATOR
|
|
"\u202F", // NARROW NO-BREAK SPACE
|
|
"\u205F", // MEDIUM MATHEMATICAL SPACE
|
|
"\u3000", // IDEOGRAPHIC SPACE
|
|
],
|
|
|
|
// The middle dot is used as a separator for foreign names in Japanese.
|
|
MIDDLE_DOT: [
|
|
"\u30FB", // KATAKANA MIDDLE DOT
|
|
"\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
|
|
],
|
|
|
|
// The Unicode range is based on Wiki:
|
|
// https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
|
|
// https://en.wikipedia.org/wiki/Hangul
|
|
// https://en.wikipedia.org/wiki/Japanese_writing_system
|
|
CJK_RANGE: [
|
|
"\u1100-\u11FF", // Hangul Jamo
|
|
"\u3040-\u309F", // Hiragana
|
|
"\u30A0-\u30FF", // Katakana
|
|
"\u3105-\u312C", // Bopomofo
|
|
"\u3130-\u318F", // Hangul Compatibility Jamo
|
|
"\u31F0-\u31FF", // Katakana Phonetic Extensions
|
|
"\u3200-\u32FF", // Enclosed CJK Letters and Months
|
|
"\u3400-\u4DBF", // CJK unified ideographs Extension A
|
|
"\u4E00-\u9FFF", // CJK Unified Ideographs
|
|
"\uA960-\uA97F", // Hangul Jamo Extended-A
|
|
"\uAC00-\uD7AF", // Hangul Syllables
|
|
"\uD7B0-\uD7FF", // Hangul Jamo Extended-B
|
|
"\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
|
|
],
|
|
|
|
HANGUL_RANGE: [
|
|
"\u1100-\u11FF", // Hangul Jamo
|
|
"\u3130-\u318F", // Hangul Compatibility Jamo
|
|
"\uA960-\uA97F", // Hangul Jamo Extended-A
|
|
"\uAC00-\uD7AF", // Hangul Syllables
|
|
"\uD7B0-\uD7FF", // Hangul Jamo Extended-B
|
|
],
|
|
|
|
_dataLoaded: false,
|
|
|
|
// Returns true if |set| contains |token|, modulo a final period.
|
|
_containsString(set, token) {
|
|
let target = token.replace(/\.$/, "").toLowerCase();
|
|
return set.includes(target);
|
|
},
|
|
|
|
// Removes common name prefixes from |name_tokens|.
|
|
_stripPrefixes(nameTokens) {
|
|
for (let i in nameTokens) {
|
|
if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
|
|
return nameTokens.slice(i);
|
|
}
|
|
}
|
|
return [];
|
|
},
|
|
|
|
// Removes common name suffixes from |name_tokens|.
|
|
_stripSuffixes(nameTokens) {
|
|
for (let i = nameTokens.length - 1; i >= 0; i--) {
|
|
if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
|
|
return nameTokens.slice(0, i + 1);
|
|
}
|
|
}
|
|
return [];
|
|
},
|
|
|
|
_isCJKName(name) {
|
|
// The name is considered to be a CJK name if it is only CJK characters,
|
|
// spaces, and "middle dot" separators, with at least one CJK character, and
|
|
// no more than 2 words.
|
|
//
|
|
// Chinese and Japanese names are usually spelled out using the Han
|
|
// characters (logographs), which constitute the "CJK Unified Ideographs"
|
|
// block in Unicode, also referred to as Unihan. Korean names are usually
|
|
// spelled out in the Korean alphabet (Hangul), although they do have a Han
|
|
// equivalent as well.
|
|
|
|
if (!name) {
|
|
return false;
|
|
}
|
|
|
|
let previousWasCJK = false;
|
|
let wordCount = 0;
|
|
|
|
for (let c of name) {
|
|
let isMiddleDot = this.MIDDLE_DOT.includes(c);
|
|
let isCJK = !isMiddleDot && this.reCJK.test(c);
|
|
if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
|
|
return false;
|
|
}
|
|
if (isCJK && !previousWasCJK) {
|
|
wordCount++;
|
|
}
|
|
previousWasCJK = isCJK;
|
|
}
|
|
|
|
return wordCount > 0 && wordCount < 3;
|
|
},
|
|
|
|
// Tries to split a Chinese, Japanese, or Korean name into its given name &
|
|
// surname parts. If splitting did not work for whatever reason, returns null.
|
|
_splitCJKName(nameTokens) {
|
|
// The convention for CJK languages is to put the surname (last name) first,
|
|
// and the given name (first name) second. In a continuous text, there is
|
|
// normally no space between the two parts of the name. When entering their
|
|
// name into a field, though, some people add a space to disambiguate. CJK
|
|
// names (almost) never have a middle name.
|
|
|
|
let reHangulName = new RegExp(
|
|
"^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
|
|
"u"
|
|
);
|
|
let nameParts = {
|
|
given: "",
|
|
middle: "",
|
|
family: "",
|
|
};
|
|
|
|
if (nameTokens.length == 1) {
|
|
// There is no space between the surname and given name. Try to infer
|
|
// where to separate between the two. Most Chinese and Korean surnames
|
|
// have only one character, but there are a few that have 2. If the name
|
|
// does not start with a surname from a known list, default to one
|
|
// character.
|
|
let name = nameTokens[0];
|
|
let isKorean = reHangulName.test(name);
|
|
let surnameLength = 0;
|
|
|
|
// 4-character Korean names are more likely to be 2/2 than 1/3, so use
|
|
// the full list of Korean 2-char surnames. (instead of only the common
|
|
// ones)
|
|
let multiCharSurnames =
|
|
isKorean && name.length > 3
|
|
? this.KOREAN_MULTI_CHAR_SURNAMES
|
|
: this.COMMON_CJK_MULTI_CHAR_SURNAMES;
|
|
|
|
// Default to 1 character if the surname is not in the list.
|
|
surnameLength = multiCharSurnames.some(surname =>
|
|
name.startsWith(surname)
|
|
)
|
|
? 2
|
|
: 1;
|
|
|
|
nameParts.family = name.substr(0, surnameLength);
|
|
nameParts.given = name.substr(surnameLength);
|
|
} else if (nameTokens.length == 2) {
|
|
// The user entered a space between the two name parts. This makes our job
|
|
// easier. Family name first, given name second.
|
|
nameParts.family = nameTokens[0];
|
|
nameParts.given = nameTokens[1];
|
|
} else {
|
|
return null;
|
|
}
|
|
|
|
return nameParts;
|
|
},
|
|
|
|
init() {
|
|
if (this._dataLoaded) {
|
|
return;
|
|
}
|
|
this._dataLoaded = true;
|
|
|
|
this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
|
|
},
|
|
|
|
splitName(name) {
|
|
let nameParts = {
|
|
given: "",
|
|
middle: "",
|
|
family: "",
|
|
};
|
|
|
|
if (!name) {
|
|
return nameParts;
|
|
}
|
|
|
|
let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
|
|
nameTokens = this._stripPrefixes(nameTokens);
|
|
|
|
if (this._isCJKName(name)) {
|
|
let parts = this._splitCJKName(nameTokens);
|
|
if (parts) {
|
|
return parts;
|
|
}
|
|
}
|
|
|
|
// Don't assume "Ma" is a suffix in John Ma.
|
|
if (nameTokens.length > 2) {
|
|
nameTokens = this._stripSuffixes(nameTokens);
|
|
}
|
|
|
|
if (!nameTokens.length) {
|
|
// Bad things have happened; just assume the whole thing is a given name.
|
|
nameParts.given = name;
|
|
return nameParts;
|
|
}
|
|
|
|
// Only one token, assume given name.
|
|
if (nameTokens.length == 1) {
|
|
nameParts.given = nameTokens[0];
|
|
return nameParts;
|
|
}
|
|
|
|
// 2 or more tokens. Grab the family, which is the last word plus any
|
|
// recognizable family prefixes.
|
|
let familyTokens = [nameTokens.pop()];
|
|
while (nameTokens.length) {
|
|
let lastToken = nameTokens[nameTokens.length - 1];
|
|
if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
|
|
break;
|
|
}
|
|
familyTokens.unshift(lastToken);
|
|
nameTokens.pop();
|
|
}
|
|
nameParts.family = familyTokens.join(" ");
|
|
|
|
// Take the last remaining token as the middle name (if there are at least 2
|
|
// tokens).
|
|
if (nameTokens.length >= 2) {
|
|
nameParts.middle = nameTokens.pop();
|
|
}
|
|
|
|
// Remainder is given name.
|
|
nameParts.given = nameTokens.join(" ");
|
|
|
|
return nameParts;
|
|
},
|
|
|
|
joinNameParts({ given, middle, family }) {
|
|
if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
|
|
return family + given;
|
|
}
|
|
return [given, middle, family]
|
|
.filter(part => part && part.length)
|
|
.join(" ");
|
|
},
|
|
};
|
|
|
|
FormAutofillNameUtils.init();
|