Merge pull request #3 from Microsoft/soundex-baseline
Add Soundex baseline to test set.
This commit is contained in:
Коммит
2e5389e3fe
|
@ -3,6 +3,7 @@
|
|||
|
||||
import { EnContactMatcher, ContactMatcherConfig, ContactFields,
|
||||
EnPlaceMatcher, PlaceMatcherConfig, PlaceFields } from "../../../ts/matchers"
|
||||
import Soundex from "./soundex";
|
||||
|
||||
interface TestElement<Element> {
|
||||
/**
|
||||
|
@ -40,7 +41,7 @@ interface Transcription {
|
|||
// If the expected answer is within the top MAX_RETURNS, then it is a pass.
|
||||
const MAX_RETURNS = 3;
|
||||
|
||||
function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[], label: string): number {
|
||||
function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[], label: string, transformQuery?: (query: string) => string): number {
|
||||
let total = 0;
|
||||
let failed = 0;
|
||||
const failedTests = [];
|
||||
|
@ -49,7 +50,11 @@ function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[],
|
|||
testQuery.transcriptions.forEach((transcription) => {
|
||||
let result;
|
||||
try {
|
||||
result = matcher.find(transcription.utterance);
|
||||
let utterance = transcription.utterance;
|
||||
if (transformQuery) {
|
||||
utterance = transformQuery(utterance);
|
||||
}
|
||||
result = matcher.find(utterance);
|
||||
expect(result).toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({
|
||||
id: test.element.id
|
||||
|
@ -71,65 +76,92 @@ function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[],
|
|||
}
|
||||
|
||||
describe("TESTSET contacts", () => {
|
||||
const contacts: TestElement<ContactFields>[] = require("./contacts.json");
|
||||
const contactsTestSet: TestElement<ContactFields>[] = require("./contacts.json");
|
||||
const contacts = contactsTestSet.map((test) => test.element);
|
||||
|
||||
const baselineExactConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS, findThreshold: 0});
|
||||
const baselineExact = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselineExactConfig);
|
||||
const baselineExact = new EnContactMatcher(contacts, (element) => element, baselineExactConfig);
|
||||
|
||||
const baselineStringConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
|
||||
const baselineString = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselineStringConfig);
|
||||
const baselineString = new EnContactMatcher(contacts, (element) => element, baselineStringConfig);
|
||||
|
||||
const baselineSoundexConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
|
||||
const baselineSoundex = new EnContactMatcher(contacts, (element) => {
|
||||
const soundexContact = {...element};
|
||||
soundexContact.name = Soundex.encode(soundexContact.name);
|
||||
soundexContact.aliases = soundexContact.aliases ? soundexContact.aliases.map((alias) => Soundex.encode(alias)) : [];
|
||||
return soundexContact;
|
||||
}, baselineSoundexConfig);
|
||||
|
||||
const baselinePhoneticConfig = new ContactMatcherConfig({phoneticWeightPercentage: 1, maxReturns: MAX_RETURNS});
|
||||
const baselinePhonetic = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselinePhoneticConfig);
|
||||
const baselinePhonetic = new EnContactMatcher(contacts, (element) => element, baselinePhoneticConfig);
|
||||
|
||||
const config = new ContactMatcherConfig({maxReturns: MAX_RETURNS});
|
||||
const matcher = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, config);
|
||||
const matcher = new EnContactMatcher(contacts, (element) => element, config);
|
||||
|
||||
test("accuracy - default config", () => {
|
||||
expect(matcherAccuracy(matcher, contacts, "Contacts matcher (default)")).toBeGreaterThan(0);
|
||||
expect(matcherAccuracy(matcher, contactsTestSet, "Contacts matcher (default)")).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("accuracy - pure string distance", () => {
|
||||
expect(matcherAccuracy(baselineString, contacts, "Contacts baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselineString, contactsTestSet, "Contacts baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - Soundex distance", () => {
|
||||
expect(matcherAccuracy(baselineSoundex, contactsTestSet, "Contacts baseline (Soundex distance)", Soundex.encode)).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - pure phonetic distance", () => {
|
||||
expect(matcherAccuracy(baselinePhonetic, contacts, "Contacts baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselinePhonetic, contactsTestSet, "Contacts baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - baseline (exact match)", () => {
|
||||
expect(matcherAccuracy(baselineExact, contacts, "Contacts baseline (exact match)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselineExact, contactsTestSet, "Contacts baseline (exact match)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("TESTSET places", () => {
|
||||
const places: TestElement<PlaceFields>[] = require("./places.json");
|
||||
const placesTestSet: TestElement<PlaceFields>[] = require("./places.json");
|
||||
const places = placesTestSet.map((test) => test.element);
|
||||
|
||||
const baselineExactConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS, findThreshold: 0});
|
||||
const baselineExact = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselineExactConfig);
|
||||
const baselineExact = new EnPlaceMatcher(places, (element) => element, baselineExactConfig);
|
||||
|
||||
const baselineStringConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
|
||||
const baselineString = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselineStringConfig);
|
||||
const baselineString = new EnPlaceMatcher(places, (element) => element, baselineStringConfig);
|
||||
|
||||
const baselineSoundexConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
|
||||
const baselineSoundex = new EnPlaceMatcher(places, (element) => {
|
||||
const soundexPlace = {...element};
|
||||
soundexPlace.name = Soundex.encode(soundexPlace.name);
|
||||
soundexPlace.address = Soundex.encode(soundexPlace.address);
|
||||
soundexPlace.types = soundexPlace.types ? soundexPlace.types.map((type) => Soundex.encode(type)) : [];
|
||||
return soundexPlace;
|
||||
}, baselineSoundexConfig);
|
||||
|
||||
const baselinePhoneticConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 1, maxReturns: MAX_RETURNS});
|
||||
const baselinePhonetic = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselinePhoneticConfig);
|
||||
const baselinePhonetic = new EnPlaceMatcher(places, (element) => element, baselinePhoneticConfig);
|
||||
|
||||
const config = new PlaceMatcherConfig({maxReturns: MAX_RETURNS});
|
||||
const matcher = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, config);
|
||||
const matcher = new EnPlaceMatcher(places, (element) => element, config);
|
||||
|
||||
test("accuracy - default config", () => {
|
||||
expect(matcherAccuracy(matcher, places, "Places matcher (default)")).toBeGreaterThan(0);
|
||||
expect(matcherAccuracy(matcher, placesTestSet, "Places matcher (default)")).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("accuracy - pure string distance", () => {
|
||||
expect(matcherAccuracy(baselineString, places, "Places baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselineString, placesTestSet, "Places baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - Soundex distance", () => {
|
||||
expect(matcherAccuracy(baselineSoundex, placesTestSet, "Places baseline (Soundex distance)", Soundex.encode)).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - pure phonetic distance", () => {
|
||||
expect(matcherAccuracy(baselinePhonetic, places, "Places baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselinePhonetic, placesTestSet, "Places baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
|
||||
test("accuracy - baseline (exact match)", () => {
|
||||
expect(matcherAccuracy(baselineExact, places, "Places baseline (exact match)")).toBeGreaterThanOrEqual(0);
|
||||
expect(matcherAccuracy(baselineExact, placesTestSet, "Places baseline (exact match)")).toBeGreaterThanOrEqual(0);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
import Soundex from "./soundex";
|
||||
|
||||
test("Soundex.", () => {
|
||||
expect(Soundex.encode("")).toBe("");
|
||||
expect(Soundex.encode(" ")).toBe("");
|
||||
|
||||
expect(Soundex.encode("Robert")).toBe("R163");
|
||||
expect(Soundex.encode("Rupert")).toBe("R163");
|
||||
expect(Soundex.encode("Rubin")).toBe("R150");
|
||||
expect(Soundex.encode("Ashcraft")).toBe("A261");
|
||||
expect(Soundex.encode("Ashcroft")).toBe("A261");
|
||||
expect(Soundex.encode("Tymczak")).toBe("T522");
|
||||
expect(Soundex.encode("Pfister")).toBe("P236");
|
||||
expect(Soundex.encode("Honeyman")).toBe("H555");
|
||||
|
||||
expect(Soundex.encode("Robert Robert")).toBe("R163 R163");
|
||||
});
|
|
@ -0,0 +1,93 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
import { WhitespaceTokenizer } from "../../../ts/nlp";
|
||||
|
||||
function soundexNumber(c: string) {
|
||||
switch (c) {
|
||||
case "B":
|
||||
case "F":
|
||||
case "P":
|
||||
case "V":
|
||||
return "1";
|
||||
case "C":
|
||||
case "G":
|
||||
case "J":
|
||||
case "K":
|
||||
case "Q":
|
||||
case "S":
|
||||
case "X":
|
||||
case "Z":
|
||||
return "2";
|
||||
case "D":
|
||||
case "T":
|
||||
return "3";
|
||||
case "L":
|
||||
return "4";
|
||||
case "M":
|
||||
case "N":
|
||||
return "5";
|
||||
case "R":
|
||||
return "6";
|
||||
|
||||
default:
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
function encodeWord(word: string): string {
|
||||
let soundex = "";
|
||||
if (word.length === 0) {
|
||||
return soundex;
|
||||
}
|
||||
|
||||
let i = 0;
|
||||
let c = word.charAt(i);
|
||||
let n = soundexNumber(c);
|
||||
|
||||
soundex += c;
|
||||
|
||||
for (++i; i < word.length; ++i) {
|
||||
c = word.charAt(i);
|
||||
if (c == "H" || c == "W") {
|
||||
// Completely ignore H and W
|
||||
continue;
|
||||
}
|
||||
|
||||
const newN = soundexNumber(c);
|
||||
if (newN === c) {
|
||||
// Ignore vowels, but make sure to encode consonants on either
|
||||
// side twice (i.e., "SIS" => "22")
|
||||
n = "0";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n !== newN) {
|
||||
n = newN;
|
||||
soundex += n;
|
||||
}
|
||||
}
|
||||
if (soundex.length < 4) {
|
||||
soundex += "0".repeat(4 - soundex.length);
|
||||
}
|
||||
return soundex.substr(0, 4);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Modified version of Soundex to apply the original fixed-length Soundex on each word,
|
||||
* then concatenate those encoded results together.
|
||||
*
|
||||
* @abstract
|
||||
* @class Soundex
|
||||
*/
|
||||
abstract class Soundex {
|
||||
private static readonly tokenizer = new WhitespaceTokenizer();
|
||||
|
||||
static encode(text: string): string {
|
||||
const tokens = Soundex.tokenizer.tokenize(text.toUpperCase());
|
||||
return tokens.map(token => encodeWord(token.value)).join(" ");
|
||||
}
|
||||
}
|
||||
|
||||
export default Soundex;
|
Загрузка…
Ссылка в новой задаче