Merge pull request #3 from Microsoft/soundex-baseline

Add Soundex baseline to test set.
This commit is contained in:
Mmdixon 2018-07-19 15:51:21 -04:00 коммит произвёл GitHub
Родитель 773936b246 375299f156
Коммит 2e5389e3fe
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 165 добавлений и 20 удалений

Просмотреть файл

@ -3,6 +3,7 @@
import { EnContactMatcher, ContactMatcherConfig, ContactFields,
EnPlaceMatcher, PlaceMatcherConfig, PlaceFields } from "../../../ts/matchers"
import Soundex from "./soundex";
interface TestElement<Element> {
/**
@ -40,7 +41,7 @@ interface Transcription {
// If the expected answer is within the top MAX_RETURNS, then it is a pass.
const MAX_RETURNS = 3;
function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[], label: string): number {
function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[], label: string, transformQuery?: (query: string) => string): number {
let total = 0;
let failed = 0;
const failedTests = [];
@ -49,7 +50,11 @@ function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[],
testQuery.transcriptions.forEach((transcription) => {
let result;
try {
result = matcher.find(transcription.utterance);
let utterance = transcription.utterance;
if (transformQuery) {
utterance = transformQuery(utterance);
}
result = matcher.find(utterance);
expect(result).toEqual(expect.arrayContaining([
expect.objectContaining({
id: test.element.id
@ -71,65 +76,92 @@ function matcherAccuracy<Element>(matcher: any, testSet: TestElement<Element>[],
}
describe("TESTSET contacts", () => {
const contacts: TestElement<ContactFields>[] = require("./contacts.json");
const contactsTestSet: TestElement<ContactFields>[] = require("./contacts.json");
const contacts = contactsTestSet.map((test) => test.element);
const baselineExactConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS, findThreshold: 0});
const baselineExact = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselineExactConfig);
const baselineExact = new EnContactMatcher(contacts, (element) => element, baselineExactConfig);
const baselineStringConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
const baselineString = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselineStringConfig);
const baselineString = new EnContactMatcher(contacts, (element) => element, baselineStringConfig);
const baselineSoundexConfig = new ContactMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
const baselineSoundex = new EnContactMatcher(contacts, (element) => {
const soundexContact = {...element};
soundexContact.name = Soundex.encode(soundexContact.name);
soundexContact.aliases = soundexContact.aliases ? soundexContact.aliases.map((alias) => Soundex.encode(alias)) : [];
return soundexContact;
}, baselineSoundexConfig);
const baselinePhoneticConfig = new ContactMatcherConfig({phoneticWeightPercentage: 1, maxReturns: MAX_RETURNS});
const baselinePhonetic = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, baselinePhoneticConfig);
const baselinePhonetic = new EnContactMatcher(contacts, (element) => element, baselinePhoneticConfig);
const config = new ContactMatcherConfig({maxReturns: MAX_RETURNS});
const matcher = new EnContactMatcher(contacts.map((test) => test.element), (element) => element, config);
const matcher = new EnContactMatcher(contacts, (element) => element, config);
test("accuracy - default config", () => {
expect(matcherAccuracy(matcher, contacts, "Contacts matcher (default)")).toBeGreaterThan(0);
expect(matcherAccuracy(matcher, contactsTestSet, "Contacts matcher (default)")).toBeGreaterThan(0);
});
test("accuracy - pure string distance", () => {
expect(matcherAccuracy(baselineString, contacts, "Contacts baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselineString, contactsTestSet, "Contacts baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
});
test("accuracy - Soundex distance", () => {
expect(matcherAccuracy(baselineSoundex, contactsTestSet, "Contacts baseline (Soundex distance)", Soundex.encode)).toBeGreaterThanOrEqual(0);
});
test("accuracy - pure phonetic distance", () => {
expect(matcherAccuracy(baselinePhonetic, contacts, "Contacts baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselinePhonetic, contactsTestSet, "Contacts baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
});
test("accuracy - baseline (exact match)", () => {
expect(matcherAccuracy(baselineExact, contacts, "Contacts baseline (exact match)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselineExact, contactsTestSet, "Contacts baseline (exact match)")).toBeGreaterThanOrEqual(0);
});
});
describe("TESTSET places", () => {
const places: TestElement<PlaceFields>[] = require("./places.json");
const placesTestSet: TestElement<PlaceFields>[] = require("./places.json");
const places = placesTestSet.map((test) => test.element);
const baselineExactConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS, findThreshold: 0});
const baselineExact = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselineExactConfig);
const baselineExact = new EnPlaceMatcher(places, (element) => element, baselineExactConfig);
const baselineStringConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
const baselineString = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselineStringConfig);
const baselineString = new EnPlaceMatcher(places, (element) => element, baselineStringConfig);
const baselineSoundexConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 0, maxReturns: MAX_RETURNS});
const baselineSoundex = new EnPlaceMatcher(places, (element) => {
const soundexPlace = {...element};
soundexPlace.name = Soundex.encode(soundexPlace.name);
soundexPlace.address = Soundex.encode(soundexPlace.address);
soundexPlace.types = soundexPlace.types ? soundexPlace.types.map((type) => Soundex.encode(type)) : [];
return soundexPlace;
}, baselineSoundexConfig);
const baselinePhoneticConfig = new PlaceMatcherConfig({phoneticWeightPercentage: 1, maxReturns: MAX_RETURNS});
const baselinePhonetic = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, baselinePhoneticConfig);
const baselinePhonetic = new EnPlaceMatcher(places, (element) => element, baselinePhoneticConfig);
const config = new PlaceMatcherConfig({maxReturns: MAX_RETURNS});
const matcher = new EnPlaceMatcher(places.map((test) => test.element), (element) => element, config);
const matcher = new EnPlaceMatcher(places, (element) => element, config);
test("accuracy - default config", () => {
expect(matcherAccuracy(matcher, places, "Places matcher (default)")).toBeGreaterThan(0);
expect(matcherAccuracy(matcher, placesTestSet, "Places matcher (default)")).toBeGreaterThan(0);
});
test("accuracy - pure string distance", () => {
expect(matcherAccuracy(baselineString, places, "Places baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselineString, placesTestSet, "Places baseline (100% string distance)")).toBeGreaterThanOrEqual(0);
});
test("accuracy - Soundex distance", () => {
expect(matcherAccuracy(baselineSoundex, placesTestSet, "Places baseline (Soundex distance)", Soundex.encode)).toBeGreaterThanOrEqual(0);
});
test("accuracy - pure phonetic distance", () => {
expect(matcherAccuracy(baselinePhonetic, places, "Places baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselinePhonetic, placesTestSet, "Places baseline (100% phonetic distance)")).toBeGreaterThanOrEqual(0);
});
test("accuracy - baseline (exact match)", () => {
expect(matcherAccuracy(baselineExact, places, "Places baseline (exact match)")).toBeGreaterThanOrEqual(0);
expect(matcherAccuracy(baselineExact, placesTestSet, "Places baseline (exact match)")).toBeGreaterThanOrEqual(0);
});
});

Просмотреть файл

@ -0,0 +1,20 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import Soundex from "./soundex";
test("Soundex.", () => {
expect(Soundex.encode("")).toBe("");
expect(Soundex.encode(" ")).toBe("");
expect(Soundex.encode("Robert")).toBe("R163");
expect(Soundex.encode("Rupert")).toBe("R163");
expect(Soundex.encode("Rubin")).toBe("R150");
expect(Soundex.encode("Ashcraft")).toBe("A261");
expect(Soundex.encode("Ashcroft")).toBe("A261");
expect(Soundex.encode("Tymczak")).toBe("T522");
expect(Soundex.encode("Pfister")).toBe("P236");
expect(Soundex.encode("Honeyman")).toBe("H555");
expect(Soundex.encode("Robert Robert")).toBe("R163 R163");
});

Просмотреть файл

@ -0,0 +1,93 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
import { WhitespaceTokenizer } from "../../../ts/nlp";
function soundexNumber(c: string) {
switch (c) {
case "B":
case "F":
case "P":
case "V":
return "1";
case "C":
case "G":
case "J":
case "K":
case "Q":
case "S":
case "X":
case "Z":
return "2";
case "D":
case "T":
return "3";
case "L":
return "4";
case "M":
case "N":
return "5";
case "R":
return "6";
default:
return c;
}
}
function encodeWord(word: string): string {
let soundex = "";
if (word.length === 0) {
return soundex;
}
let i = 0;
let c = word.charAt(i);
let n = soundexNumber(c);
soundex += c;
for (++i; i < word.length; ++i) {
c = word.charAt(i);
if (c == "H" || c == "W") {
// Completely ignore H and W
continue;
}
const newN = soundexNumber(c);
if (newN === c) {
// Ignore vowels, but make sure to encode consonants on either
// side twice (i.e., "SIS" => "22")
n = "0";
continue;
}
if (n !== newN) {
n = newN;
soundex += n;
}
}
if (soundex.length < 4) {
soundex += "0".repeat(4 - soundex.length);
}
return soundex.substr(0, 4);
}
/**
* Modified version of Soundex to apply the original fixed-length Soundex on each word,
* then concatenate those encoded results together.
*
* @abstract
* @class Soundex
*/
abstract class Soundex {
private static readonly tokenizer = new WhitespaceTokenizer();
static encode(text: string): string {
const tokens = Soundex.tokenizer.tokenize(text.toUpperCase());
return tokens.map(token => encodeWord(token.value)).join(" ");
}
}
export default Soundex;