Initial changes to support speech.phrase in translation service reco (#603)

This commit is contained in:
Glenn Harper 2022-12-07 14:21:41 -08:00 коммит произвёл GitHub
Родитель ead45a3438
Коммит fa92888500
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 357 добавлений и 151 удалений

Просмотреть файл

@ -0,0 +1,217 @@
import { IAudioSource } from "../common/Exports";
import {
CancellationErrorCode,
CancellationReason,
OutputFormat,
PropertyCollection,
PropertyId,
ResultReason,
SpeechRecognitionEventArgs,
SpeechRecognitionResult,
TranslationRecognitionEventArgs,
TranslationRecognitionResult,
TranslationRecognizer
} from "../sdk/Exports";
import {
DetailedSpeechPhrase,
EnumTranslation,
IAuthentication,
IConnectionFactory,
OutputFormatPropertyName,
RecognitionStatus,
RecognizerConfig,
ServiceRecognizerBase,
SimpleSpeechPhrase,
SpeechHypothesis,
TranscriberRecognizer
} from "./Exports";
import { SpeechConnectionMessage } from "./SpeechConnectionMessage.Internal";
export class ConversationServiceRecognizer extends ServiceRecognizerBase {
public constructor(
authentication: IAuthentication,
connectionFactory: IConnectionFactory,
audioSource: IAudioSource,
recognizerConfig: RecognizerConfig,
recognizer: TranslationRecognizer | TranscriberRecognizer) {
super(authentication, connectionFactory, audioSource, recognizerConfig, recognizer);
this.handleSpeechPhraseMessage = async (textBody: string): Promise<void> => this.handleSpeechPhrase(textBody);
this.handleSpeechHypothesisMessage = (textBody: string): void => this.handleSpeechHypothesis(textBody);
}
protected async processTypeSpecificMessages(connectionMessage: SpeechConnectionMessage): Promise<boolean> {
let processed: boolean = false;
switch (connectionMessage.path.toLowerCase()) {
case "speech.hypothesis":
case "speech.fragment":
if (!!this.handleSpeechHypothesisMessage) {
this.handleSpeechHypothesisMessage(connectionMessage.textBody);
}
processed = true;
break;
case "speech.phrase":
if (!!this.handleSpeechPhraseMessage) {
await this.handleSpeechPhraseMessage(connectionMessage.textBody);
}
processed = true;
break;
default:
break;
}
return processed;
}
protected cancelRecognition(
sessionId: string,
requestId: string,
cancellationReason: CancellationReason,
errorCode: CancellationErrorCode,
error: string): void {
// Implementing to allow inheritance
void sessionId;
void requestId;
void cancellationReason;
void errorCode;
void error;
}
protected async handleSpeechPhrase(textBody: string): Promise<void> {
const simple: SimpleSpeechPhrase = SimpleSpeechPhrase.fromJSON(textBody);
const resultReason: ResultReason = EnumTranslation.implTranslateRecognitionResult(simple.RecognitionStatus);
let result: SpeechRecognitionResult;
const resultProps: PropertyCollection = new PropertyCollection();
resultProps.setProperty(PropertyId.SpeechServiceResponse_JsonResult, textBody);
const simpleOffset = simple.Offset + this.privRequestSession.currentTurnAudioOffset;
this.privRequestSession.onPhraseRecognized(this.privRequestSession.currentTurnAudioOffset + simple.Offset + simple.Duration);
if (ResultReason.Canceled === resultReason) {
const cancelReason: CancellationReason = EnumTranslation.implTranslateCancelResult(simple.RecognitionStatus);
const cancellationErrorCode: CancellationErrorCode = EnumTranslation.implTranslateCancelErrorCode(simple.RecognitionStatus);
await this.cancelRecognitionLocal(
cancelReason,
cancellationErrorCode,
EnumTranslation.implTranslateErrorDetails(cancellationErrorCode));
} else {
if (!(this.privRequestSession.isSpeechEnded && resultReason === ResultReason.NoMatch && simple.RecognitionStatus !== RecognitionStatus.InitialSilenceTimeout)) {
if (this.privRecognizerConfig.parameters.getProperty(OutputFormatPropertyName) === OutputFormat[OutputFormat.Simple]) {
result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
resultReason,
simple.DisplayText,
simple.Duration,
simpleOffset,
simple.Language,
simple.LanguageDetectionConfidence,
simple.SpeakerId,
undefined,
textBody,
resultProps);
if (this.privRecognizer instanceof TranslationRecognizer) {
try {
const ev = new TranslationRecognitionEventArgs(TranslationRecognitionResult.fromSpeechRecognitionResult(result), simpleOffset, this.privRequestSession.sessionId);
this.privRecognizer.recognized(this.privRecognizer, ev);
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
return;
}
} else {
const detailed: DetailedSpeechPhrase = DetailedSpeechPhrase.fromJSON(textBody);
const totalOffset: number = detailed.Offset + this.privRequestSession.currentTurnAudioOffset;
const offsetCorrectedJson: string = detailed.getJsonWithCorrectedOffsets(totalOffset);
result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
resultReason,
detailed.Text,
detailed.Duration,
totalOffset,
detailed.Language,
detailed.LanguageDetectionConfidence,
detailed.SpeakerId,
undefined,
offsetCorrectedJson,
resultProps);
}
if (this.privRecognizer instanceof TranscriberRecognizer) {
try {
const event: SpeechRecognitionEventArgs = new SpeechRecognitionEventArgs(result, result.offset, this.privRequestSession.sessionId);
this.privRecognizer.recognized(this.privRecognizer, event);
if (!!this.privSuccessCallback) {
try {
this.privSuccessCallback(result);
} catch (e) {
if (!!this.privErrorCallback) {
this.privErrorCallback(e as string);
}
}
// Only invoke the call back once.
// and if it's successful don't invoke the
// error after that.
this.privSuccessCallback = undefined;
this.privErrorCallback = undefined;
}
/* eslint-disable no-empty */
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
}
}
}
protected handleSpeechHypothesis(textBody: string): void {
const hypothesis: SpeechHypothesis = SpeechHypothesis.fromJSON(textBody);
const offset: number = hypothesis.Offset + this.privRequestSession.currentTurnAudioOffset;
const resultProps: PropertyCollection = new PropertyCollection();
resultProps.setProperty(PropertyId.SpeechServiceResponse_JsonResult, textBody);
const result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
ResultReason.RecognizingSpeech,
hypothesis.Text,
hypothesis.Duration,
offset,
hypothesis.Language,
hypothesis.LanguageDetectionConfidence,
hypothesis.SpeakerId,
undefined,
textBody,
resultProps);
this.privRequestSession.onHypothesis(offset);
if (this.privRecognizer instanceof TranscriberRecognizer) {
if (!!this.privRecognizer.recognizing) {
try {
const ev = new SpeechRecognitionEventArgs(result, hypothesis.Duration, this.privRequestSession.sessionId);
this.privRecognizer.recognizing(this.privRecognizer, ev);
/* eslint-disable no-empty */
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
} else {
if (this.privRecognizer instanceof TranslationRecognizer) {
try {
const ev = new TranslationRecognitionEventArgs(TranslationRecognitionResult.fromSpeechRecognitionResult(result), hypothesis.Duration, this.privRequestSession.sessionId);
this.privRecognizer.recognizing(this.privRecognizer, ev);
/* eslint-disable no-empty */
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
}
}
}

Просмотреть файл

@ -11,6 +11,7 @@ export * from "./ISynthesisConnectionFactory";
export * from "./IntentConnectionFactory";
export * from "./RecognitionEvents";
export * from "./ServiceRecognizerBase";
export * from "./ConversationServiceRecognizer";
export * from "./RecognizerConfig";
export * from "./SpeechServiceInterfaces";
export * from "./WebsocketMessageFormatter";

Просмотреть файл

@ -10,7 +10,7 @@ export interface ITranslationPhrase {
RecognitionStatus: RecognitionStatus;
Offset: number;
Duration: number;
Translation: ITranslations;
Translation?: ITranslations;
Text: string;
DisplayText?: string;
}

Просмотреть файл

@ -571,6 +571,8 @@ export abstract class ServiceRecognizerBase implements IDisposable {
}
protected configConnectionOverride: (connection: IConnection) => Promise<IConnection> = undefined;
protected handleSpeechPhraseMessage: (textBody: string) => Promise<void> = undefined;
protected handleSpeechHypothesisMessage: (textBody: string) => void = undefined;
protected sendSpeechServiceConfig(connection: IConnection, requestSession: RequestSession, SpeechServiceConfigJson: string): Promise<void> {
requestSession.onSpeechContext();

Просмотреть файл

@ -10,24 +10,16 @@ import {
CancellationErrorCode,
CancellationReason,
ConversationTranscriptionCanceledEventArgs,
OutputFormat,
PropertyCollection,
PropertyId,
ResultReason,
SpeechRecognitionEventArgs,
SpeechRecognitionResult,
} from "../sdk/Exports";
import { ConversationInfo } from "../sdk/Transcription/Exports";
import { ConversationProperties } from "../sdk/Transcription/IConversation";
import {
CancellationErrorCodePropertyName,
DetailedSpeechPhrase,
EnumTranslation,
OutputFormatPropertyName,
RecognitionStatus,
ServiceRecognizerBase,
SimpleSpeechPhrase,
SpeechHypothesis,
ConversationServiceRecognizer,
TranscriberRecognizer
} from "./Exports";
import { IAuthentication } from "./IAuthentication";
@ -36,7 +28,7 @@ import { RecognizerConfig } from "./RecognizerConfig";
import { SpeechConnectionMessage } from "./SpeechConnectionMessage.Internal";
// eslint-disable-next-line max-classes-per-file
export class TranscriptionServiceRecognizer extends ServiceRecognizerBase {
export class TranscriptionServiceRecognizer extends ConversationServiceRecognizer {
private privTranscriberRecognizer: TranscriberRecognizer;
@ -62,129 +54,7 @@ export class TranscriptionServiceRecognizer extends ServiceRecognizerBase {
}
protected async processTypeSpecificMessages(connectionMessage: SpeechConnectionMessage): Promise<boolean> {
let result: SpeechRecognitionResult;
const resultProps: PropertyCollection = new PropertyCollection();
resultProps.setProperty(PropertyId.SpeechServiceResponse_JsonResult, connectionMessage.textBody);
let processed: boolean = false;
switch (connectionMessage.path.toLowerCase()) {
case "speech.hypothesis":
case "speech.fragment":
const hypothesis: SpeechHypothesis = SpeechHypothesis.fromJSON(connectionMessage.textBody);
const offset: number = hypothesis.Offset + this.privRequestSession.currentTurnAudioOffset;
result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
ResultReason.RecognizingSpeech,
hypothesis.Text,
hypothesis.Duration,
offset,
hypothesis.Language,
hypothesis.LanguageDetectionConfidence,
hypothesis.SpeakerId,
undefined,
connectionMessage.textBody,
resultProps);
this.privRequestSession.onHypothesis(offset);
const ev = new SpeechRecognitionEventArgs(result, hypothesis.Duration, this.privRequestSession.sessionId);
if (!!this.privTranscriberRecognizer.recognizing) {
try {
this.privTranscriberRecognizer.recognizing(this.privTranscriberRecognizer, ev);
/* eslint-disable no-empty */
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
processed = true;
break;
case "speech.phrase":
const simple: SimpleSpeechPhrase = SimpleSpeechPhrase.fromJSON(connectionMessage.textBody);
const resultReason: ResultReason = EnumTranslation.implTranslateRecognitionResult(simple.RecognitionStatus);
this.privRequestSession.onPhraseRecognized(this.privRequestSession.currentTurnAudioOffset + simple.Offset + simple.Duration);
if (ResultReason.Canceled === resultReason) {
const cancelReason: CancellationReason = EnumTranslation.implTranslateCancelResult(simple.RecognitionStatus);
const cancellationErrorCode: CancellationErrorCode = EnumTranslation.implTranslateCancelErrorCode(simple.RecognitionStatus);
await this.cancelRecognitionLocal(
cancelReason,
cancellationErrorCode,
EnumTranslation.implTranslateErrorDetails(cancellationErrorCode));
} else {
if (!(this.privRequestSession.isSpeechEnded && resultReason === ResultReason.NoMatch && simple.RecognitionStatus !== RecognitionStatus.InitialSilenceTimeout)) {
if (this.privRecognizerConfig.parameters.getProperty(OutputFormatPropertyName) === OutputFormat[OutputFormat.Simple]) {
result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
resultReason,
simple.DisplayText,
simple.Duration,
simple.Offset + this.privRequestSession.currentTurnAudioOffset,
simple.Language,
simple.LanguageDetectionConfidence,
simple.SpeakerId,
undefined,
connectionMessage.textBody,
resultProps);
} else {
const detailed: DetailedSpeechPhrase = DetailedSpeechPhrase.fromJSON(connectionMessage.textBody);
const totalOffset: number = detailed.Offset + this.privRequestSession.currentTurnAudioOffset;
const offsetCorrectedJson: string = detailed.getJsonWithCorrectedOffsets(totalOffset);
result = new SpeechRecognitionResult(
this.privRequestSession.requestId,
resultReason,
detailed.Text,
detailed.Duration,
totalOffset,
detailed.Language,
detailed.LanguageDetectionConfidence,
detailed.SpeakerId,
undefined,
offsetCorrectedJson,
resultProps);
}
const event: SpeechRecognitionEventArgs = new SpeechRecognitionEventArgs(result, result.offset, this.privRequestSession.sessionId);
if (!!this.privTranscriberRecognizer.recognized) {
try {
this.privTranscriberRecognizer.recognized(this.privTranscriberRecognizer, event);
/* eslint-disable no-empty */
} catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
}
if (!!this.privSuccessCallback) {
try {
this.privSuccessCallback(result);
} catch (e) {
if (!!this.privErrorCallback) {
this.privErrorCallback(e as string);
}
}
// Only invoke the call back once.
// and if it's successful don't invoke the
// error after that.
this.privSuccessCallback = undefined;
this.privErrorCallback = undefined;
}
}
processed = true;
break;
default:
break;
}
return processed;
return super.processTypeSpecificMessages(connectionMessage);
}
// Cancels recognition.

Просмотреть файл

@ -23,9 +23,9 @@ import {
} from "../sdk/Exports";
import {
CancellationErrorCodePropertyName,
ConversationServiceRecognizer,
EnumTranslation,
RecognitionStatus,
ServiceRecognizerBase,
SynthesisStatus,
TranslationHypothesis,
TranslationPhrase,
@ -38,7 +38,7 @@ import { ITranslationPhrase } from "./ServiceMessages/TranslationPhrase";
import { SpeechConnectionMessage } from "./SpeechConnectionMessage.Internal";
// eslint-disable-next-line max-classes-per-file
export class TranslationServiceRecognizer extends ServiceRecognizerBase {
export class TranslationServiceRecognizer extends ConversationServiceRecognizer {
private privTranslationRecognizer: TranslationRecognizer;
public constructor(
@ -63,7 +63,10 @@ export class TranslationServiceRecognizer extends ServiceRecognizerBase {
protected async processTypeSpecificMessages(connectionMessage: SpeechConnectionMessage): Promise<boolean> {
const resultProps: PropertyCollection = new PropertyCollection();
let processed: boolean = false;
let processed: boolean = await super.processTypeSpecificMessages(connectionMessage);
if (processed) {
return true;
}
const handleTranslationPhrase = async (translatedPhrase: TranslationPhrase): Promise<void> => {
this.privRequestSession.onPhraseRecognized(this.privRequestSession.currentTurnAudioOffset + translatedPhrase.Offset + translatedPhrase.Duration);
@ -300,7 +303,7 @@ export class TranslationServiceRecognizer extends ServiceRecognizerBase {
let resultReason: ResultReason;
if (serviceResult instanceof TranslationPhrase) {
if (serviceResult.Translation.TranslationStatus === TranslationStatus.Success) {
if (!!serviceResult.Translation && serviceResult.Translation.TranslationStatus === TranslationStatus.Success) {
resultReason = ResultReason.TranslatedSpeech;
} else {
resultReason = ResultReason.RecognizedSpeech;

Просмотреть файл

@ -62,16 +62,25 @@ class ConversationTranslationRecognizer extends TranslationRecognizer {
this.privSpeechState = SpeechState.Inactive;
};
this.recognizing = (tr: TranslationRecognizer, e: TranslationRecognitionEventArgs): void => {
if (!!this.privTranslator.recognizing) {
this.privTranslator.recognizing(this.privTranslator, e);
}
};
// eslint-disable-next-line @typescript-eslint/no-misused-promises
this.recognized = async (tr: TranslationRecognizer, e: TranslationRecognitionEventArgs): Promise<void> => {
// TODO: add support for getting recognitions from here if own speech
// if there is an error connecting to the conversation service from the speech service the error will be returned in the ErrorDetails field.
if (e.result?.errorDetails) {
await this.cancelSpeech();
// TODO: format the error message contained in 'errorDetails'
this.fireCancelEvent(e.result.errorDetails);
} else {
if (!!this.privTranslator.recognized) {
this.privTranslator.recognized(this.privTranslator, e);
}
}
return;
};
// eslint-disable-next-line @typescript-eslint/no-misused-promises
@ -146,9 +155,15 @@ export class ConversationTranslator extends ConversationCommon implements IConve
public sessionStarted: (sender: ConversationHandler, event: SessionEventArgs) => void;
public sessionStopped: (sender: ConversationHandler, event: SessionEventArgs) => void;
public textMessageReceived: (sender: IConversationTranslator, event: ConversationTranslationEventArgs) => void;
// Callbacks for whole conversation results
public transcribed: (sender: IConversationTranslator, event: ConversationTranslationEventArgs) => void;
public transcribing: (sender: IConversationTranslator, event: ConversationTranslationEventArgs) => void;
// Callbacks for detecting speech/translation results from self
public recognized: (sender: IConversationTranslator, event: TranslationRecognitionEventArgs) => void;
public recognizing: (sender: IConversationTranslator, event: TranslationRecognitionEventArgs) => void;
private privSpeechRecognitionLanguage: string;
private privProperties: PropertyCollection;
private privIsDisposed: boolean;

Просмотреть файл

@ -31,6 +31,10 @@ export class TranslationRecognitionResult extends SpeechRecognitionResult {
this.privTranslations = translations;
}
public static fromSpeechRecognitionResult(result: SpeechRecognitionResult): TranslationRecognitionResult {
return new TranslationRecognitionResult(undefined, result.resultId, result.reason, result.text, result.duration, result.offset, result.errorDetails, result.json, result.properties);
}
/**
* Presents the translation results. Each item in the dictionary represents
* a translation result in one of target languages, where the key is the name

Просмотреть файл

@ -591,18 +591,111 @@ describe("conversation service tests", () => {
done(error);
}
});
ct.transcribed = ((s: sdk.ConversationTranslator, e: sdk.ConversationTranslationEventArgs) => {
expect(e.result.text).toContain("weather");
ct.stopTranscribingAsync(
() => {
ct.leaveConversationAsync(() => {
c.endConversationAsync(
done,
(e: string) => { done(e); });
ct.recognized = ((s: sdk.ConversationTranslator, e: sdk.TranslationRecognitionEventArgs) => {
if (e.result.text !== "") {
expect(e.result.text).toContain("weather");
ct.stopTranscribingAsync(
() => {
ct.leaveConversationAsync(() => {
c.endConversationAsync(
done,
(e: string) => { done(e); });
},
(e: string) => { done(e); });
},
(e: string) => { done(e); });
},
(e: string) => { done(e); });
}
});
ct.transcribed = ((s: sdk.ConversationTranslator, e: sdk.ConversationTranslationEventArgs) => {
expect(e.result.text).toContain("weather");
});
const lang: string = "en-US";
const nickname: string = "Tester";
ct.joinConversationAsync(c.conversationId, nickname, lang,
(() => {
// continue
}),
((error: any) => {
done(error);
}));
});
}),
((error: any) => {
done();
}));
});
test("Start Conversation, join as host and connect to CTS endpoint", (done: jest.DoneCallback) => {
// eslint-disable-next-line no-console
console.info("Start Conversation, join as host and connect to CTS endpoint");
// start a conversation
const config = sdk.SpeechTranslationConfig.fromSubscription(Settings.ConversationTranscriptionKey, Settings.ConversationTranscriptionRegion);
if (endpointHost !== "") { config.setProperty(sdk.PropertyId[sdk.PropertyId.ConversationTranslator_Host], endpointHost); }
config.setProperty(sdk.PropertyId[sdk.PropertyId.SpeechServiceConnection_Endpoint], Settings.ConversationTranslatorSwedenEndpoint);
const c: sdk.Conversation = sdk.Conversation.createConversationAsync(config, (() => {
objsToClose.push(c);
// audio config
const audioConfig: sdk.AudioConfig = WaveFileAudioInput.getAudioConfigFromFile(Settings.WaveFile);
objsToClose.push(audioConfig);
const ct: sdk.ConversationTranslator = new sdk.ConversationTranslator(audioConfig);
objsToClose.push(ct);
if (endpointHost !== "") { ct.properties.setProperty(sdk.PropertyId.ConversationTranslator_Host, endpointHost); }
ct.properties.setProperty(sdk.PropertyId[sdk.PropertyId.SpeechServiceConnection_Endpoint], Settings.ConversationTranslatorSwedenEndpoint);
const propName: string = "foo";
const propValue: string = "bar";
ct.setServiceProperty(propName, propValue);
const currentProperties: IStringDictionary<string> = JSON.parse(ct.properties.getProperty(ServicePropertiesPropertyName, "{}")) as IStringDictionary<string>;
expect(currentProperties[propName]).toEqual(propValue);
c.startConversationAsync(() => {
// Check that uri for service connection contains service property and value
const detachObject: IDetachable = Events.instance.attachListener({
onEvent: (event: PlatformEvent): void => {
if (event instanceof ConnectionStartEvent) {
const connectionEvent: ConnectionStartEvent = event as ConnectionStartEvent;
const uri: string = connectionEvent.uri;
expect(uri).not.toBeUndefined();
if(!uri.includes("capito")){
// Make sure there's only a single ? in the URL.
expect(uri.indexOf("?")).toEqual(uri.lastIndexOf("?"));
expect(uri).toContain(`${propName}=${propValue}`);
void detachObject.detach();
}
}
},
});
ct.participantsChanged = ((s: sdk.ConversationTranslator, e: sdk.ConversationParticipantsChangedEventArgs) => {
try {
ct.startTranscribingAsync();
} catch (error) {
done(error);
}
});
ct.recognized = ((s: sdk.ConversationTranslator, e: sdk.TranslationRecognitionEventArgs) => {
if (e.result.text !== "") {
expect(e.result.text).toContain("weather");
ct.stopTranscribingAsync(
() => {
ct.leaveConversationAsync(() => {
c.endConversationAsync(
done,
(e: string) => { done(e); });
},
(e: string) => { done(e); });
},
(e: string) => { done(e); });
}
});
const lang: string = "en-US";
@ -621,7 +714,6 @@ describe("conversation service tests", () => {
}));
});
});
// Conversation Translator tests: begin
describe("conversation translator constructor tests", () => {

Просмотреть файл

@ -14,6 +14,8 @@ export class Settings {
public static SpeechTestEndpointId: string = "<<YOUR_TEST_ENDPOINT_ID>>";
public static ConversationTranslatorSwedenEndpoint: string = "wss://transcribe.westus.cts.speech.microsoft.com/speech/recognition/dynamicaudio";
// Endpoint and key for timeout testing.
// Endpoint should reduce standard speech timeout to value specified in SpeechServiceTimeoutSeconds
// If undefined, production timeout of 10 seconds will be used, but at the cost of greatly increased test