From 7be3deda65e88864479b4bb01ef7c51957ac67cd Mon Sep 17 00:00:00 2001 From: Andre Natal Date: Thu, 11 Sep 2014 10:47:00 -0400 Subject: [PATCH] Bug 1051118 - Pass the sample rate of captured audio to speech recognition services. r=smaug --- .../webspeech/recognition/SpeechRecognition.cpp | 15 ++++++++------- .../webspeech/recognition/SpeechRecognition.h | 6 ++++-- .../recognition/SpeechStreamListener.cpp | 17 ++++++++++------- .../recognition/SpeechStreamListener.h | 2 +- .../recognition/nsISpeechRecognitionService.idl | 4 ++-- .../test/FakeSpeechRecognitionService.cpp | 2 +- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/content/media/webspeech/recognition/SpeechRecognition.cpp b/content/media/webspeech/recognition/SpeechRecognition.cpp index 8fbe9e116504..490f258f40cf 100644 --- a/content/media/webspeech/recognition/SpeechRecognition.cpp +++ b/content/media/webspeech/recognition/SpeechRecognition.cpp @@ -313,7 +313,7 @@ SpeechRecognition::Transition(SpeechEvent* aEvent) * Returns the number of samples that were processed. */ uint32_t -SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) +SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate) { AudioSegment::ChunkIterator iterator(*aSegment); uint32_t samples = 0; @@ -324,7 +324,7 @@ SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment) iterator.Next(); } - mRecognitionService->ProcessAudioSegment(aSegment); + mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate); return samples; } @@ -400,7 +400,7 @@ SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent) SetState(STATE_ESTIMATING); mEndpointer.SetEnvironmentEstimationMode(); - mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); DispatchTrustedEvent(NS_LITERAL_STRING("audiostart")); if (mCurrentState == STATE_ESTIMATING) { @@ -424,7 +424,7 @@ SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent) { SetState(STATE_ESTIMATING); - mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment); + mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); if (mEstimationSamples > kESTIMATION_SAMPLES) { mEndpointer.SetUserInputMode(); SetState(STATE_WAITING_FOR_SPEECH); @@ -436,7 +436,7 @@ SpeechRecognition::DetectSpeech(SpeechEvent* aEvent) { SetState(STATE_WAITING_FOR_SPEECH); - ProcessAudioSegment(aEvent->mAudioSegment); + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); if (mEndpointer.DidStartReceivingSpeech()) { mSpeechDetectionTimer->Cancel(); SetState(STATE_RECOGNIZING); @@ -449,7 +449,7 @@ SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent) { SetState(STATE_RECOGNIZING); - ProcessAudioSegment(aEvent->mAudioSegment); + ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate); if (mEndpointer.speech_input_complete()) { DispatchTrustedEvent(NS_LITERAL_STRING("speechend")); @@ -828,7 +828,7 @@ SpeechRecognition::CreateAudioSegment(nsTArray>& aChunks) void SpeechRecognition::FeedAudioData(already_AddRefed aSamples, uint32_t aDuration, - MediaStreamListener* aProvider) + MediaStreamListener* aProvider, TrackRate aTrackRate) { NS_ASSERTION(!NS_IsMainThread(), "FeedAudioData should not be called in the main thread"); @@ -876,6 +876,7 @@ SpeechRecognition::FeedAudioData(already_AddRefed aSamples, nsRefPtr event = new SpeechEvent(this, EVENT_AUDIO_DATA); event->mAudioSegment = segment; event->mProvider = aProvider; + event->mTrackRate = aTrackRate; NS_DispatchToMainThread(event); return; diff --git a/content/media/webspeech/recognition/SpeechRecognition.h b/content/media/webspeech/recognition/SpeechRecognition.h index 331e09a5baba..ea5aa64a4534 100644 --- a/content/media/webspeech/recognition/SpeechRecognition.h +++ b/content/media/webspeech/recognition/SpeechRecognition.h @@ -130,7 +130,7 @@ public: uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount); uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray>& aResult); AudioSegment* CreateAudioSegment(nsTArray>& aChunks); - void FeedAudioData(already_AddRefed aSamples, uint32_t aDuration, MediaStreamListener* aProvider); + void FeedAudioData(already_AddRefed aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate); static struct TestConfig { @@ -211,7 +211,7 @@ private: NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream); NS_IMETHOD StopRecording(); - uint32_t ProcessAudioSegment(AudioSegment* aSegment); + uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate); void NotifyError(SpeechEvent* aEvent); void ProcessEvent(SpeechEvent* aEvent); @@ -266,6 +266,7 @@ public: , mError(0) , mRecognition(aRecognition) , mType(aType) + , mTrackRate(0) { } @@ -286,6 +287,7 @@ private: // event gets processed. nsRefPtr mProvider; SpeechRecognition::EventType mType; + TrackRate mTrackRate; }; } // namespace dom diff --git a/content/media/webspeech/recognition/SpeechStreamListener.cpp b/content/media/webspeech/recognition/SpeechStreamListener.cpp index d8f77c3bb9e4..a9a4492f33c6 100644 --- a/content/media/webspeech/recognition/SpeechStreamListener.cpp +++ b/content/media/webspeech/recognition/SpeechStreamListener.cpp @@ -50,18 +50,20 @@ SpeechStreamListener::NotifyQueuedTrackChanges(MediaStreamGraph* aGraph, if (iterator->IsNull()) { nsTArray nullData; PodZero(nullData.AppendElements(duration), duration); - ConvertAndDispatchAudioChunk(duration, iterator->mVolume, nullData.Elements()); + ConvertAndDispatchAudioChunk(duration, iterator->mVolume, nullData.Elements(), aTrackRate); } else { AudioSampleFormat format = iterator->mBufferFormat; MOZ_ASSERT(format == AUDIO_FORMAT_S16 || format == AUDIO_FORMAT_FLOAT32); if (format == AUDIO_FORMAT_S16) { - ConvertAndDispatchAudioChunk(duration, iterator->mVolume, - static_cast(iterator->mChannelData[0])); + ConvertAndDispatchAudioChunk(duration,iterator->mVolume, + static_cast(iterator->mChannelData[0]), + aTrackRate); } else if (format == AUDIO_FORMAT_FLOAT32) { - ConvertAndDispatchAudioChunk(duration, iterator->mVolume, - static_cast(iterator->mChannelData[0])); + ConvertAndDispatchAudioChunk(duration,iterator->mVolume, + static_cast(iterator->mChannelData[0]), + aTrackRate); } } @@ -71,7 +73,8 @@ SpeechStreamListener::NotifyQueuedTrackChanges(MediaStreamGraph* aGraph, template void SpeechStreamListener::ConvertAndDispatchAudioChunk(int aDuration, float aVolume, - SampleFormatType* aData) + SampleFormatType* aData, + TrackRate aTrackRate) { nsRefPtr samples(SharedBuffer::Create(aDuration * 1 * // channel @@ -80,7 +83,7 @@ SpeechStreamListener::ConvertAndDispatchAudioChunk(int aDuration, float aVolume, int16_t* to = static_cast(samples->Data()); ConvertAudioSamplesWithScale(aData, to, aDuration, aVolume); - mRecognition->FeedAudioData(samples.forget(), aDuration, this); + mRecognition->FeedAudioData(samples.forget(), aDuration, this, aTrackRate); } void diff --git a/content/media/webspeech/recognition/SpeechStreamListener.h b/content/media/webspeech/recognition/SpeechStreamListener.h index 55b8db95faf0..74da950c60da 100644 --- a/content/media/webspeech/recognition/SpeechStreamListener.h +++ b/content/media/webspeech/recognition/SpeechStreamListener.h @@ -35,7 +35,7 @@ public: private: template - void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, SampleFormatType* aData); + void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, SampleFormatType* aData, TrackRate aTrackRate); nsRefPtr mRecognition; }; diff --git a/content/media/webspeech/recognition/nsISpeechRecognitionService.idl b/content/media/webspeech/recognition/nsISpeechRecognitionService.idl index 884778d522ca..45299b42c0a2 100644 --- a/content/media/webspeech/recognition/nsISpeechRecognitionService.idl +++ b/content/media/webspeech/recognition/nsISpeechRecognitionService.idl @@ -12,10 +12,10 @@ native SpeechRecognitionWeakPtr(mozilla::WeakPtr); [ptr] native AudioSegmentPtr(mozilla::AudioSegment); -[uuid(cb98d929-81cd-4a51-a214-80d3e6281d24)] +[uuid(857f3fa2-a980-4d3e-a959-a2f53af74232)] interface nsISpeechRecognitionService : nsISupports { void initialize(in SpeechRecognitionWeakPtr aSpeechRecognition); - void processAudioSegment(in AudioSegmentPtr aAudioSegment); + void processAudioSegment(in AudioSegmentPtr aAudioSegment, in long aSampleRate); void soundEnd(); void abort(); }; diff --git a/content/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp b/content/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp index 422b8b9054f8..d7a119279fef 100644 --- a/content/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp +++ b/content/media/webspeech/recognition/test/FakeSpeechRecognitionService.cpp @@ -40,7 +40,7 @@ FakeSpeechRecognitionService::Initialize(WeakPtr aSpeechRecog } NS_IMETHODIMP -FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment) +FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, int32_t aSampleRate) { return NS_OK; }