зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1051118 - Pass the sample rate of captured audio to speech recognition services. r=smaug
This commit is contained in:
Родитель
ac745b44f1
Коммит
7be3deda65
|
@ -313,7 +313,7 @@ SpeechRecognition::Transition(SpeechEvent* aEvent)
|
|||
* Returns the number of samples that were processed.
|
||||
*/
|
||||
uint32_t
|
||||
SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
|
||||
SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate)
|
||||
{
|
||||
AudioSegment::ChunkIterator iterator(*aSegment);
|
||||
uint32_t samples = 0;
|
||||
|
@ -324,7 +324,7 @@ SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment)
|
|||
iterator.Next();
|
||||
}
|
||||
|
||||
mRecognitionService->ProcessAudioSegment(aSegment);
|
||||
mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
|
||||
return samples;
|
||||
}
|
||||
|
||||
|
@ -400,7 +400,7 @@ SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
|
|||
SetState(STATE_ESTIMATING);
|
||||
|
||||
mEndpointer.SetEnvironmentEstimationMode();
|
||||
mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
|
||||
mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
|
||||
|
||||
DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
|
||||
if (mCurrentState == STATE_ESTIMATING) {
|
||||
|
@ -424,7 +424,7 @@ SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
|
|||
{
|
||||
SetState(STATE_ESTIMATING);
|
||||
|
||||
mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment);
|
||||
mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
|
||||
if (mEstimationSamples > kESTIMATION_SAMPLES) {
|
||||
mEndpointer.SetUserInputMode();
|
||||
SetState(STATE_WAITING_FOR_SPEECH);
|
||||
|
@ -436,7 +436,7 @@ SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
|
|||
{
|
||||
SetState(STATE_WAITING_FOR_SPEECH);
|
||||
|
||||
ProcessAudioSegment(aEvent->mAudioSegment);
|
||||
ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
|
||||
if (mEndpointer.DidStartReceivingSpeech()) {
|
||||
mSpeechDetectionTimer->Cancel();
|
||||
SetState(STATE_RECOGNIZING);
|
||||
|
@ -449,7 +449,7 @@ SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
|
|||
{
|
||||
SetState(STATE_RECOGNIZING);
|
||||
|
||||
ProcessAudioSegment(aEvent->mAudioSegment);
|
||||
ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
|
||||
if (mEndpointer.speech_input_complete()) {
|
||||
DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));
|
||||
|
||||
|
@ -828,7 +828,7 @@ SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
|
|||
void
|
||||
SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
|
||||
uint32_t aDuration,
|
||||
MediaStreamListener* aProvider)
|
||||
MediaStreamListener* aProvider, TrackRate aTrackRate)
|
||||
{
|
||||
NS_ASSERTION(!NS_IsMainThread(),
|
||||
"FeedAudioData should not be called in the main thread");
|
||||
|
@ -876,6 +876,7 @@ SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
|
|||
nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
|
||||
event->mAudioSegment = segment;
|
||||
event->mProvider = aProvider;
|
||||
event->mTrackRate = aTrackRate;
|
||||
NS_DispatchToMainThread(event);
|
||||
|
||||
return;
|
||||
|
|
|
@ -130,7 +130,7 @@ public:
|
|||
uint32_t FillSamplesBuffer(const int16_t* aSamples, uint32_t aSampleCount);
|
||||
uint32_t SplitSamplesBuffer(const int16_t* aSamplesBuffer, uint32_t aSampleCount, nsTArray<nsRefPtr<SharedBuffer>>& aResult);
|
||||
AudioSegment* CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks);
|
||||
void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider);
|
||||
void FeedAudioData(already_AddRefed<SharedBuffer> aSamples, uint32_t aDuration, MediaStreamListener* aProvider, TrackRate aTrackRate);
|
||||
|
||||
static struct TestConfig
|
||||
{
|
||||
|
@ -211,7 +211,7 @@ private:
|
|||
NS_IMETHOD StartRecording(DOMMediaStream* aDOMStream);
|
||||
NS_IMETHOD StopRecording();
|
||||
|
||||
uint32_t ProcessAudioSegment(AudioSegment* aSegment);
|
||||
uint32_t ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate);
|
||||
void NotifyError(SpeechEvent* aEvent);
|
||||
|
||||
void ProcessEvent(SpeechEvent* aEvent);
|
||||
|
@ -266,6 +266,7 @@ public:
|
|||
, mError(0)
|
||||
, mRecognition(aRecognition)
|
||||
, mType(aType)
|
||||
, mTrackRate(0)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -286,6 +287,7 @@ private:
|
|||
// event gets processed.
|
||||
nsRefPtr<MediaStreamListener> mProvider;
|
||||
SpeechRecognition::EventType mType;
|
||||
TrackRate mTrackRate;
|
||||
};
|
||||
|
||||
} // namespace dom
|
||||
|
|
|
@ -50,18 +50,20 @@ SpeechStreamListener::NotifyQueuedTrackChanges(MediaStreamGraph* aGraph,
|
|||
if (iterator->IsNull()) {
|
||||
nsTArray<int16_t> nullData;
|
||||
PodZero(nullData.AppendElements(duration), duration);
|
||||
ConvertAndDispatchAudioChunk(duration, iterator->mVolume, nullData.Elements());
|
||||
ConvertAndDispatchAudioChunk(duration, iterator->mVolume, nullData.Elements(), aTrackRate);
|
||||
} else {
|
||||
AudioSampleFormat format = iterator->mBufferFormat;
|
||||
|
||||
MOZ_ASSERT(format == AUDIO_FORMAT_S16 || format == AUDIO_FORMAT_FLOAT32);
|
||||
|
||||
if (format == AUDIO_FORMAT_S16) {
|
||||
ConvertAndDispatchAudioChunk(duration, iterator->mVolume,
|
||||
static_cast<const int16_t*>(iterator->mChannelData[0]));
|
||||
ConvertAndDispatchAudioChunk(duration,iterator->mVolume,
|
||||
static_cast<const int16_t*>(iterator->mChannelData[0]),
|
||||
aTrackRate);
|
||||
} else if (format == AUDIO_FORMAT_FLOAT32) {
|
||||
ConvertAndDispatchAudioChunk(duration, iterator->mVolume,
|
||||
static_cast<const float*>(iterator->mChannelData[0]));
|
||||
ConvertAndDispatchAudioChunk(duration,iterator->mVolume,
|
||||
static_cast<const float*>(iterator->mChannelData[0]),
|
||||
aTrackRate);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -71,7 +73,8 @@ SpeechStreamListener::NotifyQueuedTrackChanges(MediaStreamGraph* aGraph,
|
|||
|
||||
template<typename SampleFormatType> void
|
||||
SpeechStreamListener::ConvertAndDispatchAudioChunk(int aDuration, float aVolume,
|
||||
SampleFormatType* aData)
|
||||
SampleFormatType* aData,
|
||||
TrackRate aTrackRate)
|
||||
{
|
||||
nsRefPtr<SharedBuffer> samples(SharedBuffer::Create(aDuration *
|
||||
1 * // channel
|
||||
|
@ -80,7 +83,7 @@ SpeechStreamListener::ConvertAndDispatchAudioChunk(int aDuration, float aVolume,
|
|||
int16_t* to = static_cast<int16_t*>(samples->Data());
|
||||
ConvertAudioSamplesWithScale(aData, to, aDuration, aVolume);
|
||||
|
||||
mRecognition->FeedAudioData(samples.forget(), aDuration, this);
|
||||
mRecognition->FeedAudioData(samples.forget(), aDuration, this, aTrackRate);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
private:
|
||||
template<typename SampleFormatType>
|
||||
void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, SampleFormatType* aData);
|
||||
void ConvertAndDispatchAudioChunk(int aDuration, float aVolume, SampleFormatType* aData, TrackRate aTrackRate);
|
||||
nsRefPtr<SpeechRecognition> mRecognition;
|
||||
};
|
||||
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
native SpeechRecognitionWeakPtr(mozilla::WeakPtr<mozilla::dom::SpeechRecognition>);
|
||||
[ptr] native AudioSegmentPtr(mozilla::AudioSegment);
|
||||
|
||||
[uuid(cb98d929-81cd-4a51-a214-80d3e6281d24)]
|
||||
[uuid(857f3fa2-a980-4d3e-a959-a2f53af74232)]
|
||||
interface nsISpeechRecognitionService : nsISupports {
|
||||
void initialize(in SpeechRecognitionWeakPtr aSpeechRecognition);
|
||||
void processAudioSegment(in AudioSegmentPtr aAudioSegment);
|
||||
void processAudioSegment(in AudioSegmentPtr aAudioSegment, in long aSampleRate);
|
||||
void soundEnd();
|
||||
void abort();
|
||||
};
|
||||
|
|
|
@ -40,7 +40,7 @@ FakeSpeechRecognitionService::Initialize(WeakPtr<SpeechRecognition> aSpeechRecog
|
|||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment)
|
||||
FakeSpeechRecognitionService::ProcessAudioSegment(AudioSegment* aAudioSegment, int32_t aSampleRate)
|
||||
{
|
||||
return NS_OK;
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче