Bug 1397793 - Move to APM - Part 2 - Actual processing. r=pehrsons

This also is long, but simple.

First, we switch to floats everywhere. This allows to work with any rate, is
more flexible with channel layout, and is a stable API (see audio_processing.h
in webrtc.org).

Then, 10ms worth of audio (already at the graph rate) are poped from the
lock-free queue (fed on the other end by the MSG mixer), and does the following:
- Down mixing to stereo (if needed)
- De-interleaving into planar buffer
- Prepare input and output config
- Actually make the API call
- Free the data

Now, first, we should use a ring buffer, and not have to free any data. Then we
also should not use a lock-free queue, and synchronously process the
reverse-stream, but this is enough code already.

Then, the actual mic data processing:
- Pop a packet from the packetizer (that gives us 10ms worth of audio, note that
we switch from int16_t to float, i.e. we don't do this conversion anymore).
- We convert to planar buffers, deinterleaving
- Prepare input and output config
- Allocate a SharedBuffer of the right size
- Process the data with the processing algorithm selected in UpdateSingleSource
- Append to the a MediaSegment, and append to the right MediaStreamTrack for the
correct SourceMediaStream (the data is already planar and all well).

MozReview-Commit-ID: 2IjgHP0GAmw

--HG--
extra : rebase_source : 1e08c4a781db8778e0532f9ef1a8e369513a2c66
extra : source : 0107b3feb84bbe0e643f505ec58e303dfd94e1a7
This commit is contained in:
Paul Adenot 2017-12-04 13:34:14 +01:00
Родитель 49545441d4
Коммит 14fb321adc
3 изменённых файлов: 196 добавлений и 32 удалений

Просмотреть файл

@ -17,13 +17,13 @@ class SingleRwFifo;
namespace mozilla {
typedef struct FarEndAudioChunk_ {
uint16_t mSamples;
size_t mSamples;
bool mOverrun;
int16_t mData[1]; // variable-length
AudioDataValue mData[1]; // variable-length
} FarEndAudioChunk;
// This class is used to packetize and send the mixed audio from an MSG, in
// int16, to the AEC module of WebRTC.org.
// float, to the AEC module of WebRTC.org.
class AudioOutputObserver
{
public:

Просмотреть файл

@ -516,7 +516,7 @@ private:
const UniquePtr<webrtc::AudioProcessing> mAudioProcessing;
// accessed from the GraphDriver thread except for deletion
nsAutoPtr<AudioPacketizer<AudioDataValue, AudioDataValue>> mPacketizer;
nsAutoPtr<AudioPacketizer<AudioDataValue, float>> mPacketizer;
ScopedCustomReleasePtr<webrtc::VoEExternalMedia> mVoERenderListener;
// mMonitor protects mSources[] and mPrinicpalIds[] access/changes, and
@ -549,7 +549,8 @@ private:
MediaEnginePrefs mLastPrefs;
AlignedFloatBuffer mInputBuffer;
AlignedFloatBuffer mInputDownmixBuffer;
AlignedFloatBuffer mDeinterleavedBuffer;
AlignedAudioBuffer mInputDownmixBuffer;
};
class MediaEngineWebRTC : public MediaEngine

Просмотреть файл

@ -22,13 +22,6 @@
using namespace webrtc;
#define CHANNELS 1
#define ENCODING "L16"
#define DEFAULT_PORT 5555
#define SAMPLE_RATE(freq) ((freq)*2*8) // bps, 16-bit samples
#define SAMPLE_LENGTH(freq) (((freq)*10)/1000)
// These are restrictions from the webrtc.org code
#define MAX_CHANNELS 2
#define MAX_SAMPLING_FREQ 48000 // Hz - multiple of 100
@ -144,7 +137,7 @@ AudioOutputObserver::InsertFarEnd(const AudioDataValue *aBuffer, uint32_t aFrame
while (aFrames) {
if (!mSaved) {
mSaved = (FarEndAudioChunk *) moz_xmalloc(sizeof(FarEndAudioChunk) +
(mChunkSize * channels - 1)*sizeof(int16_t));
(mChunkSize * channels - 1)*sizeof(AudioDataValue));
mSaved->mSamples = mChunkSize;
mSaved->mOverrun = aOverran;
aOverran = false;
@ -154,7 +147,7 @@ AudioOutputObserver::InsertFarEnd(const AudioDataValue *aBuffer, uint32_t aFrame
to_copy = aFrames;
}
int16_t* dest = &(mSaved->mData[mSamplesSaved * channels]);
AudioDataValue* dest = &(mSaved->mData[mSamplesSaved * channels]);
if (aChannels > MAX_CHANNELS) {
AudioConverter converter(AudioConfig(aChannels, 0), AudioConfig(channels, 0));
converter.Process(mDownmixBuffer, aBuffer, to_copy);
@ -165,7 +158,7 @@ AudioOutputObserver::InsertFarEnd(const AudioDataValue *aBuffer, uint32_t aFrame
#ifdef LOG_FAREND_INSERTION
if (fp) {
fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(int16_t), fp);
fwrite(&(mSaved->mData[mSamplesSaved * aChannels]), to_copy * aChannels, sizeof(AudioDataValue), fp);
}
#endif
aFrames -= to_copy;
@ -203,7 +196,7 @@ MediaEngineWebRTCMicrophoneSource::MediaEngineWebRTCMicrophoneSource(
, mExtendedFilter(aExtendedFilter)
, mTrackID(TRACK_NONE)
, mStarted(false)
, mSampleFrequency(MediaEngine::DEFAULT_SAMPLE_RATE)
, mSampleFrequency(MediaEngine::USE_GRAPH_RATE)
, mTotalFrames(0)
, mLastLogFrames(0)
, mSkipProcessing(false)
@ -454,12 +447,24 @@ MediaEngineWebRTCMicrophoneSource::UpdateSingleSource(
// (Bug 1238038) fail allocation for a second device
return NS_ERROR_FAILURE;
}
if (mAudioInput->SetRecordingDevice(mCapIndex)) {
return NS_ERROR_FAILURE;
}
mAudioInput->SetUserChannelCount(prefs.mChannels);
if (!AllocChannel()) {
FreeChannel();
LOG(("Audio device is not initalized"));
return NS_ERROR_FAILURE;
}
LOG(("Audio device %d allocated", mCapIndex));
{
// Update with the actual applied channelCount in order
// to store it in settings.
uint32_t channelCount = 0;
mAudioInput->GetChannelCount(channelCount);
MOZ_ASSERT(channelCount > 0);
prefs.mChannels = channelCount;
}
break;
case kStarted:
@ -676,6 +681,7 @@ MediaEngineWebRTCMicrophoneSource::NotifyOutputData(MediaStreamGraph* aGraph,
}
}
// Only called if we're not in passthrough mode
void
MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
const AudioDataValue* aBuffer,
@ -691,7 +697,7 @@ MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
mPacketizer->Channels() != aChannels) {
// It's ok to drop the audio still in the packetizer here.
mPacketizer =
new AudioPacketizer<AudioDataValue, AudioDataValue>(aRate/100, aChannels);
new AudioPacketizer<AudioDataValue, float>(aRate/100, aChannels);
}
// On initial capture, throw away all far-end data except the most recent sample
@ -704,6 +710,110 @@ MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
}
}
// Feed the far-end audio data (speakers) to the feedback input of the AEC.
while (mAudioOutputObserver->Size() > 0) {
// Bug 1414837: This will call `free()`, and we should remove it.
// Pop gives ownership.
UniquePtr<FarEndAudioChunk> buffer(mAudioOutputObserver->Pop()); // only call if size() > 0
if (!buffer) {
continue;
}
AudioDataValue* packetDataPointer = buffer->mData;
AutoTArray<AudioDataValue*, MAX_CHANNELS> deinterleavedPacketDataChannelPointers;
AudioDataValue* interleavedFarend = nullptr;
uint32_t channelCountFarend = 0;
uint32_t framesPerPacketFarend = 0;
// Downmix from aChannels to MAX_CHANNELS if needed
if (mAudioOutputObserver->PlayoutChannels() > MAX_CHANNELS) {
AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_DEFAULT),
AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_DEFAULT));
framesPerPacketFarend =
buffer->mSamples;
framesPerPacketFarend =
converter.Process(mInputDownmixBuffer,
packetDataPointer,
framesPerPacketFarend);
interleavedFarend = mInputDownmixBuffer.Data();
channelCountFarend = MAX_CHANNELS;
deinterleavedPacketDataChannelPointers.SetLength(MAX_CHANNELS);
} else {
uint32_t outputChannels = mAudioOutputObserver->PlayoutChannels();
interleavedFarend = packetDataPointer;
channelCountFarend = outputChannels;
framesPerPacketFarend = buffer->mSamples;
deinterleavedPacketDataChannelPointers.SetLength(outputChannels);
}
MOZ_ASSERT(interleavedFarend &&
(channelCountFarend == 1 || channelCountFarend == 2) &&
framesPerPacketFarend);
offset = 0;
for (size_t i = 0; i < deinterleavedPacketDataChannelPointers.Length(); ++i) {
deinterleavedPacketDataChannelPointers[i] = packetDataPointer + offset;
offset += framesPerPacketFarend;
}
// deinterleave back into the FarEndAudioChunk buffer to save an alloc.
// There is enough room because either there is the same number of
// channels/frames or we've just downmixed.
Deinterleave(interleavedFarend,
framesPerPacketFarend,
channelCountFarend,
deinterleavedPacketDataChannelPointers.Elements());
// Having the same config for input and output means we potentially save
// some CPU. We won't need the output here, the API forces us to set a
// valid pointer with enough space.
StreamConfig inputConfig(mAudioOutputObserver->PlayoutFrequency(),
channelCountFarend,
false /* we don't use typing detection*/);
StreamConfig outputConfig = inputConfig;
// Prepare a channel pointers array, with enough storage for the
// frames.
//
// If this is a platform that uses s16 for audio input and output,
// convert to floats, the APM API we use only accepts floats.
float* inputData = nullptr;
#ifdef MOZ_SAMPLE_TYPE_S16
// Convert to floats, use mInputBuffer for this.
size_t sampleCount = framesPerPacketFarend * channelCountFarend;
if (mInputBuffer.Length() < sampleCount) {
mInputBuffer.SetLength(sampleCount);
}
ConvertAudioSamples(buffer->mData, mInputBuffer.Data(), sampleCount);
inputData = mInputBuffer.Data();
#else // MOZ_SAMPLE_TYPE_F32
inputData = buffer->mData;
#endif
AutoTArray<float*, MAX_CHANNELS> channelsPointers;
channelsPointers.SetLength(channelCountFarend);
offset = 0;
for (size_t i = 0; i < channelsPointers.Length(); ++i) {
channelsPointers[i] = inputData + offset;
offset += framesPerPacketFarend;
}
// Passing the same pointers here saves a copy inside this function.
int err =
mAudioProcessing->ProcessReverseStream(channelsPointers.Elements(),
inputConfig,
outputConfig,
channelsPointers.Elements());
if (err) {
MOZ_LOG(GetMediaManagerLog(), LogLevel::Error,
("error in audio ProcessReverseStream(): %d", err));
return;
}
}
// Packetize our input data into 10ms chunks, deinterleave into planar channel
// buffers, process, and append to the right MediaStreamTrack.
mPacketizer->Input(aBuffer, static_cast<uint32_t>(aFrames));
while (mPacketizer->PacketsAvailable()) {
@ -711,19 +821,72 @@ MediaEngineWebRTCMicrophoneSource::PacketizeAndProcess(MediaStreamGraph* aGraph,
mPacketizer->Channels();
if (mInputBuffer.Length() < samplesPerPacket) {
mInputBuffer.SetLength(samplesPerPacket);
mDeinterleavedBuffer.SetLength(samplesPerPacket);
}
int16_t* packet = mInputBuffer.Elements();
float* packet = mInputBuffer.Data();
mPacketizer->Output(packet);
if (aChannels > MAX_CHANNELS) {
AudioConverter converter(AudioConfig(aChannels, 0, AudioConfig::FORMAT_S16),
AudioConfig(MAX_CHANNELS, 0, AudioConfig::FORMAT_S16));
converter.Process(mInputDownmixBuffer, packet, mPacketizer->PacketSize());
mVoERender->ExternalRecordingInsertData(mInputDownmixBuffer.Data(),
mPacketizer->PacketSize() * MAX_CHANNELS,
aRate, 0);
} else {
mVoERender->ExternalRecordingInsertData(packet, samplesPerPacket, aRate, 0);
// Deinterleave the input data
// Prepare an array pointing to deinterleaved channels.
AutoTArray<float*, 8> deinterleavedPacketizedInputDataChannelPointers;
deinterleavedPacketizedInputDataChannelPointers.SetLength(aChannels);
offset = 0;
for (size_t i = 0; i < deinterleavedPacketizedInputDataChannelPointers.Length(); ++i) {
deinterleavedPacketizedInputDataChannelPointers[i] = mDeinterleavedBuffer.Data() + offset;
offset += aFrames;
}
// Deinterleave to mInputBuffer, pointed to by inputBufferChannelPointers.
Deinterleave(packet, mPacketizer->PacketSize(), aChannels,
deinterleavedPacketizedInputDataChannelPointers.Elements());
StreamConfig inputConfig(aRate,
aChannels,
false /* we don't use typing detection*/);
StreamConfig outputConfig = inputConfig;
// Bug 1404965: Get the right delay here, it saves some work down the line.
mAudioProcessing->set_stream_delay_ms(0);
// Bug 1414837: find a way to not allocate here.
RefPtr<SharedBuffer> buffer =
SharedBuffer::Create(mPacketizer->PacketSize() * aChannels * sizeof(float));
AudioSegment segment;
// Prepare channel pointers to the SharedBuffer created above.
AutoTArray<float*, 8> processedOutputChannelPointers;
AutoTArray<const float*, 8> processedOutputChannelPointersConst;
processedOutputChannelPointers.SetLength(aChannels);
processedOutputChannelPointersConst.SetLength(aChannels);
offset = 0;
for (size_t i = 0; i < processedOutputChannelPointers.Length(); ++i) {
processedOutputChannelPointers[i] = static_cast<float*>(buffer->Data()) + offset;
processedOutputChannelPointersConst[i] = static_cast<float*>(buffer->Data()) + offset;
offset += aFrames;
}
mAudioProcessing->ProcessStream(deinterleavedPacketizedInputDataChannelPointers.Elements(),
inputConfig,
outputConfig,
processedOutputChannelPointers.Elements());
MonitorAutoLock lock(mMonitor);
if (mState != kStarted)
return;
for (size_t i = 0; i < mSources.Length(); ++i) {
if (!mSources[i]) { // why ?!
continue;
}
// We already have planar audio data of the right format. Insert into the
// MSG.
MOZ_ASSERT(processedOutputChannelPointers.Length() == aChannels);
segment.AppendFrames(buffer.forget(),
processedOutputChannelPointersConst,
mPacketizer->PacketSize(),
mPrincipalHandles[i]);
mSources[i]->AppendToTrack(mTrackID, &segment);
}
}
}
@ -750,7 +913,7 @@ MediaEngineWebRTCMicrophoneSource::InsertInGraph(const T* aBuffer,
}
size_t len = mSources.Length();
for (size_t i = 0; i < len; i++) {
for (size_t i = 0; i < len; ++i) {
if (!mSources[i]) {
continue;
}
@ -766,7 +929,7 @@ MediaEngineWebRTCMicrophoneSource::InsertInGraph(const T* aBuffer,
MOZ_ASSERT(aChannels >= 1 && aChannels <= 8,
"Support up to 8 channels");
nsAutoPtr<AudioSegment> segment(new AudioSegment());
AudioSegment segment;
RefPtr<SharedBuffer> buffer =
SharedBuffer::Create(aFrames * aChannels * sizeof(T));
AutoTArray<const T*, 8> channels;
@ -792,11 +955,11 @@ MediaEngineWebRTCMicrophoneSource::InsertInGraph(const T* aBuffer,
}
MOZ_ASSERT(aChannels == channels.Length());
segment->AppendFrames(buffer.forget(), channels, aFrames,
segment.AppendFrames(buffer.forget(), channels, aFrames,
mPrincipalHandles[i]);
segment->GetStartTime(insertTime);
segment.GetStartTime(insertTime);
mSources[i]->AppendToTrack(mTrackID, segment);
mSources[i]->AppendToTrack(mTrackID, &segment);
}
}