Bug 1869043 interleave audio output after AudioMixer mixing r=padenot

This will enable PlayAudio() to be used in a subsequent patch to mix audio output for CrossGraphReceiver::EnqueueAudio(). An AudioChunk is used to clarify the planar storage and simplify use of InterleaveAndConvertBuffer(). AudioMixer is also changed to initialize to zero only as many samples in its buffer as will be used. Resetting to zero is no longer performed in FinishMixing() because that is done on first call to Mix(). Differential Revision: https://phabricator.services.mozilla.com/D198220
2024-01-11 23:42:47 +00:00 · 2024-01-11 23:42:47 +00:00 · 7fa20ab1f1
--- a/dom/media/AudioBufferUtils.h
+++ b/dom/media/AudioBufferUtils.h
@ -6,6 +6,7 @@
 #ifndef MOZILLA_SCRATCHBUFFER_H_
 #define MOZILLA_SCRATCHBUFFER_H_

+#include "AudioSegment.h"
 #include "mozilla/PodOperations.h"
 #include "mozilla/UniquePtr.h"
 #include "nsDebug.h"
@ -80,7 +81,7 @@ class AudioCallbackBufferWrapper {

  /**
   * Write some frames to the internal buffer. Free space in the buffer should
-   * be check prior to calling this.
+   * be checked prior to calling these.
   */
  void WriteFrames(T* aBuffer, uint32_t aFrames) {
    MOZ_ASSERT(aFrames <= Available(),
@ -90,6 +91,15 @@ class AudioCallbackBufferWrapper {
            FramesToSamples(mChannels, aFrames));
    mSampleWriteOffset += FramesToSamples(mChannels, aFrames);
  }
+  void WriteFrames(const AudioChunk& aChunk, uint32_t aFrames) {
+    MOZ_ASSERT(aFrames <= Available(),
+               "Writing more that we can in the audio buffer.");
+
+    InterleaveAndConvertBuffer(aChunk.ChannelData<T>().Elements(), aFrames,
+                               aChunk.mVolume, aChunk.ChannelCount(),
+                               mBuffer + mSampleWriteOffset);
+    mSampleWriteOffset += FramesToSamples(mChannels, aFrames);
+  }

  /**
   * Number of frames that can be written to the buffer.
@ -178,16 +188,18 @@ class SpillBuffer {

    return framesToWrite;
  }
-  /* Fill the spill buffer from aInput, containing aFrames frames, return the
-   * number of frames written to the spill buffer */
-  uint32_t Fill(T* aInput, uint32_t aFrames) {
+  /* Fill the spill buffer from aInput.
+   * Return the number of frames written to the spill buffer */
+  uint32_t Fill(const AudioChunk& aInput) {
    uint32_t framesToWrite =
-        std::min(aFrames, BLOCK_SIZE - SamplesToFrames(mChannels, mPosition));
+        std::min(static_cast<uint32_t>(aInput.mDuration),
+                 BLOCK_SIZE - SamplesToFrames(mChannels, mPosition));

    MOZ_ASSERT(FramesToSamples(mChannels, framesToWrite) + mPosition <=
               BLOCK_SIZE * mChannels);
-    PodCopy(mBuffer.get() + mPosition, aInput,
-            FramesToSamples(mChannels, framesToWrite));
+    InterleaveAndConvertBuffer(
+        aInput.ChannelData<T>().Elements(), framesToWrite, aInput.mVolume,
+        aInput.ChannelCount(), mBuffer.get() + mPosition);

    mPosition += FramesToSamples(mChannels, framesToWrite);

--- a/dom/media/AudioCaptureTrack.cpp
+++ b/dom/media/AudioCaptureTrack.cpp
@ -96,35 +96,10 @@ uint32_t AudioCaptureTrack::NumberOfChannels() const {
  return GetData<AudioSegment>()->MaxChannelCount();
 }

-void AudioCaptureTrack::MixerCallback(AudioDataValue* aMixedBuffer,
-                                      AudioSampleFormat aFormat,
-                                      uint32_t aChannels, uint32_t aFrames,
+void AudioCaptureTrack::MixerCallback(AudioChunk* aMixedBuffer,
                                      uint32_t aSampleRate) {
-  AutoTArray<nsTArray<AudioDataValue>, MONO> output;
-  AutoTArray<const AudioDataValue*, MONO> bufferPtrs;
-  output.SetLength(MONO);
-  bufferPtrs.SetLength(MONO);
-
-  uint32_t written = 0;
-  // We need to copy here, because the mixer will reuse the storage, we should
-  // not hold onto it. Buffers are in planar format.
-  for (uint32_t channel = 0; channel < aChannels; channel++) {
-    AudioDataValue* out = output[channel].AppendElements(aFrames);
-    PodCopy(out, aMixedBuffer + written, aFrames);
-    bufferPtrs[channel] = out;
-    written += aFrames;
-  }
-  AudioChunk chunk;
-  chunk.mBuffer =
-      new mozilla::SharedChannelArrayBuffer<AudioDataValue>(std::move(output));
-  chunk.mDuration = aFrames;
-  chunk.mBufferFormat = aFormat;
-  chunk.mChannelData.SetLength(MONO);
-  for (uint32_t channel = 0; channel < aChannels; channel++) {
-    chunk.mChannelData[channel] = bufferPtrs[channel];
-  }
-
+  MOZ_ASSERT(aMixedBuffer->ChannelCount() == MONO);
  // Now we have mixed data, simply append it.
-  GetData<AudioSegment>()->AppendAndConsumeChunk(std::move(chunk));
+  GetData<AudioSegment>()->AppendAndConsumeChunk(std::move(*aMixedBuffer));
 }
 }  // namespace mozilla
--- a/dom/media/AudioCaptureTrack.h
+++ b/dom/media/AudioCaptureTrack.h
@ -31,9 +31,7 @@ class AudioCaptureTrack : public ProcessedMediaTrack,
  uint32_t NumberOfChannels() const override;

 protected:
-  void MixerCallback(AudioDataValue* aMixedBuffer, AudioSampleFormat aFormat,
-                     uint32_t aChannels, uint32_t aFrames,
-                     uint32_t aSampleRate) override;
+  void MixerCallback(AudioChunk* aMixedBuffer, uint32_t aSampleRate) override;
  AudioMixer mMixer;
  bool mStarted;
  bool mTrackCreated;
--- a/dom/media/AudioMixer.h
+++ b/dom/media/AudioMixer.h
@ -7,6 +7,7 @@
 #define MOZILLA_AUDIOMIXER_H_

 #include "AudioSampleFormat.h"
+#include "AudioSegment.h"
 #include "AudioStream.h"
 #include "nsTArray.h"
 #include "mozilla/LinkedList.h"
@ -16,9 +17,11 @@
 namespace mozilla {

 struct MixerCallbackReceiver {
-  virtual void MixerCallback(AudioDataValue* aMixedBuffer,
-                             AudioSampleFormat aFormat, uint32_t aChannels,
-                             uint32_t aFrames, uint32_t aSampleRate) = 0;
+  // MixerCallback MAY modify aMixedBuffer but MUST clear
+  // aMixedBuffer->mBuffer if its data is to live longer than the duration of
+  // the callback.
+  virtual void MixerCallback(AudioChunk* aMixedBuffer,
+                             uint32_t aSampleRate) = 0;
 };
 /**
 * This class mixes multiple streams of audio together to output a single audio
@ -26,8 +29,7 @@ struct MixerCallbackReceiver {
 *
 * AudioMixer::Mix is to be called repeatedly with buffers that have the same
 * length, sample rate, sample format and channel count. This class works with
- * interleaved and plannar buffers, but the buffer mixed must be of the same
- * type during a mixing cycle.
+ * planar buffers.
 *
 * When all the tracks have been mixed, calling FinishMixing will call back with
 * a buffer containing the mixed audio data.
@ -36,7 +38,7 @@ struct MixerCallbackReceiver {
 */
 class AudioMixer {
 public:
-  AudioMixer() : mFrames(0), mChannels(0), mSampleRate(0) {}
+  AudioMixer() { mChunk.mBufferFormat = AUDIO_OUTPUT_FORMAT; }

  ~AudioMixer() {
    MixerCallback* cb;
@ -45,37 +47,39 @@ class AudioMixer {
    }
  }

-  void StartMixing() { mSampleRate = mChannels = mFrames = 0; }
+  void StartMixing() {
+    mChunk.mDuration = 0;
+    mSampleRate = 0;
+  }

  /* Get the data from the mixer. This is supposed to be called when all the
   * tracks have been mixed in. The caller should not hold onto the data. */
  void FinishMixing() {
-    MOZ_ASSERT(mChannels && mSampleRate, "Mix not called for this cycle?");
+    MOZ_ASSERT(mSampleRate, "Mix not called for this cycle?");
    for (MixerCallback* cb = mCallbacks.getFirst(); cb != nullptr;
         cb = cb->getNext()) {
      MixerCallbackReceiver* receiver = cb->mReceiver;
      MOZ_ASSERT(receiver);
-      receiver->MixerCallback(mMixedAudio.Elements(),
-                              AudioSampleTypeToFormat<AudioDataValue>::Format,
-                              mChannels, mFrames, mSampleRate);
+      receiver->MixerCallback(&mChunk, mSampleRate);
    }
-    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
-    mSampleRate = mChannels = mFrames = 0;
+    mChunk.mDuration = 0;
+    mSampleRate = 0;
  }

  /* Add a buffer to the mix. The buffer can be null if there's nothing to mix
   * but the callback is still needed. */
  void Mix(AudioDataValue* aSamples, uint32_t aChannels, uint32_t aFrames,
           uint32_t aSampleRate) {
-    if (!mFrames && !mChannels) {
-      mFrames = aFrames;
-      mChannels = aChannels;
+    if (!mChunk.mDuration) {
+      mChunk.mDuration = aFrames;
+      MOZ_ASSERT(aChannels > 0);
+      mChunk.mChannelData.SetLength(aChannels);
      mSampleRate = aSampleRate;
      EnsureCapacityAndSilence();
    }

-    MOZ_ASSERT(aFrames == mFrames);
-    MOZ_ASSERT(aChannels == mChannels);
+    MOZ_ASSERT(aFrames == mChunk.mDuration);
+    MOZ_ASSERT(aChannels == mChunk.ChannelCount());
    MOZ_ASSERT(aSampleRate == mSampleRate);

    if (!aSamples) {
@ -83,7 +87,7 @@ class AudioMixer {
    }

    for (uint32_t i = 0; i < aFrames * aChannels; i++) {
-      mMixedAudio[i] += aSamples[i];
+      mChunk.ChannelDataForWrite<AudioDataValue>(0)[i] += aSamples[i];
    }
  }

@ -115,10 +119,21 @@ class AudioMixer {

 private:
  void EnsureCapacityAndSilence() {
-    if (mFrames * mChannels > mMixedAudio.Length()) {
-      mMixedAudio.SetLength(mFrames * mChannels);
+    uint32_t sampleCount = mChunk.mDuration * mChunk.ChannelCount();
+    if (!mChunk.mBuffer || sampleCount > mSampleCapacity) {
+      CheckedInt<size_t> bufferSize(sizeof(AudioDataValue));
+      bufferSize *= sampleCount;
+      mChunk.mBuffer = SharedBuffer::Create(bufferSize);
+      mSampleCapacity = sampleCount;
    }
-    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
+    MOZ_ASSERT(!mChunk.mBuffer->IsShared());
+    mChunk.mChannelData[0] =
+        static_cast<SharedBuffer*>(mChunk.mBuffer.get())->Data();
+    for (size_t i = 1; i < mChunk.ChannelCount(); ++i) {
+      mChunk.mChannelData[i] =
+          mChunk.ChannelData<AudioDataValue>()[0] + i * mChunk.mDuration;
+    }
+    PodZero(mChunk.ChannelDataForWrite<AudioDataValue>(0), sampleCount);
  }

  class MixerCallback : public LinkedListElement<MixerCallback> {
@ -130,14 +145,12 @@ class AudioMixer {

  /* Function that is called when the mixing is done. */
  LinkedList<MixerCallback> mCallbacks;
-  /* Number of frames for this mixing block. */
-  uint32_t mFrames;
-  /* Number of channels for this mixing block. */
-  uint32_t mChannels;
-  /* Sample rate the of the mixed data. */
-  uint32_t mSampleRate;
  /* Buffer containing the mixed audio data. */
-  nsTArray<AudioDataValue> mMixedAudio;
+  AudioChunk mChunk;
+  /* Size allocated for mChunk.mBuffer. */
+  uint32_t mSampleCapacity = 0;
+  /* Sample rate the of the mixed data. */
+  uint32_t mSampleRate = 0;
 };

 }  // namespace mozilla
--- a/dom/media/AudioSegment.cpp
+++ b/dom/media/AudioSegment.cpp
@ -281,49 +281,4 @@ void AudioSegment::Mix(AudioMixer& aMixer, uint32_t aOutputChannels,
  }
 }

-void AudioSegment::WriteTo(AudioMixer& aMixer, uint32_t aOutputChannels,
-                           uint32_t aSampleRate) {
-  AutoTArray<AudioDataValue,
-             SilentChannel::AUDIO_PROCESSING_FRAMES * GUESS_AUDIO_CHANNELS>
-      buf;
-  // Offset in the buffer that will be written to the mixer, in samples.
-  uint32_t offset = 0;
-
-  if (GetDuration() <= 0) {
-    MOZ_ASSERT(GetDuration() == 0);
-    return;
-  }
-
-  uint32_t outBufferLength = GetDuration() * aOutputChannels;
-  buf.SetLength(outBufferLength);
-
-  for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
-    AudioChunk& c = *ci;
-
-    switch (c.mBufferFormat) {
-      case AUDIO_FORMAT_S16:
-        WriteChunk<int16_t>(c, aOutputChannels, c.mVolume,
-                            buf.Elements() + offset);
-        break;
-      case AUDIO_FORMAT_FLOAT32:
-        WriteChunk<float>(c, aOutputChannels, c.mVolume,
-                          buf.Elements() + offset);
-        break;
-      case AUDIO_FORMAT_SILENCE:
-        // The mixer is expecting interleaved data, so this is ok.
-        PodZero(buf.Elements() + offset, c.mDuration * aOutputChannels);
-        break;
-      default:
-        MOZ_ASSERT(false, "Not handled");
-    }
-
-    offset += c.mDuration * aOutputChannels;
-  }
-
-  if (offset) {
-    aMixer.Mix(buf.Elements(), aOutputChannels, offset / aOutputChannels,
-               aSampleRate);
-  }
-}
-
 }  // namespace mozilla
--- a/dom/media/AudioSegment.h
+++ b/dom/media/AudioSegment.h
@ -404,11 +404,6 @@ class AudioSegment : public MediaSegmentBase<AudioSegment, AudioChunk> {
    chunk = AppendChunk(aChunk.mDuration);
  }
  void ApplyVolume(float aVolume);
-  // Mix the segment into a mixer, interleaved. This is useful to output a
-  // segment to a system audio callback. It up or down mixes to aChannelCount
-  // channels.
-  void WriteTo(AudioMixer& aMixer, uint32_t aChannelCount,
-               uint32_t aSampleRate);
  // Mix the segment into a mixer, keeping it planar, up or down mixing to
  // aChannelCount channels.
  void Mix(AudioMixer& aMixer, uint32_t aChannelCount, uint32_t aSampleRate);
--- a/dom/media/GraphDriver.cpp
+++ b/dom/media/GraphDriver.cpp
@ -1092,26 +1092,28 @@ void AudioCallbackDriver::StateCallback(cubeb_state aState) {
  }
 }

-void AudioCallbackDriver::MixerCallback(AudioDataValue* aMixedBuffer,
-                                        AudioSampleFormat aFormat,
-                                        uint32_t aChannels, uint32_t aFrames,
+void AudioCallbackDriver::MixerCallback(AudioChunk* aMixedBuffer,
                                        uint32_t aSampleRate) {
  MOZ_ASSERT(InIteration());
  uint32_t toWrite = mBuffer.Available();

-  if (!mBuffer.Available() && aFrames > 0) {
+  TrackTime frameCount = aMixedBuffer->mDuration;
+  if (!mBuffer.Available() && frameCount > 0) {
    NS_WARNING("DataCallback buffer full, expect frame drops.");
  }

-  MOZ_ASSERT(mBuffer.Available() <= aFrames);
+  MOZ_ASSERT(mBuffer.Available() <= frameCount);

-  mBuffer.WriteFrames(aMixedBuffer, mBuffer.Available());
+  mBuffer.WriteFrames(*aMixedBuffer, mBuffer.Available());
  MOZ_ASSERT(mBuffer.Available() == 0,
             "Missing frames to fill audio callback's buffer.");
+  if (toWrite == frameCount) {
+    return;
+  }

-  DebugOnly<uint32_t> written = mScratchBuffer.Fill(
-      aMixedBuffer + toWrite * aChannels, aFrames - toWrite);
-  NS_WARNING_ASSERTION(written == aFrames - toWrite, "Dropping frames.");
+  aMixedBuffer->SliceTo(toWrite, frameCount);
+  DebugOnly<uint32_t> written = mScratchBuffer.Fill(*aMixedBuffer);
+  NS_WARNING_ASSERTION(written == frameCount - toWrite, "Dropping frames.");
 };

 void AudioCallbackDriver::PanOutputIfNeeded(bool aMicrophoneActive) {
--- a/dom/media/GraphDriver.h
+++ b/dom/media/GraphDriver.h
@ -595,9 +595,7 @@ class AudioCallbackDriver : public GraphDriver, public MixerCallbackReceiver {

  /* This function gets called when the graph has produced the audio frames for
   * this iteration. */
-  void MixerCallback(AudioDataValue* aMixedBuffer, AudioSampleFormat aFormat,
-                     uint32_t aChannels, uint32_t aFrames,
-                     uint32_t aSampleRate) override;
+  void MixerCallback(AudioChunk* aMixedBuffer, uint32_t aSampleRate) override;

  AudioCallbackDriver* AsAudioCallbackDriver() override { return this; }
  const AudioCallbackDriver* AsAudioCallbackDriver() const override {
--- a/dom/media/MediaTrackGraph.cpp
+++ b/dom/media/MediaTrackGraph.cpp
@ -745,7 +745,7 @@ TrackTime MediaTrackGraphImpl::PlayAudio(AudioMixer* aMixer,
    } else {
      outputChannels = AudioOutputChannelCount();
    }
-    output.WriteTo(*aMixer, outputChannels, mSampleRate);
+    output.Mix(*aMixer, outputChannels, mSampleRate);
  }
  return ticksWritten;
 }
--- a/dom/media/gtest/TestAudioBuffers.cpp
+++ b/dom/media/gtest/TestAudioBuffers.cpp
@ -17,13 +17,21 @@ void test_for_number_of_channels(const uint32_t channels) {
  mozilla::SpillBuffer<float, 128> b(channels);
  std::vector<float> fromCallback(samples, 0.0);
  std::vector<float> other(samples, 1.0);
+  mozilla::AudioChunk chunk;
+  chunk.mBufferFormat = mozilla::AUDIO_FORMAT_FLOAT32;
+  chunk.mChannelData.SetLength(channels);
+  for (uint32_t i = 0; i < channels; ++i) {
+    chunk.mChannelData[i] = other.data() + i * channels;
+  }

  // Set the buffer in the wrapper from the callback
  mBuffer.SetBuffer(fromCallback.data(), FRAMES);

  // Fill the SpillBuffer with data.
-  ASSERT_TRUE(b.Fill(other.data(), 15) == 15);
-  ASSERT_TRUE(b.Fill(other.data(), 17) == 17);
+  chunk.mDuration = 15;
+  ASSERT_TRUE(b.Fill(chunk) == 15);
+  chunk.mDuration = 17;
+  ASSERT_TRUE(b.Fill(chunk) == 17);
  for (uint32_t i = 0; i < 32 * channels; i++) {
    other[i] = 0.0;
  }
@ -46,8 +54,9 @@ void test_for_number_of_channels(const uint32_t channels) {
    << ")\n";
  }

-  ASSERT_TRUE(b.Fill(other.data(), FRAMES) == 128);
-  ASSERT_TRUE(b.Fill(other.data(), FRAMES) == 0);
+  chunk.mDuration = FRAMES;
+  ASSERT_TRUE(b.Fill(chunk) == 128);
+  ASSERT_TRUE(b.Fill(chunk) == 0);
  ASSERT_TRUE(b.Empty(mBuffer) == 0);
 }

--- a/dom/media/gtest/TestAudioMixer.cpp
+++ b/dom/media/gtest/TestAudioMixer.cpp
@ -14,20 +14,17 @@ namespace audio_mixer {
 struct MixerConsumer : public mozilla::MixerCallbackReceiver {
  /* In this test, the different audio stream and channels are always created to
   * cancel each other. */
-  void MixerCallback(AudioDataValue* aData, AudioSampleFormat aFormat,
-                     uint32_t aChannels, uint32_t aFrames,
-                     uint32_t aSampleRate) {
+  void MixerCallback(mozilla::AudioChunk* aMixedBuffer, uint32_t aSampleRate) {
    bool silent = true;
-    for (uint32_t i = 0; i < aChannels * aFrames; i++) {
-      if (aData[i] != 0.0) {
-        if (aFormat == mozilla::AUDIO_FORMAT_S16) {
-          fprintf(stderr, "Sample at %d is not silent: %d\n", i,
-                  (short)aData[i]);
-        } else {
-          fprintf(stderr, "Sample at %d is not silent: %f\n", i,
-                  (float)aData[i]);
+    ASSERT_EQ(aMixedBuffer->mBufferFormat, mozilla::AUDIO_FORMAT_FLOAT32);
+    for (uint32_t c = 0; c < aMixedBuffer->ChannelCount(); c++) {
+      const float* channelData = aMixedBuffer->ChannelData<AudioDataValue>()[c];
+      for (uint32_t i = 0; i < aMixedBuffer->mDuration; i++) {
+        if (channelData[i] != 0.0) {
+          fprintf(stderr, "Sample at %d in channel %c is not silent: %f\n", i,
+                  c, channelData[i]);
+          silent = false;
        }
-        silent = false;
      }
    }
    ASSERT_TRUE(silent);