Bug 982490 - Ensure for MSG cycle that each MediaStream write the same number of frames to their AudioStream. r=jesup,roc

2014-03-24 11:06:06 +01:00 · 2014-03-24 11:06:06 +01:00 · c906c38e32
--- a/content/media/AudioMixer.h
+++ b/content/media/AudioMixer.h
@ -0,0 +1,85 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_AUDIOMIXER_H_
+#define MOZILLA_AUDIOMIXER_H_
+
+#include "AudioSampleFormat.h"
+#include "nsTArray.h"
+#include "mozilla/PodOperations.h"
+
+namespace mozilla {
+typedef void(*MixerFunc)(AudioDataValue* aMixedBuffer,
+                         AudioSampleFormat aFormat,
+                         uint32_t aChannels,
+                         uint32_t aFrames);
+
+/**
+ * This class mixes multiple streams of audio together to output a single audio
+ * stream.
+ *
+ * AudioMixer::Mix is to be called repeatedly with buffers that have the same
+ * length, sample rate, sample format and channel count.
+ *
+ * When all the tracks have been mixed, calling FinishMixing will call back with
+ * a buffer containing the mixed audio data.
+ *
+ * This class is not thread safe.
+ */
+class AudioMixer
+{
+public:
+  AudioMixer(MixerFunc aCallback)
+    : mCallback(aCallback),
+      mFrames(0),
+      mChannels(0)
+  { }
+
+  /* Get the data from the mixer. This is supposed to be called when all the
+   * tracks have been mixed in. The caller should not hold onto the data. */
+  void FinishMixing() {
+    mCallback(mMixedAudio.Elements(),
+              AudioSampleTypeToFormat<AudioDataValue>::Format,
+              mChannels,
+              mFrames);
+    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
+    mChannels = mFrames = 0;
+  }
+
+  /* Add a buffer to the mix. aSamples is interleaved. */
+  void Mix(AudioDataValue* aSamples, uint32_t aChannels, uint32_t aFrames) {
+    if (!mFrames && !mChannels) {
+      mFrames = aFrames;
+      mChannels = aChannels;
+      EnsureCapacityAndSilence();
+    }
+
+    MOZ_ASSERT(aFrames == mFrames);
+    MOZ_ASSERT(aChannels == mChannels);
+
+    for (uint32_t i = 0; i < aFrames * aChannels; i++) {
+      mMixedAudio[i] += aSamples[i];
+    }
+  }
+private:
+  void EnsureCapacityAndSilence() {
+    if (mFrames * mChannels > mMixedAudio.Length()) {
+      mMixedAudio.SetLength(mFrames* mChannels);
+    }
+    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
+  }
+
+  /* Function that is called when the mixing is done. */
+  MixerFunc mCallback;
+  /* Number of frames for this mixing block. */
+  uint32_t mFrames;
+  /* Number of channels for this mixing block. */
+  uint32_t mChannels;
+  /* Buffer containing the mixed audio data. */
+  nsTArray<AudioDataValue> mMixedAudio;
+};
+}
+
+#endif // MOZILLA_AUDIOMIXER_H_
--- a/content/media/AudioSampleFormat.h
+++ b/content/media/AudioSampleFormat.h
@ -49,7 +49,19 @@ public:

 typedef AudioSampleTraits<AUDIO_OUTPUT_FORMAT>::Type AudioDataValue;

-// Single-sample conversion 
+template<typename T> class AudioSampleTypeToFormat;
+
+template <> class AudioSampleTypeToFormat<float> {
+public:
+  static const AudioSampleFormat Format = AUDIO_FORMAT_FLOAT32;
+};
+
+template <> class AudioSampleTypeToFormat<short> {
+public:
+  static const AudioSampleFormat Format = AUDIO_FORMAT_S16;
+};
+
+// Single-sample conversion
 /*
 * Use "2^N" conversion since it's simple, fast, "bit transparent", used by
 * many other libraries and apparently behaves reasonably.
--- a/content/media/AudioSegment.cpp
+++ b/content/media/AudioSegment.cpp
@ -6,6 +6,7 @@
 #include "AudioSegment.h"

 #include "AudioStream.h"
+#include "AudioMixer.h"
 #include "AudioChannelFormat.h"
 #include "Latency.h"
 #include "speex/speex_resampler.h"
@ -134,69 +135,74 @@ void AudioSegment::ResampleChunks(SpeexResamplerState* aResampler)
 }

 void
-AudioSegment::WriteTo(uint64_t aID, AudioStream* aOutput)
+AudioSegment::WriteTo(uint64_t aID, AudioStream* aOutput, AudioMixer* aMixer)
 {
  uint32_t outputChannels = aOutput->GetChannels();
  nsAutoTArray<AudioDataValue,AUDIO_PROCESSING_FRAMES*GUESS_AUDIO_CHANNELS> buf;
  nsAutoTArray<const void*,GUESS_AUDIO_CHANNELS> channelData;

+  if (!GetDuration()) {
+    return;
+  }
+
+  uint32_t outBufferLength = GetDuration() * outputChannels;
+  buf.SetLength(outBufferLength);
+
+  // Offset in the buffer that will end up sent to the AudioStream.
+  uint32_t offset = 0;
+
  for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
    AudioChunk& c = *ci;
-    TrackTicks offset = 0;
-    while (offset < c.mDuration) {
-      TrackTicks durationTicks =
-        std::min<TrackTicks>(c.mDuration - offset, AUDIO_PROCESSING_FRAMES);
-      if (uint64_t(outputChannels)*durationTicks > INT32_MAX || offset > INT32_MAX) {
-        NS_ERROR("Buffer overflow");
-        return;
-      }
+    uint32_t frames = c.mDuration;

-      uint32_t duration = uint32_t(durationTicks);
-
-      // If we have written data in the past, or we have real (non-silent) data
-      // to write, we can proceed. Otherwise, it means we just started the
-      // AudioStream, and we don't have real data to write to it (just silence).
-      // To avoid overbuffering in the AudioStream, we simply drop the silence,
-      // here. The stream will underrun and output silence anyways.
-      if (c.mBuffer || aOutput->GetWritten()) {
-        buf.SetLength(outputChannels*duration);
-        if (c.mBuffer) {
-          channelData.SetLength(c.mChannelData.Length());
-          for (uint32_t i = 0; i < channelData.Length(); ++i) {
-            channelData[i] =
-              AddAudioSampleOffset(c.mChannelData[i], c.mBufferFormat, int32_t(offset));
-          }
-
-          if (channelData.Length() < outputChannels) {
-            // Up-mix. Note that this might actually make channelData have more
-            // than outputChannels temporarily.
-            AudioChannelsUpMix(&channelData, outputChannels, gZeroChannel);
-          }
-
-          if (channelData.Length() > outputChannels) {
-            // Down-mix.
-            DownmixAndInterleave(channelData, c.mBufferFormat, duration,
-                                 c.mVolume, outputChannels, buf.Elements());
-          } else {
-            InterleaveAndConvertBuffer(channelData.Elements(), c.mBufferFormat,
-                                       duration, c.mVolume,
-                                       outputChannels,
-                                       buf.Elements());
-          }
-        } else {
-          // Assumes that a bit pattern of zeroes == 0.0f
-          memset(buf.Elements(), 0, buf.Length()*sizeof(AudioDataValue));
+    // If we have written data in the past, or we have real (non-silent) data
+    // to write, we can proceed. Otherwise, it means we just started the
+    // AudioStream, and we don't have real data to write to it (just silence).
+    // To avoid overbuffering in the AudioStream, we simply drop the silence,
+    // here. The stream will underrun and output silence anyways.
+    if (c.mBuffer || aOutput->GetWritten()) {
+      if (c.mBuffer) {
+        channelData.SetLength(c.mChannelData.Length());
+        for (uint32_t i = 0; i < channelData.Length(); ++i) {
+          channelData[i] = c.mChannelData[i];
        }
-        aOutput->Write(buf.Elements(), int32_t(duration), &(c.mTimeStamp));
+
+        if (channelData.Length() < outputChannels) {
+          // Up-mix. Note that this might actually make channelData have more
+          // than outputChannels temporarily.
+          AudioChannelsUpMix(&channelData, outputChannels, gZeroChannel);
+        }
+
+        if (channelData.Length() > outputChannels) {
+          // Down-mix.
+          DownmixAndInterleave(channelData, c.mBufferFormat, frames,
+                               c.mVolume, outputChannels, buf.Elements() + offset);
+        } else {
+          InterleaveAndConvertBuffer(channelData.Elements(), c.mBufferFormat,
+                                     frames, c.mVolume,
+                                     outputChannels,
+                                     buf.Elements() + offset);
+        }
+      } else {
+        // Assumes that a bit pattern of zeroes == 0.0f
+        memset(buf.Elements() + offset, 0, outputChannels * frames * sizeof(AudioDataValue));
      }
-      if(!c.mTimeStamp.IsNull()) {
-        TimeStamp now = TimeStamp::Now();
-        // would be more efficient to c.mTimeStamp to ms on create time then pass here
-        LogTime(AsyncLatencyLogger::AudioMediaStreamTrack, aID,
-                (now - c.mTimeStamp).ToMilliseconds(), c.mTimeStamp);
-      }
-      offset += duration;
    }
+
+    offset += frames * outputChannels;
+
+    if (!c.mTimeStamp.IsNull()) {
+      TimeStamp now = TimeStamp::Now();
+      // would be more efficient to c.mTimeStamp to ms on create time then pass here
+      LogTime(AsyncLatencyLogger::AudioMediaStreamTrack, aID,
+              (now - c.mTimeStamp).ToMilliseconds(), c.mTimeStamp);
+    }
+  }
+
+  aOutput->Write(buf.Elements(), GetDuration(), &(mChunks[mChunks.Length() - 1].mTimeStamp));
+
+  if (aMixer) {
+    aMixer->Mix(buf.Elements(), outputChannels, GetDuration());
  }
  aOutput->Start();
 }
--- a/content/media/AudioSegment.h
+++ b/content/media/AudioSegment.h
@ -27,6 +27,7 @@ public:
 };

 class AudioStream;
+class AudioMixer;

 /**
 * For auto-arrays etc, guess this as the common number of channels.
@ -215,7 +216,7 @@ public:
    return chunk;
  }
  void ApplyVolume(float aVolume);
-  void WriteTo(uint64_t aID, AudioStream* aOutput);
+  void WriteTo(uint64_t aID, AudioStream* aOutput, AudioMixer* aMixer = nullptr);

  int ChannelCount() {
    NS_WARN_IF_FALSE(!mChunks.IsEmpty(),
--- a/content/media/MediaSegment.h
+++ b/content/media/MediaSegment.h
@ -267,9 +267,8 @@ protected:
  void AppendSliceInternal(const MediaSegmentBase<C, Chunk>& aSource,
                           TrackTicks aStart, TrackTicks aEnd)
  {
-    NS_ASSERTION(aStart <= aEnd, "Endpoints inverted");
-    NS_WARN_IF_FALSE(aStart >= 0 && aEnd <= aSource.mDuration,
-                     "Slice out of range");
+    MOZ_ASSERT(aStart <= aEnd, "Endpoints inverted");
+    MOZ_ASSERT(aStart >= 0 && aEnd <= aSource.mDuration, "Slice out of range");
    mDuration += aEnd - aStart;
    TrackTicks offset = 0;
    for (uint32_t i = 0; i < aSource.mChunks.Length() && offset < aEnd; ++i) {
--- a/content/media/MediaStreamGraph.cpp
+++ b/content/media/MediaStreamGraph.cpp
@ -577,17 +577,30 @@ MediaStreamGraphImpl::UpdateStreamOrderForStream(mozilla::LinkedList<MediaStream
  *mStreams.AppendElement() = stream.forget();
 }

+static void AudioMixerCallback(AudioDataValue* aMixedBuffer,
+                               AudioSampleFormat aFormat,
+                               uint32_t aChannels,
+                               uint32_t aFrames)
+{
+  // Need an api to register mixer callbacks, bug 989921
+}
+
 void
 MediaStreamGraphImpl::UpdateStreamOrder()
 {
  mOldStreams.SwapElements(mStreams);
  mStreams.ClearAndRetainStorage();
+  bool shouldMix = false;
  for (uint32_t i = 0; i < mOldStreams.Length(); ++i) {
    MediaStream* stream = mOldStreams[i];
    stream->mHasBeenOrdered = false;
    stream->mIsConsumed = false;
    stream->mIsOnOrderingStack = false;
    stream->mInBlockingSet = false;
+    if (stream->AsSourceStream() &&
+        stream->AsSourceStream()->NeedsMixing()) {
+      shouldMix = true;
+    }
    ProcessedMediaStream* ps = stream->AsProcessedStream();
    if (ps) {
      ps->mInCycle = false;
@ -598,6 +611,12 @@ MediaStreamGraphImpl::UpdateStreamOrder()
    }
  }

+  if (!mMixer && shouldMix) {
+    mMixer = new AudioMixer(AudioMixerCallback);
+  } else if (mMixer && !shouldMix) {
+    mMixer = nullptr;
+  }
+
  mozilla::LinkedList<MediaStream> stack;
  for (uint32_t i = 0; i < mOldStreams.Length(); ++i) {
    nsRefPtr<MediaStream>& s = mOldStreams[i];
@ -810,6 +829,7 @@ MediaStreamGraphImpl::CreateOrDestroyAudioStreams(GraphTime aAudioOutputStartTim
          aStream->mAudioOutputStreams.AppendElement();
        audioOutputStream->mAudioPlaybackStartTime = aAudioOutputStartTime;
        audioOutputStream->mBlockedAudioTime = 0;
+        audioOutputStream->mLastTickWritten = 0;
        audioOutputStream->mStream = new AudioStream();
        // XXX for now, allocate stereo output. But we need to fix this to
        // match the system's ideal channel configuration.
@ -831,14 +851,22 @@ MediaStreamGraphImpl::CreateOrDestroyAudioStreams(GraphTime aAudioOutputStartTim
  }
 }

-void
+TrackTicks
 MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
                                GraphTime aFrom, GraphTime aTo)
 {
  MOZ_ASSERT(mRealtime, "Should only attempt to play audio in realtime mode");

+  TrackTicks ticksWritten = 0;
+  // We compute the number of needed ticks by converting a difference of graph
+  // time rather than by substracting two converted stream time to ensure that
+  // the rounding between {Graph,Stream}Time and track ticks is not dependant
+  // on the absolute value of the {Graph,Stream}Time, and so that number of
+  // ticks to play is the same for each cycle.
+  TrackTicks ticksNeeded = TimeToTicksRoundDown(IdealAudioRate(), aTo) - TimeToTicksRoundDown(IdealAudioRate(), aFrom);
+
  if (aStream->mAudioOutputStreams.IsEmpty()) {
-    return;
+    return 0;
  }

  // When we're playing multiple copies of this stream at the same time, they're
@ -852,6 +880,25 @@ MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
    MediaStream::AudioOutputStream& audioOutput = aStream->mAudioOutputStreams[i];
    StreamBuffer::Track* track = aStream->mBuffer.FindTrack(audioOutput.mTrackID);
    AudioSegment* audio = track->Get<AudioSegment>();
+    AudioSegment output;
+    MOZ_ASSERT(track->GetRate() == IdealAudioRate());
+
+    // offset and audioOutput.mLastTickWritten can differ by at most one sample,
+    // because of the rounding issue. We track that to ensure we don't skip a
+    // sample, or play a sample twice.
+    TrackTicks offset = track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, aFrom));
+    if (!audioOutput.mLastTickWritten) {
+        audioOutput.mLastTickWritten = offset;
+    }
+    if (audioOutput.mLastTickWritten != offset) {
+      // If there is a global underrun of the MSG, this property won't hold, and
+      // we reset the sample count tracking.
+      if (std::abs(audioOutput.mLastTickWritten - offset) != 1) {
+        audioOutput.mLastTickWritten = offset;
+      } else {
+        offset = audioOutput.mLastTickWritten;
+      }
+    }

    // We don't update aStream->mBufferStartTime here to account for
    // time spent blocked. Instead, we'll update it in UpdateCurrentTime after the
@ -859,54 +906,59 @@ MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
    // right offsets in the stream buffer, even if we've already written silence for
    // some amount of blocked time after the current time.
    GraphTime t = aFrom;
-    while (t < aTo) {
+    while (ticksNeeded) {
      GraphTime end;
      bool blocked = aStream->mBlocked.GetAt(t, &end);
      end = std::min(end, aTo);

-      AudioSegment output;
-      if (blocked) {
-        // Track total blocked time in aStream->mBlockedAudioTime so that
-        // the amount of silent samples we've inserted for blocking never gets
-        // more than one sample away from the ideal amount.
-        TrackTicks startTicks =
-            TimeToTicksRoundDown(IdealAudioRate(), audioOutput.mBlockedAudioTime);
-        audioOutput.mBlockedAudioTime += end - t;
-        TrackTicks endTicks =
-            TimeToTicksRoundDown(IdealAudioRate(), audioOutput.mBlockedAudioTime);
-
-        output.InsertNullDataAtStart(endTicks - startTicks);
-        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing blocking-silence samples for %f to %f",
-                                    aStream, MediaTimeToSeconds(t), MediaTimeToSeconds(end)));
+      // Check how many ticks of sound we can provide if we are blocked some
+      // time in the middle of this cycle.
+      TrackTicks toWrite = 0;
+      if (end >= aTo) {
+        toWrite = ticksNeeded;
      } else {
-        TrackTicks startTicks =
-            track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, t));
-        TrackTicks endTicks =
-            track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, end));
-
-        // If startTicks is before the track start, then that part of 'audio'
-        // will just be silence, which is fine here. But if endTicks is after
-        // the track end, then 'audio' won't be long enough, so we'll need
-        // to explicitly play silence.
-        TrackTicks sliceEnd = std::min(endTicks, audio->GetDuration());
-        if (sliceEnd > startTicks) {
-          output.AppendSlice(*audio, startTicks, sliceEnd);
-        }
-        // Play silence where the track has ended
-        output.AppendNullData(endTicks - sliceEnd);
-        NS_ASSERTION(endTicks == sliceEnd || track->IsEnded(),
-                     "Ran out of data but track not ended?");
-        output.ApplyVolume(volume);
-        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing samples for %f to %f (samples %lld to %lld)",
-                                    aStream, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
-                                    startTicks, endTicks));
+        toWrite = TimeToTicksRoundDown(IdealAudioRate(), end - aFrom);
+      }
+
+      if (blocked) {
+        output.InsertNullDataAtStart(toWrite);
+        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing %ld blocking-silence samples for %f to %f (%ld to %ld)\n",
+                                    aStream, toWrite, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
+                                    offset, offset + toWrite));
+        ticksNeeded -= toWrite;
+      } else {
+        TrackTicks endTicksNeeded = offset + toWrite;
+        TrackTicks endTicksAvailable = audio->GetDuration();
+        if (endTicksNeeded <= endTicksAvailable) {
+          output.AppendSlice(*audio, offset, endTicksNeeded);
+        } else {
+          MOZ_ASSERT(track->IsEnded(), "Not enough data, and track not ended.");
+          // If we are at the end of the track, maybe write the remaining
+          // samples, and pad with/output silence.
+          if (endTicksNeeded > endTicksAvailable &&
+              offset < endTicksAvailable) {
+            output.AppendSlice(*audio, offset, endTicksAvailable);
+            ticksNeeded -= endTicksAvailable - offset;
+            toWrite -= endTicksAvailable - offset;
+          }
+          output.AppendNullData(toWrite);
+        }
+        output.ApplyVolume(volume);
+        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing %ld samples for %f to %f (samples %ld to %ld)\n",
+                                     aStream, toWrite, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
+                                     offset, endTicksNeeded));
+        ticksNeeded -= toWrite;
      }
-      // Need unique id for stream & track - and we want it to match the inserter
-      output.WriteTo(LATENCY_STREAM_ID(aStream, track->GetID()),
-                     audioOutput.mStream);
      t = end;
+      offset += toWrite;
+      audioOutput.mLastTickWritten += toWrite;
    }
+
+    // Need unique id for stream & track - and we want it to match the inserter
+    output.WriteTo(LATENCY_STREAM_ID(aStream, track->GetID()),
+                   audioOutput.mStream, mMixer);
  }
+  return ticksWritten;
 }

 static void
@ -1241,6 +1293,9 @@ MediaStreamGraphImpl::RunThread()
    bool allBlockedForever = true;
    // True when we've done ProcessInput for all processed streams.
    bool doneAllProducing = false;
+    // This is the number of frame that are written to the AudioStreams, for
+    // this cycle.
+    TrackTicks ticksPlayed = 0;
    // Figure out what each stream wants to do
    for (uint32_t i = 0; i < mStreams.Length(); ++i) {
      MediaStream* stream = mStreams[i];
@ -1277,7 +1332,13 @@ MediaStreamGraphImpl::RunThread()
      if (mRealtime) {
        // Only playback audio and video in real-time mode
        CreateOrDestroyAudioStreams(prevComputedTime, stream);
-        PlayAudio(stream, prevComputedTime, mStateComputedTime);
+        TrackTicks ticksPlayedForThisStream = PlayAudio(stream, prevComputedTime, mStateComputedTime);
+        if (!ticksPlayed) {
+          ticksPlayed = ticksPlayedForThisStream;
+        } else {
+          MOZ_ASSERT(!ticksPlayedForThisStream || ticksPlayedForThisStream == ticksPlayed,
+              "Each stream should have the same number of frame.");
+        }
        PlayVideo(stream);
      }
      SourceMediaStream* is = stream->AsSourceStream();
@ -1289,6 +1350,11 @@ MediaStreamGraphImpl::RunThread()
        allBlockedForever = false;
      }
    }
+
+    if (mMixer) {
+      mMixer->FinishMixing();
+    }
+
    if (ensureNextIteration || !allBlockedForever) {
      EnsureNextIteration();
    }
@ -2317,6 +2383,20 @@ SourceMediaStream::GetBufferedTicks(TrackID aID)
  return 0;
 }

+void
+SourceMediaStream::RegisterForAudioMixing()
+{
+  MutexAutoLock lock(mMutex);
+  mNeedsMixing = true;
+}
+
+bool
+SourceMediaStream::NeedsMixing()
+{
+  MutexAutoLock lock(mMutex);
+  return mNeedsMixing;
+}
+
 void
 MediaInputPort::Init()
 {
@ -2501,6 +2581,7 @@ MediaStreamGraphImpl::MediaStreamGraphImpl(bool aRealtime)
  , mNonRealtimeProcessing(false)
  , mStreamOrderDirty(false)
  , mLatencyLog(AsyncLatencyLogger::Get())
+  , mMixer(nullptr)
 {
 #ifdef PR_LOGGING
  if (!gMediaStreamGraphLog) {
--- a/content/media/MediaStreamGraph.h
+++ b/content/media/MediaStreamGraph.h
@ -18,6 +18,7 @@
 #include "MainThreadUtils.h"
 #include "nsAutoRef.h"
 #include "speex/speex_resampler.h"
+#include "AudioMixer.h"

 class nsIRunnable;

@ -572,6 +573,8 @@ protected:
    // Amount of time that we've wanted to play silence because of the stream
    // blocking.
    MediaTime mBlockedAudioTime;
+    // Last tick written to the audio output.
+    TrackTicks mLastTickWritten;
    nsAutoPtr<AudioStream> mStream;
    TrackID mTrackID;
  };
@ -782,6 +785,9 @@ public:
    bool mHaveEnough;
  };

+  void RegisterForAudioMixing();
+  bool NeedsMixing();
+
 protected:
  TrackData* FindDataForTrack(TrackID aID)
  {
@ -815,6 +821,7 @@ protected:
  bool mPullEnabled;
  bool mUpdateFinished;
  bool mDestroyed;
+  bool mNeedsMixing;
 };

 /**
--- a/content/media/MediaStreamGraphImpl.h
+++ b/content/media/MediaStreamGraphImpl.h
@ -13,12 +13,15 @@
 #include "nsIThread.h"
 #include "nsIRunnable.h"
 #include "Latency.h"
+#include "mozilla/WeakPtr.h"

 namespace mozilla {

 template <typename T>
 class LinkedList;

+class AudioMixer;
+
 /**
 * Assume we can run an iteration of the MediaStreamGraph loop in this much time
 * or less.
@ -52,10 +55,6 @@ static const int AUDIO_TARGET_MS = 2*MEDIA_GRAPH_TARGET_PERIOD_MS +
 static const int VIDEO_TARGET_MS = 2*MEDIA_GRAPH_TARGET_PERIOD_MS +
    SCHEDULE_SAFETY_MARGIN_MS;

-/**
- * Rate at which we run the video tracks.
- */
-
 /**
 * A per-stream update message passed from the media graph thread to the
 * main thread.
@ -327,9 +326,9 @@ public:
                                   MediaStream* aStream);
  /**
   * Queue audio (mix of stream audio and silence for blocked intervals)
-   * to the audio output stream.
+   * to the audio output stream. Returns the number of frames played.
   */
-  void PlayAudio(MediaStream* aStream, GraphTime aFrom, GraphTime aTo);
+  TrackTicks PlayAudio(MediaStream* aStream, GraphTime aFrom, GraphTime aTo);
  /**
   * Set the correct current video frame for stream aStream.
   */
@ -575,6 +574,10 @@ public:
   * Hold a ref to the Latency logger
   */
  nsRefPtr<AsyncLatencyLogger> mLatencyLog;
+  /**
+   * If this is not null, all the audio output for the MSG will be mixed down.
+   */
+  nsAutoPtr<AudioMixer> mMixer;
 };

 }
--- a/content/media/compiledtest/TestAudioMixer.cpp
+++ b/content/media/compiledtest/TestAudioMixer.cpp
@ -0,0 +1,155 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AudioMixer.h"
+#include <assert.h>
+
+using mozilla::AudioDataValue;
+using mozilla::AudioSampleFormat;
+
+/* In this test, the different audio stream and channels are always created to
+ * cancel each other. */
+void MixingDone(AudioDataValue* aData, AudioSampleFormat aFormat, uint32_t aChannels, uint32_t aFrames)
+{
+  bool silent = true;
+  for (uint32_t i = 0; i < aChannels * aFrames; i++) {
+    if (aData[i] != 0.0) {
+      if (aFormat == mozilla::AUDIO_FORMAT_S16) {
+        fprintf(stderr, "Sample at %d is not silent: %d\n", i, (short)aData[i]);
+      } else {
+        fprintf(stderr, "Sample at %d is not silent: %f\n", i, (float)aData[i]);
+      }
+      silent = false;
+    }
+  }
+  if (!silent) {
+    MOZ_CRASH();
+  }
+}
+
+/* Helper function to give us the maximum and minimum value that don't clip,
+ * for a given sample format (integer or floating-point). */
+template<typename T>
+T GetLowValue();
+
+template<typename T>
+T GetHighValue();
+
+template<>
+float GetLowValue<float>() {
+  return -1.0;
+}
+
+template<>
+short GetLowValue<short>() {
+  return -INT16_MAX;
+}
+
+template<>
+float GetHighValue<float>() {
+  return 1.0;
+}
+
+template<>
+short GetHighValue<short>() {
+  return INT16_MAX;
+}
+
+void FillBuffer(AudioDataValue* aBuffer, uint32_t aLength, AudioDataValue aValue)
+{
+  AudioDataValue* end = aBuffer + aLength;
+  while (aBuffer != end) {
+    *aBuffer++ = aValue;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const uint32_t CHANNEL_LENGTH = 256;
+  AudioDataValue a[CHANNEL_LENGTH * 2];
+  AudioDataValue b[CHANNEL_LENGTH * 2];
+  FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+  FillBuffer(a + CHANNEL_LENGTH, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+  FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+  FillBuffer(b + CHANNEL_LENGTH, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+
+  {
+    int iterations = 2;
+    mozilla::AudioMixer mixer(MixingDone);
+
+    fprintf(stderr, "Test AudioMixer constant buffer length.\n");
+
+    while (iterations--) {
+      mixer.Mix(a, 2, CHANNEL_LENGTH);
+      mixer.Mix(b, 2, CHANNEL_LENGTH);
+      mixer.FinishMixing();
+    }
+  }
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+
+    fprintf(stderr, "Test AudioMixer variable buffer length.\n");
+
+    FillBuffer(a, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH / 2);
+    mixer.Mix(b, 2, CHANNEL_LENGTH / 2);
+    mixer.FinishMixing();
+    FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    FillBuffer(a, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH / 2);
+    mixer.Mix(b, 2, CHANNEL_LENGTH / 2);
+    mixer.FinishMixing();
+  }
+
+  FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+  FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+    fprintf(stderr, "Test AudioMixer variable channel count.\n");
+
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+  }
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+    fprintf(stderr, "Test AudioMixer variable stream count.\n");
+
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+  }
+
+  return 0;
+}
+
--- a/content/media/compiledtest/moz.build
+++ b/content/media/compiledtest/moz.build
@ -0,0 +1,16 @@
+# -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+CPP_UNIT_TESTS += [
+    'TestAudioMixer.cpp',
+]
+
+FAIL_ON_WARNINGS = True
+
+LOCAL_INCLUDES += [
+    '..',
+]
+
--- a/content/media/moz.build
+++ b/content/media/moz.build
@ -12,6 +12,8 @@ PARALLEL_DIRS += [
  'webvtt'
 ]

+TEST_TOOL_DIRS += ['compiledtest']
+
 if CONFIG['MOZ_RAW']:
    PARALLEL_DIRS += ['raw']

@ -57,6 +59,7 @@ EXPORTS += [
    'AudioChannelFormat.h',
    'AudioCompactor.h',
    'AudioEventTimeline.h',
+    'AudioMixer.h',
    'AudioNodeEngine.h',
    'AudioNodeExternalInputStream.h',
    'AudioNodeStream.h',
--- a/content/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/content/media/webrtc/MediaEngineWebRTCAudio.cpp
@ -158,6 +158,8 @@ MediaEngineWebRTCAudioSource::Start(SourceMediaStream* aStream, TrackID aID)
  AudioSegment* segment = new AudioSegment();
  aStream->AddTrack(aID, SAMPLE_FREQUENCY, 0, segment);
  aStream->AdvanceKnownTracksTime(STREAM_TIME_MAX);
+  // XXX Make this based on the pref.
+  aStream->RegisterForAudioMixing();
  LOG(("Start audio for stream %p", aStream));

  if (mState == kStarted) {