From 3b43fdba8cb9fb37033fc9ad2cfaffdffc690352 Mon Sep 17 00:00:00 2001
From: Paul Adenot <paul@paul.cx>
Date: Mon, 24 Mar 2014 11:06:06 +0100
Subject: [PATCH] Bug 982490 - Ensure for MSG cycle that each MediaStream write
 the same number of frames to their AudioStream. r=jesup,roc

---
 content/media/AudioMixer.h                    |  85 +++++++++
 content/media/AudioSampleFormat.h             |  14 +-
 content/media/AudioSegment.cpp                | 110 ++++++------
 content/media/AudioSegment.h                  |   3 +-
 content/media/MediaSegment.h                  |   5 +-
 content/media/MediaStreamGraph.cpp            | 165 +++++++++++++-----
 content/media/MediaStreamGraph.h              |   7 +
 content/media/MediaStreamGraphImpl.h          |  15 +-
 content/media/compiledtest/TestAudioMixer.cpp | 155 ++++++++++++++++
 content/media/compiledtest/moz.build          |  16 ++
 content/media/moz.build                       |   3 +
 .../media/webrtc/MediaEngineWebRTCAudio.cpp   |   2 +
 12 files changed, 475 insertions(+), 105 deletions(-)
 create mode 100644 content/media/AudioMixer.h
 create mode 100644 content/media/compiledtest/TestAudioMixer.cpp
 create mode 100644 content/media/compiledtest/moz.build
diff --git a/content/media/AudioMixer.h b/content/media/AudioMixer.h
new file mode 100644
index 000000000000..0c6e6799b4aa
--- /dev/null
+++ b/content/media/AudioMixer.h
@@ -0,0 +1,85 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_AUDIOMIXER_H_
+#define MOZILLA_AUDIOMIXER_H_
+
+#include "AudioSampleFormat.h"
+#include "nsTArray.h"
+#include "mozilla/PodOperations.h"
+
+namespace mozilla {
+typedef void(*MixerFunc)(AudioDataValue* aMixedBuffer,
+                         AudioSampleFormat aFormat,
+                         uint32_t aChannels,
+                         uint32_t aFrames);
+
+/**
+ * This class mixes multiple streams of audio together to output a single audio
+ * stream.
+ *
+ * AudioMixer::Mix is to be called repeatedly with buffers that have the same
+ * length, sample rate, sample format and channel count.
+ *
+ * When all the tracks have been mixed, calling FinishMixing will call back with
+ * a buffer containing the mixed audio data.
+ *
+ * This class is not thread safe.
+ */
+class AudioMixer
+{
+public:
+  AudioMixer(MixerFunc aCallback)
+    : mCallback(aCallback),
+      mFrames(0),
+      mChannels(0)
+  { }
+
+  /* Get the data from the mixer. This is supposed to be called when all the
+   * tracks have been mixed in. The caller should not hold onto the data. */
+  void FinishMixing() {
+    mCallback(mMixedAudio.Elements(),
+              AudioSampleTypeToFormat<AudioDataValue>::Format,
+              mChannels,
+              mFrames);
+    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
+    mChannels = mFrames = 0;
+  }
+
+  /* Add a buffer to the mix. aSamples is interleaved. */
+  void Mix(AudioDataValue* aSamples, uint32_t aChannels, uint32_t aFrames) {
+    if (!mFrames && !mChannels) {
+      mFrames = aFrames;
+      mChannels = aChannels;
+      EnsureCapacityAndSilence();
+    }
+
+    MOZ_ASSERT(aFrames == mFrames);
+    MOZ_ASSERT(aChannels == mChannels);
+
+    for (uint32_t i = 0; i < aFrames * aChannels; i++) {
+      mMixedAudio[i] += aSamples[i];
+    }
+  }
+private:
+  void EnsureCapacityAndSilence() {
+    if (mFrames * mChannels > mMixedAudio.Length()) {
+      mMixedAudio.SetLength(mFrames* mChannels);
+    }
+    PodZero(mMixedAudio.Elements(), mMixedAudio.Length());
+  }
+
+  /* Function that is called when the mixing is done. */
+  MixerFunc mCallback;
+  /* Number of frames for this mixing block. */
+  uint32_t mFrames;
+  /* Number of channels for this mixing block. */
+  uint32_t mChannels;
+  /* Buffer containing the mixed audio data. */
+  nsTArray<AudioDataValue> mMixedAudio;
+};
+}
+
+#endif // MOZILLA_AUDIOMIXER_H_
diff --git a/content/media/AudioSampleFormat.h b/content/media/AudioSampleFormat.h
index f35c83712335..89a38ff9f550 100644
--- a/content/media/AudioSampleFormat.h
+++ b/content/media/AudioSampleFormat.h
@@ -49,7 +49,19 @@ public:
 
 typedef AudioSampleTraits<AUDIO_OUTPUT_FORMAT>::Type AudioDataValue;
 
-// Single-sample conversion 
+template<typename T> class AudioSampleTypeToFormat;
+
+template <> class AudioSampleTypeToFormat<float> {
+public:
+  static const AudioSampleFormat Format = AUDIO_FORMAT_FLOAT32;
+};
+
+template <> class AudioSampleTypeToFormat<short> {
+public:
+  static const AudioSampleFormat Format = AUDIO_FORMAT_S16;
+};
+
+// Single-sample conversion
 /*
  * Use "2^N" conversion since it's simple, fast, "bit transparent", used by
  * many other libraries and apparently behaves reasonably.
diff --git a/content/media/AudioSegment.cpp b/content/media/AudioSegment.cpp
index 68b80765b601..359bc43e379e 100644
--- a/content/media/AudioSegment.cpp
+++ b/content/media/AudioSegment.cpp
@@ -6,6 +6,7 @@
 #include "AudioSegment.h"
 
 #include "AudioStream.h"
+#include "AudioMixer.h"
 #include "AudioChannelFormat.h"
 #include "Latency.h"
 #include "speex/speex_resampler.h"
@@ -134,69 +135,74 @@ void AudioSegment::ResampleChunks(SpeexResamplerState* aResampler)
 }
 
 void
-AudioSegment::WriteTo(uint64_t aID, AudioStream* aOutput)
+AudioSegment::WriteTo(uint64_t aID, AudioStream* aOutput, AudioMixer* aMixer)
 {
   uint32_t outputChannels = aOutput->GetChannels();
   nsAutoTArray<AudioDataValue,AUDIO_PROCESSING_FRAMES*GUESS_AUDIO_CHANNELS> buf;
   nsAutoTArray<const void*,GUESS_AUDIO_CHANNELS> channelData;
 
+  if (!GetDuration()) {
+    return;
+  }
+
+  uint32_t outBufferLength = GetDuration() * outputChannels;
+  buf.SetLength(outBufferLength);
+
+  // Offset in the buffer that will end up sent to the AudioStream.
+  uint32_t offset = 0;
+
   for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
     AudioChunk& c = *ci;
-    TrackTicks offset = 0;
-    while (offset < c.mDuration) {
-      TrackTicks durationTicks =
-        std::min<TrackTicks>(c.mDuration - offset, AUDIO_PROCESSING_FRAMES);
-      if (uint64_t(outputChannels)*durationTicks > INT32_MAX || offset > INT32_MAX) {
-        NS_ERROR("Buffer overflow");
-        return;
-      }
+    uint32_t frames = c.mDuration;
 
-      uint32_t duration = uint32_t(durationTicks);
-
-      // If we have written data in the past, or we have real (non-silent) data
-      // to write, we can proceed. Otherwise, it means we just started the
-      // AudioStream, and we don't have real data to write to it (just silence).
-      // To avoid overbuffering in the AudioStream, we simply drop the silence,
-      // here. The stream will underrun and output silence anyways.
-      if (c.mBuffer || aOutput->GetWritten()) {
-        buf.SetLength(outputChannels*duration);
-        if (c.mBuffer) {
-          channelData.SetLength(c.mChannelData.Length());
-          for (uint32_t i = 0; i < channelData.Length(); ++i) {
-            channelData[i] =
-              AddAudioSampleOffset(c.mChannelData[i], c.mBufferFormat, int32_t(offset));
-          }
-
-          if (channelData.Length() < outputChannels) {
-            // Up-mix. Note that this might actually make channelData have more
-            // than outputChannels temporarily.
-            AudioChannelsUpMix(&channelData, outputChannels, gZeroChannel);
-          }
-
-          if (channelData.Length() > outputChannels) {
-            // Down-mix.
-            DownmixAndInterleave(channelData, c.mBufferFormat, duration,
-                                 c.mVolume, outputChannels, buf.Elements());
-          } else {
-            InterleaveAndConvertBuffer(channelData.Elements(), c.mBufferFormat,
-                                       duration, c.mVolume,
-                                       outputChannels,
-                                       buf.Elements());
-          }
-        } else {
-          // Assumes that a bit pattern of zeroes == 0.0f
-          memset(buf.Elements(), 0, buf.Length()*sizeof(AudioDataValue));
+    // If we have written data in the past, or we have real (non-silent) data
+    // to write, we can proceed. Otherwise, it means we just started the
+    // AudioStream, and we don't have real data to write to it (just silence).
+    // To avoid overbuffering in the AudioStream, we simply drop the silence,
+    // here. The stream will underrun and output silence anyways.
+    if (c.mBuffer || aOutput->GetWritten()) {
+      if (c.mBuffer) {
+        channelData.SetLength(c.mChannelData.Length());
+        for (uint32_t i = 0; i < channelData.Length(); ++i) {
+          channelData[i] = c.mChannelData[i];
         }
-        aOutput->Write(buf.Elements(), int32_t(duration), &(c.mTimeStamp));
+
+        if (channelData.Length() < outputChannels) {
+          // Up-mix. Note that this might actually make channelData have more
+          // than outputChannels temporarily.
+          AudioChannelsUpMix(&channelData, outputChannels, gZeroChannel);
+        }
+
+        if (channelData.Length() > outputChannels) {
+          // Down-mix.
+          DownmixAndInterleave(channelData, c.mBufferFormat, frames,
+                               c.mVolume, outputChannels, buf.Elements() + offset);
+        } else {
+          InterleaveAndConvertBuffer(channelData.Elements(), c.mBufferFormat,
+                                     frames, c.mVolume,
+                                     outputChannels,
+                                     buf.Elements() + offset);
+        }
+      } else {
+        // Assumes that a bit pattern of zeroes == 0.0f
+        memset(buf.Elements() + offset, 0, outputChannels * frames * sizeof(AudioDataValue));
       }
-      if(!c.mTimeStamp.IsNull()) {
-        TimeStamp now = TimeStamp::Now();
-        // would be more efficient to c.mTimeStamp to ms on create time then pass here
-        LogTime(AsyncLatencyLogger::AudioMediaStreamTrack, aID,
-                (now - c.mTimeStamp).ToMilliseconds(), c.mTimeStamp);
-      }
-      offset += duration;
     }
+
+    offset += frames * outputChannels;
+
+    if (!c.mTimeStamp.IsNull()) {
+      TimeStamp now = TimeStamp::Now();
+      // would be more efficient to c.mTimeStamp to ms on create time then pass here
+      LogTime(AsyncLatencyLogger::AudioMediaStreamTrack, aID,
+              (now - c.mTimeStamp).ToMilliseconds(), c.mTimeStamp);
+    }
+  }
+
+  aOutput->Write(buf.Elements(), GetDuration(), &(mChunks[mChunks.Length() - 1].mTimeStamp));
+
+  if (aMixer) {
+    aMixer->Mix(buf.Elements(), outputChannels, GetDuration());
   }
   aOutput->Start();
 }
diff --git a/content/media/AudioSegment.h b/content/media/AudioSegment.h
index c80a3ae45bc3..d30d0051722f 100644
--- a/content/media/AudioSegment.h
+++ b/content/media/AudioSegment.h
@@ -27,6 +27,7 @@ public:
 };
 
 class AudioStream;
+class AudioMixer;
 
 /**
  * For auto-arrays etc, guess this as the common number of channels.
@@ -215,7 +216,7 @@ public:
     return chunk;
   }
   void ApplyVolume(float aVolume);
-  void WriteTo(uint64_t aID, AudioStream* aOutput);
+  void WriteTo(uint64_t aID, AudioStream* aOutput, AudioMixer* aMixer = nullptr);
 
   int ChannelCount() {
     NS_WARN_IF_FALSE(!mChunks.IsEmpty(),
diff --git a/content/media/MediaSegment.h b/content/media/MediaSegment.h
index bbad4ec1397c..ef3793aa1c21 100644
--- a/content/media/MediaSegment.h
+++ b/content/media/MediaSegment.h
@@ -267,9 +267,8 @@ protected:
   void AppendSliceInternal(const MediaSegmentBase<C, Chunk>& aSource,
                            TrackTicks aStart, TrackTicks aEnd)
   {
-    NS_ASSERTION(aStart <= aEnd, "Endpoints inverted");
-    NS_WARN_IF_FALSE(aStart >= 0 && aEnd <= aSource.mDuration,
-                     "Slice out of range");
+    MOZ_ASSERT(aStart <= aEnd, "Endpoints inverted");
+    MOZ_ASSERT(aStart >= 0 && aEnd <= aSource.mDuration, "Slice out of range");
     mDuration += aEnd - aStart;
     TrackTicks offset = 0;
     for (uint32_t i = 0; i < aSource.mChunks.Length() && offset < aEnd; ++i) {
diff --git a/content/media/MediaStreamGraph.cpp b/content/media/MediaStreamGraph.cpp
index db17f4d7df27..108fd6648631 100644
--- a/content/media/MediaStreamGraph.cpp
+++ b/content/media/MediaStreamGraph.cpp
@@ -577,17 +577,30 @@ MediaStreamGraphImpl::UpdateStreamOrderForStream(mozilla::LinkedList<MediaStream
   *mStreams.AppendElement() = stream.forget();
 }
 
+static void AudioMixerCallback(AudioDataValue* aMixedBuffer,
+                               AudioSampleFormat aFormat,
+                               uint32_t aChannels,
+                               uint32_t aFrames)
+{
+  // Need an api to register mixer callbacks, bug 989921
+}
+
 void
 MediaStreamGraphImpl::UpdateStreamOrder()
 {
   mOldStreams.SwapElements(mStreams);
   mStreams.ClearAndRetainStorage();
+  bool shouldMix = false;
   for (uint32_t i = 0; i < mOldStreams.Length(); ++i) {
     MediaStream* stream = mOldStreams[i];
     stream->mHasBeenOrdered = false;
     stream->mIsConsumed = false;
     stream->mIsOnOrderingStack = false;
     stream->mInBlockingSet = false;
+    if (stream->AsSourceStream() &&
+        stream->AsSourceStream()->NeedsMixing()) {
+      shouldMix = true;
+    }
     ProcessedMediaStream* ps = stream->AsProcessedStream();
     if (ps) {
       ps->mInCycle = false;
@@ -598,6 +611,12 @@ MediaStreamGraphImpl::UpdateStreamOrder()
     }
   }
 
+  if (!mMixer && shouldMix) {
+    mMixer = new AudioMixer(AudioMixerCallback);
+  } else if (mMixer && !shouldMix) {
+    mMixer = nullptr;
+  }
+
   mozilla::LinkedList<MediaStream> stack;
   for (uint32_t i = 0; i < mOldStreams.Length(); ++i) {
     nsRefPtr<MediaStream>& s = mOldStreams[i];
@@ -810,6 +829,7 @@ MediaStreamGraphImpl::CreateOrDestroyAudioStreams(GraphTime aAudioOutputStartTim
           aStream->mAudioOutputStreams.AppendElement();
         audioOutputStream->mAudioPlaybackStartTime = aAudioOutputStartTime;
         audioOutputStream->mBlockedAudioTime = 0;
+        audioOutputStream->mLastTickWritten = 0;
         audioOutputStream->mStream = new AudioStream();
         // XXX for now, allocate stereo output. But we need to fix this to
         // match the system's ideal channel configuration.
@@ -831,14 +851,22 @@ MediaStreamGraphImpl::CreateOrDestroyAudioStreams(GraphTime aAudioOutputStartTim
   }
 }
 
-void
+TrackTicks
 MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
                                 GraphTime aFrom, GraphTime aTo)
 {
   MOZ_ASSERT(mRealtime, "Should only attempt to play audio in realtime mode");
 
+  TrackTicks ticksWritten = 0;
+  // We compute the number of needed ticks by converting a difference of graph
+  // time rather than by substracting two converted stream time to ensure that
+  // the rounding between {Graph,Stream}Time and track ticks is not dependant
+  // on the absolute value of the {Graph,Stream}Time, and so that number of
+  // ticks to play is the same for each cycle.
+  TrackTicks ticksNeeded = TimeToTicksRoundDown(IdealAudioRate(), aTo) - TimeToTicksRoundDown(IdealAudioRate(), aFrom);
+
   if (aStream->mAudioOutputStreams.IsEmpty()) {
-    return;
+    return 0;
   }
 
   // When we're playing multiple copies of this stream at the same time, they're
@@ -852,6 +880,25 @@ MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
     MediaStream::AudioOutputStream& audioOutput = aStream->mAudioOutputStreams[i];
     StreamBuffer::Track* track = aStream->mBuffer.FindTrack(audioOutput.mTrackID);
     AudioSegment* audio = track->Get<AudioSegment>();
+    AudioSegment output;
+    MOZ_ASSERT(track->GetRate() == IdealAudioRate());
+
+    // offset and audioOutput.mLastTickWritten can differ by at most one sample,
+    // because of the rounding issue. We track that to ensure we don't skip a
+    // sample, or play a sample twice.
+    TrackTicks offset = track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, aFrom));
+    if (!audioOutput.mLastTickWritten) {
+        audioOutput.mLastTickWritten = offset;
+    }
+    if (audioOutput.mLastTickWritten != offset) {
+      // If there is a global underrun of the MSG, this property won't hold, and
+      // we reset the sample count tracking.
+      if (std::abs(audioOutput.mLastTickWritten - offset) != 1) {
+        audioOutput.mLastTickWritten = offset;
+      } else {
+        offset = audioOutput.mLastTickWritten;
+      }
+    }
 
     // We don't update aStream->mBufferStartTime here to account for
     // time spent blocked. Instead, we'll update it in UpdateCurrentTime after the
@@ -859,54 +906,59 @@ MediaStreamGraphImpl::PlayAudio(MediaStream* aStream,
     // right offsets in the stream buffer, even if we've already written silence for
     // some amount of blocked time after the current time.
     GraphTime t = aFrom;
-    while (t < aTo) {
+    while (ticksNeeded) {
       GraphTime end;
       bool blocked = aStream->mBlocked.GetAt(t, &end);
       end = std::min(end, aTo);
 
-      AudioSegment output;
-      if (blocked) {
-        // Track total blocked time in aStream->mBlockedAudioTime so that
-        // the amount of silent samples we've inserted for blocking never gets
-        // more than one sample away from the ideal amount.
-        TrackTicks startTicks =
-            TimeToTicksRoundDown(IdealAudioRate(), audioOutput.mBlockedAudioTime);
-        audioOutput.mBlockedAudioTime += end - t;
-        TrackTicks endTicks =
-            TimeToTicksRoundDown(IdealAudioRate(), audioOutput.mBlockedAudioTime);
-
-        output.InsertNullDataAtStart(endTicks - startTicks);
-        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing blocking-silence samples for %f to %f",
-                                    aStream, MediaTimeToSeconds(t), MediaTimeToSeconds(end)));
+      // Check how many ticks of sound we can provide if we are blocked some
+      // time in the middle of this cycle.
+      TrackTicks toWrite = 0;
+      if (end >= aTo) {
+        toWrite = ticksNeeded;
       } else {
-        TrackTicks startTicks =
-            track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, t));
-        TrackTicks endTicks =
-            track->TimeToTicksRoundDown(GraphTimeToStreamTime(aStream, end));
-
-        // If startTicks is before the track start, then that part of 'audio'
-        // will just be silence, which is fine here. But if endTicks is after
-        // the track end, then 'audio' won't be long enough, so we'll need
-        // to explicitly play silence.
-        TrackTicks sliceEnd = std::min(endTicks, audio->GetDuration());
-        if (sliceEnd > startTicks) {
-          output.AppendSlice(*audio, startTicks, sliceEnd);
-        }
-        // Play silence where the track has ended
-        output.AppendNullData(endTicks - sliceEnd);
-        NS_ASSERTION(endTicks == sliceEnd || track->IsEnded(),
-                     "Ran out of data but track not ended?");
-        output.ApplyVolume(volume);
-        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing samples for %f to %f (samples %lld to %lld)",
-                                    aStream, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
-                                    startTicks, endTicks));
+        toWrite = TimeToTicksRoundDown(IdealAudioRate(), end - aFrom);
+      }
+
+      if (blocked) {
+        output.InsertNullDataAtStart(toWrite);
+        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing %ld blocking-silence samples for %f to %f (%ld to %ld)\n",
+                                    aStream, toWrite, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
+                                    offset, offset + toWrite));
+        ticksNeeded -= toWrite;
+      } else {
+        TrackTicks endTicksNeeded = offset + toWrite;
+        TrackTicks endTicksAvailable = audio->GetDuration();
+        if (endTicksNeeded <= endTicksAvailable) {
+          output.AppendSlice(*audio, offset, endTicksNeeded);
+        } else {
+          MOZ_ASSERT(track->IsEnded(), "Not enough data, and track not ended.");
+          // If we are at the end of the track, maybe write the remaining
+          // samples, and pad with/output silence.
+          if (endTicksNeeded > endTicksAvailable &&
+              offset < endTicksAvailable) {
+            output.AppendSlice(*audio, offset, endTicksAvailable);
+            ticksNeeded -= endTicksAvailable - offset;
+            toWrite -= endTicksAvailable - offset;
+          }
+          output.AppendNullData(toWrite);
+        }
+        output.ApplyVolume(volume);
+        STREAM_LOG(PR_LOG_DEBUG+1, ("MediaStream %p writing %ld samples for %f to %f (samples %ld to %ld)\n",
+                                     aStream, toWrite, MediaTimeToSeconds(t), MediaTimeToSeconds(end),
+                                     offset, endTicksNeeded));
+        ticksNeeded -= toWrite;
       }
-      // Need unique id for stream & track - and we want it to match the inserter
-      output.WriteTo(LATENCY_STREAM_ID(aStream, track->GetID()),
-                     audioOutput.mStream);
       t = end;
+      offset += toWrite;
+      audioOutput.mLastTickWritten += toWrite;
     }
+
+    // Need unique id for stream & track - and we want it to match the inserter
+    output.WriteTo(LATENCY_STREAM_ID(aStream, track->GetID()),
+                   audioOutput.mStream, mMixer);
   }
+  return ticksWritten;
 }
 
 static void
@@ -1241,6 +1293,9 @@ MediaStreamGraphImpl::RunThread()
     bool allBlockedForever = true;
     // True when we've done ProcessInput for all processed streams.
     bool doneAllProducing = false;
+    // This is the number of frame that are written to the AudioStreams, for
+    // this cycle.
+    TrackTicks ticksPlayed = 0;
     // Figure out what each stream wants to do
     for (uint32_t i = 0; i < mStreams.Length(); ++i) {
       MediaStream* stream = mStreams[i];
@@ -1277,7 +1332,13 @@ MediaStreamGraphImpl::RunThread()
       if (mRealtime) {
         // Only playback audio and video in real-time mode
         CreateOrDestroyAudioStreams(prevComputedTime, stream);
-        PlayAudio(stream, prevComputedTime, mStateComputedTime);
+        TrackTicks ticksPlayedForThisStream = PlayAudio(stream, prevComputedTime, mStateComputedTime);
+        if (!ticksPlayed) {
+          ticksPlayed = ticksPlayedForThisStream;
+        } else {
+          MOZ_ASSERT(!ticksPlayedForThisStream || ticksPlayedForThisStream == ticksPlayed,
+              "Each stream should have the same number of frame.");
+        }
         PlayVideo(stream);
       }
       SourceMediaStream* is = stream->AsSourceStream();
@@ -1289,6 +1350,11 @@ MediaStreamGraphImpl::RunThread()
         allBlockedForever = false;
       }
     }
+
+    if (mMixer) {
+      mMixer->FinishMixing();
+    }
+
     if (ensureNextIteration || !allBlockedForever) {
       EnsureNextIteration();
     }
@@ -2317,6 +2383,20 @@ SourceMediaStream::GetBufferedTicks(TrackID aID)
   return 0;
 }
 
+void
+SourceMediaStream::RegisterForAudioMixing()
+{
+  MutexAutoLock lock(mMutex);
+  mNeedsMixing = true;
+}
+
+bool
+SourceMediaStream::NeedsMixing()
+{
+  MutexAutoLock lock(mMutex);
+  return mNeedsMixing;
+}
+
 void
 MediaInputPort::Init()
 {
@@ -2501,6 +2581,7 @@ MediaStreamGraphImpl::MediaStreamGraphImpl(bool aRealtime)
   , mNonRealtimeProcessing(false)
   , mStreamOrderDirty(false)
   , mLatencyLog(AsyncLatencyLogger::Get())
+  , mMixer(nullptr)
 {
 #ifdef PR_LOGGING
   if (!gMediaStreamGraphLog) {
diff --git a/content/media/MediaStreamGraph.h b/content/media/MediaStreamGraph.h
index 0889a9c2fde5..cf2dd23df0d7 100644
--- a/content/media/MediaStreamGraph.h
+++ b/content/media/MediaStreamGraph.h
@@ -18,6 +18,7 @@
 #include "MainThreadUtils.h"
 #include "nsAutoRef.h"
 #include "speex/speex_resampler.h"
+#include "AudioMixer.h"
 
 class nsIRunnable;
 
@@ -572,6 +573,8 @@ protected:
     // Amount of time that we've wanted to play silence because of the stream
     // blocking.
     MediaTime mBlockedAudioTime;
+    // Last tick written to the audio output.
+    TrackTicks mLastTickWritten;
     nsAutoPtr<AudioStream> mStream;
     TrackID mTrackID;
   };
@@ -782,6 +785,9 @@ public:
     bool mHaveEnough;
   };
 
+  void RegisterForAudioMixing();
+  bool NeedsMixing();
+
 protected:
   TrackData* FindDataForTrack(TrackID aID)
   {
@@ -815,6 +821,7 @@ protected:
   bool mPullEnabled;
   bool mUpdateFinished;
   bool mDestroyed;
+  bool mNeedsMixing;
 };
 
 /**
diff --git a/content/media/MediaStreamGraphImpl.h b/content/media/MediaStreamGraphImpl.h
index f6e1ea1db64d..e0beef184d17 100644
--- a/content/media/MediaStreamGraphImpl.h
+++ b/content/media/MediaStreamGraphImpl.h
@@ -13,12 +13,15 @@
 #include "nsIThread.h"
 #include "nsIRunnable.h"
 #include "Latency.h"
+#include "mozilla/WeakPtr.h"
 
 namespace mozilla {
 
 template <typename T>
 class LinkedList;
 
+class AudioMixer;
+
 /**
  * Assume we can run an iteration of the MediaStreamGraph loop in this much time
  * or less.
@@ -52,10 +55,6 @@ static const int AUDIO_TARGET_MS = 2*MEDIA_GRAPH_TARGET_PERIOD_MS +
 static const int VIDEO_TARGET_MS = 2*MEDIA_GRAPH_TARGET_PERIOD_MS +
     SCHEDULE_SAFETY_MARGIN_MS;
 
-/**
- * Rate at which we run the video tracks.
- */
-
 /**
  * A per-stream update message passed from the media graph thread to the
  * main thread.
@@ -327,9 +326,9 @@ public:
                                    MediaStream* aStream);
   /**
    * Queue audio (mix of stream audio and silence for blocked intervals)
-   * to the audio output stream.
+   * to the audio output stream. Returns the number of frames played.
    */
-  void PlayAudio(MediaStream* aStream, GraphTime aFrom, GraphTime aTo);
+  TrackTicks PlayAudio(MediaStream* aStream, GraphTime aFrom, GraphTime aTo);
   /**
    * Set the correct current video frame for stream aStream.
    */
@@ -575,6 +574,10 @@ public:
    * Hold a ref to the Latency logger
    */
   nsRefPtr<AsyncLatencyLogger> mLatencyLog;
+  /**
+   * If this is not null, all the audio output for the MSG will be mixed down.
+   */
+  nsAutoPtr<AudioMixer> mMixer;
 };
 
 }
diff --git a/content/media/compiledtest/TestAudioMixer.cpp b/content/media/compiledtest/TestAudioMixer.cpp
new file mode 100644
index 000000000000..10f6cb8354b2
--- /dev/null
+++ b/content/media/compiledtest/TestAudioMixer.cpp
@@ -0,0 +1,155 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AudioMixer.h"
+#include <assert.h>
+
+using mozilla::AudioDataValue;
+using mozilla::AudioSampleFormat;
+
+/* In this test, the different audio stream and channels are always created to
+ * cancel each other. */
+void MixingDone(AudioDataValue* aData, AudioSampleFormat aFormat, uint32_t aChannels, uint32_t aFrames)
+{
+  bool silent = true;
+  for (uint32_t i = 0; i < aChannels * aFrames; i++) {
+    if (aData[i] != 0.0) {
+      if (aFormat == mozilla::AUDIO_FORMAT_S16) {
+        fprintf(stderr, "Sample at %d is not silent: %d\n", i, (short)aData[i]);
+      } else {
+        fprintf(stderr, "Sample at %d is not silent: %f\n", i, (float)aData[i]);
+      }
+      silent = false;
+    }
+  }
+  if (!silent) {
+    MOZ_CRASH();
+  }
+}
+
+/* Helper function to give us the maximum and minimum value that don't clip,
+ * for a given sample format (integer or floating-point). */
+template<typename T>
+T GetLowValue();
+
+template<typename T>
+T GetHighValue();
+
+template<>
+float GetLowValue<float>() {
+  return -1.0;
+}
+
+template<>
+short GetLowValue<short>() {
+  return -INT16_MAX;
+}
+
+template<>
+float GetHighValue<float>() {
+  return 1.0;
+}
+
+template<>
+short GetHighValue<short>() {
+  return INT16_MAX;
+}
+
+void FillBuffer(AudioDataValue* aBuffer, uint32_t aLength, AudioDataValue aValue)
+{
+  AudioDataValue* end = aBuffer + aLength;
+  while (aBuffer != end) {
+    *aBuffer++ = aValue;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const uint32_t CHANNEL_LENGTH = 256;
+  AudioDataValue a[CHANNEL_LENGTH * 2];
+  AudioDataValue b[CHANNEL_LENGTH * 2];
+  FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+  FillBuffer(a + CHANNEL_LENGTH, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+  FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+  FillBuffer(b + CHANNEL_LENGTH, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+
+  {
+    int iterations = 2;
+    mozilla::AudioMixer mixer(MixingDone);
+
+    fprintf(stderr, "Test AudioMixer constant buffer length.\n");
+
+    while (iterations--) {
+      mixer.Mix(a, 2, CHANNEL_LENGTH);
+      mixer.Mix(b, 2, CHANNEL_LENGTH);
+      mixer.FinishMixing();
+    }
+  }
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+
+    fprintf(stderr, "Test AudioMixer variable buffer length.\n");
+
+    FillBuffer(a, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH / 2);
+    mixer.Mix(b, 2, CHANNEL_LENGTH / 2);
+    mixer.FinishMixing();
+    FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    FillBuffer(a, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(a + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetLowValue<AudioDataValue>());
+    FillBuffer(b, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    FillBuffer(b + CHANNEL_LENGTH / 2, CHANNEL_LENGTH / 2, GetHighValue<AudioDataValue>());
+    mixer.Mix(a, 2, CHANNEL_LENGTH / 2);
+    mixer.Mix(b, 2, CHANNEL_LENGTH / 2);
+    mixer.FinishMixing();
+  }
+
+  FillBuffer(a, CHANNEL_LENGTH, GetLowValue<AudioDataValue>());
+  FillBuffer(b, CHANNEL_LENGTH, GetHighValue<AudioDataValue>());
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+    fprintf(stderr, "Test AudioMixer variable channel count.\n");
+
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 1, CHANNEL_LENGTH);
+    mixer.Mix(b, 1, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+  }
+
+  {
+    mozilla::AudioMixer mixer(MixingDone);
+    fprintf(stderr, "Test AudioMixer variable stream count.\n");
+
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+    mixer.Mix(a, 2, CHANNEL_LENGTH);
+    mixer.Mix(b, 2, CHANNEL_LENGTH);
+    mixer.FinishMixing();
+  }
+
+  return 0;
+}
+
diff --git a/content/media/compiledtest/moz.build b/content/media/compiledtest/moz.build
new file mode 100644
index 000000000000..8a1cffa507b9
--- /dev/null
+++ b/content/media/compiledtest/moz.build
@@ -0,0 +1,16 @@
+# -*- Mode: python; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+CPP_UNIT_TESTS += [
+    'TestAudioMixer.cpp',
+]
+
+FAIL_ON_WARNINGS = True
+
+LOCAL_INCLUDES += [
+    '..',
+]
+
diff --git a/content/media/moz.build b/content/media/moz.build
index 868ab143c0ae..f48fce4940e0 100644
--- a/content/media/moz.build
+++ b/content/media/moz.build
@@ -12,6 +12,8 @@ PARALLEL_DIRS += [
   'webvtt'
 ]
 
+TEST_TOOL_DIRS += ['compiledtest']
+
 if CONFIG['MOZ_RAW']:
     PARALLEL_DIRS += ['raw']
 
@@ -57,6 +59,7 @@ EXPORTS += [
     'AudioChannelFormat.h',
     'AudioCompactor.h',
     'AudioEventTimeline.h',
+    'AudioMixer.h',
     'AudioNodeEngine.h',
     'AudioNodeExternalInputStream.h',
     'AudioNodeStream.h',
diff --git a/content/media/webrtc/MediaEngineWebRTCAudio.cpp b/content/media/webrtc/MediaEngineWebRTCAudio.cpp
index 9b69d22985aa..09eb4e207d5e 100644
--- a/content/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/content/media/webrtc/MediaEngineWebRTCAudio.cpp
@@ -158,6 +158,8 @@ MediaEngineWebRTCAudioSource::Start(SourceMediaStream* aStream, TrackID aID)
   AudioSegment* segment = new AudioSegment();
   aStream->AddTrack(aID, SAMPLE_FREQUENCY, 0, segment);
   aStream->AdvanceKnownTracksTime(STREAM_TIME_MAX);
+  // XXX Make this based on the pref.
+  aStream->RegisterForAudioMixing();
   LOG(("Start audio for stream %p", aStream));
 
   if (mState == kStarted) {