Bug 1741959 - Move audio data processing to ProcessInput r=padenot,pehrsons

The interface for getting the data source of the AudioInputProcessing in AudioInputTrack is moved from AudioInputProcessing::NotifyInputData to ::ProcessInput, which takes an AudioSegment forwarded from the AudioInputTrack's source track Depends on D131870 Differential Revision: https://phabricator.services.mozilla.com/D122513
2021-12-18 15:09:46 +00:00 · 2021-12-18 15:09:46 +00:00 · 082eb01c54
--- a/dom/media/AudioSegment.cpp
+++ b/dom/media/AudioSegment.cpp
@ -61,6 +61,60 @@ void AudioSegment::ResampleChunks(nsAutoRef<SpeexResamplerState>& aResampler,
  }
 }

+size_t AudioSegment::WriteToInterleavedBuffer(nsTArray<AudioDataValue>& aBuffer,
+                                              uint32_t aChannels) const {
+  size_t offset = 0;
+  if (GetDuration() <= 0) {
+    MOZ_ASSERT(GetDuration() == 0);
+    return offset;
+  }
+
+  // Calculate how many samples in this segment
+  size_t frames = static_cast<size_t>(GetDuration());
+  CheckedInt<size_t> samples(frames);
+  samples *= static_cast<size_t>(aChannels);
+  MOZ_ASSERT(samples.isValid());
+  if (!samples.isValid()) {
+    return offset;
+  }
+
+  // Enlarge buffer space if needed
+  if (samples.value() > aBuffer.Capacity()) {
+    aBuffer.SetCapacity(samples.value());
+  }
+  aBuffer.SetLengthAndRetainStorage(samples.value());
+  aBuffer.ClearAndRetainStorage();
+
+  // Convert the de-interleaved chunks into an interleaved buffer. Note that
+  // we may upmix or downmix the audio data if the channel in the chunks
+  // mismatch with aChannels
+  for (ConstChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
+    const AudioChunk& c = *ci;
+    size_t samplesInChunk = static_cast<size_t>(c.mDuration) * aChannels;
+    switch (c.mBufferFormat) {
+      case AUDIO_FORMAT_S16:
+        WriteChunk<int16_t>(c, aChannels, c.mVolume,
+                            aBuffer.Elements() + offset);
+        break;
+      case AUDIO_FORMAT_FLOAT32:
+        WriteChunk<float>(c, aChannels, c.mVolume, aBuffer.Elements() + offset);
+        break;
+      case AUDIO_FORMAT_SILENCE:
+        PodZero(aBuffer.Elements() + offset, samplesInChunk);
+        break;
+      default:
+        MOZ_ASSERT_UNREACHABLE("Unknown format");
+        PodZero(aBuffer.Elements() + offset, samplesInChunk);
+        break;
+    }
+    offset += samplesInChunk;
+  }
+  MOZ_DIAGNOSTIC_ASSERT(samples.value() == offset,
+                        "Segment's duration is incorrect");
+  aBuffer.SetLengthAndRetainStorage(offset);
+  return offset;
+}
+
 // This helps to to safely get a pointer to the position we want to start
 // writing a planar audio buffer, depending on the channel and the offset in the
 // buffer.
--- a/dom/media/AudioSegment.h
+++ b/dom/media/AudioSegment.h
@ -422,6 +422,12 @@ class AudioSegment : public MediaSegmentBase<AudioSegment, AudioChunk> {
    MOZ_ASSERT(aChannels == channels.Length());
    AppendFrames(buffer.forget(), channels, aFrames, aPrincipalHandle);
  }
+  // Write the segement data into an interleaved buffer. Do mixing if the
+  // AudioChunk's channel count in the segment is different from aChannels.
+  // Returns sample count of the converted audio data. The converted data will
+  // be stored into aBuffer.
+  size_t WriteToInterleavedBuffer(nsTArray<AudioDataValue>& aBuffer,
+                                  uint32_t aChannels) const;
  // Consumes aChunk, and returns a pointer to the persistent copy of aChunk
  // in the segment.
  AudioChunk* AppendAndConsumeChunk(AudioChunk&& aChunk) {
@ -486,8 +492,8 @@ class AudioSegment : public MediaSegmentBase<AudioSegment, AudioChunk> {
 };

 template <typename SrcT>
-void WriteChunk(AudioChunk& aChunk, uint32_t aOutputChannels, float aVolume,
-                AudioDataValue* aOutputBuffer) {
+void WriteChunk(const AudioChunk& aChunk, uint32_t aOutputChannels,
+                float aVolume, AudioDataValue* aOutputBuffer) {
  AutoTArray<const SrcT*, GUESS_AUDIO_CHANNELS> channelData;

  channelData = aChunk.ChannelData<SrcT>().Clone();
--- a/dom/media/DeviceInputTrack.cpp
+++ b/dom/media/DeviceInputTrack.cpp
@ -11,11 +11,45 @@

 namespace mozilla {

+#ifdef LOG_INTERNAL
+#  undef LOG_INTERNAL
+#endif  // LOG_INTERNAL
+#define LOG_INTERNAL(level, msg, ...) \
+  MOZ_LOG(gMediaTrackGraphLog, LogLevel::level, (msg, ##__VA_ARGS__))
+
+#ifdef LOG
+#  undef LOG
+#endif  // LOG
+#define LOG(msg, ...) LOG_INTERNAL(Debug, msg, ##__VA_ARGS__)
+
+// This can only be called in graph thread since mGraph->CurrentDriver() is
+// graph thread only
+#ifdef TRACK_GRAPH_LOG_INTERNAL
+#  undef TRACK_GRAPH_LOG_INTERNAL
+#endif  // TRACK_GRAPH_LOG_INTERNAL
+#define TRACK_GRAPH_LOG_INTERNAL(level, msg, ...)                        \
+  LOG_INTERNAL(level, "(Graph %p, Driver %p) NativeInputTrack %p, " msg, \
+               this->mGraph, this->mGraph->CurrentDriver(), this,        \
+               ##__VA_ARGS__)
+
+#ifdef TRACK_GRAPH_LOG
+#  undef TRACK_GRAPH_LOG
+#endif  // TRACK_GRAPH_LOG
+#define TRACK_GRAPH_LOG(msg, ...) \
+  TRACK_GRAPH_LOG_INTERNAL(Debug, msg, ##__VA_ARGS__)
+
+#ifdef TRACK_GRAPH_LOGV
+#  undef TRACK_GRAPH_LOGV
+#endif  // TRACK_GRAPH_LOGV
+#define TRACK_GRAPH_LOGV(msg, ...) \
+  TRACK_GRAPH_LOG_INTERNAL(Verbose, msg, ##__VA_ARGS__)
+
 /* static */
 NativeInputTrack* NativeInputTrack::Create(MediaTrackGraphImpl* aGraph) {
  MOZ_ASSERT(NS_IsMainThread());

  NativeInputTrack* track = new NativeInputTrack(aGraph->GraphRate());
+  LOG("Create NativeInputTrack %p in MTG %p", track, aGraph);
  aGraph->AddTrack(track);
  return track;
 }
@ -35,7 +69,7 @@ size_t NativeInputTrack::RemoveUser() {

 void NativeInputTrack::DestroyImpl() {
  MOZ_ASSERT(mGraph->OnGraphThreadOrNotRunning());
-  mInputData.Clear();
+  mPendingData.Clear();
  ProcessedMediaTrack::DestroyImpl();
 }

@ -44,23 +78,27 @@ void NativeInputTrack::ProcessInput(GraphTime aFrom, GraphTime aTo,
  MOZ_ASSERT(mGraph->OnGraphThreadOrNotRunning());
  TRACE_COMMENT("NativeInputTrack::ProcessInput", "%p", this);

-  if (mInputData.IsEmpty()) {
+  TRACK_GRAPH_LOGV("ProcessInput from %" PRId64 " to %" PRId64
+                   ", needs %" PRId64 " frames",
+                   aFrom, aTo, aTo - aFrom);
+
+  TrackTime from = GraphTimeToTrackTime(aFrom);
+  TrackTime to = GraphTimeToTrackTime(aTo);
+  if (from >= to) {
    return;
  }

-  // The number of NotifyInputData and ProcessInput calls could be different. We
-  // always process the input data from NotifyInputData in the first
-  // ProcessInput after the NotifyInputData
+  MOZ_ASSERT_IF(!mIsBufferingAppended, mPendingData.IsEmpty());

-  // The mSegment will be the de-interleaved audio data converted from
-  // mInputData
+  TrackTime need = to - from;
+  TrackTime dataNeed = std::min(mPendingData.GetDuration(), need);
+  TrackTime silenceNeed = std::max(need - dataNeed, (TrackTime)0);

-  GetData<AudioSegment>()->Clear();
-  GetData<AudioSegment>()->AppendFromInterleavedBuffer(
-      mInputData.Data(), mInputData.FrameCount(), mInputData.Channels(),
-      PRINCIPAL_HANDLE_NONE);
+  MOZ_ASSERT_IF(dataNeed > 0, silenceNeed == 0);

-  mInputData.Clear();
+  GetData<AudioSegment>()->AppendSlice(mPendingData, 0, dataNeed);
+  mPendingData.RemoveLeading(dataNeed);
+  GetData<AudioSegment>()->AppendNullData(silenceNeed);
 }

 uint32_t NativeInputTrack::NumberOfChannels() const {
@ -82,8 +120,10 @@ void NativeInputTrack::NotifyInputStopped(MediaTrackGraphImpl* aGraph) {
  MOZ_ASSERT(aGraph->OnGraphThreadOrNotRunning());
  MOZ_ASSERT(aGraph == mGraph,
             "Receive input stopped signal from another graph");
+  TRACK_GRAPH_LOG("NotifyInputStopped");
  mInputChannels = 0;
-  mInputData.Clear();
+  mIsBufferingAppended = false;
+  mPendingData.Clear();
  for (auto& listener : mDataUsers) {
    listener->NotifyInputStopped(aGraph);
  }
@ -96,12 +136,30 @@ void NativeInputTrack::NotifyInputData(MediaTrackGraphImpl* aGraph,
                                       uint32_t aAlreadyBuffered) {
  MOZ_ASSERT(aGraph->OnGraphThreadOrNotRunning());
  MOZ_ASSERT(aGraph == mGraph, "Receive input data from another graph");
+  TRACK_GRAPH_LOGV(
+      "NotifyInputData: frames=%zu, rate=%d, channel=%u, alreadyBuffered=%u",
+      aFrames, aRate, aChannels, aAlreadyBuffered);
+
+  if (!mIsBufferingAppended) {
+    // First time we see live frames getting added. Use what's already buffered
+    // in the driver's scratch buffer as a starting point.
+    MOZ_ASSERT(mPendingData.IsEmpty());
+    constexpr TrackTime buffering = WEBAUDIO_BLOCK_SIZE;
+    const TrackTime remaining =
+        buffering - static_cast<TrackTime>(aAlreadyBuffered);
+    mPendingData.AppendNullData(remaining);
+    mIsBufferingAppended = true;
+    TRACK_GRAPH_LOG("Set mIsBufferingAppended by appending %" PRId64 " frames.",
+                    remaining);
+  }

  MOZ_ASSERT(aChannels);
  if (!mInputChannels) {
    mInputChannels = aChannels;
  }
-  mInputData.Push(aBuffer, aFrames, aRate, aChannels);
+  mPendingData.AppendFromInterleavedBuffer(aBuffer, aFrames, aChannels,
+                                           PRINCIPAL_HANDLE_NONE);
+
  for (auto& listener : mDataUsers) {
    listener->NotifyInputData(aGraph, aBuffer, aFrames, aRate, aChannels,
                              aAlreadyBuffered);
@ -112,10 +170,16 @@ void NativeInputTrack::DeviceChanged(MediaTrackGraphImpl* aGraph) {
  MOZ_ASSERT(aGraph->OnGraphThreadOrNotRunning());
  MOZ_ASSERT(aGraph == mGraph,
             "Receive device changed signal from another graph");
-  mInputData.Clear();
+  TRACK_GRAPH_LOG("DeviceChanged");
  for (auto& listener : mDataUsers) {
    listener->DeviceChanged(aGraph);
  }
 }

+#undef LOG_INTERNAL
+#undef LOG
+#undef TRACK_GRAPH_LOG_INTERNAL
+#undef TRACK_GRAPH_LOG
+#undef TRACK_GRAPH_LOGV
+
 }  // namespace mozilla
--- a/dom/media/DeviceInputTrack.h
+++ b/dom/media/DeviceInputTrack.h
@ -17,7 +17,8 @@ class NativeInputTrack : public ProcessedMediaTrack {
  ~NativeInputTrack() = default;
  explicit NativeInputTrack(TrackRate aSampleRate)
      : ProcessedMediaTrack(aSampleRate, MediaSegment::AUDIO,
-                            new AudioSegment()) {}
+                            new AudioSegment()),
+        mIsBufferingAppended(false) {}

 public:
  // Main Thread API
@ -49,9 +50,13 @@ class NativeInputTrack : public ProcessedMediaTrack {
  nsTArray<RefPtr<AudioDataListener>> mDataUsers;

 private:
+  // Indicate whether we append extra frames in mPendingData. The extra number
+  // of frames is in [0, WEBAUDIO_BLOCK_SIZE] range.
+  bool mIsBufferingAppended;
+
  // Queue the audio input data coming from NotifyInputData. Used in graph
  // thread only.
-  AudioInputSamples mInputData;
+  AudioSegment mPendingData;

  // Only accessed on the graph thread.
  uint32_t mInputChannels = 0;
--- a/dom/media/gtest/TestAudioInputProcessing.cpp
+++ b/dom/media/gtest/TestAudioInputProcessing.cpp
@ -37,286 +37,163 @@ class MockGraph : public MediaTrackGraphImpl {
  ~MockGraph() = default;
 };

-TEST(TestAudioInputProcessing, UnaccountedPacketizerBuffering)
+// AudioInputProcessing will put extra frames as pre-buffering data to avoid
+// glitchs in non pass-through mode. The main goal of the test is to check how
+// many frames left in the AudioInputProcessing's mSegment in various situations
+// after input data has been processed.
+TEST(TestAudioInputProcessing, Buffering)
 {
-  const TrackRate rate = 48000;
-  const uint32_t channels = 2;
-  auto graph = MakeRefPtr<NiceMock<MockGraph>>(48000, 2);
-  auto aip = MakeRefPtr<AudioInputProcessing>(channels, PRINCIPAL_HANDLE_NONE);
-  AudioGenerator<AudioDataValue> generator(channels, rate);
-
-  // The packetizer takes 480 frames. To trigger this we need to populate the
-  // packetizer without filling it completely the first iteration, then trigger
-  // the unbounded-buffering-assertion on the second iteration.
-
-  const size_t nrFrames = 440;
-  const size_t bufferSize = nrFrames * channels;
-  GraphTime processedTime;
-  GraphTime nextTime;
-  nsTArray<AudioDataValue> buffer(bufferSize);
-  buffer.AppendElements(bufferSize);
-  AudioSegment segment;
-  bool ended;
-
-  aip->Start();
-
-  {
-    // First iteration.
-    // 440 does not fill the packetizer but accounts for pre-silence buffering.
-    // Iterations have processed 72 frames more than provided by callbacks:
-    //     512 - 440 = 72
-    // Thus the total amount of pre-silence buffering added is:
-    //     480 + 128 - 72 = 536
-    // The iteration pulls in 512 frames of silence, leaving 24 frames buffered.
-    processedTime = 0;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(nrFrames);
-    generator.GenerateInterleaved(buffer.Elements(), nrFrames);
-    aip->NotifyInputData(graph, buffer.Elements(), nrFrames, rate, channels,
-                         nextTime - nrFrames);
-    aip->ProcessInput(graph, nullptr);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), 24U);
-  }
-
-  {
-    // Second iteration.
-    // 880 fills a packet of 480 frames. 400 left in the packetizer.
-    // Last iteration left 24 frames buffered, making this iteration have 504
-    // frames in the buffer while pulling 384 frames.
-    // That leaves 120 frames buffered, which must be no more than the total
-    // intended buffering of 480 + 128 = 608 frames.
-    processedTime = nextTime;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(2 * nrFrames);
-    generator.GenerateInterleaved(buffer.Elements(), nrFrames);
-    aip->NotifyInputData(graph, buffer.Elements(), nrFrames, rate, channels,
-                         nextTime - (2 * nrFrames));
-    aip->ProcessInput(graph, nullptr);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), 120U);
-  }
-
-  graph->Destroy();
-}
-
-TEST(TestAudioInputProcessing, InputDataCapture)
-{
-  // This test simulates an audio cut issue happens when using Redmi AirDots.
-  // Similar issues could happen when using other Bluetooth devices like Bose QC
-  // 35 II or Sony WH-XB900N.
-
-  const TrackRate rate = 8000;  // So the packetizer takes 80 frames
+  const TrackRate rate = 8000;  // So packet size is 80
  const uint32_t channels = 1;
  auto graph = MakeRefPtr<NiceMock<MockGraph>>(rate, channels);
  auto aip = MakeRefPtr<AudioInputProcessing>(channels, PRINCIPAL_HANDLE_NONE);
-  AudioGenerator<AudioDataValue> generator(channels, rate);

  const size_t frames = 72;
-  const size_t bufferSize = frames * channels;
-  nsTArray<AudioDataValue> buffer(bufferSize);
-  buffer.AppendElements(bufferSize);

+  AudioGenerator<AudioDataValue> generator(channels, rate);
  GraphTime processedTime;
  GraphTime nextTime;
-  AudioSegment segment;
-  bool ended;
+  AudioSegment output;

-  aip->Start();
+  // Toggle pass-through mode without starting
+  {
+    EXPECT_EQ(aip->PassThrough(graph), false);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
+
+    aip->SetPassThrough(graph, true);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
+
+    aip->SetPassThrough(graph, false);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
+
+    aip->SetPassThrough(graph, true);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
+  }

  {
-    // First iteration.
-    // aip will fill (WEBAUDIO_BLOCK_SIZE + packetizer-size) = 128 + 80 = 208
-    // silence frames in begining of its data storage. The iteration will take
-    // (nextTime - segment-duration) = (128 - 0) = 128 frames to segment,
-    // leaving 208 - 128 = 80 silence frames.
-    const TrackTime bufferedFrames = 80U;
+    // Need (nextTime - processedTime) = 128 - 0 = 128 frames this round.
+    // aip has not started and set to processing mode yet, so output will be
+    // filled with silence data directly.
    processedTime = 0;
    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(frames);

-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels, 0);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, nullptr);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
  }

+  // Set aip to processing/non-pass-through mode
+  aip->SetPassThrough(graph, false);
  {
-    // Second iteration.
-    // We will packetize 80 frames to aip's data storage. The last round left 80
-    // frames so we have 80 + 80 = 160 frames. The iteration will take (nextTime
-    // - segment-duration) = (256 - 128) = 128 frames to segment, leaving 160 -
-    // 128 = 32 frames.
-    const TrackTime bufferedFrames = 32U;
+    // Need (nextTime - processedTime) = 256 - 128 = 128 frames this round.
+    // aip has not started yet, so output will be filled with silence data
+    // directly.
    processedTime = nextTime;
    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(2 * frames);

-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, nullptr);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
  }

+  // aip has been started and set to processing mode so it will insert 80 frames
+  // into aip's internal buffer as pre-buffering.
+  aip->Start(graph);
  {
-    // Third iteration.
-    // Sometimes AudioCallbackDriver's buffer, whose type is
-    // AudioCallbackBufferWrapper, could be unavailable, and therefore
-    // ProcessInput won't be called. In this case, we should queue the audio
-    // data and process them when ProcessInput can be called again.
+    // Need (nextTime - processedTime) = 256 - 256 = 0 frames this round.
+    // The Process() aip will take 0 frames from input, packetize and process
+    // these frames into 0 80-frame packet(0 frames left in packetizer), insert
+    // packets into aip's internal buffer, then move 0 frames the internal
+    // buffer to output, leaving 80 + 0 - 0 = 80 frames in aip's internal
+    // buffer.
    processedTime = nextTime;
    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(3 * frames);
-    // Note that processedTime is *equal* to nextTime (processedTime ==
-    // nextTime) now but it's ok since we don't call ProcessInput here.

-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    Unused << processedTime;
-    buffer.ClearAndRetainStorage();
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 80);
  }

  {
-    // Fourth iteration.
-    // We will packetize 80 (previous round) + 80 (this round) = 160 frames to
-    // aip's data storage. 32 frames are left after the second iteration, so we
-    // have 160 + 32 = 192 frames. The iteration will take (nextTime
-    // - segment-duration) = (384 - 256) = 128 frames to segment, leaving 192 -
-    // 128 = 64 frames.
-    const TrackTime bufferedFrames = 64U;
+    // Need (nextTime - processedTime) = 384 - 256 = 128 frames this round.
+    // The Process() aip will take 128 frames from input, packetize and process
+    // these frames into floor(128/80) = 1 80-frame packet (48 frames left in
+    // packetizer), insert packets into aip's internal buffer, then move 128
+    // frames the internal buffer to output, leaving 80 + 80 - 128 = 32 frames
+    // in aip's internal buffer.
    processedTime = nextTime;
    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(4 * frames);
-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, nullptr);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
+
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 32);
  }

-  graph->Destroy();
-}
+  {
+    // Need (nextTime - processedTime) = 384 - 384 = 0 frames this round.
+    processedTime = nextTime;
+    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(5 * frames);

-TEST(TestAudioInputProcessing, InputDataCapturePassThrough)
-{
-  // This test simulates an audio cut issue happens when using Redmi AirDots.
-  // Similar issues could happen when using other Bluetooth devices like Bose QC
-  // 35 II or Sony WH-XB900N.
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);

-  const TrackRate rate = 8000;  // So the packetizer takes 80 frames
-  const uint32_t channels = 1;
-  auto graph = MakeRefPtr<NiceMock<MockGraph>>(rate, channels);
-  auto aip = MakeRefPtr<AudioInputProcessing>(channels, PRINCIPAL_HANDLE_NONE);
-  AudioGenerator<AudioDataValue> generator(channels, rate);
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 32);
+  }

-  const size_t frames = 72;
-  const size_t bufferSize = frames * channels;
-  nsTArray<AudioDataValue> buffer(bufferSize);
-  buffer.AppendElements(bufferSize);
+  {
+    // Need (nextTime - processedTime) = 512 - 384 = 128 frames this round.
+    // The Process() aip will take 128 frames from input, packetize and process
+    // these frames into floor(128+48/80) = 2 80-frame packet (16 frames left in
+    // packetizer), insert packets into aip's internal buffer, then move 128
+    // frames the internal buffer to output, leaving 32 + 2*80 - 128 = 64 frames
+    // in aip's internal buffer.
+    processedTime = nextTime;
+    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(6 * frames);

-  GraphTime processedTime;
-  GraphTime nextTime;
-  AudioSegment segment;
-  AudioSegment source;
-  bool ended;
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), nextTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 64);
+  }

  aip->SetPassThrough(graph, true);
-  aip->Start();
-
  {
-    // First iteration.
-    // aip will fill (WEBAUDIO_BLOCK_SIZE + ) = 128 + 72 = 200 silence frames in
-    // begining of its data storage. The iteration will take (nextTime -
-    // segment-duration) = (128 - 0) = 128 frames to segment, leaving 200 - 128
-    // = 72 silence frames.
-    const TrackTime bufferedFrames = 72U;
-    processedTime = 0;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(frames);
-
-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    source.AppendFromInterleavedBuffer(buffer.Elements(), frames, channels,
-                                       PRINCIPAL_HANDLE_NONE);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels, 0);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, &source);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
-    source.Clear();
-  }
-
-  {
-    // Second iteration.
-    // We will feed 72 frames to aip's data storage. The last round left 72
-    // frames so we have 72 + 72 = 144 frames. The iteration will take (nextTime
-    // - segment-duration) = (256 - 128) = 128 frames to segment, leaving 144 -
-    // 128 = 16 frames.
-    const TrackTime bufferedFrames = 16U;
+    // Need (nextTime - processedTime) = 512 - 512 = 0 frames this round.
+    // No buffering in pass-through mode
    processedTime = nextTime;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(2 * frames);
+    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(7 * frames);

-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    source.AppendFromInterleavedBuffer(buffer.Elements(), frames, channels,
-                                       PRINCIPAL_HANDLE_NONE);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, &source);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
-    source.Clear();
-  }
-
-  {
-    // Third iteration.
-    // Sometimes AudioCallbackDriver's buffer, whose type is
-    // AudioCallbackBufferWrapper, could be unavailable, and therefore
-    // ProcessInput won't be called. In this case, we should queue the audio
-    // data and process them when ProcessInput can be called again.
-    processedTime = nextTime;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(3 * frames);
-    // Note that processedTime is *equal* to nextTime (processedTime ==
-    // nextTime) now but it's ok since we don't call ProcessInput here.
-
-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    source.AppendFromInterleavedBuffer(buffer.Elements(), frames, channels,
-                                       PRINCIPAL_HANDLE_NONE);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    Unused << processedTime;
-    buffer.ClearAndRetainStorage();
-  }
-
-  {
-    // Fourth iteration.
-    // We will feed 72 (previous round) + 72 (this round) = 144 frames to aip's
-    // data storage. 16 frames are left after the second iteration, so we have
-    // 144 + 16 = 160 frames. The iteration will take (nextTime -
-    // segment-duration) = (384 - 256) = 128 frames to segment, leaving 160 -
-    // 128 = 32 frames.
-    const TrackTime bufferedFrames = 32U;
-    processedTime = nextTime;
-    nextTime = MediaTrackGraphImpl::RoundUpToEndOfAudioBlock(4 * frames);
-    generator.GenerateInterleaved(buffer.Elements(), frames);
-    source.AppendFromInterleavedBuffer(buffer.Elements(), frames, channels,
-                                       PRINCIPAL_HANDLE_NONE);
-    aip->NotifyInputData(graph, buffer.Elements(), frames, rate, channels,
-                         0 /* ignored */);
-    buffer.ClearAndRetainStorage();
-    aip->ProcessInput(graph, &source);
-    aip->Pull(graph, processedTime, nextTime, segment.GetDuration(), &segment,
-              true, &ended);
-    EXPECT_EQ(aip->NumBufferedFrames(graph), bufferedFrames);
-    source.Clear();
+    AudioSegment input;
+    generator.Generate(input, nextTime - processedTime);
+
+    aip->Process(graph, processedTime, nextTime, &input, &output);
+    EXPECT_EQ(input.GetDuration(), nextTime - processedTime);
+    EXPECT_EQ(output.GetDuration(), processedTime);
+    EXPECT_EQ(aip->NumBufferedFrames(graph), 0);
  }

+  aip->Stop(graph);
  graph->Destroy();
 }
--- a/dom/media/gtest/TestAudioTrackGraph.cpp
+++ b/dom/media/gtest/TestAudioTrackGraph.cpp
@ -48,15 +48,16 @@ struct StartInputProcessing : public ControlMessage {
      : ControlMessage(aTrack),
        mInputTrack(aTrack),
        mInputProcessing(aInputProcessing) {}
-  void Run() override { mInputProcessing->Start(); }
+  void Run() override { mInputProcessing->Start(mTrack->GraphImpl()); }
 };

 struct StopInputProcessing : public ControlMessage {
  const RefPtr<AudioInputProcessing> mInputProcessing;

-  explicit StopInputProcessing(AudioInputProcessing* aInputProcessing)
-      : ControlMessage(nullptr), mInputProcessing(aInputProcessing) {}
-  void Run() override { mInputProcessing->Stop(); }
+  explicit StopInputProcessing(AudioInputTrack* aTrack,
+                               AudioInputProcessing* aInputProcessing)
+      : ControlMessage(aTrack), mInputProcessing(aInputProcessing) {}
+  void Run() override { mInputProcessing->Stop(mTrack->GraphImpl()); }
 };

 struct SetPassThrough : public ControlMessage {
@ -279,7 +280,7 @@ TEST(TestAudioTrackGraph, ErrorCallback)
  // Clean up.
  DispatchFunction([&] {
    inputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(listener));
+        MakeUnique<StopInputProcessing>(inputTrack, listener));
    inputTrack->CloseAudioInput();
    inputTrack->Destroy();
  });
@ -348,7 +349,7 @@ TEST(TestAudioTrackGraph, AudioInputTrack)
    outputTrack->Destroy();
    port->Destroy();
    inputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(listener));
+        MakeUnique<StopInputProcessing>(inputTrack, listener));
    inputTrack->CloseAudioInput();
    inputTrack->Destroy();
  });
@ -363,7 +364,7 @@ TEST(TestAudioTrackGraph, AudioInputTrack)

  EXPECT_EQ(estimatedFreq, inputFrequency);
  std::cerr << "PreSilence: " << preSilenceSamples << std::endl;
-  // We buffer 128 frames in passthrough mode. See AudioInputProcessing::Pull.
+  // We buffer 128 frames. See DeviceInputTrack::ProcessInput.
  EXPECT_GE(preSilenceSamples, 128U);
  // If the fallback system clock driver is doing a graph iteration before the
  // first audio driver iteration comes in, that iteration is ignored and
@ -485,7 +486,7 @@ TEST(TestAudioTrackGraph, ReOpenAudioInput)
    outputTrack->Destroy();
    port->Destroy();
    inputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(listener));
+        MakeUnique<StopInputProcessing>(inputTrack, listener));
    inputTrack->CloseAudioInput();
    inputTrack->Destroy();
  });
@ -501,7 +502,8 @@ TEST(TestAudioTrackGraph, ReOpenAudioInput)
  EXPECT_EQ(estimatedFreq, inputFrequency);
  std::cerr << "PreSilence: " << preSilenceSamples << std::endl;
  // We buffer 10ms worth of frames in non-passthrough mode, plus up to 128
-  // frames as we round up to the nearest block. See AudioInputProcessing::Pull.
+  // frames as we round up to the nearest block. See
+  // AudioInputProcessing::Process and DeviceInputTrack::PrcoessInput.
  EXPECT_GE(preSilenceSamples, 128U + inputRate / 100);
  // If the fallback system clock driver is doing a graph iteration before the
  // first audio driver iteration comes in, that iteration is ignored and
@ -605,7 +607,7 @@ TEST(TestAudioTrackGraph, AudioInputTrackDisabling)
    outputTrack->Destroy();
    port->Destroy();
    inputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(listener));
+        MakeUnique<StopInputProcessing>(inputTrack, listener));
    inputTrack->CloseAudioInput();
    inputTrack->Destroy();
  });
@ -686,7 +688,7 @@ struct AudioTrackSet {
    mOutputTrack->Destroy();
    mPort->Destroy();
    mInputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(mListener));
+        MakeUnique<StopInputProcessing>(mInputTrack, mListener));
    mInputTrack->CloseAudioInput();
    mInputTrack->Destroy();

@ -1028,7 +1030,7 @@ void TestCrossGraphPort(uint32_t aInputRate, uint32_t aOutputRate,
    transmitter->Destroy();
    port->Destroy();
    inputTrack->GraphImpl()->AppendMessage(
-        MakeUnique<StopInputProcessing>(listener));
+        MakeUnique<StopInputProcessing>(inputTrack, listener));
    inputTrack->CloseAudioInput();
    inputTrack->Destroy();
  });
--- a/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
+++ b/dom/media/webrtc/MediaEngineWebRTCAudio.cpp
@ -388,18 +388,19 @@ class StartStopMessage : public ControlMessage {
 public:
  enum StartStop { Start, Stop };

-  StartStopMessage(AudioInputProcessing* aInputProcessing, StartStop aAction)
-      : ControlMessage(nullptr),
+  StartStopMessage(MediaTrack* aTrack, AudioInputProcessing* aInputProcessing,
+                   StartStop aAction)
+      : ControlMessage(aTrack),
        mInputProcessing(aInputProcessing),
        mAction(aAction) {}

  void Run() override {
    if (mAction == StartStopMessage::Start) {
      TRACE("InputProcessing::Start")
-      mInputProcessing->Start();
+      mInputProcessing->Start(mTrack->GraphImpl());
    } else if (mAction == StartStopMessage::Stop) {
      TRACE("InputProcessing::Stop")
-      mInputProcessing->Stop();
+      mInputProcessing->Stop(mTrack->GraphImpl());
    } else {
      MOZ_CRASH("Invalid enum value");
    }
@ -439,7 +440,7 @@ nsresult MediaEngineWebRTCMicrophoneSource::Start() {
        }

        track->GraphImpl()->AppendMessage(MakeUnique<StartStopMessage>(
-            inputProcessing, StartStopMessage::Start));
+            track, inputProcessing, StartStopMessage::Start));
        track->OpenAudioInput(deviceID, inputProcessing);
      }));

@ -470,7 +471,7 @@ nsresult MediaEngineWebRTCMicrophoneSource::Stop() {
        }

        track->GraphImpl()->AppendMessage(MakeUnique<StartStopMessage>(
-            inputProcessing, StartStopMessage::Stop));
+            track, inputProcessing, StartStopMessage::Stop));
        MOZ_ASSERT(track->DeviceId().value() == deviceInfo->DeviceID());
        track->CloseAudioInput();
      }));
@ -493,7 +494,6 @@ AudioInputProcessing::AudioInputProcessing(
      mRequestedInputChannelCount(aMaxChannelCount),
      mSkipProcessing(false),
      mInputDownmixBuffer(MAX_SAMPLING_FREQ * MAX_CHANNELS / 100),
-      mLiveBufferingAppended(Nothing()),
      mPrincipal(aPrincipalHandle),
      mEnabled(false),
      mEnded(false),
@ -513,22 +513,24 @@ void AudioInputProcessing::SetPassThrough(MediaTrackGraphImpl* aGraph,
                                          bool aPassThrough) {
  MOZ_ASSERT(aGraph->OnGraphThread());

-  if (!mSkipProcessing && aPassThrough) {
-    // Reset AudioProcessing so that if we resume processing in the future it
-    // doesn't depend on old state.
-    mAudioProcessing->Initialize();
-
-    if (mPacketizerInput) {
-      MOZ_ASSERT(mPacketizerInput->PacketsAvailable() == 0);
-      LOG_FRAME(
-          "AudioInputProcessing %p Appending %u frames of null data for data "
-          "discarded in the packetizer",
-          this, mPacketizerInput->FramesAvailable());
-      mSegment.AppendNullData(mPacketizerInput->FramesAvailable());
-      mPacketizerInput->Clear();
-    }
+  if (aPassThrough == mSkipProcessing) {
+    return;
  }
  mSkipProcessing = aPassThrough;
+
+  if (!mEnabled) {
+    MOZ_ASSERT(!mPacketizerInput);
+    return;
+  }
+
+  if (aPassThrough) {
+    // Turn on pass-through
+    ResetAudioProcessing(aGraph);
+  } else {
+    // Turn off pass-through
+    MOZ_ASSERT(!mPacketizerInput);
+    EnsureAudioProcessing(aGraph, mRequestedInputChannelCount);
+  }
 }

 uint32_t AudioInputProcessing::GetRequestedInputChannelCount() {
@ -542,104 +544,222 @@ void AudioInputProcessing::SetRequestedInputChannelCount(
  aGraph->ReevaluateInputDevice();
 }

-void AudioInputProcessing::Start() {
-  mEnabled = true;
-  mLiveBufferingAppended = Nothing();
-}
-
-void AudioInputProcessing::Stop() { mEnabled = false; }
-
-void AudioInputProcessing::Pull(MediaTrackGraphImpl* aGraph, GraphTime aFrom,
-                                GraphTime aTo, GraphTime aTrackEnd,
-                                AudioSegment* aSegment,
-                                bool aLastPullThisIteration, bool* aEnded) {
+void AudioInputProcessing::Start(MediaTrackGraphImpl* aGraph) {
  MOZ_ASSERT(aGraph->OnGraphThread());

-  if (mEnded) {
-    *aEnded = true;
+  if (mEnabled) {
+    return;
+  }
+  mEnabled = true;
+
+  if (mSkipProcessing) {
    return;
  }

-  TrackTime delta = aTo - aTrackEnd;
-  MOZ_ASSERT(delta >= 0, "We shouldn't append more than requested");
-  TrackTime buffering = 0;
+  MOZ_ASSERT(!mPacketizerInput);
+  EnsureAudioProcessing(aGraph, mRequestedInputChannelCount);
+}

-  // Add the amount of buffering required to not underrun and glitch.
+void AudioInputProcessing::Stop(MediaTrackGraphImpl* aGraph) {
+  MOZ_ASSERT(aGraph->OnGraphThread());

-  // Make sure there's at least one extra block buffered until audio callbacks
-  // come in, since we round graph iteration durations up to the nearest block.
-  buffering += WEBAUDIO_BLOCK_SIZE;
-
-  if (!PassThrough(aGraph) && mPacketizerInput) {
-    // Processing is active and is processed in chunks of 10ms through the
-    // input packetizer. We allow for 10ms of silence on the track to
-    // accomodate the buffering worst-case.
-    buffering += mPacketizerInput->mPacketSize;
-  }
-
-  if (delta <= 0) {
+  if (!mEnabled) {
    return;
  }

-  if (MOZ_LIKELY(mLiveBufferingAppended)) {
-    if (MOZ_UNLIKELY(buffering > *mLiveBufferingAppended)) {
-      // We need to buffer more data. This could happen the first time we pull
-      // input data, or the first iteration after starting to use the
-      // packetizer.
-      TrackTime silence = buffering - *mLiveBufferingAppended;
-      LOG_FRAME("AudioInputProcessing %p Inserting %" PRId64
-                " frames of silence due to buffer increase",
-                this, silence);
-      mSegment.InsertNullDataAtStart(silence);
-      mLiveBufferingAppended = Some(buffering);
-    } else if (MOZ_UNLIKELY(buffering < *mLiveBufferingAppended)) {
-      // We need to clear some buffered data to reduce latency now that the
-      // packetizer is no longer used.
-      MOZ_ASSERT(PassThrough(aGraph), "Must have turned on passthrough");
-      TrackTime removal = *mLiveBufferingAppended - buffering;
-      MOZ_ASSERT(mSegment.GetDuration() >= removal);
-      TrackTime frames = std::min(mSegment.GetDuration(), removal);
-      LOG_FRAME("AudioInputProcessing %p Removing %" PRId64
-                " frames of silence due to buffer decrease",
-                this, frames);
-      *mLiveBufferingAppended -= frames;
-      mSegment.RemoveLeading(frames);
-    }
-  }
+  mEnabled = false;

-  if (mSegment.GetDuration() > 0) {
-    MOZ_ASSERT(buffering == *mLiveBufferingAppended);
-    TrackTime frames = std::min(mSegment.GetDuration(), delta);
-    LOG_FRAME("AudioInputProcessing %p Appending %" PRId64
-              " frames of real data for %u channels.",
-              this, frames, mRequestedInputChannelCount);
-    aSegment->AppendSlice(mSegment, 0, frames);
-    mSegment.RemoveLeading(frames);
-    delta -= frames;
-
-    // Assert that the amount of data buffered doesn't grow unboundedly.
-    MOZ_ASSERT_IF(aLastPullThisIteration, mSegment.GetDuration() <= buffering);
-  }
-
-  if (delta <= 0) {
-    if (mSegment.GetDuration() == 0) {
-      mLiveBufferingAppended = Some(-delta);
-    }
+  if (mSkipProcessing) {
    return;
  }

-  LOG_FRAME("AudioInputProcessing %p Pulling %" PRId64
-            " frames of silence for %u channels.",
-            this, delta, mRequestedInputChannelCount);
+  // Packetizer is active and we were just stopped. Stop the packetizer and
+  // processing.
+  ResetAudioProcessing(aGraph);
+}

-  // This assertion fails if we append silence here after having appended live
-  // frames. Before appending live frames we should add sufficient buffering to
-  // not have to glitch (aka append silence). Failing this meant the buffering
-  // was not sufficient.
-  MOZ_ASSERT_IF(mEnabled, !mLiveBufferingAppended);
-  mLiveBufferingAppended = Nothing();
+// The following is how how Process() works in pass-through and non-pass-through
+// mode. In both mode, Process() outputs the same amount of the frames as its
+// input data.
+//
+// I. In non-pass-through mode:
+//
+// We will use webrtc::AudioProcessing to process the input audio data in this
+// mode. The data input in webrtc::AudioProcessing needs to be a 10ms chunk,
+// while the input data passed to Process() is not necessary to have times of
+// 10ms-chunk length. To divide the input data into 10ms chunks,
+// mPacketizerInput is introduced.
+//
+// We will add one 10ms-chunk silence into the internal buffer before Process()
+// works. Those extra frames is called pre-buffering. It aims to avoid glitches
+// we may have when producing data in mPacketizerInput. Without pre-buffering,
+// when the input data length is not 10ms-times, we could end up having no
+// enough output needs since mPacketizerInput would keep some input data, which
+// is the remainder of the 10ms-chunk length. To force processing those data
+// left in mPacketizerInput, we would need to add some extra frames to make
+// mPacketizerInput produce a 10ms-chunk. For example, if the sample rate is
+// 44100 Hz, then the packet-size is 441 frames. When we only have 384 input
+// frames, we would need to put additional 57 frames to mPacketizerInput to
+// produce a packet. However, those extra 57 frames result in a glitch sound.
+//
+// By adding one 10ms-chunk silence in advance to the internal buffer, we won't
+// need to add extra frames between the input data no matter what data length it
+// is. The only drawback is the input data won't be processed and send to output
+// immediately. Process() will consume pre-buffering data for its output first.
+// The below describes how it works:
+//
+//
+//                          Process()
+//               +-----------------------------+
+//   input D(N)  |   +--------+   +--------+   |  output D(N)
+// --------------|-->|  P(N)  |-->|  S(N)  |---|-------------->
+//               |   +--------+   +--------+   |
+//               |   packetizer    mSegment    |
+//               +-----------------------------+
+//               <------ internal buffer ------>
+//
+//
+//   D(N): number of frames from the input and the output needs in the N round
+//      Z: number of frames of a 10ms chunk(packet) in mPacketizerInput, Z >= 1
+//         (if Z = 1, packetizer has no effect)
+//   P(N): number of frames left in mPacketizerInput after the N round. Once the
+//         frames in packetizer >= Z, packetizer will produce a packet to
+//         mSegment, so P(N) = (P(N-1) + D(N)) % Z, 0 <= P(N) <= Z-1
+//   S(N): number of frames left in mSegment after the N round. The input D(N)
+//         frames will be passed to mPacketizerInput first, and then
+//         mPacketizerInput may append some packets to mSegment, so
+//         S(N) = S(N-1) + Z * floor((P(N-1) + D(N)) / Z) - D(N)
+//
+// At the first, we set P(0) = 0, S(0) = X, where X >= Z-1. X is the
+// pre-buffering put in the internal buffer. With this settings, P(K) + S(K) = X
+// always holds.
+//
+// Intuitively, this seems true: We put X frames in the internal buffer at
+// first. If the data won't be blocked in packetizer, after the Process(), the
+// internal buffer should still hold X frames since the number of frames coming
+// from input is the same as the output needs. The key of having enough data for
+// output needs, while the input data is piled up in packetizer, is by putting
+// at least Z-1 frames as pre-buffering, since the maximum number of frames
+// stuck in the packetizer before it can emit a packet is packet-size - 1.
+// Otherwise, we don't have enough data for output if the new input data plus
+// the data left in packetizer produces a smaller-than-10ms chunk, which will be
+// left in packetizer. Thus we must have some pre-buffering frames in the
+// mSegment to make up the length of the left chunk we need for output. This can
+// also be told by by induction:
+//   (1) This holds when K = 0
+//   (2) Assume this holds when K = N: so P(N) + S(N) = X
+//       => P(N) + S(N) = X >= Z-1 => S(N) >= Z-1-P(N)
+//   (3) When K = N+1, we have D(N+1) input frames comes
+//     a. if P(N) + D(N+1) < Z, then packetizer has no enough data for one
+//        packet. No data produced by packertizer, so the mSegment now has
+//        S(N) >= Z-1-P(N) frames. Output needs D(N+1) < Z-P(N) frames. So it
+//        needs at most Z-P(N)-1 frames, and mSegment has enough frames for
+//        output, Then, P(N+1) = P(N) + D(N+1) and S(N+1) = S(N) - D(N+1)
+//        => P(N+1) + S(N+1) = P(N) + S(N) = X
+//     b. if P(N) + D(N+1) = Z, then packetizer will produce one packet for
+//        mSegment, so mSegment now has S(N) + Z frames. Output needs D(N+1)
+//        = Z-P(N) frames. S(N) has at least Z-1-P(N)+Z >= Z-P(N) frames, since
+//        Z >= 1. So mSegment has enough frames for output. Then, P(N+1) = 0 and
+//        S(N+1) = S(N) + Z - D(N+1) = S(N) + P(N)
+//        => P(N+1) + S(N+1) = P(N) + S(N) = X
+//     c. if P(N) + D(N+1) > Z, and let P(N) + D(N+1) = q * Z + r, where q >= 1
+//        and 0 <= r <= Z-1, then packetizer will produce can produce q packets
+//        for mSegment. Output needs D(N+1) = q * Z - P(N) + r frames and
+//        mSegment has S(N) + q * z >= q * z - P(N) + Z-1 >= q*z -P(N) + r,
+//        since r <= Z-1. So mSegment has enough frames for output. Then,
+//        P(N+1) = r and S(N+1) = S(N) + q * Z - D(N+1)
+//         => P(N+1) + S(N+1) = S(N) + (q * Z + r - D(N+1)) =  S(N) + P(N) = X
+//   => P(K) + S(K) = X always holds
+//
+// Since P(K) + S(K) = X and P(K) is in [0, Z-1], the S(K) is in [X-Z+1, X]
+// range. In our implementation, X is set to Z so S(K) is in [1, Z].
+// By the above workflow, we always have enough data for output and no extra
+// frames put into packetizer. It means we don't have any glitch!
+//
+// II. In pass-through mode:
+//
+//                Process()
+//               +--------+
+//   input D(N)  |        |  output D(N)
+// -------------->-------->--------------->
+//               |        |
+//               +--------+
+//
+// The D(N) frames of data are just forwarded from input to output without any
+// processing
+void AudioInputProcessing::Process(MediaTrackGraphImpl* aGraph, GraphTime aFrom,
+                                   GraphTime aTo, AudioSegment* aInput,
+                                   AudioSegment* aOutput) {
+  MOZ_ASSERT(aGraph->OnGraphThread());
+  MOZ_ASSERT(aFrom <= aTo);
+  MOZ_ASSERT(!mEnded);

-  aSegment->AppendNullData(delta);
+  TrackTime need = aTo - aFrom;
+  if (need == 0) {
+    return;
+  }
+
+  if (!mEnabled) {
+    LOG_FRAME("(Graph %p, Driver %p) AudioInputProcessing %p Filling %" PRId64
+              " frames of silence to output (disabled)",
+              aGraph, aGraph->CurrentDriver(), this, need);
+    aOutput->AppendNullData(need);
+    return;
+  }
+
+  MOZ_ASSERT(aInput->GetDuration() == need,
+             "Wrong data length from input port source");
+
+  if (PassThrough(aGraph)) {
+    LOG_FRAME(
+        "(Graph %p, Driver %p) AudioInputProcessing %p Forwarding %" PRId64
+        " frames of input data to output directly (PassThrough)",
+        aGraph, aGraph->CurrentDriver(), this, aInput->GetDuration());
+    aOutput->AppendSegment(aInput, mPrincipal);
+    return;
+  }
+
+  // SetPassThrough(false) must be called before reaching here.
+  MOZ_ASSERT(mPacketizerInput);
+  // If mRequestedInputChannelCount is updated, create a new packetizer. No
+  // need to change the pre-buffering since the rate is always the same. The
+  // frames left in the packetizer would be replaced by null data and then
+  // transferred to mSegment.
+  EnsureAudioProcessing(aGraph, mRequestedInputChannelCount);
+
+  // Preconditions of the audio-processing logic.
+  MOZ_ASSERT(static_cast<uint32_t>(mSegment.GetDuration()) +
+                 mPacketizerInput->FramesAvailable() ==
+             mPacketizerInput->mPacketSize);
+  // We pre-buffer mPacketSize frames, but the maximum number of frames stuck in
+  // the packetizer before it can emit a packet is mPacketSize-1. Thus that
+  // remaining 1 frame will always be present in mSegment.
+  MOZ_ASSERT(mSegment.GetDuration() >= 1);
+  MOZ_ASSERT(mSegment.GetDuration() <= mPacketizerInput->mPacketSize);
+
+  PacketizeAndProcess(aGraph, *aInput);
+  LOG_FRAME("(Graph %p, Driver %p) AudioInputProcessing %p Buffer has %" PRId64
+            " frames of data now, after packetizing and processing",
+            aGraph, aGraph->CurrentDriver(), this, mSegment.GetDuration());
+
+  // By setting pre-buffering to the number of frames of one packet, and
+  // because the maximum number of frames stuck in the packetizer before
+  // it can emit a packet is the mPacketSize-1, we always have at least
+  // one more frame than output needs.
+  MOZ_ASSERT(mSegment.GetDuration() > need);
+  aOutput->AppendSlice(mSegment, 0, need);
+  mSegment.RemoveLeading(need);
+  LOG_FRAME("(Graph %p, Driver %p) AudioInputProcessing %p moving %" PRId64
+            " frames of data to output, leaving %" PRId64 " frames in buffer",
+            aGraph, aGraph->CurrentDriver(), this, need,
+            mSegment.GetDuration());
+
+  // Postconditions of the audio-processing logic.
+  MOZ_ASSERT(static_cast<uint32_t>(mSegment.GetDuration()) +
+                 mPacketizerInput->FramesAvailable() ==
+             mPacketizerInput->mPacketSize);
+  MOZ_ASSERT(mSegment.GetDuration() >= 1);
+  MOZ_ASSERT(mSegment.GetDuration() <= mPacketizerInput->mPacketSize);
 }

 void AudioInputProcessing::NotifyOutputData(MediaTrackGraphImpl* aGraph,
@ -653,12 +773,13 @@ void AudioInputProcessing::NotifyOutputData(MediaTrackGraphImpl* aGraph,
    return;
  }

-  if (!mPacketizerOutput || mPacketizerOutput->mPacketSize != aRate / 100u ||
+  if (!mPacketizerOutput ||
+      mPacketizerOutput->mPacketSize != GetPacketSize(aRate) ||
      mPacketizerOutput->mChannels != aChannels) {
    // It's ok to drop the audio still in the packetizer here: if this changes,
    // we changed devices or something.
    mPacketizerOutput = Nothing();
-    mPacketizerOutput.emplace(aRate / 100, aChannels);
+    mPacketizerOutput.emplace(GetPacketSize(aRate), aChannels);
  }

  mPacketizerOutput->Input(aBuffer, aFrames);
@ -736,27 +857,35 @@ void AudioInputProcessing::NotifyOutputData(MediaTrackGraphImpl* aGraph,

 // Only called if we're not in passthrough mode
 void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
-                                               const AudioDataValue* aBuffer,
-                                               size_t aFrames, TrackRate aRate,
-                                               uint32_t aChannels) {
+                                               const AudioSegment& aSegment) {
  MOZ_ASSERT(!PassThrough(aGraph),
             "This should be bypassed when in PassThrough mode.");
  MOZ_ASSERT(mEnabled);
-  size_t offset = 0;
+  MOZ_ASSERT(mPacketizerInput);
+  MOZ_ASSERT(mPacketizerInput->mPacketSize ==
+             GetPacketSize(aGraph->GraphRate()));

-  if (!mPacketizerInput || mPacketizerInput->mPacketSize != aRate / 100u ||
-      mPacketizerInput->mChannels != aChannels) {
-    // It's ok to drop the audio still in the packetizer here.
-    mPacketizerInput = Nothing();
-    mPacketizerInput.emplace(aRate / 100, aChannels);
-  }
-
-  LOG_FRAME("AudioInputProcessing %p Appending %zu frames to packetizer", this,
-            aFrames);
+  // The WriteToInterleavedBuffer will do upmix or downmix if the channel-count
+  // in aSegment's chunks is different from mPacketizerInput->mChannels
+  // WriteToInterleavedBuffer could be avoided once Bug 1729041 is done.
+  size_t sampleCount = aSegment.WriteToInterleavedBuffer(
+      mInterleavedBuffer, mPacketizerInput->mChannels);
+  size_t frameCount =
+      sampleCount / static_cast<size_t>(mPacketizerInput->mChannels);

  // Packetize our input data into 10ms chunks, deinterleave into planar channel
  // buffers, process, and append to the right MediaStreamTrack.
-  mPacketizerInput->Input(aBuffer, static_cast<uint32_t>(aFrames));
+  mPacketizerInput->Input(mInterleavedBuffer.Elements(),
+                          static_cast<uint32_t>(frameCount));
+
+  LOG_FRAME(
+      "(Graph %p, Driver %p) AudioInputProcessing %p Packetizing %zu frames. "
+      "Packetizer has %u frames (enough for %u packets) now",
+      aGraph, aGraph->CurrentDriver(), this, frameCount,
+      mPacketizerInput->FramesAvailable(),
+      mPacketizerInput->PacketsAvailable());
+
+  size_t offset = 0;

  while (mPacketizerInput->PacketsAvailable()) {
    mPacketCount++;
@ -771,15 +900,15 @@ void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
    float* packet = mInputBuffer.Data();
    mPacketizerInput->Output(packet);

-    // Downmix from aChannels to mono if needed. We always have floats
-    // here, the packetizer performed the conversion. This handles sound cards
-    // with multiple physical jacks exposed as a single device with _n_
-    // discrete channels, where only a single mic is plugged in. Those channels
-    // are not correlated temporaly since they are discrete channels, mixing is
-    // just a sum.
+    // Downmix from mPacketizerInput->mChannels to mono if needed. We always
+    // have floats here, the packetizer performed the conversion. This handles
+    // sound cards with multiple physical jacks exposed as a single device with
+    // _n_ discrete channels, where only a single mic is plugged in. Those
+    // channels are not correlated temporaly since they are discrete channels,
+    // mixing is just a sum.
    AutoTArray<float*, 8> deinterleavedPacketizedInputDataChannelPointers;
    uint32_t channelCountInput = 0;
-    if (aChannels > MAX_CHANNELS) {
+    if (mPacketizerInput->mChannels > MAX_CHANNELS) {
      channelCountInput = MONO;
      deinterleavedPacketizedInputDataChannelPointers.SetLength(
          channelCountInput);
@ -790,12 +919,12 @@ void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
      size_t readIndex = 0;
      for (size_t i = 0; i < mPacketizerInput->mPacketSize; i++) {
        mDeinterleavedBuffer.Data()[i] = 0.;
-        for (size_t j = 0; j < aChannels; j++) {
+        for (size_t j = 0; j < mPacketizerInput->mChannels; j++) {
          mDeinterleavedBuffer.Data()[i] += packet[readIndex++];
        }
      }
    } else {
-      channelCountInput = aChannels;
+      channelCountInput = mPacketizerInput->mChannels;
      // Deinterleave the input data
      // Prepare an array pointing to deinterleaved channels.
      deinterleavedPacketizedInputDataChannelPointers.SetLength(
@ -812,7 +941,7 @@ void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
                   deinterleavedPacketizedInputDataChannelPointers.Elements());
    }

-    StreamConfig inputConfig(aRate, channelCountInput,
+    StreamConfig inputConfig(aGraph->GraphRate(), channelCountInput,
                             false /* we don't use typing detection*/);
    StreamConfig outputConfig = inputConfig;

@ -873,8 +1002,11 @@ void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
      continue;
    }

-    LOG_FRAME("AudioInputProcessing %p Appending %u frames of packetized audio",
-              this, mPacketizerInput->mPacketSize);
+    LOG_FRAME(
+        "(Graph %p, Driver %p) AudioInputProcessing %p Appending %u frames of "
+        "packetized audio, leaving %u frames in packetizer",
+        aGraph, aGraph->CurrentDriver(), this, mPacketizerInput->mPacketSize,
+        mPacketizerInput->FramesAvailable());

    // We already have planar audio data of the right format. Insert into the
    // MTG.
@ -886,54 +1018,10 @@ void AudioInputProcessing::PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
  }
 }

-void AudioInputProcessing::ProcessInput(MediaTrackGraphImpl* aGraph,
-                                        const AudioSegment* aSegment) {
-  MOZ_ASSERT(aGraph);
-  MOZ_ASSERT(aGraph->OnGraphThread());
-
-  if (mEnded || !mEnabled || !mLiveBufferingAppended ||
-      mPendingData.IsEmpty()) {
-    return;
-  }
-
-  // The number of NotifyInputData and ProcessInput calls could be different. We
-  // always process the input data from NotifyInputData in the first
-  // ProcessInput after the NotifyInputData
-
-  // If some processing is necessary, packetize and insert in the WebRTC.org
-  // code. Otherwise, directly insert the mic data in the MTG, bypassing all
-  // processing.
-  if (PassThrough(aGraph)) {
-    if (aSegment && !aSegment->IsEmpty()) {
-      mSegment.AppendSegment(aSegment, mPrincipal);
-    } else {
-      mSegment.AppendFromInterleavedBuffer(mPendingData.Data(),
-                                           mPendingData.FrameCount(),
-                                           mPendingData.Channels(), mPrincipal);
-    }
-  } else {
-    MOZ_ASSERT(aGraph->GraphRate() == mPendingData.Rate());
-    // Bug 1729041: Feed aSegment to PacketizeAndProcess so mPendingData can be
-    // removed, and save a copy.
-    PacketizeAndProcess(aGraph, mPendingData.Data(), mPendingData.FrameCount(),
-                        mPendingData.Rate(), mPendingData.Channels());
-  }
-
-  mPendingData.Clear();
-}
-
 void AudioInputProcessing::NotifyInputStopped(MediaTrackGraphImpl* aGraph) {
  MOZ_ASSERT(aGraph->OnGraphThread());
  // This is called when an AudioCallbackDriver switch has happened for any
-  // reason, including other reasons than starting this audio input stream. We
-  // reset state when this happens, as a fallback driver may have fiddled with
-  // the amount of buffered silence during the switch.
-  mLiveBufferingAppended = Nothing();
-  mSegment.Clear();
-  if (mPacketizerInput) {
-    mPacketizerInput->Clear();
-  }
-  mPendingData.Clear();
+  // reason, including other reasons than starting this audio input stream.
 }

 // Called back on GraphDriver thread!
@ -944,17 +1032,9 @@ void AudioInputProcessing::NotifyInputData(MediaTrackGraphImpl* aGraph,
                                           uint32_t aChannels,
                                           uint32_t aAlreadyBuffered) {
  MOZ_ASSERT(aGraph->OnGraphThread());
-  TRACE("AudioInputProcessing::NotifyInputData");
-
+  MOZ_ASSERT(aGraph->GraphRate() == aRate);
  MOZ_ASSERT(mEnabled);
-
-  if (!mLiveBufferingAppended) {
-    // First time we see live frames getting added. Use what's already buffered
-    // in the driver's scratch buffer as a starting point.
-    mLiveBufferingAppended = Some(aAlreadyBuffered);
-  }
-
-  mPendingData.Push(aBuffer, aFrames, aRate, aChannels);
+  TRACE("AudioInputProcessing::NotifyInputData");
 }

 void AudioInputProcessing::DeviceChanged(MediaTrackGraphImpl* aGraph) {
@ -962,6 +1042,10 @@ void AudioInputProcessing::DeviceChanged(MediaTrackGraphImpl* aGraph) {

  // Reset some processing
  mAudioProcessing->Initialize();
+  LOG_FRAME(
+      "(Graph %p, Driver %p) AudioInputProcessing %p Reinitializing audio "
+      "processing",
+      aGraph, aGraph->CurrentDriver(), this);
 }

 void AudioInputProcessing::ApplyConfig(MediaTrackGraphImpl* aGraph,
@ -973,7 +1057,6 @@ void AudioInputProcessing::ApplyConfig(MediaTrackGraphImpl* aGraph,
 void AudioInputProcessing::End() {
  mEnded = true;
  mSegment.Clear();
-  mPendingData.Clear();
 }

 TrackTime AudioInputProcessing::NumBufferedFrames(
@ -982,6 +1065,74 @@ TrackTime AudioInputProcessing::NumBufferedFrames(
  return mSegment.GetDuration();
 }

+void AudioInputProcessing::EnsureAudioProcessing(MediaTrackGraphImpl* aGraph,
+                                                 uint32_t aChannels) {
+  MOZ_ASSERT(aGraph->OnGraphThread());
+  MOZ_ASSERT(aChannels > 0);
+  MOZ_ASSERT(mEnabled);
+  MOZ_ASSERT(!mSkipProcessing);
+
+  if (mPacketizerInput && mPacketizerInput->mChannels == aChannels) {
+    return;
+  }
+
+  // If mPacketizerInput exists but with different channel-count, there is no
+  // need to change pre-buffering since the packet size is the same as the old
+  // one, since the rate is a constant.
+  MOZ_ASSERT_IF(mPacketizerInput, mPacketizerInput->mPacketSize ==
+                                      GetPacketSize(aGraph->GraphRate()));
+  bool needPreBuffering = !mPacketizerInput;
+  if (mPacketizerInput) {
+    const TrackTime numBufferedFrames =
+        static_cast<TrackTime>(mPacketizerInput->FramesAvailable());
+    mSegment.AppendNullData(numBufferedFrames);
+    mPacketizerInput = Nothing();
+  }
+
+  mPacketizerInput.emplace(GetPacketSize(aGraph->GraphRate()), aChannels);
+
+  if (needPreBuffering) {
+    LOG_FRAME(
+        "(Graph %p, Driver %p) AudioInputProcessing %p: Adding %u frames of "
+        "silence as pre-buffering",
+        aGraph, aGraph->CurrentDriver(), this, mPacketizerInput->mPacketSize);
+
+    AudioSegment buffering;
+    buffering.AppendNullData(
+        static_cast<TrackTime>(mPacketizerInput->mPacketSize));
+    PacketizeAndProcess(aGraph, buffering);
+  }
+}
+
+void AudioInputProcessing::ResetAudioProcessing(MediaTrackGraphImpl* aGraph) {
+  MOZ_ASSERT(aGraph->OnGraphThread());
+  MOZ_ASSERT(mSkipProcessing || !mEnabled);
+  MOZ_ASSERT(mPacketizerInput);
+
+  LOG_FRAME(
+      "(Graph %p, Driver %p) AudioInputProcessing %p Resetting audio "
+      "processing",
+      aGraph, aGraph->CurrentDriver(), this);
+
+  // Reset AudioProcessing so that if we resume processing in the future it
+  // doesn't depend on old state.
+  mAudioProcessing->Initialize();
+
+  MOZ_ASSERT(static_cast<uint32_t>(mSegment.GetDuration()) +
+                 mPacketizerInput->FramesAvailable() ==
+             mPacketizerInput->mPacketSize);
+
+  // It's ok to clear all the internal buffer here since we won't use mSegment
+  // in pass-through mode or when audio processing is disabled.
+  LOG_FRAME(
+      "(Graph %p, Driver %p) AudioInputProcessing %p Emptying out %" PRId64
+      " frames of data",
+      aGraph, aGraph->CurrentDriver(), this, mSegment.GetDuration());
+  mSegment.Clear();
+
+  mPacketizerInput = Nothing();
+}
+
 void AudioInputTrack::Destroy() {
  MOZ_ASSERT(NS_IsMainThread());
  CloseAudioInput();
@ -1031,38 +1182,101 @@ void AudioInputTrack::DestroyImpl() {
 void AudioInputTrack::ProcessInput(GraphTime aFrom, GraphTime aTo,
                                   uint32_t aFlags) {
  TRACE_COMMENT("AudioInputTrack::ProcessInput", "AudioInputTrack %p", this);
+  MOZ_ASSERT(mInputProcessing);

-  // Check if there is a connected NativeInputTrack
-  NativeInputTrack* source = nullptr;
-  if (!mInputs.IsEmpty()) {
-    for (const MediaInputPort* input : mInputs) {
-      MOZ_ASSERT(input->GetSource());
-      if (input->GetSource()->AsNativeInputTrack()) {
-        source = input->GetSource()->AsNativeInputTrack();
-        break;
-      }
+  LOG_FRAME(
+      "(Graph %p, Driver %p) AudioInputTrack %p ProcessInput from %" PRId64
+      " to %" PRId64 ", needs %" PRId64 " frames",
+      mGraph, mGraph->CurrentDriver(), this, aFrom, aTo, aTo - aFrom);
+
+  if (aFrom >= aTo) {
+    return;
+  }
+
+  if (!mInputProcessing->IsEnded()) {
+    MOZ_ASSERT(TrackTimeToGraphTime(GetEnd()) == aFrom);
+    if (mInputs.IsEmpty()) {
+      GetData<AudioSegment>()->AppendNullData(aTo - aFrom);
+      LOG_FRAME("(Graph %p, Driver %p) AudioInputTrack %p Filling %" PRId64
+                " frames of null data (no input source)",
+                mGraph, mGraph->CurrentDriver(), this, aTo - aFrom);
+    } else {
+      MOZ_ASSERT(mInputs.Length() == 1);
+      AudioSegment data;
+      GetInputSourceData(data, mInputProcessing->GetPrincipalHandle(),
+                         mInputs[0], aFrom, aTo);
+      mInputProcessing->Process(GraphImpl(), aFrom, aTo, &data,
+                                GetData<AudioSegment>());
    }
-  }
+    MOZ_ASSERT(TrackTimeToGraphTime(GetEnd()) == aTo);

-  // Push the input data from the connected NativeInputTrack to mInputProcessing
-  if (source) {
-    MOZ_ASSERT(source->GraphImpl() == GraphImpl());
-    MOZ_ASSERT(source->mSampleRate == mSampleRate);
-    MOZ_ASSERT(GraphImpl()->GraphRate() == mSampleRate);
-    mInputProcessing->ProcessInput(GraphImpl(),
-                                   source->GetData<AudioSegment>());
-  }
-
-  bool ended = false;
-  mInputProcessing->Pull(
-      GraphImpl(), aFrom, aTo, TrackTimeToGraphTime(GetEnd()),
-      GetData<AudioSegment>(), aTo == GraphImpl()->mStateComputedTime, &ended);
-  ApplyTrackDisabling(mSegment.get());
-  if (ended && (aFlags & ALLOW_END)) {
+    ApplyTrackDisabling(mSegment.get());
+  } else if (aFlags & ALLOW_END) {
    mEnded = true;
  }
 }

+void AudioInputTrack::GetInputSourceData(AudioSegment& aOutput,
+                                         const PrincipalHandle& aPrincipal,
+                                         const MediaInputPort* aPort,
+                                         GraphTime aFrom, GraphTime aTo) const {
+  MOZ_ASSERT(mGraph->OnGraphThread());
+  MOZ_ASSERT(aOutput.IsEmpty());
+
+  MediaTrack* source = aPort->GetSource();
+  GraphTime next;
+  for (GraphTime t = aFrom; t < aTo; t = next) {
+    MediaInputPort::InputInterval interval =
+        MediaInputPort::GetNextInputInterval(aPort, t);
+    interval.mEnd = std::min(interval.mEnd, aTo);
+
+    const bool inputEnded =
+        source->Ended() &&
+        source->GetEnd() <=
+            source->GraphTimeToTrackTimeWithBlocking(interval.mStart);
+
+    TrackTime ticks = interval.mEnd - interval.mStart;
+    next = interval.mEnd;
+
+    if (interval.mStart >= interval.mEnd) {
+      break;
+    }
+
+    if (inputEnded) {
+      aOutput.AppendNullData(ticks);
+      LOG_FRAME("(Graph %p, Driver %p) AudioInputTrack %p Getting %" PRId64
+                " ticks of null data from input port source (ended input)",
+                mGraph, mGraph->CurrentDriver(), this, ticks);
+    } else if (interval.mInputIsBlocked) {
+      aOutput.AppendNullData(ticks);
+      LOG_FRAME("(Graph %p, Driver %p) AudioInputTrack %p Getting %" PRId64
+                " ticks of null data from input port source (blocked input)",
+                mGraph, mGraph->CurrentDriver(), this, ticks);
+    } else if (source->IsSuspended()) {
+      aOutput.AppendNullData(ticks);
+      LOG_FRAME(
+          "(Graph %p, Driver %p) AudioInputTrack %p Getting %" PRId64
+          " ticks of null data from input port source (source is suspended)",
+          mGraph, mGraph->CurrentDriver(), this, ticks);
+    } else {
+      TrackTime start =
+          source->GraphTimeToTrackTimeWithBlocking(interval.mStart);
+      TrackTime end = source->GraphTimeToTrackTimeWithBlocking(interval.mEnd);
+      MOZ_ASSERT(source->GetData<AudioSegment>()->GetDuration() >= end);
+
+      AudioSegment data;
+      data.AppendSlice(*source->GetData<AudioSegment>(), start, end);
+
+      // Replace the principal
+      aOutput.AppendSegment(&data, aPrincipal);
+
+      LOG_FRAME("(Graph %p, Driver %p) AudioInputTrack %p Getting %" PRId64
+                " ticks of real data from input port source %p",
+                mGraph, mGraph->CurrentDriver(), this, end - start, source);
+    }
+  }
+}
+
 void AudioInputTrack::SetInputProcessingImpl(
    RefPtr<AudioInputProcessing> aInputProcessing) {
  MOZ_ASSERT(GraphImpl()->OnGraphThread());
--- a/dom/media/webrtc/MediaEngineWebRTCAudio.h
+++ b/dom/media/webrtc/MediaEngineWebRTCAudio.h
@ -126,10 +126,8 @@ class AudioInputProcessing : public AudioDataListener {
 public:
  AudioInputProcessing(uint32_t aMaxChannelCount,
                       const PrincipalHandle& aPrincipalHandle);
-
-  void Pull(MediaTrackGraphImpl* aGraph, GraphTime aFrom, GraphTime aTo,
-            GraphTime aTrackEnd, AudioSegment* aSegment,
-            bool aLastPullThisIteration, bool* aEnded);
+  void Process(MediaTrackGraphImpl* aGraph, GraphTime aFrom, GraphTime aTo,
+               AudioSegment* aInput, AudioSegment* aOutput);

  void NotifyOutputData(MediaTrackGraphImpl* aGraph, AudioDataValue* aBuffer,
                        size_t aFrames, TrackRate aRate,
@ -146,8 +144,8 @@ class AudioInputProcessing : public AudioDataListener {
    return !PassThrough(aGraph);
  }

-  void Start();
-  void Stop();
+  void Start(MediaTrackGraphImpl* aGraph);
+  void Stop(MediaTrackGraphImpl* aGraph);

  void DeviceChanged(MediaTrackGraphImpl* aGraph) override;

@ -157,12 +155,8 @@ class AudioInputProcessing : public AudioDataListener {

  void Disconnect(MediaTrackGraphImpl* aGraph) override;

-  // aSegment stores the unprocessed non-interleaved audio input data from mic
-  void ProcessInput(MediaTrackGraphImpl* aGraph, const AudioSegment* aSegment);
-
  void PacketizeAndProcess(MediaTrackGraphImpl* aGraph,
-                           const AudioDataValue* aBuffer, size_t aFrames,
-                           TrackRate aRate, uint32_t aChannels);
+                           const AudioSegment& aSegment);

  void SetPassThrough(MediaTrackGraphImpl* aGraph, bool aPassThrough);
  uint32_t GetRequestedInputChannelCount();
@ -182,8 +176,19 @@ class AudioInputProcessing : public AudioDataListener {

  TrackTime NumBufferedFrames(MediaTrackGraphImpl* aGraph) const;

+  // The packet size contains samples in 10ms. The unit of aRate is hz.
+  constexpr static uint32_t GetPacketSize(TrackRate aRate) {
+    return static_cast<uint32_t>(aRate) / 100u;
+  }
+
+  bool IsEnded() const { return mEnded; }
+
+  const PrincipalHandle& GetPrincipalHandle() const { return mPrincipal; }
+
 private:
  ~AudioInputProcessing() = default;
+  void EnsureAudioProcessing(MediaTrackGraphImpl* aGraph, uint32_t aChannels);
+  void ResetAudioProcessing(MediaTrackGraphImpl* aGraph);
  // This implements the processing algoritm to apply to the input (e.g. a
  // microphone). If all algorithms are disabled, this class in not used. This
  // class only accepts audio chunks of 10ms. It has two inputs and one output:
@ -215,13 +220,6 @@ class AudioInputProcessing : public AudioDataListener {
  AlignedFloatBuffer mInputDownmixBuffer;
  // Stores data waiting to be pulled.
  AudioSegment mSegment;
-  // Set to Nothing() by Start(). Once live frames have been appended from the
-  // audio callback, this is the number of frames appended as pre-buffer for
-  // that data, to avoid underruns. Buffering in the track might be needed
-  // because of the AUDIO_BLOCK interval at which we run the graph, the
-  // packetizer keeping some input data. Care must be taken when turning on and
-  // off the packetizer.
-  Maybe<TrackTime> mLiveBufferingAppended;
  // Principal for the data that flows through this class.
  const PrincipalHandle mPrincipal;
  // Whether or not this MediaEngine is enabled. If it's not enabled, it
@ -230,11 +228,15 @@ class AudioInputProcessing : public AudioDataListener {
  bool mEnabled;
  // Whether or not we've ended and removed the AudioInputTrack.
  bool mEnded;
-  // Store the unprocessed interleaved audio input data
-  AudioInputSamples mPendingData;
  // When processing is enabled, the number of packets received by this
  // instance, to implement periodic logging.
  uint64_t mPacketCount;
+  // A storage holding the interleaved audio data converted the AudioSegment.
+  // This will be used as an input parameter for PacketizeAndProcess. This
+  // should be removed once bug 1729041 is done.
+  AutoTArray<AudioDataValue,
+             SilentChannel::AUDIO_PROCESSING_FRAMES * GUESS_AUDIO_CHANNELS>
+      mInterleavedBuffer;
 };

 // MediaTrack subclass tailored for MediaEngineWebRTCMicrophoneSource.
@ -283,6 +285,12 @@ class AudioInputTrack : public ProcessedMediaTrack {
        "Must set mInputProcessing before exposing to content");
    return mInputProcessing->GetRequestedInputChannelCount();
  }
+  // Get the data in [aFrom, aTo) from aPort->GetSource() to aOutput. aOutput
+  // needs to be empty.
+  void GetInputSourceData(AudioSegment& aOutput,
+                          const PrincipalHandle& aPrincipal,
+                          const MediaInputPort* aPort, GraphTime aFrom,
+                          GraphTime aTo) const;

  // Any thread
  AudioInputTrack* AsAudioInputTrack() override { return this; }