bug 1474222 change ConvolverNode output to mono for single channel convolution r=padenot

This also returns to using a single convolver for processing of mono input, which introduces complexity in up-mixing the state of the convolver when a second channel is added. MozReview-Commit-ID: KeBrAswQbtF --HG-- extra : rebase_source : d793bd967e0291069e4e6cc418de53c4b4cf3253
2018-08-06 21:24:15 +12:00 · 2018-08-06 21:24:15 +12:00 · 7ee8880ce6
--- a/dom/media/webaudio/ConvolverNode.cpp
+++ b/dom/media/webaudio/ConvolverNode.cpp
@ -30,13 +30,64 @@ class ConvolverNodeEngine final : public AudioNodeEngine
 public:
  ConvolverNodeEngine(AudioNode* aNode, bool aNormalize)
    : AudioNodeEngine(aNode)
-    , mLeftOverData(INT32_MIN)
-    , mSampleRate(0.0f)
    , mUseBackgroundThreads(!aNode->Context()->IsOffline())
    , mNormalize(aNormalize)
  {
  }

+  // Indicates how the right output channel is generated.
+  enum class RightConvolverMode {
+    // A right convolver is always used when there is more than one impulse
+    // response channel.
+    Always,
+    // With a single response channel, the mode may be either Direct or
+    // Difference.  The decision on which to use is made when stereo input is
+    // received.  Once the right convolver is in use, convolver state is
+    // suitable only for the selected mode, and so the mode cannot change
+    // until the right convolver contains only silent history.
+    //
+    // With Direct mode, each convolver processes a corresponding channel.
+    // This mode is selected when input is initially stereo or
+    // channelInterpretation is "discrete" at the time or starting the right
+    // convolver when input changes from non-silent mono to stereo.
+    Direct,
+    // Difference mode is selected if channelInterpretation is "speakers" at
+    // the time starting the right convolver when the input changes from mono
+    // to stereo.
+    //
+    // When non-silent input is initially mono, with a single response
+    // channel, the right output channel is not produced until input becomes
+    // stereo.  Only a single convolver is used for mono processing.  When
+    // stereo input arrives after mono input, output must be as if the mono
+    // signal remaining in the left convolver is up-mixed, but the right
+    // convolver has not been initialized with the history of the mono input.
+    // Copying the state of the left convolver into the right convolver is not
+    // desirable, because there is considerable state to copy, and the
+    // different convolvers are intended to process out of phase, which means
+    // that state from one convolver would not directly map to state in
+    // another convolver.
+    //
+    // Instead the distributive property of convolution is used to generate
+    // the right output channel using information in the left output channel.
+    // Using l and r to denote the left and right channel input signals, g the
+    // impulse response, and * convolution, the convolution of the right
+    // channel can be given by
+    //
+    //   r * g = (l + (r - l)) * g
+    //         = l * g + (r - l) * g
+    //
+    // The left convolver continues to process the left channel l to produce
+    // l * g.  The right convolver processes the difference of input channel
+    // signals r - l to produce (r - l) * g.  The outputs of the two
+    // convolvers are added to generate the right channel output r * g.
+    //
+    // The benefit of doing this is that the history of the r - l input for a
+    // "speakers" up-mixed mono signal is zero, and so an empty convolver
+    // already has exactly the right history for mixing the previous mono
+    // signal with the new stereo signal.
+    Difference
+  };
+
  enum Parameters {
    SAMPLE_RATE,
    NORMALIZE
@ -73,17 +124,45 @@ public:
    // Very large FFTs will have worse phase errors. Given these constraints 32768 is a good compromise.
    const size_t MaxFFTSize = 32768;

-    mLeftOverData = INT32_MIN; // reset
+    // Reset.
+    mRemainingLeftOutput = INT32_MIN;
+    mRemainingRightOutput = 0;
+    mRemainingRightHistory = 0;

    if (aBuffer.IsNull() || !mSampleRate) {
      mReverb = nullptr;
      return;
    }

+    // Assume for now that convolution of channel difference is not required.
+    // Direct may change to Difference during processing.
+    mRightConvolverMode =
+      aBuffer.ChannelCount() == 1 ? RightConvolverMode::Direct
+      : RightConvolverMode::Always;
+
    mReverb = new WebCore::Reverb(aBuffer, MaxFFTSize, mUseBackgroundThreads,
                                  mNormalize, mSampleRate);
  }

+  void AllocateReverbInput(const AudioBlock& aInput,
+                           uint32_t aTotalChannelCount)
+  {
+    uint32_t inputChannelCount = aInput.ChannelCount();
+    MOZ_ASSERT(inputChannelCount <= aTotalChannelCount);
+    mReverbInput.AllocateChannels(aTotalChannelCount);
+    // Pre-multiply the input's volume
+    for (uint32_t i = 0; i < inputChannelCount; ++i) {
+      const float* src = static_cast<const float*>(aInput.mChannelData[i]);
+      float* dest = mReverbInput.ChannelFloatsForWrite(i);
+      AudioBlockCopyChannelWithScale(src, aInput.mVolume, dest);
+    }
+    // Fill remaining channels with silence
+    for (uint32_t i = inputChannelCount; i < aTotalChannelCount; ++i) {
+      float* dest = mReverbInput.ChannelFloatsForWrite(i);
+      std::fill_n(dest, WEBAUDIO_BLOCK_SIZE, 0.0f);
+    }
+  }
+
  void ProcessBlock(AudioNodeStream* aStream,
                    GraphTime aFrom,
                    const AudioBlock& aInput,
@ -92,7 +171,7 @@ public:

  bool IsActive() const override
  {
-    return mLeftOverData != INT32_MIN;
+    return mRemainingLeftOutput != INT32_MIN;
  }

  size_t SizeOfExcludingThis(MallocSizeOf aMallocSizeOf) const override
@ -117,12 +196,34 @@ private:
  // Keeping mReverbInput across process calls avoids unnecessary reallocation.
  AudioBlock mReverbInput;
  nsAutoPtr<WebCore::Reverb> mReverb;
-  int32_t mLeftOverData;
-  float mSampleRate;
+  // Tracks samples of the tail remaining to be output.  INT32_MIN is a
+  // special value to indicate that the end of any previous tail has been
+  // handled.
+  int32_t mRemainingLeftOutput = INT32_MIN;
+  // mRemainingRightOutput and mRemainingRightHistory are only used when
+  // mRightOutputMode != Always.  There is no special handling required at the
+  // end of tail times and so INT32_MIN is not used.
+  // mRemainingRightOutput tracks how much longer this node needs to continue
+  // to produce a right output channel.
+  int32_t mRemainingRightOutput = 0;
+  // mRemainingRightHistory tracks how much silent input would be required to
+  // drain the right convolver, which may sometimes be longer than the period
+  // a right output channel is required.
+  int32_t mRemainingRightHistory = 0;
+  float mSampleRate = 0.0f;
+  RightConvolverMode mRightConvolverMode = RightConvolverMode::Always;
  bool mUseBackgroundThreads;
  bool mNormalize;
 };

+static void
+AddScaledLeftToRight(AudioBlock* aBlock, float aScale)
+{
+  const float* left = static_cast<const float*>(aBlock->mChannelData[0]);
+  float* right = aBlock->ChannelFloatsForWrite(1);
+  AudioBlockAddChannelWithScale(left, aScale, right);
+}
+
 void
 ConvolverNodeEngine::ProcessBlock(AudioNodeStream* aStream,
                                  GraphTime aFrom,
@ -135,14 +236,16 @@ ConvolverNodeEngine::ProcessBlock(AudioNodeStream* aStream,
    return;
  }

+  uint32_t inputChannelCount = aInput.ChannelCount();
  if (aInput.IsNull()) {
-    if (mLeftOverData > 0) {
-      mLeftOverData -= WEBAUDIO_BLOCK_SIZE;
-      mReverbInput.AllocateChannels(1);
-      WriteZeroesToAudioBlock(&mReverbInput, 0, WEBAUDIO_BLOCK_SIZE);
+    if (mRemainingLeftOutput > 0) {
+      mRemainingLeftOutput -= WEBAUDIO_BLOCK_SIZE;
+      AllocateReverbInput(aInput, 1); // floats for silence
    } else {
-      if (mLeftOverData != INT32_MIN) {
-        mLeftOverData = INT32_MIN;
+      if (mRemainingLeftOutput != INT32_MIN) {
+        mRemainingLeftOutput = INT32_MIN;
+        MOZ_ASSERT(mRemainingRightOutput <= 0);
+        MOZ_ASSERT(mRemainingRightHistory <= 0);
        aStream->ScheduleCheckForInactive();
        RefPtr<PlayingRefChanged> refchanged =
          new PlayingRefChanged(aStream, PlayingRefChanged::RELEASE);
@ -153,31 +256,138 @@ ConvolverNodeEngine::ProcessBlock(AudioNodeStream* aStream,
      return;
    }
  } else {
-    if (aInput.mVolume != 1.0f) {
-      // Pre-multiply the input's volume
-      uint32_t numChannels = aInput.ChannelCount();
-      mReverbInput.AllocateChannels(numChannels);
-      for (uint32_t i = 0; i < numChannels; ++i) {
-        const float* src = static_cast<const float*>(aInput.mChannelData[i]);
-        float* dest = mReverbInput.ChannelFloatsForWrite(i);
-        AudioBlockCopyChannelWithScale(src, aInput.mVolume, dest);
-      }
-    } else {
-      mReverbInput = aInput;
-    }
-
-    if (mLeftOverData <= 0) {
+    if (mRemainingLeftOutput <= 0) {
      RefPtr<PlayingRefChanged> refchanged =
        new PlayingRefChanged(aStream, PlayingRefChanged::ADDREF);
      aStream->Graph()->
        DispatchToMainThreadAfterStreamStateUpdate(refchanged.forget());
    }
-    mLeftOverData = mReverb->impulseResponseLength();
-    MOZ_ASSERT(mLeftOverData > 0);
+
+    // Use mVolume as a flag to detect whether AllocateReverbInput() gets
+    // called.
+    mReverbInput.mVolume = 0.0f;
+
+    // Special handling of input channel count changes is used when there is
+    // only a single impulse response channel.  See RightConvolverMode.
+    if (mRightConvolverMode != RightConvolverMode::Always) {
+      ChannelInterpretation channelInterpretation =
+        aStream->GetChannelInterpretation();
+      if (inputChannelCount == 2) {
+        if (mRemainingRightHistory <= 0) {
+          // Will start the second convolver.  Choose to convolve the right
+          // channel directly if there is no left tail to up-mix or up-mixing
+          // is "discrete".
+          mRightConvolverMode =
+            (mRemainingLeftOutput <= 0 ||
+             channelInterpretation == ChannelInterpretation::Discrete) ?
+            RightConvolverMode::Direct : RightConvolverMode::Difference;
+        }
+        // The extra WEBAUDIO_BLOCK_SIZE is subtracted below.
+        mRemainingRightOutput =
+          mReverb->impulseResponseLength() + WEBAUDIO_BLOCK_SIZE;
+        mRemainingRightHistory = mRemainingRightOutput;
+        if (mRightConvolverMode == RightConvolverMode::Difference) {
+          AllocateReverbInput(aInput, 2);
+          // Subtract left from right.
+          AddScaledLeftToRight(&mReverbInput, -1.0f);
+        }
+      } else if (mRemainingRightHistory > 0) {
+        // There is one channel of input, but a second convolver also
+        // requires input.  Up-mix appropriately for the second convolver.
+        if ((mRightConvolverMode == RightConvolverMode::Difference) ^
+            (channelInterpretation == ChannelInterpretation::Discrete)) {
+          MOZ_ASSERT(
+            (mRightConvolverMode == RightConvolverMode::Difference &&
+             channelInterpretation == ChannelInterpretation::Speakers) ||
+            (mRightConvolverMode == RightConvolverMode::Direct &&
+             channelInterpretation == ChannelInterpretation::Discrete));
+          // The state is one of the following combinations:
+          // 1) Difference and speakers.
+          //    Up-mixing gives r = l.
+          //    The input to the second convolver is r - l.
+          // 2) Direct and discrete.
+          //    Up-mixing gives r = 0.
+          //    The input to the second convolver is r.
+          //
+          // In each case the input for the second convolver is silence, which
+          // will drain the convolver.
+          AllocateReverbInput(aInput, 2);
+        } else {
+          if (channelInterpretation == ChannelInterpretation::Discrete) {
+            MOZ_ASSERT(mRightConvolverMode == RightConvolverMode::Difference);
+            // channelInterpretation has changed since the second convolver
+            // was added.  "discrete" up-mixing of input would produce a
+            // silent right channel r = 0, but the second convolver needs
+            // r - l for RightConvolverMode::Difference.
+            AllocateReverbInput(aInput, 2);
+            AddScaledLeftToRight(&mReverbInput, -1.0f);
+          } else {
+            MOZ_ASSERT(channelInterpretation ==
+                       ChannelInterpretation::Speakers);
+            MOZ_ASSERT(mRightConvolverMode == RightConvolverMode::Direct);
+            // The Reverb will essentially up-mix the single input channel by
+            // feeding it into both convolvers.
+          }
+          // The second convolver does not have silent input, and so it will
+          // not drain.  It will need to continue processing up-mixed input
+          // because the next input block may be stereo, which would be mixed
+          // with the signal remaining in the convolvers.
+          // The extra WEBAUDIO_BLOCK_SIZE is subtracted below.
+          mRemainingRightHistory =
+            mReverb->impulseResponseLength() + WEBAUDIO_BLOCK_SIZE;
+        }
+      }
+    }
+
+    if (mReverbInput.mVolume == 0.0f) { // not yet set
+      if (aInput.mVolume != 1.0f) {
+        AllocateReverbInput(aInput, inputChannelCount); // pre-multiply
+      } else {
+        mReverbInput = aInput;
+      }
+    }
+
+    mRemainingLeftOutput = mReverb->impulseResponseLength();
+    MOZ_ASSERT(mRemainingLeftOutput > 0);
  }
-  aOutput->AllocateChannels(2);
+
+  // "The ConvolverNode produces a mono output only in the single case where
+  // there is a single input channel and a single-channel buffer."
+  uint32_t outputChannelCount = 2;
+  uint32_t reverbOutputChannelCount = 2;
+  if (mRightConvolverMode != RightConvolverMode::Always) {
+    // When the input changes from stereo to mono, the output continues to be
+    // stereo for the length of the tail time, during which the two channels
+    // may differ.
+    if (mRemainingRightOutput > 0) {
+      MOZ_ASSERT(mRemainingRightHistory > 0);
+      mRemainingRightOutput -= WEBAUDIO_BLOCK_SIZE;
+    } else {
+      outputChannelCount = 1;
+    }
+    // The second convolver keeps processing until it drains.
+    if (mRemainingRightHistory > 0) {
+      mRemainingRightHistory -= WEBAUDIO_BLOCK_SIZE;
+    } else {
+      reverbOutputChannelCount = 1;
+    }
+  }
+
+  // If there are two convolvers, then they each need an output buffer, even
+  // if the second convolver is only processing to keep history of up-mixed
+  // input.
+  aOutput->AllocateChannels(reverbOutputChannelCount);

  mReverb->process(&mReverbInput, aOutput);
+
+  if (mRightConvolverMode == RightConvolverMode::Difference &&
+      outputChannelCount == 2) {
+    // Add left to right.
+    AddScaledLeftToRight(aOutput, 1.0f);
+  } else {
+    // Trim if outputChannelCount < reverbOutputChannelCount
+    aOutput->mChannelData.TruncateLength(outputChannelCount);
+  }
 }

 ConvolverNode::ConvolverNode(AudioContext* aContext)
--- a/testing/web-platform/meta/webaudio/the-audio-api/the-convolvernode-interface/convolver-response-1-chan.html.ini
+++ b/testing/web-platform/meta/webaudio/the-audio-api/the-convolvernode-interface/convolver-response-1-chan.html.ini
@ -1,10 +1,4 @@
 [convolver-response-1-chan.html]
-  [X 1: Channel 1: Expected 0 for all values but found 1280 unexpected values: \n\tIndex\tActual\n\t[0\]\t-1.1920928955078125e-7\n\t[1\]\t-4.470348358154297e-8\n\t[2\]\t0.3311062455177307\n\t[3\]\t0.6248593926429749\n\t...and 1276 more errors.]
-    expected: FAIL
-
-  [< [1-channel input\] 1 out of 2 assertions were failed.]
-    expected: FAIL
-
  [X 2: Channel 0 expected to be equal to the array [0,0,0.9458408951759338,0.8448333740234375,0.8210252523422241,0.8620985746383667,0.8430315852165222,0.855602502822876,0.7933436632156372,0.9865825176239014,0.3972480297088623,-0.7786127924919128,-0.9223549962043762,-0.7896472215652466,-0.8727429509162903,-0.8325281143188477...\] but differs in 966 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t2.9802322387695313e-8\t0.0000000000000000e+0\n\t[1\]\t-7.4505805969238281e-8\t0.0000000000000000e+0\n\t[2\]\t9.4584077596664429e-1\t9.4584089517593384e-1\n\t[3\]\t8.4483331441879272e-1\t8.4483337402343750e-1\n\t...and 962 more errors.]
    expected: FAIL

@ -41,10 +35,7 @@
  [< [5.1-channel input\] 2 out of 2 assertions were failed.]
    expected: FAIL

-  [# AUDIT TASK RUNNER FINISHED: 5 out of 6 tasks were failed.]
-    expected: FAIL
-
-  [X 1: Channel 1: Expected 0 for all values but found 1279 unexpected values: \n\tIndex\tActual\n\t[1\]\t-2.9802322387695312e-8\n\t[2\]\t0.33110618591308594\n\t[3\]\t0.6248594522476196\n\t[4\]\t0.8481202721595764\n\t...and 1275 more errors.]
+  [# AUDIT TASK RUNNER FINISHED: 4 out of 6 tasks were failed.]
    expected: FAIL

  [X 2: Channel 0 expected to be equal to the array [0,0,0.9458407163619995,0.844833254814148,0.821025013923645,0.8620984554290771,0.8430314660072327,0.8556023836135864,0.7933435440063477,0.9865822792053223,0.39724797010421753,-0.7786126136779785,-0.9223548769950867,-0.7896471619606018,-0.8727428317070007,-0.8325279355049133...\] but differs in 993 places:\n\tIndex\tActual\t\t\tExpected\n\t[0\]\t-2.0861625671386719e-7\t0.0000000000000000e+0\n\t[1\]\t-2.9802322387695313e-8\t0.0000000000000000e+0\n\t[2\]\t9.4584059715270996e-1\t9.4584071636199951e-1\n\t[4\]\t8.2102489471435547e-1\t8.2102501392364502e-1\n\t...and 989 more errors.]