Bug 1649817 - Verify audio buffers in drift correction tests, r=padenot

In AudioDriftCorrection gtests, the audio in the buffers is simulated by a sine tone of specific amplitude and frequency. This paych implements a way to verify automatically that the expected signal is being received in the output. Differential Revision: https://phabricator.services.mozilla.com/D81988
2020-07-15 14:02:41 +00:00 · 2020-07-15 14:02:41 +00:00 · 70eee99400
--- a/dom/media/gtest/TestAudioDriftCorrection.cpp
+++ b/dom/media/gtest/TestAudioDriftCorrection.cpp
@ -199,13 +199,159 @@ TEST(TestClockDrift, SmallBufferedFrames)
  EXPECT_FLOAT_EQ(c.GetCorrection(), 1.1);
 }

+template <typename T>
+class AudioToneGenerator {
+ public:
+  static_assert(std::is_same<T, int16_t>::value ||
+                std::is_same<T, float>::value);
+
+  explicit AudioToneGenerator(int32_t aRate) : mRate(aRate) {
+    MOZ_ASSERT(mRate > 0);
+  }
+
+  void Write(AudioChunk& aChunk) {
+    float time = mTime;
+    for (uint32_t i = 0; i < aChunk.ChannelCount(); ++i) {
+      mTime = time;  // reset the time for each channel
+      T* buffer = aChunk.ChannelDataForWrite<T>(0);
+      Write(buffer, aChunk.GetDuration());
+    }
+  }
+
+  void Write(T* aBuffer, int32_t aFrames) {
+    for (int i = 0; i < aFrames; ++i) {
+      double value = Amplitude() * sin(2 * M_PI * mFrequency * mTime + mPhase);
+      aBuffer[i] = static_cast<T>(value);
+      mTime += mDeltaTime;
+    }
+  }
+
+  T MaxMagnitudeDifference() {
+    return static_cast<T>(Amplitude() *
+                          sin(2 * M_PI * mFrequency * mDeltaTime + mPhase));
+  }
+
+  const int32_t mRate;
+  const int32_t mFrequency = 100;
+  const float mPhase = 0.0;
+  const float mDeltaTime = 1.0f / mRate;
+
+  static T Amplitude() {
+    if (std::is_same<T, int16_t>::value) {
+      return 19384;  // int16_t::max / 2
+    }
+    return 0.5f;  // 1.0f / 2
+  }
+
+ private:
+  float mTime = 0.0;
+};
+
+template <typename T>
+class AudioToneVerifier {
+ public:
+  explicit AudioToneVerifier(int32_t aRate)
+      : mRate(aRate), mExpectedTone(aRate) {
+    MOZ_ASSERT(mRate > 0);
+  }
+
+  // Only the mono channel is taken into account.
+  void AppendData(const AudioSegment& segment) {
+    for (AudioSegment::ConstChunkIterator iter(segment); !iter.IsEnded();
+         iter.Next()) {
+      const AudioChunk& c = *iter;
+      CheckBuffer(c.ChannelData<T>()[0], c.GetDuration());
+    }
+  }
+
+  int32_t EstimatedFreq() const {
+    if (mTotalFramesSoFar == PreSilenceSamples() || mZeroCrossCount <= 1) {
+      return 0.0f;
+    }
+    MOZ_ASSERT(mZeroCrossCount > 1);
+    return mRate / (mSumPeriodInSamples / (mZeroCrossCount - 1));
+  }
+
+  int64_t PreSilenceSamples() const {
+    // The first sample of the sinewave is zero.
+    MOZ_ASSERT(mPreSilenceSamples >= 1);
+    return mPreSilenceSamples - 1;
+  }
+
+  int32_t CountDiscontinuities() const {
+    // Every discontinuity is counted twice one on current sample and one more
+    // on previous sample.
+    return mDiscontinuitiesCount / 2;
+  }
+
+ private:
+  void CheckBuffer(const T* aBuffer, int32_t aSamples) {
+    for (int i = 0; i < aSamples; ++i) {
+      ++mTotalFramesSoFar;
+      // Avoid pre-silence
+      if (!CountPreSilence(aBuffer[i])) {
+        CountZeroCrossing(aBuffer[i]);
+        CountDiscontinuities(aBuffer[i]);
+      }
+
+      mPrevious = aBuffer[i];
+    }
+  }
+
+  bool CountPreSilence(T aCurrentSample) {
+    if (IsZero(aCurrentSample) && mPreSilenceSamples == mTotalFramesSoFar - 1) {
+      ++mPreSilenceSamples;
+      return true;
+    }
+    return false;
+  }
+
+  // Positive to negative direction
+  void CountZeroCrossing(T aCurrentSample) {
+    if (mPrevious > 0 && aCurrentSample <= 0) {
+      if (mZeroCrossCount++) {
+        MOZ_ASSERT(mZeroCrossCount > 1);
+        mSumPeriodInSamples += mTotalFramesSoFar - mLastZeroCrossPosition;
+      }
+      mLastZeroCrossPosition = mTotalFramesSoFar;
+    }
+  }
+
+  void CountDiscontinuities(T aCurrentSample) {
+    mDiscontinuitiesCount += fabs(fabs(aCurrentSample) - fabs(mPrevious)) >
+                             2 * mExpectedTone.MaxMagnitudeDifference();
+  }
+
+  bool IsZero(float a) { return fabs(a) < 1e-8; }
+  bool IsZero(short a) { return a == 0; }
+
+ private:
+  const int32_t mRate;
+  AudioToneGenerator<T> mExpectedTone;
+
+  int32_t mZeroCrossCount = 0;
+  int64_t mLastZeroCrossPosition = 0;
+  int64_t mSumPeriodInSamples = 0;
+
+  int64_t mTotalFramesSoFar = 0;
+  int64_t mPreSilenceSamples = 0;
+
+  int32_t mDiscontinuitiesCount = 0;
+  // This is needed to connect previous the previous buffers.
+  T mPrevious = {};
+};
+
+// Print the mono channel of a segment.
 void printAudioSegment(const AudioSegment& segment) {
  for (AudioSegment::ConstChunkIterator iter(segment); !iter.IsEnded();
       iter.Next()) {
    const AudioChunk& c = *iter;
-    const float* buffer = c.ChannelData<float>()[0];
    for (int i = 0; i < c.GetDuration(); ++i) {
-      printf("%f\n", buffer[i]);
+      if (c.mBufferFormat == AUDIO_FORMAT_FLOAT32) {
+        printf("%f\n", c.ChannelData<float>()[0][i]);
+      } else {
+        printf("%d\n", c.ChannelData<int16_t>()[0][i]);
+      }
    }
  }
 }
@ -219,11 +365,9 @@ void testAudioCorrection(int32_t aSourceRate, int32_t aTargetRate) {
  const int32_t sampleRateReceiver = aTargetRate;
  AudioDriftCorrection ad(sampleRateTransmitter, sampleRateReceiver);

-  const float amplitude = 0.5;
-  const float frequency = 10;
-  const float phase = 0.0;
-  float time = 0.0;
-  const float deltaTime = 1.0f / sampleRateTransmitter;
+  AudioToneGenerator<AudioDataValue> tone(sampleRateTransmitter);
+  AudioToneVerifier<AudioDataValue> inToneVerifier(sampleRateTransmitter);
+  AudioToneVerifier<AudioDataValue> outToneVerifier(sampleRateReceiver);

  int32_t sourceFrames;
  const int32_t targetFrames = sampleRateReceiver / 100;
@ -239,16 +383,12 @@ void testAudioCorrection(int32_t aSourceRate, int32_t aTargetRate) {

    for (int n = 0; n < 250; ++n) {
      // Create the input (sine tone)
-      AudioChunk chunk =
-          CreateAudioChunk<float>(sourceFrames, 1, AUDIO_FORMAT_FLOAT32);
-      float* monoBuffer = chunk.ChannelDataForWrite<float>(0);
-      for (int i = 0; i < sourceFrames; ++i) {
-        double value = amplitude * sin(2 * M_PI * frequency * time + phase);
-        monoBuffer[i] = static_cast<float>(value);
-        time += deltaTime;
-      }
+      AudioChunk chunk = CreateAudioChunk<AudioDataValue>(sourceFrames, 1,
+                                                          AUDIO_OUTPUT_FORMAT);
+      tone.Write(chunk);
      AudioSegment inSegment;
      inSegment.AppendAndConsumeChunk(&chunk);
+      inToneVerifier.AppendData(inSegment);
      // Print the input for debugging
      // printAudioSegment(inSegment);

@ -257,8 +397,18 @@ void testAudioCorrection(int32_t aSourceRate, int32_t aTargetRate) {
      EXPECT_EQ(outSegment.GetDuration(), targetFrames);
      // Print the output for debugging
      // printAudioSegment(outSegment);
+      outToneVerifier.AppendData(outSegment);
    }
  }
+  EXPECT_EQ(inToneVerifier.EstimatedFreq(), tone.mFrequency);
+  EXPECT_EQ(inToneVerifier.PreSilenceSamples(), 0);
+  EXPECT_EQ(inToneVerifier.CountDiscontinuities(), 0);
+
+  EXPECT_EQ(outToneVerifier.EstimatedFreq(), tone.mFrequency);
+  // The expected pre-silence is 50ms plus the resampling, this is roughly more
+  // than 2000 frames for the samples rates being used here
+  EXPECT_GT(outToneVerifier.PreSilenceSamples(), 2000);
+  EXPECT_EQ(outToneVerifier.CountDiscontinuities(), 0);
 }

 TEST(TestAudioDriftCorrection, Basic)
@ -273,11 +423,9 @@ void testMonoToStereoInput(int aSourceRate, int aTargetRate) {
  const int32_t sampleRateReceiver = aTargetRate;
  AudioDriftCorrection ad(sampleRateTransmitter, sampleRateReceiver);

-  const float amplitude = 0.5;
-  const float frequency = 10;
-  const float phase = 0.0;
-  float time = 0.0;
-  const float deltaTime = 1.0f / sampleRateTransmitter;
+  AudioToneGenerator<AudioDataValue> tone(sampleRateTransmitter);
+  AudioToneVerifier<AudioDataValue> inToneVerify(sampleRateTransmitter);
+  AudioToneVerifier<AudioDataValue> outToneVerify(sampleRateReceiver);

  int32_t sourceFrames;
  const int32_t targetFrames = sampleRateReceiver / 100;
@ -293,25 +441,18 @@ void testMonoToStereoInput(int aSourceRate, int aTargetRate) {

    for (int n = 0; n < 250; ++n) {
      // Create the input (sine tone)
-      AudioChunk chunk =
-          CreateAudioChunk<float>(sourceFrames / 2, 1, AUDIO_FORMAT_FLOAT32);
-      float* monoBuffer = chunk.ChannelDataForWrite<float>(0);
-      for (int i = 0; i < chunk.GetDuration(); ++i) {
-        double value = amplitude * sin(2 * M_PI * frequency * time + phase);
-        monoBuffer[i] = static_cast<float>(value);
-        time += deltaTime;
-      }
-      AudioChunk chunk2 =
-          CreateAudioChunk<float>(sourceFrames / 2, 2, AUDIO_FORMAT_FLOAT32);
-      for (int i = 0; i < chunk2.GetDuration(); ++i) {
-        double value = amplitude * sin(2 * M_PI * frequency * time + phase);
-        chunk2.ChannelDataForWrite<float>(0)[i] =
-            chunk2.ChannelDataForWrite<float>(1)[i] = static_cast<float>(value);
-        time += deltaTime;
-      }
+      AudioChunk chunk = CreateAudioChunk<AudioDataValue>(sourceFrames / 2, 1,
+                                                          AUDIO_OUTPUT_FORMAT);
+      tone.Write(chunk);
+
+      AudioChunk chunk2 = CreateAudioChunk<AudioDataValue>(sourceFrames / 2, 2,
+                                                           AUDIO_OUTPUT_FORMAT);
+      tone.Write(chunk2);
+
      AudioSegment inSegment;
      inSegment.AppendAndConsumeChunk(&chunk);
      inSegment.AppendAndConsumeChunk(&chunk2);
+      inToneVerify.AppendData(inSegment);
      // Print the input for debugging
      // printAudioSegment(inSegment);

@ -320,8 +461,21 @@ void testMonoToStereoInput(int aSourceRate, int aTargetRate) {
      EXPECT_EQ(outSegment.GetDuration(), targetFrames);
      // Print the output for debugging
      // printAudioSegment(outSegment);
+      outToneVerify.AppendData(outSegment);
    }
  }
+  EXPECT_EQ(inToneVerify.EstimatedFreq(), tone.mFrequency);
+  EXPECT_EQ(inToneVerify.PreSilenceSamples(), 0);
+  EXPECT_EQ(inToneVerify.CountDiscontinuities(), 0);
+
+  EXPECT_GT(outToneVerify.CountDiscontinuities(), 0)
+      << "Expect discontinuities";
+  EXPECT_NE(outToneVerify.EstimatedFreq(), tone.mFrequency)
+      << "Estimation is not accurate due to discontinuities";
+  // The expected pre-silence is 50ms plus the resampling. However, due to
+  // discontinuities pre-silence is expected only in the first iteration which
+  // is routhly a little more than 400 frames for the chosen sample rates.
+  EXPECT_GT(outToneVerify.PreSilenceSamples(), 400);
 }

 TEST(TestAudioDriftCorrection, MonoToStereoInput)