gecko-dev/dom/media/AudioSegment.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "AudioSegment.h"
#include "AudioMixer.h"
#include "AudioChannelFormat.h"
#include "MediaTrackGraph.h"  // for nsAutoRefTraits<SpeexResamplerState>
#include <speex/speex_resampler.h>

namespace mozilla {

const uint8_t
    SilentChannel::gZeroChannel[MAX_AUDIO_SAMPLE_SIZE *
                                SilentChannel::AUDIO_PROCESSING_FRAMES] = {0};

template <>
const float* SilentChannel::ZeroChannel<float>() {
  return reinterpret_cast<const float*>(SilentChannel::gZeroChannel);
}

template <>
const int16_t* SilentChannel::ZeroChannel<int16_t>() {
  return reinterpret_cast<const int16_t*>(SilentChannel::gZeroChannel);
}

void AudioSegment::ApplyVolume(float aVolume) {
  for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
    ci->mVolume *= aVolume;
  }
}

template <typename T>
void AudioSegment::Resample(nsAutoRef<SpeexResamplerState>& aResampler,
                            uint32_t* aResamplerChannelCount, uint32_t aInRate,
                            uint32_t aOutRate) {
  mDuration = 0;

  for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
    AutoTArray<nsTArray<T>, GUESS_AUDIO_CHANNELS> output;
    AutoTArray<const T*, GUESS_AUDIO_CHANNELS> bufferPtrs;
    AudioChunk& c = *ci;
    // If this chunk is null, don't bother resampling, just alter its duration
    if (c.IsNull()) {
      c.mDuration = (c.mDuration * aOutRate) / aInRate;
      mDuration += c.mDuration;
      continue;
    }
    uint32_t channels = c.mChannelData.Length();
    // This might introduce a discontinuity, but a channel count change in the
    // middle of a stream is not that common. This also initializes the
    // resampler as late as possible.
    if (channels != *aResamplerChannelCount) {
      SpeexResamplerState* state =
          speex_resampler_init(channels, aInRate, aOutRate,
                               SPEEX_RESAMPLER_QUALITY_DEFAULT, nullptr);
      MOZ_ASSERT(state);
      aResampler.own(state);
      *aResamplerChannelCount = channels;
    }
    output.SetLength(channels);
    bufferPtrs.SetLength(channels);
    uint32_t inFrames = c.mDuration;
    // Round up to allocate; the last frame may not be used.
    NS_ASSERTION((UINT64_MAX - aInRate + 1) / c.mDuration >= aOutRate,
                 "Dropping samples");
    uint32_t outSize =
        (static_cast<uint64_t>(c.mDuration) * aOutRate + aInRate - 1) / aInRate;
    for (uint32_t i = 0; i < channels; i++) {
      T* out = output[i].AppendElements(outSize);
      uint32_t outFrames = outSize;

      const T* in = static_cast<const T*>(c.mChannelData[i]);
      dom::WebAudioUtils::SpeexResamplerProcess(aResampler.get(), i, in,
                                                &inFrames, out, &outFrames);
      MOZ_ASSERT(inFrames == c.mDuration);

      bufferPtrs[i] = out;
      output[i].SetLength(outFrames);
    }
    MOZ_ASSERT(channels > 0);
    c.mDuration = output[0].Length();
    c.mBuffer = new mozilla::SharedChannelArrayBuffer<T>(std::move(output));
    for (uint32_t i = 0; i < channels; i++) {
      c.mChannelData[i] = bufferPtrs[i];
    }
    mDuration += c.mDuration;
  }
}

void AudioSegment::ResampleChunks(nsAutoRef<SpeexResamplerState>& aResampler,
                                  uint32_t* aResamplerChannelCount,
                                  uint32_t aInRate, uint32_t aOutRate) {
  if (mChunks.IsEmpty()) {
    return;
  }

  AudioSampleFormat format = AUDIO_FORMAT_SILENCE;
  for (ChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
    if (ci->mBufferFormat != AUDIO_FORMAT_SILENCE) {
      format = ci->mBufferFormat;
    }
  }

  switch (format) {
    // If the format is silence at this point, all the chunks are silent. The
    // actual function we use does not matter, it's just a matter of changing
    // the chunks duration.
    case AUDIO_FORMAT_SILENCE:
    case AUDIO_FORMAT_FLOAT32:
      Resample<float>(aResampler, aResamplerChannelCount, aInRate, aOutRate);
      break;
    case AUDIO_FORMAT_S16:
      Resample<int16_t>(aResampler, aResamplerChannelCount, aInRate, aOutRate);
      break;
    default:
      MOZ_ASSERT(false);
      break;
  }
}

size_t AudioSegment::WriteToInterleavedBuffer(nsTArray<AudioDataValue>& aBuffer,
                                              uint32_t aChannels) const {
  size_t offset = 0;
  if (GetDuration() <= 0) {
    MOZ_ASSERT(GetDuration() == 0);
    return offset;
  }

  // Calculate how many samples in this segment
  size_t frames = static_cast<size_t>(GetDuration());
  CheckedInt<size_t> samples(frames);
  samples *= static_cast<size_t>(aChannels);
  MOZ_ASSERT(samples.isValid());
  if (!samples.isValid()) {
    return offset;
  }

  // Enlarge buffer space if needed
  if (samples.value() > aBuffer.Capacity()) {
    aBuffer.SetCapacity(samples.value());
  }
  aBuffer.SetLengthAndRetainStorage(samples.value());
  aBuffer.ClearAndRetainStorage();

  // Convert the de-interleaved chunks into an interleaved buffer. Note that
  // we may upmix or downmix the audio data if the channel in the chunks
  // mismatch with aChannels
  for (ConstChunkIterator ci(*this); !ci.IsEnded(); ci.Next()) {
    const AudioChunk& c = *ci;
    size_t samplesInChunk = static_cast<size_t>(c.mDuration) * aChannels;
    switch (c.mBufferFormat) {
      case AUDIO_FORMAT_S16:
        WriteChunk<int16_t>(c, aChannels, c.mVolume,
                            aBuffer.Elements() + offset);
        break;
      case AUDIO_FORMAT_FLOAT32:
        WriteChunk<float>(c, aChannels, c.mVolume, aBuffer.Elements() + offset);
        break;
      case AUDIO_FORMAT_SILENCE:
        PodZero(aBuffer.Elements() + offset, samplesInChunk);
        break;
      default:
        MOZ_ASSERT_UNREACHABLE("Unknown format");
        PodZero(aBuffer.Elements() + offset, samplesInChunk);
        break;
    }
    offset += samplesInChunk;
  }
  MOZ_DIAGNOSTIC_ASSERT(samples.value() == offset,
                        "Segment's duration is incorrect");
  aBuffer.SetLengthAndRetainStorage(offset);
  return offset;
}

// This helps to to safely get a pointer to the position we want to start
// writing a planar audio buffer, depending on the channel and the offset in the
// buffer.
static AudioDataValue* PointerForOffsetInChannel(AudioDataValue* aData,
                                                 size_t aLengthSamples,
                                                 uint32_t aChannelCount,
                                                 uint32_t aChannel,
                                                 uint32_t aOffsetSamples) {
  size_t samplesPerChannel = aLengthSamples / aChannelCount;
  size_t beginningOfChannel = samplesPerChannel * aChannel;
  MOZ_ASSERT(aChannel * samplesPerChannel + aOffsetSamples < aLengthSamples,
             "Offset request out of bounds.");
  return aData + beginningOfChannel + aOffsetSamples;
}

template <typename SrcT>
static void DownMixChunk(const AudioChunk& aChunk,
                         Span<AudioDataValue* const> aOutputChannels) {
  Span<const SrcT* const> channelData = aChunk.ChannelData<SrcT>();
  uint32_t frameCount = aChunk.mDuration;
  if (channelData.Length() > aOutputChannels.Length()) {
    // Down mix.
    AudioChannelsDownMix(channelData, aOutputChannels, frameCount);
    for (AudioDataValue* outChannel : aOutputChannels) {
      ScaleAudioSamples(outChannel, frameCount, aChunk.mVolume);
    }
  } else {
    // The channel count is already what we want.
    for (uint32_t channel = 0; channel < aOutputChannels.Length(); channel++) {
      ConvertAudioSamplesWithScale(channelData[channel],
                                   aOutputChannels[channel], frameCount,
                                   aChunk.mVolume);
    }
  }
}

void AudioChunk::DownMixTo(
    Span<AudioDataValue* const> aOutputChannelPtrs) const {
  switch (mBufferFormat) {
    case AUDIO_FORMAT_FLOAT32:
      DownMixChunk<float>(*this, aOutputChannelPtrs);
      return;
    case AUDIO_FORMAT_S16:
      DownMixChunk<int16_t>(*this, aOutputChannelPtrs);
      return;
    case AUDIO_FORMAT_SILENCE:
      for (AudioDataValue* outChannel : aOutputChannelPtrs) {
        std::fill_n(outChannel, mDuration, static_cast<AudioDataValue>(0));
      }
      return;
      // Avoid `default:` so that `-Wswitch` catches missing enumerators at
      // compile time.
  }
  MOZ_ASSERT_UNREACHABLE("buffer format");
}

void AudioSegment::Mix(AudioMixer& aMixer, uint32_t aOutputChannels,
                       uint32_t aSampleRate) {
  AutoTArray<AudioDataValue,
             SilentChannel::AUDIO_PROCESSING_FRAMES * GUESS_AUDIO_CHANNELS>
      buf;
  AudioChunk upMixChunk;
  uint32_t offsetSamples = 0;
  uint32_t duration = GetDuration();

  if (duration <= 0) {
    MOZ_ASSERT(duration == 0);
    return;
  }

  uint32_t outBufferLength = duration * aOutputChannels;
  buf.SetLength(outBufferLength);

  AutoTArray<AudioDataValue*, GUESS_AUDIO_CHANNELS> outChannelPtrs;
  outChannelPtrs.SetLength(aOutputChannels);

  uint32_t frames;
  for (ChunkIterator ci(*this); !ci.IsEnded();
       ci.Next(), offsetSamples += frames) {
    const AudioChunk& c = *ci;
    frames = c.mDuration;
    for (uint32_t channel = 0; channel < aOutputChannels; channel++) {
      outChannelPtrs[channel] =
          PointerForOffsetInChannel(buf.Elements(), outBufferLength,
                                    aOutputChannels, channel, offsetSamples);
    }

    // If the chunk is silent, simply write the right number of silence in the
    // buffers.
    if (c.mBufferFormat == AUDIO_FORMAT_SILENCE) {
      for (AudioDataValue* outChannel : outChannelPtrs) {
        PodZero(outChannel, frames);
      }
      continue;
    }
    // We need to upmix and downmix appropriately, depending on the
    // desired input and output channels.
    const AudioChunk* downMixInput = &c;
    if (c.ChannelCount() < aOutputChannels) {
      // Up-mix.
      upMixChunk = c;
      AudioChannelsUpMix<void>(&upMixChunk.mChannelData, aOutputChannels,
                               SilentChannel::gZeroChannel);
      downMixInput = &upMixChunk;
    }
    downMixInput->DownMixTo(outChannelPtrs);
  }

  if (offsetSamples) {
    MOZ_ASSERT(offsetSamples == outBufferLength / aOutputChannels,
               "We forgot to write some samples?");
    aMixer.Mix(buf.Elements(), aOutputChannels, offsetSamples, aSampleRate);
  }
}

}  // namespace mozilla