Bug 926838 - [Part 4] Implement AlignedTArray for 32-byte alignment is required by openmax dl. Also modify callers. r=ehsan

2013-11-13 11:07:31 +08:00 · 2013-11-13 11:07:31 +08:00 · de10d6d328
--- a/dom/media/webaudio/AlignedTArray.h
+++ b/dom/media/webaudio/AlignedTArray.h
@ -0,0 +1,85 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim:set ts=2 sw=2 sts=2 et cindent: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef AlignedTArray_h__
+#define AlignedTArray_h__
+
+#include "mozilla/Alignment.h"
+#include "nsTArray.h"
+
+/**
+ * E: element type, must be a POD type.
+ * N: N bytes alignment for the first element, defaults to 32
+  */
+template <typename E, int N, typename Alloc>
+class AlignedTArray_Impl : public nsTArray_Impl<E, Alloc>
+{
+  static_assert((N & (N-1)) == 0, "N must be power of 2");
+  typedef nsTArray_Impl<E, Alloc>                    base_type;
+public:
+  typedef E                                          elem_type;
+  typedef typename base_type::size_type              size_type;
+  typedef typename base_type::index_type             index_type;
+
+  AlignedTArray_Impl() {}
+  explicit AlignedTArray_Impl(size_type capacity) : base_type(capacity+sExtra) {}
+  elem_type* Elements() { return getAligned(base_type::Elements()); }
+  const elem_type* Elements() const { return getAligned(base_type::Elements()); }
+  elem_type& operator[](index_type i) { return Elements()[i];}
+  const elem_type& operator[](index_type i) const { return Elements()[i]; }
+
+  typename Alloc::ResultType SetLength(size_type newLen) {
+    return base_type::SetLength(newLen + sExtra);
+  }
+  size_type Length() const {
+    return base_type::Length() <= sExtra ? 0 : base_type::Length() - sExtra;
+  }
+
+private:
+  AlignedTArray_Impl(const AlignedTArray_Impl& other) = delete;
+  void operator=(const AlignedTArray_Impl& other) = delete;
+
+  static const size_type sPadding = N <= MOZ_ALIGNOF(E) ? 0 : N - MOZ_ALIGNOF(E);
+  static const size_type sExtra = (sPadding + sizeof(E) - 1) / sizeof(E);
+
+  template <typename U>
+  static U* getAligned(U* p)
+  {
+    return reinterpret_cast<U*>(((uintptr_t)p + N - 1) & ~(N-1));
+  }
+};
+
+template <typename E, int N=32>
+class AlignedTArray : public AlignedTArray_Impl<E, N, nsTArrayInfallibleAllocator>
+{
+public:
+  typedef AlignedTArray_Impl<E, N, nsTArrayInfallibleAllocator> base_type;
+  typedef AlignedTArray<E, N>                                   self_type;
+  typedef typename base_type::size_type                         size_type;
+
+  AlignedTArray() {}
+  explicit AlignedTArray(size_type capacity) : base_type(capacity) {}
+private:
+  AlignedTArray(const AlignedTArray& other) = delete;
+  void operator=(const AlignedTArray& other) = delete;
+};
+
+template <typename E, int N=32>
+class AlignedFallibleTArray : public AlignedTArray_Impl<E, N, nsTArrayFallibleAllocator>
+{
+public:
+  typedef AlignedTArray_Impl<E, N, nsTArrayFallibleAllocator> base_type;
+  typedef AlignedFallibleTArray<E, N>                         self_type;
+  typedef typename base_type::size_type                       size_type;
+
+  AlignedFallibleTArray() {}
+  explicit AlignedFallibleTArray(size_type capacity) : base_type(capacity) {}
+private:
+  AlignedFallibleTArray(const AlignedFallibleTArray& other) = delete;
+  void operator=(const AlignedFallibleTArray& other) = delete;
+};
+
+#endif // AlignedTArray_h__
--- a/dom/media/webaudio/AnalyserNode.cpp
+++ b/dom/media/webaudio/AnalyserNode.cpp
@ -251,17 +251,16 @@ bool
 AnalyserNode::FFTAnalysis()
 {
  float* inputBuffer;
-  bool allocated = false;
+  AlignedFallibleTArray<float> tmpBuffer;
  if (mWriteIndex == 0) {
    inputBuffer = mBuffer.Elements();
  } else {
-    inputBuffer = static_cast<float*>(malloc(FftSize() * sizeof(float)));
-    if (!inputBuffer) {
+    if (tmpBuffer.SetLength(FftSize())) {
      return false;
    }
+    inputBuffer = tmpBuffer.Elements();
    memcpy(inputBuffer, mBuffer.Elements() + mWriteIndex, sizeof(float) * (FftSize() - mWriteIndex));
    memcpy(inputBuffer + FftSize() - mWriteIndex, mBuffer.Elements(), sizeof(float) * mWriteIndex);
-    allocated = true;
  }

  ApplyBlackmanWindow(inputBuffer, FftSize());
@ -279,9 +278,6 @@ AnalyserNode::FFTAnalysis()
                       (1.0 - mSmoothingTimeConstant) * scalarMagnitude;
  }

-  if (allocated) {
-    free(inputBuffer);
-  }
  return true;
 }

@ -305,16 +301,16 @@ AnalyserNode::AllocateBuffer()
 {
  bool result = true;
  if (mBuffer.Length() != FftSize()) {
-    result = mBuffer.SetLength(FftSize());
-    if (result) {
-      memset(mBuffer.Elements(), 0, sizeof(float) * FftSize());
-      mWriteIndex = 0;
-
-      result = mOutputBuffer.SetLength(FrequencyBinCount());
-      if (result) {
-        memset(mOutputBuffer.Elements(), 0, sizeof(float) * FrequencyBinCount());
-      }
+    if (mBuffer.SetLength(FftSize())) {
+      return false;
    }
+    memset(mBuffer.Elements(), 0, sizeof(float) * FftSize());
+    mWriteIndex = 0;
+
+    if (mOutputBuffer.SetLength(FrequencyBinCount())) {
+      return false;
+    }
+    memset(mOutputBuffer.Elements(), 0, sizeof(float) * FrequencyBinCount());
  }
  return result;
 }
--- a/dom/media/webaudio/AnalyserNode.h
+++ b/dom/media/webaudio/AnalyserNode.h
@ -9,6 +9,7 @@

 #include "AudioNode.h"
 #include "FFTBlock.h"
+#include "AlignedTArray.h"

 namespace mozilla {
 namespace dom {
@ -77,8 +78,8 @@ private:
  double mMaxDecibels;
  double mSmoothingTimeConstant;
  uint32_t mWriteIndex;
-  FallibleTArray<float> mBuffer;
-  FallibleTArray<float> mOutputBuffer;
+  AlignedFallibleTArray<float> mBuffer;
+  AlignedFallibleTArray<float> mOutputBuffer;
 };

 }
--- a/dom/media/webaudio/FFTBlock.cpp
+++ b/dom/media/webaudio/FFTBlock.cpp
@ -44,8 +44,7 @@ FFTBlock* FFTBlock::CreateInterpolatedBlock(const FFTBlock& block0, const FFTBlo

    // In the time-domain, the 2nd half of the response must be zero, to avoid circular convolution aliasing...
    int fftSize = newBlock->FFTSize();
-    nsTArray<float> buffer;
-    buffer.SetLength(fftSize);
+    AlignedTArray<float> buffer(fftSize);
    newBlock->GetInverseWithoutScaling(buffer.Elements());
    AudioBufferInPlaceScale(buffer.Elements(), 1.0f / fftSize, fftSize / 2);
    PodZero(buffer.Elements() + fftSize / 2, fftSize / 2);
@ -60,10 +59,10 @@ void FFTBlock::InterpolateFrequencyComponents(const FFTBlock& block0, const FFTB
 {
    // FIXME : with some work, this method could be optimized

-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();

-    const kiss_fft_cpx* dft1 = block0.mOutputBuffer.Elements();
-    const kiss_fft_cpx* dft2 = block1.mOutputBuffer.Elements();
+    const ComplexU* dft1 = block0.mOutputBuffer.Elements();
+    const ComplexU* dft2 = block1.mOutputBuffer.Elements();

    MOZ_ASSERT(mFFTSize == block0.FFTSize());
    MOZ_ASSERT(mFFTSize == block1.FFTSize());
@ -154,7 +153,7 @@ void FFTBlock::InterpolateFrequencyComponents(const FFTBlock& block0, const FFTB

 double FFTBlock::ExtractAverageGroupDelay()
 {
-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();

    double aveSum = 0.0;
    double weightSum = 0.0;
@ -205,7 +204,7 @@ void FFTBlock::AddConstantGroupDelay(double sampleFrameDelay)
 {
    int halfSize = FFTSize() / 2;

-    kiss_fft_cpx* dft = mOutputBuffer.Elements();
+    ComplexU* dft = mOutputBuffer.Elements();

    const double kSamplePhaseDelay = (2.0 * M_PI) / double(FFTSize());

--- a/dom/media/webaudio/FFTBlock.h
+++ b/dom/media/webaudio/FFTBlock.h
@ -7,7 +7,13 @@
 #ifndef FFTBlock_h_
 #define FFTBlock_h_

-#include "nsTArray.h"
+#ifdef BUILD_ARM_NEON
+#include <cmath>
+#include "mozilla/arm.h"
+#include "dl/sp/api/omxSP.h"
+#endif
+
+#include "AlignedTArray.h"
 #include "AudioNodeEngine.h"
 #include "kiss_fft/kiss_fftr.h"

@ -18,15 +24,26 @@ namespace mozilla {
 // Currently it's implemented on top of KissFFT on all platforms.
 class FFTBlock final
 {
+  union ComplexU {
+    kiss_fft_cpx c;
+    float f[2];
+    struct {
+      float r;
+      float i;
+    };
+  };
+
 public:
  explicit FFTBlock(uint32_t aFFTSize)
-    : mFFT(nullptr)
-    , mIFFT(nullptr)
-    , mFFTSize(aFFTSize)
+    : mKissFFT(nullptr)
+    , mKissIFFT(nullptr)
+#ifdef BUILD_ARM_NEON
+    , mOmxFFT(nullptr)
+    , mOmxIFFT(nullptr)
+#endif
  {
    MOZ_COUNT_CTOR(FFTBlock);
-    mOutputBuffer.SetLength(aFFTSize / 2 + 1);
-    PodZero(mOutputBuffer.Elements(), aFFTSize / 2 + 1);
+    SetFFTSize(aFFTSize);
  }
  ~FFTBlock()
  {
@ -44,10 +61,17 @@ public:
  void PerformFFT(const float* aData)
  {
    EnsureFFT();
-    kiss_fftr(mFFT, aData, mOutputBuffer.Elements());
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTFwd_RToCCS_F32_Sfs(aData, mOutputBuffer.Elements()->f, mOmxFFT);
+    } else
+#endif
+    {
+      kiss_fftr(mKissFFT, aData, &(mOutputBuffer.Elements()->c));
+    }
  }
  // Inverse-transform internal data and store the resulting FFTSize()
-  // points in aData.
+  // points in aDataOut.
  void GetInverse(float* aDataOut)
  {
    GetInverseWithoutScaling(aDataOut);
@ -59,7 +83,17 @@ public:
  void GetInverseWithoutScaling(float* aDataOut)
  {
    EnsureIFFT();
-    kiss_fftri(mIFFT, mOutputBuffer.Elements(), aDataOut);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTInv_CCSToR_F32_Sfs(mOutputBuffer.Elements()->f, aDataOut, mOmxIFFT);
+      // There is no function that computes de inverse FFT without scaling, so
+      // we have to scale back up here. Bug 1158741.
+      AudioBufferInPlaceScale(aDataOut, mFFTSize, mFFTSize);
+    } else
+#endif
+    {
+      kiss_fftri(mKissIFFT, &(mOutputBuffer.Elements()->c), aDataOut);
+    }
  }
  // Inverse-transform the FFTSize()/2+1 points of data in each
  // of aRealDataIn and aImagDataIn and store the resulting
@ -70,23 +104,30 @@ public:
  {
    EnsureIFFT();
    const uint32_t inputSize = mFFTSize / 2 + 1;
-    nsTArray<kiss_fft_cpx> inputBuffer;
-    inputBuffer.SetLength(inputSize);
+    AlignedTArray<ComplexU> inputBuffer(inputSize);
    for (uint32_t i = 0; i < inputSize; ++i) {
      inputBuffer[i].r = aRealDataIn[i];
      inputBuffer[i].i = aImagDataIn[i];
    }
-    kiss_fftri(mIFFT, inputBuffer.Elements(), aRealDataOut);
-    for (uint32_t i = 0; i < mFFTSize; ++i) {
-      aRealDataOut[i] /= mFFTSize;
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      omxSP_FFTInv_CCSToR_F32_Sfs(inputBuffer.Elements()->f,
+                                  aRealDataOut, mOmxIFFT);
+    } else
+#endif
+    {
+      kiss_fftri(mKissIFFT, &(inputBuffer.Elements()->c), aRealDataOut);
+      for (uint32_t i = 0; i < mFFTSize; ++i) {
+        aRealDataOut[i] /= mFFTSize;
+      }
    }
  }

  void Multiply(const FFTBlock& aFrame)
  {
-    BufferComplexMultiply(reinterpret_cast<const float*>(mOutputBuffer.Elements()),
-                          reinterpret_cast<const float*>(aFrame.mOutputBuffer.Elements()),
-                          reinterpret_cast<float*>(mOutputBuffer.Elements()),
+    BufferComplexMultiply(mOutputBuffer.Elements()->f,
+                          aFrame.mOutputBuffer.Elements()->f,
+                          mOutputBuffer.Elements()->f,
                          mFFTSize / 2 + 1);
  }

@ -97,7 +138,7 @@ public:
  void PadAndMakeScaledDFT(const float* aData, size_t dataSize)
  {
    MOZ_ASSERT(dataSize <= FFTSize());
-    nsTArray<float> paddedData;
+    AlignedTArray<float> paddedData;
    paddedData.SetLength(FFTSize());
    AudioBufferCopyWithScale(aData, 1.0f / FFTSize(),
                             paddedData.Elements(), dataSize);
@ -132,8 +173,8 @@ public:
  size_t SizeOfExcludingThis(MallocSizeOf aMallocSizeOf) const
  {
    size_t amount = 0;
-    amount += aMallocSizeOf(mFFT);
-    amount += aMallocSizeOf(mIFFT);
+    amount += aMallocSizeOf(mKissFFT);
+    amount += aMallocSizeOf(mKissIFFT);
    amount += mOutputBuffer.SizeOfExcludingThis(aMallocSizeOf);
    return amount;
  }
@ -149,31 +190,78 @@ private:

  void EnsureFFT()
  {
-    if (!mFFT) {
-      mFFT = kiss_fftr_alloc(mFFTSize, 0, nullptr, nullptr);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      if (!mOmxFFT) {
+        mOmxFFT = createOmxFFT(mFFTSize);
+      }
+    } else
+#endif
+    {
+      if (!mKissFFT) {
+        mKissFFT = kiss_fftr_alloc(mFFTSize, 0, nullptr, nullptr);
+      }
    }
  }
  void EnsureIFFT()
  {
-    if (!mIFFT) {
-      mIFFT = kiss_fftr_alloc(mFFTSize, 1, nullptr, nullptr);
+#ifdef BUILD_ARM_NEON
+    if (mozilla::supports_neon()) {
+      if (!mOmxIFFT) {
+        mOmxIFFT = createOmxFFT(mFFTSize);
+      }
+    } else
+#endif
+    {
+      if (!mKissIFFT) {
+        mKissIFFT = kiss_fftr_alloc(mFFTSize, 1, nullptr, nullptr);
+      }
    }
  }
+
+#ifdef BUILD_ARM_NEON
+  static OMXFFTSpec_R_F32* createOmxFFT(uint32_t aFFTSize)
+  {
+    MOZ_ASSERT((aFFTSize & (aFFTSize-1)) == 0);
+    OMX_INT bufSize;
+    OMX_INT order = log((double)aFFTSize)/M_LN2;
+    MOZ_ASSERT(aFFTSize>>order == 1);
+    OMXResult status = omxSP_FFTGetBufSize_R_F32(order, &bufSize);
+    if (status == OMX_Sts_NoErr) {
+      OMXFFTSpec_R_F32* context = static_cast<OMXFFTSpec_R_F32*>(malloc(bufSize));
+      if (omxSP_FFTInit_R_F32(context, order) != OMX_Sts_NoErr) {
+        return nullptr;
+      }
+      return context;
+    }
+    return nullptr;
+  }
+#endif
+
  void Clear()
  {
-    free(mFFT);
-    free(mIFFT);
-    mFFT = mIFFT = nullptr;
+#ifdef BUILD_ARM_NEON
+    free(mOmxFFT);
+    free(mOmxIFFT);
+    mOmxFFT = mOmxIFFT = nullptr;
+#endif
+    free(mKissFFT);
+    free(mKissIFFT);
+    mKissFFT = mKissIFFT = nullptr;
  }
  void AddConstantGroupDelay(double sampleFrameDelay);
  void InterpolateFrequencyComponents(const FFTBlock& block0,
                                      const FFTBlock& block1, double interp);

-  kiss_fftr_cfg mFFT, mIFFT;
-  nsTArray<kiss_fft_cpx> mOutputBuffer;
+  kiss_fftr_cfg mKissFFT;
+  kiss_fftr_cfg mKissIFFT;
+#ifdef BUILD_ARM_NEON
+  OMXFFTSpec_R_F32* mOmxFFT;
+  OMXFFTSpec_R_F32* mOmxIFFT;
+#endif
+  AlignedTArray<ComplexU> mOutputBuffer;
  uint32_t mFFTSize;
 };
-
 }

 #endif
--- a/dom/media/webaudio/blink/FFTConvolver.h
+++ b/dom/media/webaudio/blink/FFTConvolver.h
@ -35,7 +35,7 @@

 namespace WebCore {

-typedef nsTArray<float> AudioFloatArray;
+typedef AlignedTArray<float> AlignedAudioFloatArray;
 using mozilla::FFTBlock;

 class FFTConvolver {
@ -66,13 +66,13 @@ private:

    // Buffer input until we get fftSize / 2 samples then do an FFT
    size_t m_readWriteIndex;
-    AudioFloatArray m_inputBuffer;
+    AlignedAudioFloatArray m_inputBuffer;

    // Stores output which we read a little at a time
-    AudioFloatArray m_outputBuffer;
+    AlignedAudioFloatArray m_outputBuffer;

    // Saves the 2nd half of the FFT buffer, so we can do an overlap-add with the 1st half of the next one
-    AudioFloatArray m_lastOverlapBuffer;
+    AlignedAudioFloatArray m_lastOverlapBuffer;
 };

 } // namespace WebCore
--- a/dom/media/webaudio/blink/HRTFKernel.cpp
+++ b/dom/media/webaudio/blink/HRTFKernel.cpp
@ -51,6 +51,14 @@ HRTFKernel::HRTFKernel(float* impulseResponse, size_t length, float sampleRate)
    : m_frameDelay(0)
    , m_sampleRate(sampleRate)
 {
+    AlignedTArray<float> buffer;
+    // copy to a 32-byte aligned buffer
+    if (((uintptr_t)impulseResponse & 31) != 0) {
+      buffer.SetLength(length);
+      mozilla::PodCopy(buffer.Elements(), impulseResponse, length);
+      impulseResponse = buffer.Elements();
+    }
+
    // Determine the leading delay (average group delay) for the response.
    m_frameDelay = extractAverageGroupDelay(impulseResponse, length);

@ -79,18 +87,18 @@ nsReturnRef<HRTFKernel> HRTFKernel::createInterpolatedKernel(HRTFKernel* kernel1
    MOZ_ASSERT(kernel1 && kernel2);
    if (!kernel1 || !kernel2)
        return nsReturnRef<HRTFKernel>();
- 
+
    MOZ_ASSERT(x >= 0.0 && x < 1.0);
    x = mozilla::clamped(x, 0.0f, 1.0f);
-    
+
    float sampleRate1 = kernel1->sampleRate();
    float sampleRate2 = kernel2->sampleRate();
    MOZ_ASSERT(sampleRate1 == sampleRate2);
    if (sampleRate1 != sampleRate2)
        return nsReturnRef<HRTFKernel>();
-    
+
    float frameDelay = (1 - x) * kernel1->frameDelay() + x * kernel2->frameDelay();
-    
+
    nsAutoPtr<FFTBlock> interpolatedFrame(
        FFTBlock::CreateInterpolatedBlock(*kernel1->fftFrame(), *kernel2->fftFrame(), x));
    return HRTFKernel::create(interpolatedFrame, frameDelay, sampleRate1);
--- a/dom/media/webaudio/blink/HRTFPanner.h
+++ b/dom/media/webaudio/blink/HRTFPanner.h
@ -35,6 +35,8 @@ struct AudioChunk;

 namespace WebCore {

+typedef nsTArray<float> AudioFloatArray;
+
 class HRTFDatabaseLoader;

 using mozilla::AudioChunk;
--- a/dom/media/webaudio/blink/PeriodicWave.cpp
+++ b/dom/media/webaudio/blink/PeriodicWave.cpp
@ -220,7 +220,7 @@ void PeriodicWave::createBandLimitedTables(const float* realData, const float* i
        imagP[halfSize-1] = 0;

        // Create the band-limited table.
-        AudioFloatArray* table = new AudioFloatArray(m_periodicWaveSize);
+        AlignedAudioFloatArray* table = new AlignedAudioFloatArray(m_periodicWaveSize);
        m_bandLimitedTables.AppendElement(table);

        // Apply an inverse FFT to generate the time-domain table data.
--- a/dom/media/webaudio/blink/PeriodicWave.h
+++ b/dom/media/webaudio/blink/PeriodicWave.h
@ -32,10 +32,12 @@
 #include "mozilla/dom/OscillatorNodeBinding.h"
 #include <nsAutoPtr.h>
 #include <nsTArray.h>
+#include "AlignedTArray.h"
 #include "mozilla/MemoryReporting.h"

 namespace WebCore {

+typedef AlignedTArray<float> AlignedAudioFloatArray;
 typedef nsTArray<float> AudioFloatArray;

 class PeriodicWave {
@ -98,7 +100,7 @@ private:

    // Creates tables based on numberOfComponents Fourier coefficients.
    void createBandLimitedTables(const float* real, const float* imag, unsigned numberOfComponents);
-    nsTArray<nsAutoPtr<AudioFloatArray> > m_bandLimitedTables;
+    nsTArray<nsAutoPtr<AlignedAudioFloatArray> > m_bandLimitedTables;
 };

 } // namespace WebCore
--- a/dom/media/webaudio/moz.build
+++ b/dom/media/webaudio/moz.build
@ -21,6 +21,7 @@ MOCHITEST_CHROME_MANIFESTS += ['test/chrome.ini']
 BROWSER_CHROME_MANIFESTS += ['test/browser.ini']

 EXPORTS += [
+    'AlignedTArray.h',
    'AudioContext.h',
    'AudioEventTimeline.h',
    'AudioNodeEngine.h',