Bug 1801557 - Use xsimd to implement dom/media/webaudio/AudioNodeEngine r=padenot

This patch contains a generic implementation of the algorithms in AudioNodeEngine.cpp, and this generic implementation is instantiated for SSE2 and NEON. Note that with this approach, supporting AVX would only require a few lines. Differential Revision: https://phabricator.services.mozilla.com/D162494
2023-01-13 13:31:43 +00:00 · 2023-01-13 13:31:43 +00:00 · 938cca0cc4
--- a/dom/media/webaudio/AudioNodeEngine.cpp
+++ b/dom/media/webaudio/AudioNodeEngine.cpp
@ -9,12 +9,11 @@
 #include "mozilla/AbstractThread.h"
 #ifdef USE_NEON
 #  include "mozilla/arm.h"
-#  include "AudioNodeEngineNEON.h"
+#  include "AudioNodeEngineGeneric.h"
 #endif
 #ifdef USE_SSE2
 #  include "mozilla/SSE.h"
-#  include "AlignmentUtils.h"
-#  include "AudioNodeEngineSSE2.h"
+#  include "AudioNodeEngineGeneric.h"
 #endif
 #include "AudioBlock.h"
 #include "Tracing.h"
@ -67,39 +66,17 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
                             uint32_t aSize) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBufferAddWithScale_NEON(aInput, aScale, aOutput, aSize);
+    Engine<xsimd::neon>::AudioBufferAddWithScale(aInput, aScale, aOutput,
+                                                 aSize);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    if (aScale == 1.0f) {
-      while (aSize && (!IS_ALIGNED16(aInput) || !IS_ALIGNED16(aOutput))) {
-        *aOutput += *aInput;
-        ++aOutput;
-        ++aInput;
-        --aSize;
-      }
-    } else {
-      while (aSize && (!IS_ALIGNED16(aInput) || !IS_ALIGNED16(aOutput))) {
-        *aOutput += *aInput * aScale;
-        ++aOutput;
-        ++aInput;
-        --aSize;
-      }
-    }
-
-    // we need to round aSize down to the nearest multiple of 16
-    uint32_t alignedSize = aSize & ~0x0F;
-    if (alignedSize > 0) {
-      AudioBufferAddWithScale_SSE(aInput, aScale, aOutput, alignedSize);
-
-      // adjust parameters for use with scalar operations below
-      aInput += alignedSize;
-      aOutput += alignedSize;
-      aSize -= alignedSize;
-    }
+    Engine<xsimd::sse2>::AudioBufferAddWithScale(aInput, aScale, aOutput,
+                                                 aSize);
+    return;
  }
 #endif

@ -127,14 +104,16 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
  } else {
 #ifdef USE_NEON
    if (mozilla::supports_neon()) {
-      AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
+      Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
+                                                          aOutput);
      return;
    }
 #endif

 #ifdef USE_SSE2
    if (mozilla::supports_sse2()) {
-      AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
+      Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
+                                                          aOutput);
      return;
    }
 #endif
@ -147,9 +126,15 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,

 void BufferComplexMultiply(const float* aInput, const float* aScale,
                           float* aOutput, uint32_t aSize) {
+#ifdef USE_NEON
+  if (mozilla::supports_neon()) {
+    Engine<xsimd::neon>::BufferComplexMultiply(aInput, aScale, aOutput, aSize);
+    return;
+  }
+#endif
 #ifdef USE_SSE2
  if (mozilla::supports_sse()) {
-    BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
+    Engine<xsimd::sse2>::BufferComplexMultiply(aInput, aScale, aOutput, aSize);
    return;
  }
 #endif
@ -182,14 +167,16 @@ void AudioBlockCopyChannelWithScale(const float aInput[WEBAUDIO_BLOCK_SIZE],
                                    float aOutput[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
+    Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
+                                                        aOutput);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
+    Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
+                                                        aOutput);
    return;
  }
 #endif
@ -214,14 +201,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
  }
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
+    Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
+    Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
    return;
  }
 #endif
@ -234,14 +221,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
 void AudioBufferInPlaceScale(float* aBlock, float* aScale, uint32_t aSize) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
+    Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
+    Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
    return;
  }
 #endif
@ -275,16 +262,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                 float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
-                                     aIsOnTheLeft, aOutputL, aOutputR);
+    Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
+        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    AudioBlockPanStereoToStereo_SSE(aInputL, aInputR, aGainL, aGainR,
-                                    aIsOnTheLeft, aOutputL, aOutputR);
+    Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
+        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif
@ -313,8 +300,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                 float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
-                                     aIsOnTheLeft, aOutputL, aOutputR);
+    Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
+        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
+    return;
+  }
+#endif
+
+#ifdef USE_SSE2
+  if (mozilla::supports_sse2()) {
+    Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
+        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif
@ -332,32 +327,19 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
 }

 float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
-  float sum = 0.0f;
-
-#ifdef USE_SSE2
-  if (mozilla::supports_sse()) {
-    const float* alignedInput = ALIGNED16(aInput);
-
-    // use scalar operations for any unaligned data at the beginning
-    while (aInput != alignedInput) {
-      if (!aLength) {
-        return sum;
-      }
-      sum += *aInput * *aInput;
-      ++aInput;
-      --aLength;
-    }
-
-    uint32_t vLength = (aLength >> 4) << 4;
-    sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
-
-    // adjust aInput and aLength to use scalar operations for any
-    // remaining values
-    aInput = alignedInput + vLength;
-    aLength -= vLength;
+#ifdef USE_NEON
+  if (mozilla::supports_neon()) {
+    return Engine<xsimd::neon>::AudioBufferSumOfSquares(aInput, aLength);
  }
 #endif

+#ifdef USE_SSE2
+  if (mozilla::supports_sse()) {
+    return Engine<xsimd::sse2>::AudioBufferSumOfSquares(aInput, aLength);
+  }
+#endif
+
+  float sum = 0.f;
  while (aLength--) {
    sum += *aInput * *aInput;
    ++aInput;
@ -368,7 +350,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
 void NaNToZeroInPlace(float* aSamples, size_t aCount) {
 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    NaNToZeroInPlace_SSE(aSamples, aCount);
+    Engine<xsimd::sse2>::NaNToZeroInPlace(aSamples, aCount);
    return;
  }
 #endif
--- a/dom/media/webaudio/AudioNodeEngineGeneric.h
+++ b/dom/media/webaudio/AudioNodeEngineGeneric.h
@ -0,0 +1,335 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_AUDIONODEENGINEGENERIC_H_
+#define MOZILLA_AUDIONODEENGINEGENERIC_H_
+
+#include "AudioNodeEngine.h"
+#include "AlignmentUtils.h"
+
+#include "xsimd/xsimd.hpp"
+
+#if defined(__GNUC__) && __GNUC__ > 7
+#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
+#  define MOZ_UNROLL(factor) MOZ_PRAGMA(GCC unroll factor)
+#elif defined(__INTEL_COMPILER) || (defined(__clang__) && __clang_major__ > 3)
+#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
+#  define MOZ_UNROLL(factor) MOZ_PRAGMA(unroll factor)
+#else
+#  define MOZ_UNROLL(_)
+#endif
+
+namespace mozilla {
+
+template <class Arch>
+static bool is_aligned(const void* ptr) {
+  return (reinterpret_cast<uintptr_t>(ptr) &
+          ~(static_cast<uintptr_t>(Arch::alignment()) - 1)) ==
+         reinterpret_cast<uintptr_t>(ptr);
+};
+
+template <class Arch>
+struct Engine {
+  static void AudioBufferAddWithScale(const float* aInput, float aScale,
+                                      float* aOutput, uint32_t aSize) {
+    if constexpr (Arch::requires_alignment()) {
+      if (aScale == 1.0f) {
+        while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
+          if (!aSize) return;
+          *aOutput += *aInput;
+          ++aOutput;
+          ++aInput;
+          --aSize;
+        }
+      } else {
+        while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
+          if (!aSize) return;
+          *aOutput += *aInput * aScale;
+          ++aOutput;
+          ++aInput;
+          --aSize;
+        }
+      }
+    }
+    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
+
+    xsimd::batch<float, Arch> vgain(aScale);
+
+    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
+    MOZ_UNROLL(4)
+    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
+      auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
+      auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
+      auto vout = xsimd::fma(vin1, vgain, vin2);
+      vout.store_aligned(&aOutput[i]);
+    }
+
+    for (unsigned i = aVSize; i < aSize; ++i) {
+      aOutput[i] += aInput[i] * aScale;
+    }
+  };
+
+  static void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
+                                             float* aOutput) {
+    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
+
+    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
+               "requires tail processing");
+
+    xsimd::batch<float, Arch> vgain = (aScale);
+
+    MOZ_UNROLL(4)
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
+         i += xsimd::batch<float, Arch>::size) {
+      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
+      auto vout = vin * vgain;
+      vout.store_aligned(&aOutput[i]);
+    }
+  };
+
+  static void AudioBlockCopyChannelWithScale(
+      const float aInput[WEBAUDIO_BLOCK_SIZE],
+      const float aScale[WEBAUDIO_BLOCK_SIZE],
+      float aOutput[WEBAUDIO_BLOCK_SIZE]) {
+    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
+
+    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
+               "requires tail processing");
+
+    MOZ_UNROLL(4)
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
+         i += xsimd::batch<float, Arch>::size) {
+      auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
+      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
+      auto vout = vin * vscaled;
+      vout.store_aligned(&aOutput[i]);
+    }
+  };
+
+  static void AudioBufferInPlaceScale(float* aBlock, float aScale,
+                                      uint32_t aSize) {
+    MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
+
+    xsimd::batch<float, Arch> vgain(aScale);
+
+    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
+    MOZ_UNROLL(4)
+    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
+      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
+      auto vout = vin * vgain;
+      vout.store_aligned(&aBlock[i]);
+    }
+    for (unsigned i = aVSize; i < aSize; ++i) aBlock[i] *= aScale;
+  };
+
+  static void AudioBufferInPlaceScale(float* aBlock, float* aScale,
+                                      uint32_t aSize) {
+    MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
+
+    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
+    MOZ_UNROLL(4)
+    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
+      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
+      auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
+      auto vout = vin * vgain;
+      vout.store_aligned(&aBlock[i]);
+    }
+    for (uint32_t i = aVSize; i < aSize; ++i) {
+      *aBlock++ *= *aScale++;
+    }
+  };
+
+  static void AudioBlockPanStereoToStereo(
+      const float aInputL[WEBAUDIO_BLOCK_SIZE],
+      const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
+      bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
+      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+    MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
+
+    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
+               "requires tail processing");
+
+    xsimd::batch<float, Arch> vgainl(aGainL);
+    xsimd::batch<float, Arch> vgainr(aGainR);
+
+    if (aIsOnTheLeft) {
+      MOZ_UNROLL(2)
+      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
+           i += xsimd::batch<float, Arch>::size) {
+        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
+        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
+
+        /* left channel : aOutputL  = aInputL + aInputR * gainL */
+        auto vout = xsimd::fma(vinr, vgainl, vinl);
+        vout.store_aligned(&aOutputL[i]);
+
+        /* right channel : aOutputR = aInputR * gainR */
+        auto vscaled = vinr * vgainr;
+        vscaled.store_aligned(&aOutputR[i]);
+      }
+    } else {
+      MOZ_UNROLL(2)
+      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
+           i += xsimd::batch<float, Arch>::size) {
+        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
+        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
+
+        /* left channel : aInputL * gainL */
+        auto vscaled = vinl * vgainl;
+        vscaled.store_aligned(&aOutputL[i]);
+
+        /* right channel: aOutputR = aInputR + aInputL * gainR */
+        auto vout = xsimd::fma(vinl, vgainr, vinr);
+        vout.store_aligned(&aOutputR[i]);
+      }
+    }
+  };
+
+  static void BufferComplexMultiply(const float* aInput, const float* aScale,
+                                    float* aOutput, uint32_t aSize) {
+    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
+    MOZ_ASSERT((aSize % xsimd::batch<float, Arch>::size == 0),
+               "requires tail processing");
+
+    MOZ_UNROLL(2)
+    for (unsigned i = 0; i < aSize * 2;
+         i += 2 * xsimd::batch<std::complex<float>>::size) {
+      auto in1 = xsimd::batch<std::complex<float>>::load_aligned(
+          reinterpret_cast<const std::complex<float>*>(&aInput[i]));
+      auto in2 = xsimd::batch<std::complex<float>>::load_aligned(
+          reinterpret_cast<const std::complex<float>*>(&aScale[i]));
+      auto out = in1 * in2;
+      out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
+    }
+  };
+
+  static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
+    float sum = 0.f;
+
+    if constexpr (Arch::requires_alignment()) {
+      while (!is_aligned<Arch>(aInput)) {
+        if (!aLength) {
+          return sum;
+        }
+        sum += *aInput * *aInput;
+        ++aInput;
+        --aLength;
+      }
+    }
+
+    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
+
+    constexpr uint32_t unroll_factor = 4;
+    xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
+
+    uint32_t vLength =
+        aLength & ~(unroll_factor * xsimd::batch<float, Arch>::size - 1);
+
+    for (uint32_t i = 0; i < vLength;
+         i += unroll_factor * xsimd::batch<float, Arch>::size) {
+      MOZ_UNROLL(4)
+      for (uint32_t j = 0; j < unroll_factor; ++j) {
+        auto in = xsimd::batch<float, Arch>::load_aligned(
+            &aInput[i + xsimd::batch<float, Arch>::size * j]);
+        accs[j] = xsimd::fma(in, in, accs[j]);
+      }
+    }
+
+    sum += reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
+    for (uint32_t i = vLength; i < aLength; ++i) sum += aInput[i] * aInput[i];
+    return sum;
+  };
+
+  static void NaNToZeroInPlace(float* aSamples, size_t aCount) {
+    if constexpr (Arch::requires_alignment()) {
+      while (!is_aligned<Arch>(aSamples)) {
+        if (!aCount) {
+          return;
+        }
+        if (*aSamples != *aSamples) {
+          *aSamples = 0.0;
+        }
+        ++aSamples;
+        --aCount;
+      }
+    }
+
+    MOZ_ASSERT(is_aligned<Arch>(aSamples), "aSamples is aligned");
+
+    uint32_t vCount = aCount & ~(xsimd::batch<float, Arch>::size - 1);
+
+    MOZ_UNROLL(4)
+    for (uint32_t i = 0; i < vCount; i += xsimd::batch<float, Arch>::size) {
+      auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
+      auto vout =
+          xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
+      vout.store_aligned(&aSamples[i]);
+    }
+
+    for (uint32_t i = vCount; i < aCount; i++) {
+      if (aSamples[i] != aSamples[i]) {
+        aSamples[i] = 0.0;
+      }
+    }
+  };
+
+  static void AudioBlockPanStereoToStereo(
+      const float aInputL[WEBAUDIO_BLOCK_SIZE],
+      const float aInputR[WEBAUDIO_BLOCK_SIZE],
+      const float aGainL[WEBAUDIO_BLOCK_SIZE],
+      const float aGainR[WEBAUDIO_BLOCK_SIZE],
+      const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
+      float aOutputL[WEBAUDIO_BLOCK_SIZE],
+      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+    MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aGainL), "aGainL is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aGainR), "aGainR is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aIsOnTheLeft), "aIsOnTheLeft is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
+    MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
+
+    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
+               "requires tail processing");
+
+    MOZ_UNROLL(2)
+    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
+         i += xsimd::batch<float, Arch>::size) {
+      auto mask =
+          xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
+
+      auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
+      auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
+      auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
+      auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
+
+      auto outL_true = xsimd::fma(inputR, gainL, inputL);
+      auto outR_true = inputR * gainR;
+
+      auto outL_false = inputL * gainL;
+      auto outR_false = xsimd::fma(inputL, gainR, inputR);
+
+      auto outL = xsimd::select(mask, outL_true, outL_false);
+      auto outR = xsimd::select(mask, outR_true, outR_false);
+
+      outL.store_aligned(&aOutputL[i]);
+      outR.store_aligned(&aOutputR[i]);
+    }
+  }
+};
+
+}  // namespace mozilla
+
+#endif
--- a/dom/media/webaudio/AudioNodeEngineNEON.cpp
+++ b/dom/media/webaudio/AudioNodeEngineNEON.cpp
@ -3,350 +3,7 @@
 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

-#include "AudioNodeEngineNEON.h"
-#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
-#  include <arm64_neon.h>
-#else
-#  include <arm_neon.h>
-#endif
-
-//#ifdef DEBUG
-#if 0  // see bug 921099
-#  define ASSERT_ALIGNED(ptr)                                     \
-    MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
-               #ptr " has to be aligned 16-bytes aligned.");
-#else
-#  define ASSERT_ALIGNED(ptr)
-#endif
-
-#define ADDRESS_OF(array, index) ((float32_t*)&array[index])
-
+#include "AudioNodeEngineGeneric.h"
 namespace mozilla {
-void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
-                                  float* aOutput, uint32_t aSize) {
-  ASSERT_ALIGNED(aInput);
-  ASSERT_ALIGNED(aOutput);
-
-  float32x4_t vin0, vin1, vin2, vin3;
-  float32x4_t vout0, vout1, vout2, vout3;
-  float32x4_t vscale = vmovq_n_f32(aScale);
-
-  uint32_t dif = aSize % 16;
-  aSize -= dif;
-  unsigned i = 0;
-  for (; i < aSize; i += 16) {
-    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
-    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
-    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
-    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
-
-    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
-    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
-    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
-    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
-
-    vout0 = vmlaq_f32(vout0, vin0, vscale);
-    vout1 = vmlaq_f32(vout1, vin1, vscale);
-    vout2 = vmlaq_f32(vout2, vin2, vscale);
-    vout3 = vmlaq_f32(vout3, vin3, vscale);
-
-    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
-  }
-
-  for (unsigned j = 0; j < dif; ++i, ++j) {
-    aOutput[i] += aInput[i] * aScale;
-  }
-}
-void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
-                                         float* aOutput) {
-  ASSERT_ALIGNED(aInput);
-  ASSERT_ALIGNED(aOutput);
-
-  float32x4_t vin0, vin1, vin2, vin3;
-  float32x4_t vout0, vout1, vout2, vout3;
-  float32x4_t vscale = vmovq_n_f32(aScale);
-
-  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
-    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
-    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
-    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
-    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
-
-    vout0 = vmulq_f32(vin0, vscale);
-    vout1 = vmulq_f32(vin1, vscale);
-    vout2 = vmulq_f32(vin2, vscale);
-    vout3 = vmulq_f32(vin3, vscale);
-
-    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
-  }
-}
-
-void AudioBlockCopyChannelWithScale_NEON(
-    const float aInput[WEBAUDIO_BLOCK_SIZE],
-    const float aScale[WEBAUDIO_BLOCK_SIZE],
-    float aOutput[WEBAUDIO_BLOCK_SIZE]) {
-  ASSERT_ALIGNED(aInput);
-  ASSERT_ALIGNED(aScale);
-  ASSERT_ALIGNED(aOutput);
-
-  float32x4_t vin0, vin1, vin2, vin3;
-  float32x4_t vout0, vout1, vout2, vout3;
-  float32x4_t vscale0, vscale1, vscale2, vscale3;
-
-  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
-    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
-    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
-    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
-    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
-
-    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
-    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
-    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
-    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
-
-    vout0 = vmulq_f32(vin0, vscale0);
-    vout1 = vmulq_f32(vin1, vscale1);
-    vout2 = vmulq_f32(vin2, vscale2);
-    vout3 = vmulq_f32(vin3, vscale3);
-
-    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
-    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
-  }
-}
-
-void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
-  ASSERT_ALIGNED(aBlock);
-
-  float32x4_t vin0, vin1, vin2, vin3;
-  float32x4_t vout0, vout1, vout2, vout3;
-  float32x4_t vscale = vmovq_n_f32(aScale);
-
-  uint32_t dif = aSize % 16;
-  uint32_t vectorSize = aSize - dif;
-  uint32_t i = 0;
-  for (; i < vectorSize; i += 16) {
-    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
-    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
-    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
-    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
-
-    vout0 = vmulq_f32(vin0, vscale);
-    vout1 = vmulq_f32(vin1, vscale);
-    vout2 = vmulq_f32(vin2, vscale);
-    vout3 = vmulq_f32(vin3, vscale);
-
-    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
-  }
-
-  for (unsigned j = 0; j < dif; ++i, ++j) {
-    aBlock[i] *= aScale;
-  }
-}
-
-void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
-                                  uint32_t aSize) {
-  ASSERT_ALIGNED(aBlock);
-
-  float32x4_t vin0, vin1, vin2, vin3;
-  float32x4_t vout0, vout1, vout2, vout3;
-  float32x4_t vscale0, vscale1, vscale2, vscale3;
-
-  uint32_t dif = aSize % 16;
-  uint32_t vectorSize = aSize - dif;
-  uint32_t i = 0;
-  for (; i < vectorSize; i += 16) {
-    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
-    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
-    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
-    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
-
-    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
-    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
-    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
-    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
-
-    vout0 = vmulq_f32(vin0, vscale0);
-    vout1 = vmulq_f32(vin1, vscale1);
-    vout2 = vmulq_f32(vin2, vscale2);
-    vout3 = vmulq_f32(vin3, vscale3);
-
-    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
-    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
-  }
-
-  for (unsigned j = 0; j < dif; ++i, ++j) {
-    aBlock[i] *= aScale[i];
-  }
-}
-
-void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
-                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
-                                      float aGainL, float aGainR,
-                                      bool aIsOnTheLeft,
-                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
-                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
-  ASSERT_ALIGNED(aInputL);
-  ASSERT_ALIGNED(aInputR);
-  ASSERT_ALIGNED(aOutputL);
-  ASSERT_ALIGNED(aOutputR);
-
-  float32x4_t vinL0, vinL1;
-  float32x4_t vinR0, vinR1;
-  float32x4_t voutL0, voutL1;
-  float32x4_t voutR0, voutR1;
-  float32x4_t vscaleL = vmovq_n_f32(aGainL);
-  float32x4_t vscaleR = vmovq_n_f32(aGainR);
-
-  if (aIsOnTheLeft) {
-    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
-      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
-      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
-
-      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
-      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
-
-      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
-      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
-
-      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
-      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
-
-      voutR0 = vmulq_f32(vinR0, vscaleR);
-      voutR1 = vmulq_f32(vinR1, vscaleR);
-
-      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
-      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
-    }
-  } else {
-    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
-      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
-      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
-
-      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
-      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
-
-      voutL0 = vmulq_f32(vinL0, vscaleL);
-      voutL1 = vmulq_f32(vinL1, vscaleL);
-
-      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
-      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
-
-      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
-      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
-
-      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
-      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
-    }
-  }
-}
-
-void AudioBlockPanStereoToStereo_NEON(
-    const float aInputL[WEBAUDIO_BLOCK_SIZE],
-    const float aInputR[WEBAUDIO_BLOCK_SIZE],
-    const float aGainL[WEBAUDIO_BLOCK_SIZE],
-    const float aGainR[WEBAUDIO_BLOCK_SIZE],
-    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
-    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
-  ASSERT_ALIGNED(aInputL);
-  ASSERT_ALIGNED(aInputR);
-  ASSERT_ALIGNED(aGainL);
-  ASSERT_ALIGNED(aGainR);
-  ASSERT_ALIGNED(aIsOnTheLeft);
-  ASSERT_ALIGNED(aOutputL);
-  ASSERT_ALIGNED(aOutputR);
-
-  float32x4_t vinL0, vinL1;
-  float32x4_t vinR0, vinR1;
-  float32x4_t voutL0, voutL1;
-  float32x4_t voutR0, voutR1;
-  float32x4_t vscaleL0, vscaleL1;
-  float32x4_t vscaleR0, vscaleR1;
-  float32x4_t onleft0, onleft1, notonleft0, notonleft1;
-
-  float32x4_t zero = vmovq_n_f32(0);
-  uint8x8_t isOnTheLeft;
-
-  // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
-  // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
-  // compiler warning, set zero.
-  voutL0 = zero;
-  voutL1 = zero;
-
-  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
-    vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
-    vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
-
-    vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
-    vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
-
-    vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
-    vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
-
-    vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
-    vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
-
-    // Load output with boolean "on the left" values. This assumes that
-    // bools are stored as a single byte.
-    isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
-    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
-    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
-    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
-    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
-    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
-    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
-    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
-    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
-
-    // Convert the boolean values into masks by setting all bits to 1
-    // if true.
-    voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
-    voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
-
-    // The right output masks are the same as the left masks
-    voutR0 = voutL0;
-    voutR1 = voutL1;
-
-    // Calculate left channel assuming isOnTheLeft
-    onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
-    onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL1);
-
-    // Calculate left channel assuming not isOnTheLeft
-    notonleft0 = vmulq_f32(vinL0, vscaleL0);
-    notonleft1 = vmulq_f32(vinL1, vscaleL1);
-
-    // Write results using previously stored masks
-    voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
-    voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
-
-    // Calculate right channel assuming isOnTheLeft
-    onleft0 = vmulq_f32(vinR0, vscaleR0);
-    onleft1 = vmulq_f32(vinR1, vscaleR1);
-
-    // Calculate right channel assuming not isOnTheLeft
-    notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
-    notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
-
-    // Write results using previously stored masks
-    voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
-    voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
-
-    vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
-    vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
-    vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
-    vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
-  }
-}
+template struct Engine<xsimd::neon>;
 }  // namespace mozilla
--- a/dom/media/webaudio/AudioNodeEngineNEON.h
+++ b/dom/media/webaudio/AudioNodeEngineNEON.h
@ -1,42 +0,0 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* this source code form is subject to the terms of the mozilla public
- * license, v. 2.0. if a copy of the mpl was not distributed with this file,
- * You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef MOZILLA_AUDIONODEENGINENEON_H_
-#define MOZILLA_AUDIONODEENGINENEON_H_
-
-#include "AudioNodeEngine.h"
-
-namespace mozilla {
-void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
-                                  float* aOutput, uint32_t aSize);
-
-void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
-                                         float* aOutput);
-
-void AudioBlockCopyChannelWithScale_NEON(
-    const float aInput[WEBAUDIO_BLOCK_SIZE],
-    const float aScale[WEBAUDIO_BLOCK_SIZE],
-    float aOutput[WEBAUDIO_BLOCK_SIZE]);
-
-void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize);
-void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale, uint32_t aSize);
-
-void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
-                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
-                                      float aGainL, float aGainR,
-                                      bool aIsOnTheLeft,
-                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
-                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]);
-
-void AudioBlockPanStereoToStereo_NEON(
-    const float aInputL[WEBAUDIO_BLOCK_SIZE],
-    const float aInputR[WEBAUDIO_BLOCK_SIZE],
-    const float aGainL[WEBAUDIO_BLOCK_SIZE],
-    const float aGainR[WEBAUDIO_BLOCK_SIZE],
-    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
-    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]);
-}  // namespace mozilla
-
-#endif /* MOZILLA_AUDIONODEENGINENEON_H_ */
--- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@ -3,361 +3,8 @@
 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

-#include "AudioNodeEngineSSE2.h"
-#include "AlignmentUtils.h"
-#include <emmintrin.h>
+#include "AudioNodeEngineGeneric.h"

 namespace mozilla {
-void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
-                                 float* aOutput, uint32_t aSize) {
-  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
-      vout1, vout2, vout3, vgain;
-
-  ASSERT_ALIGNED16(aInput);
-  ASSERT_ALIGNED16(aOutput);
-  ASSERT_MULTIPLE16(aSize);
-
-  vgain = _mm_load1_ps(&aScale);
-
-  for (unsigned i = 0; i < aSize; i += 16) {
-    vin0 = _mm_load_ps(&aInput[i]);
-    vin1 = _mm_load_ps(&aInput[i + 4]);
-    vin2 = _mm_load_ps(&aInput[i + 8]);
-    vin3 = _mm_load_ps(&aInput[i + 12]);
-
-    vscaled0 = _mm_mul_ps(vin0, vgain);
-    vscaled1 = _mm_mul_ps(vin1, vgain);
-    vscaled2 = _mm_mul_ps(vin2, vgain);
-    vscaled3 = _mm_mul_ps(vin3, vgain);
-
-    vin0 = _mm_load_ps(&aOutput[i]);
-    vin1 = _mm_load_ps(&aOutput[i + 4]);
-    vin2 = _mm_load_ps(&aOutput[i + 8]);
-    vin3 = _mm_load_ps(&aOutput[i + 12]);
-
-    vout0 = _mm_add_ps(vin0, vscaled0);
-    vout1 = _mm_add_ps(vin1, vscaled1);
-    vout2 = _mm_add_ps(vin2, vscaled2);
-    vout3 = _mm_add_ps(vin3, vscaled3);
-
-    _mm_store_ps(&aOutput[i], vout0);
-    _mm_store_ps(&aOutput[i + 4], vout1);
-    _mm_store_ps(&aOutput[i + 8], vout2);
-    _mm_store_ps(&aOutput[i + 12], vout3);
-  }
-}
-
-void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
-                                        float* aOutput) {
-  __m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;
-
-  ASSERT_ALIGNED16(aInput);
-  ASSERT_ALIGNED16(aOutput);
-
-  __m128 vgain = _mm_load1_ps(&aScale);
-
-  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
-    vin0 = _mm_load_ps(&aInput[i]);
-    vin1 = _mm_load_ps(&aInput[i + 4]);
-    vin2 = _mm_load_ps(&aInput[i + 8]);
-    vin3 = _mm_load_ps(&aInput[i + 12]);
-    vout0 = _mm_mul_ps(vin0, vgain);
-    vout1 = _mm_mul_ps(vin1, vgain);
-    vout2 = _mm_mul_ps(vin2, vgain);
-    vout3 = _mm_mul_ps(vin3, vgain);
-    _mm_store_ps(&aOutput[i], vout0);
-    _mm_store_ps(&aOutput[i + 4], vout1);
-    _mm_store_ps(&aOutput[i + 8], vout2);
-    _mm_store_ps(&aOutput[i + 12], vout3);
-  }
-}
-
-void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
-                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
-                                        float aOutput[WEBAUDIO_BLOCK_SIZE]) {
-  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
-      vout1, vout2, vout3;
-
-  ASSERT_ALIGNED16(aInput);
-  ASSERT_ALIGNED16(aScale);
-  ASSERT_ALIGNED16(aOutput);
-
-  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
-    vscaled0 = _mm_load_ps(&aScale[i]);
-    vscaled1 = _mm_load_ps(&aScale[i + 4]);
-    vscaled2 = _mm_load_ps(&aScale[i + 8]);
-    vscaled3 = _mm_load_ps(&aScale[i + 12]);
-
-    vin0 = _mm_load_ps(&aInput[i]);
-    vin1 = _mm_load_ps(&aInput[i + 4]);
-    vin2 = _mm_load_ps(&aInput[i + 8]);
-    vin3 = _mm_load_ps(&aInput[i + 12]);
-
-    vout0 = _mm_mul_ps(vin0, vscaled0);
-    vout1 = _mm_mul_ps(vin1, vscaled1);
-    vout2 = _mm_mul_ps(vin2, vscaled2);
-    vout3 = _mm_mul_ps(vin3, vscaled3);
-
-    _mm_store_ps(&aOutput[i], vout0);
-    _mm_store_ps(&aOutput[i + 4], vout1);
-    _mm_store_ps(&aOutput[i + 8], vout2);
-    _mm_store_ps(&aOutput[i + 12], vout3);
-  }
-}
-
-void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
-  __m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;
-
-  ASSERT_ALIGNED16(aBlock);
-  ASSERT_MULTIPLE16(aSize);
-
-  __m128 vgain = _mm_load1_ps(&aScale);
-
-  for (unsigned i = 0; i < aSize; i += 16) {
-    vin0 = _mm_load_ps(&aBlock[i]);
-    vin1 = _mm_load_ps(&aBlock[i + 4]);
-    vin2 = _mm_load_ps(&aBlock[i + 8]);
-    vin3 = _mm_load_ps(&aBlock[i + 12]);
-    vout0 = _mm_mul_ps(vin0, vgain);
-    vout1 = _mm_mul_ps(vin1, vgain);
-    vout2 = _mm_mul_ps(vin2, vgain);
-    vout3 = _mm_mul_ps(vin3, vgain);
-    _mm_store_ps(&aBlock[i], vout0);
-    _mm_store_ps(&aBlock[i + 4], vout1);
-    _mm_store_ps(&aBlock[i + 8], vout2);
-    _mm_store_ps(&aBlock[i + 12], vout3);
-  }
-}
-
-void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
-  __m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
-      vin2, vin3;
-
-  ASSERT_ALIGNED16(aBlock);
-  ASSERT_MULTIPLE16(aSize);
-
-  for (unsigned i = 0; i < aSize; i += 16) {
-    vin0 = _mm_load_ps(&aBlock[i]);
-    vin1 = _mm_load_ps(&aBlock[i + 4]);
-    vin2 = _mm_load_ps(&aBlock[i + 8]);
-    vin3 = _mm_load_ps(&aBlock[i + 12]);
-    vgain0 = _mm_load_ps(&aScale[i]);
-    vgain1 = _mm_load_ps(&aScale[i + 4]);
-    vgain2 = _mm_load_ps(&aScale[i + 8]);
-    vgain3 = _mm_load_ps(&aScale[i + 12]);
-    vout0 = _mm_mul_ps(vin0, vgain0);
-    vout1 = _mm_mul_ps(vin1, vgain1);
-    vout2 = _mm_mul_ps(vin2, vgain2);
-    vout3 = _mm_mul_ps(vin3, vgain3);
-    _mm_store_ps(&aBlock[i], vout0);
-    _mm_store_ps(&aBlock[i + 4], vout1);
-    _mm_store_ps(&aBlock[i + 8], vout2);
-    _mm_store_ps(&aBlock[i + 12], vout3);
-  }
-}
-
-void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
-                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
-                                     float aGainL, float aGainR,
-                                     bool aIsOnTheLeft,
-                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
-                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
-  __m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
-      vgainr;
-
-  ASSERT_ALIGNED16(aInputL);
-  ASSERT_ALIGNED16(aInputR);
-  ASSERT_ALIGNED16(aOutputL);
-  ASSERT_ALIGNED16(aOutputR);
-
-  vgainl = _mm_load1_ps(&aGainL);
-  vgainr = _mm_load1_ps(&aGainR);
-
-  if (aIsOnTheLeft) {
-    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
-      vinl0 = _mm_load_ps(&aInputL[i]);
-      vinr0 = _mm_load_ps(&aInputR[i]);
-      vinl1 = _mm_load_ps(&aInputL[i + 4]);
-      vinr1 = _mm_load_ps(&aInputR[i + 4]);
-
-      /* left channel : aOutputL  = aInputL + aInputR * gainL */
-      vscaled0 = _mm_mul_ps(vinr0, vgainl);
-      vscaled1 = _mm_mul_ps(vinr1, vgainl);
-      vout0 = _mm_add_ps(vscaled0, vinl0);
-      vout1 = _mm_add_ps(vscaled1, vinl1);
-      _mm_store_ps(&aOutputL[i], vout0);
-      _mm_store_ps(&aOutputL[i + 4], vout1);
-
-      /* right channel : aOutputR = aInputR * gainR */
-      vscaled0 = _mm_mul_ps(vinr0, vgainr);
-      vscaled1 = _mm_mul_ps(vinr1, vgainr);
-      _mm_store_ps(&aOutputR[i], vscaled0);
-      _mm_store_ps(&aOutputR[i + 4], vscaled1);
-    }
-  } else {
-    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
-      vinl0 = _mm_load_ps(&aInputL[i]);
-      vinr0 = _mm_load_ps(&aInputR[i]);
-      vinl1 = _mm_load_ps(&aInputL[i + 4]);
-      vinr1 = _mm_load_ps(&aInputR[i + 4]);
-
-      /* left channel : aInputL * gainL */
-      vscaled0 = _mm_mul_ps(vinl0, vgainl);
-      vscaled1 = _mm_mul_ps(vinl1, vgainl);
-      _mm_store_ps(&aOutputL[i], vscaled0);
-      _mm_store_ps(&aOutputL[i + 4], vscaled1);
-
-      /* right channel: aOutputR = aInputR + aInputL * gainR */
-      vscaled0 = _mm_mul_ps(vinl0, vgainr);
-      vscaled1 = _mm_mul_ps(vinl1, vgainr);
-      vout0 = _mm_add_ps(vscaled0, vinr0);
-      vout1 = _mm_add_ps(vscaled1, vinr1);
-      _mm_store_ps(&aOutputR[i], vout0);
-      _mm_store_ps(&aOutputR[i + 4], vout1);
-    }
-  }
-}
-
-void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
-                               float* aOutput, uint32_t aSize) {
-  unsigned i;
-  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
-      outimag1, outimag2, outimag3;
-
-  ASSERT_ALIGNED16(aInput);
-  ASSERT_ALIGNED16(aScale);
-  ASSERT_ALIGNED16(aOutput);
-  ASSERT_MULTIPLE16(aSize);
-
-  for (i = 0; i < aSize * 2; i += 16) {
-    in0 = _mm_load_ps(&aInput[i]);
-    in1 = _mm_load_ps(&aInput[i + 4]);
-    in2 = _mm_load_ps(&aInput[i + 8]);
-    in3 = _mm_load_ps(&aInput[i + 12]);
-
-    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
-    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
-    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
-    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
-
-    in0 = _mm_load_ps(&aScale[i]);
-    in1 = _mm_load_ps(&aScale[i + 4]);
-    in2 = _mm_load_ps(&aScale[i + 8]);
-    in3 = _mm_load_ps(&aScale[i + 12]);
-
-    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
-    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
-    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
-    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
-
-    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
-                     _mm_mul_ps(outimag0, outimag1));
-    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
-                     _mm_mul_ps(outimag0, outreal1));
-    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
-                     _mm_mul_ps(outimag2, outimag3));
-    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
-                     _mm_mul_ps(outimag2, outreal3));
-
-    outreal0 = _mm_unpacklo_ps(in0, in1);
-    outreal1 = _mm_unpackhi_ps(in0, in1);
-    outreal2 = _mm_unpacklo_ps(in2, in3);
-    outreal3 = _mm_unpackhi_ps(in2, in3);
-
-    _mm_store_ps(&aOutput[i], outreal0);
-    _mm_store_ps(&aOutput[i + 4], outreal1);
-    _mm_store_ps(&aOutput[i + 8], outreal2);
-    _mm_store_ps(&aOutput[i + 12], outreal3);
-  }
-}
-
-float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
-  unsigned i;
-  __m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
-  float out[4];
-
-  ASSERT_ALIGNED16(aInput);
-  ASSERT_MULTIPLE16(aLength);
-
-  acc0 = _mm_setzero_ps();
-  acc1 = _mm_setzero_ps();
-  acc2 = _mm_setzero_ps();
-  acc3 = _mm_setzero_ps();
-
-  for (i = 0; i < aLength; i += 16) {
-    in0 = _mm_load_ps(&aInput[i]);
-    in1 = _mm_load_ps(&aInput[i + 4]);
-    in2 = _mm_load_ps(&aInput[i + 8]);
-    in3 = _mm_load_ps(&aInput[i + 12]);
-
-    in0 = _mm_mul_ps(in0, in0);
-    in1 = _mm_mul_ps(in1, in1);
-    in2 = _mm_mul_ps(in2, in2);
-    in3 = _mm_mul_ps(in3, in3);
-
-    acc0 = _mm_add_ps(acc0, in0);
-    acc1 = _mm_add_ps(acc1, in1);
-    acc2 = _mm_add_ps(acc2, in2);
-    acc3 = _mm_add_ps(acc3, in3);
-  }
-
-  acc0 = _mm_add_ps(acc0, acc1);
-  acc0 = _mm_add_ps(acc0, acc2);
-  acc0 = _mm_add_ps(acc0, acc3);
-
-  _mm_store_ps(out, acc0);
-
-  return out[0] + out[1] + out[2] + out[3];
-}
-
-void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount) {
-  __m128 vin0, vin1, vin2, vin3;
-  __m128 vmask0, vmask1, vmask2, vmask3;
-  __m128 vout0, vout1, vout2, vout3;
-
-  float* samplesAligned16 = ALIGNED16(aSamples);
-  size_t leadingElementsScalar =
-      std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
-  size_t remainingElements = aCount - leadingElementsScalar;
-  size_t vectoredEnd = aCount - remainingElements % 16;
-
-  MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) % 16));
-
-  size_t i = 0;
-  for (; i < leadingElementsScalar; i++) {
-    if (aSamples[i] != aSamples[i]) {
-      aSamples[i] = 0.0;
-    }
-  }
-
-  ASSERT_ALIGNED16(&aSamples[i]);
-
-  for (; i < vectoredEnd; i += 16) {
-    vin0 = _mm_load_ps(&aSamples[i + 0]);
-    vin1 = _mm_load_ps(&aSamples[i + 4]);
-    vin2 = _mm_load_ps(&aSamples[i + 8]);
-    vin3 = _mm_load_ps(&aSamples[i + 12]);
-
-    vmask0 = _mm_cmpord_ps(vin0, vin0);
-    vmask1 = _mm_cmpord_ps(vin1, vin1);
-    vmask2 = _mm_cmpord_ps(vin2, vin2);
-    vmask3 = _mm_cmpord_ps(vin3, vin3);
-
-    vout0 = _mm_and_ps(vin0, vmask0);
-    vout1 = _mm_and_ps(vin1, vmask1);
-    vout2 = _mm_and_ps(vin2, vmask2);
-    vout3 = _mm_and_ps(vin3, vmask3);
-
-    _mm_store_ps(&aSamples[i + 0], vout0);
-    _mm_store_ps(&aSamples[i + 4], vout1);
-    _mm_store_ps(&aSamples[i + 8], vout2);
-    _mm_store_ps(&aSamples[i + 12], vout3);
-  }
-  for (; i < aCount; i++) {
-    if (aSamples[i] != aSamples[i]) {
-      aSamples[i] = 0.0;
-    }
-  }
-}
-
+template struct Engine<xsimd::sse2>;
 }  // namespace mozilla
--- a/dom/media/webaudio/AudioNodeEngineSSE2.h
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.h
@ -1,35 +0,0 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* this source code form is subject to the terms of the mozilla public
- * license, v. 2.0. if a copy of the mpl was not distributed with this file,
- * You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include "AudioNodeEngine.h"
-
-namespace mozilla {
-void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
-                                 float* aOutput, uint32_t aSize);
-
-void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
-                                        float* aOutput);
-
-void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
-                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
-                                        float aOutput[WEBAUDIO_BLOCK_SIZE]);
-
-void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize);
-void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize);
-
-void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
-                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
-                                     float aGainL, float aGainR,
-                                     bool aIsOnTheLeft,
-                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
-                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]);
-
-float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength);
-
-void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
-                               float* aOutput, uint32_t aSize);
-
-void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount);
-}  // namespace mozilla
--- a/dom/media/webaudio/moz.build
+++ b/dom/media/webaudio/moz.build
@ -127,6 +127,7 @@ UNIFIED_SOURCES += [

 if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
    DEFINES["USE_NEON"] = True
+    LOCAL_INCLUDES += ["/third_party/xsimd/include"]
    SOURCES += ["AudioNodeEngineNEON.cpp"]
    SOURCES["AudioNodeEngineNEON.cpp"].flags += CONFIG["NEON_FLAGS"]
    if CONFIG["BUILD_ARM_NEON"]:
@ -136,6 +137,7 @@ if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
 if CONFIG["INTEL_ARCHITECTURE"]:
    SOURCES += ["AudioNodeEngineSSE2.cpp"]
    DEFINES["USE_SSE2"] = True
+    LOCAL_INCLUDES += ["/third_party/xsimd/include"]
    SOURCES["AudioNodeEngineSSE2.cpp"].flags += CONFIG["SSE2_FLAGS"]

 include("/ipc/chromium/chromium-config.mozbuild")