Bug 1801557 - Use xsimd to implement dom/media/webaudio/AudioNodeEngine r=padenot

This patch contains a generic implementation of the algorithms in
AudioNodeEngine.cpp, and this generic implementation is instantiated for
SSE2 and NEON. Note that with this approach, supporting AVX would only
require a few lines.

Differential Revision: https://phabricator.services.mozilla.com/D162494
This commit is contained in:
serge-sans-paille 2023-01-13 13:31:43 +00:00
Родитель 149f737bd0
Коммит 938cca0cc4
7 изменённых файлов: 392 добавлений и 846 удалений

Просмотреть файл

@ -9,12 +9,11 @@
#include "mozilla/AbstractThread.h"
#ifdef USE_NEON
# include "mozilla/arm.h"
# include "AudioNodeEngineNEON.h"
# include "AudioNodeEngineGeneric.h"
#endif
#ifdef USE_SSE2
# include "mozilla/SSE.h"
# include "AlignmentUtils.h"
# include "AudioNodeEngineSSE2.h"
# include "AudioNodeEngineGeneric.h"
#endif
#include "AudioBlock.h"
#include "Tracing.h"
@ -67,39 +66,17 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
uint32_t aSize) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBufferAddWithScale_NEON(aInput, aScale, aOutput, aSize);
Engine<xsimd::neon>::AudioBufferAddWithScale(aInput, aScale, aOutput,
aSize);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
if (aScale == 1.0f) {
while (aSize && (!IS_ALIGNED16(aInput) || !IS_ALIGNED16(aOutput))) {
*aOutput += *aInput;
++aOutput;
++aInput;
--aSize;
}
} else {
while (aSize && (!IS_ALIGNED16(aInput) || !IS_ALIGNED16(aOutput))) {
*aOutput += *aInput * aScale;
++aOutput;
++aInput;
--aSize;
}
}
// we need to round aSize down to the nearest multiple of 16
uint32_t alignedSize = aSize & ~0x0F;
if (alignedSize > 0) {
AudioBufferAddWithScale_SSE(aInput, aScale, aOutput, alignedSize);
// adjust parameters for use with scalar operations below
aInput += alignedSize;
aOutput += alignedSize;
aSize -= alignedSize;
}
Engine<xsimd::sse2>::AudioBufferAddWithScale(aInput, aScale, aOutput,
aSize);
return;
}
#endif
@ -127,14 +104,16 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
} else {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
aOutput);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
aOutput);
return;
}
#endif
@ -147,9 +126,15 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
void BufferComplexMultiply(const float* aInput, const float* aScale,
float* aOutput, uint32_t aSize) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
Engine<xsimd::neon>::BufferComplexMultiply(aInput, aScale, aOutput, aSize);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse()) {
BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
Engine<xsimd::sse2>::BufferComplexMultiply(aInput, aScale, aOutput, aSize);
return;
}
#endif
@ -182,14 +167,16 @@ void AudioBlockCopyChannelWithScale(const float aInput[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
aOutput);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
aOutput);
return;
}
#endif
@ -214,14 +201,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
}
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
return;
}
#endif
@ -234,14 +221,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
void AudioBufferInPlaceScale(float* aBlock, float* aScale, uint32_t aSize) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
return;
}
#endif
@ -275,16 +262,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
aIsOnTheLeft, aOutputL, aOutputR);
Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
AudioBlockPanStereoToStereo_SSE(aInputL, aInputR, aGainL, aGainR,
aIsOnTheLeft, aOutputL, aOutputR);
Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
return;
}
#endif
@ -313,8 +300,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
#ifdef USE_NEON
if (mozilla::supports_neon()) {
AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
aIsOnTheLeft, aOutputL, aOutputR);
Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
return;
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
return;
}
#endif
@ -332,32 +327,19 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
}
float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
float sum = 0.0f;
#ifdef USE_SSE2
if (mozilla::supports_sse()) {
const float* alignedInput = ALIGNED16(aInput);
// use scalar operations for any unaligned data at the beginning
while (aInput != alignedInput) {
if (!aLength) {
return sum;
}
sum += *aInput * *aInput;
++aInput;
--aLength;
}
uint32_t vLength = (aLength >> 4) << 4;
sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
// adjust aInput and aLength to use scalar operations for any
// remaining values
aInput = alignedInput + vLength;
aLength -= vLength;
#ifdef USE_NEON
if (mozilla::supports_neon()) {
return Engine<xsimd::neon>::AudioBufferSumOfSquares(aInput, aLength);
}
#endif
#ifdef USE_SSE2
if (mozilla::supports_sse()) {
return Engine<xsimd::sse2>::AudioBufferSumOfSquares(aInput, aLength);
}
#endif
float sum = 0.f;
while (aLength--) {
sum += *aInput * *aInput;
++aInput;
@ -368,7 +350,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
void NaNToZeroInPlace(float* aSamples, size_t aCount) {
#ifdef USE_SSE2
if (mozilla::supports_sse2()) {
NaNToZeroInPlace_SSE(aSamples, aCount);
Engine<xsimd::sse2>::NaNToZeroInPlace(aSamples, aCount);
return;
}
#endif

Просмотреть файл

@ -0,0 +1,335 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef MOZILLA_AUDIONODEENGINEGENERIC_H_
#define MOZILLA_AUDIONODEENGINEGENERIC_H_
#include "AudioNodeEngine.h"
#include "AlignmentUtils.h"
#include "xsimd/xsimd.hpp"
#if defined(__GNUC__) && __GNUC__ > 7
# define MOZ_PRAGMA(tokens) _Pragma(#tokens)
# define MOZ_UNROLL(factor) MOZ_PRAGMA(GCC unroll factor)
#elif defined(__INTEL_COMPILER) || (defined(__clang__) && __clang_major__ > 3)
# define MOZ_PRAGMA(tokens) _Pragma(#tokens)
# define MOZ_UNROLL(factor) MOZ_PRAGMA(unroll factor)
#else
# define MOZ_UNROLL(_)
#endif
namespace mozilla {
template <class Arch>
static bool is_aligned(const void* ptr) {
return (reinterpret_cast<uintptr_t>(ptr) &
~(static_cast<uintptr_t>(Arch::alignment()) - 1)) ==
reinterpret_cast<uintptr_t>(ptr);
};
template <class Arch>
struct Engine {
static void AudioBufferAddWithScale(const float* aInput, float aScale,
float* aOutput, uint32_t aSize) {
if constexpr (Arch::requires_alignment()) {
if (aScale == 1.0f) {
while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
if (!aSize) return;
*aOutput += *aInput;
++aOutput;
++aInput;
--aSize;
}
} else {
while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
if (!aSize) return;
*aOutput += *aInput * aScale;
++aOutput;
++aInput;
--aSize;
}
}
}
MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
xsimd::batch<float, Arch> vgain(aScale);
uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
MOZ_UNROLL(4)
for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
auto vout = xsimd::fma(vin1, vgain, vin2);
vout.store_aligned(&aOutput[i]);
}
for (unsigned i = aVSize; i < aSize; ++i) {
aOutput[i] += aInput[i] * aScale;
}
};
static void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
float* aOutput) {
MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
"requires tail processing");
xsimd::batch<float, Arch> vgain = (aScale);
MOZ_UNROLL(4)
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
i += xsimd::batch<float, Arch>::size) {
auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
auto vout = vin * vgain;
vout.store_aligned(&aOutput[i]);
}
};
static void AudioBlockCopyChannelWithScale(
const float aInput[WEBAUDIO_BLOCK_SIZE],
const float aScale[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
"requires tail processing");
MOZ_UNROLL(4)
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
i += xsimd::batch<float, Arch>::size) {
auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
auto vout = vin * vscaled;
vout.store_aligned(&aOutput[i]);
}
};
static void AudioBufferInPlaceScale(float* aBlock, float aScale,
uint32_t aSize) {
MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
xsimd::batch<float, Arch> vgain(aScale);
uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
MOZ_UNROLL(4)
for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
auto vout = vin * vgain;
vout.store_aligned(&aBlock[i]);
}
for (unsigned i = aVSize; i < aSize; ++i) aBlock[i] *= aScale;
};
static void AudioBufferInPlaceScale(float* aBlock, float* aScale,
uint32_t aSize) {
MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
MOZ_UNROLL(4)
for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
auto vout = vin * vgain;
vout.store_aligned(&aBlock[i]);
}
for (uint32_t i = aVSize; i < aSize; ++i) {
*aBlock++ *= *aScale++;
}
};
static void AudioBlockPanStereoToStereo(
const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
"requires tail processing");
xsimd::batch<float, Arch> vgainl(aGainL);
xsimd::batch<float, Arch> vgainr(aGainR);
if (aIsOnTheLeft) {
MOZ_UNROLL(2)
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
i += xsimd::batch<float, Arch>::size) {
auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
/* left channel : aOutputL = aInputL + aInputR * gainL */
auto vout = xsimd::fma(vinr, vgainl, vinl);
vout.store_aligned(&aOutputL[i]);
/* right channel : aOutputR = aInputR * gainR */
auto vscaled = vinr * vgainr;
vscaled.store_aligned(&aOutputR[i]);
}
} else {
MOZ_UNROLL(2)
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
i += xsimd::batch<float, Arch>::size) {
auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
/* left channel : aInputL * gainL */
auto vscaled = vinl * vgainl;
vscaled.store_aligned(&aOutputL[i]);
/* right channel: aOutputR = aInputR + aInputL * gainR */
auto vout = xsimd::fma(vinl, vgainr, vinr);
vout.store_aligned(&aOutputR[i]);
}
}
};
static void BufferComplexMultiply(const float* aInput, const float* aScale,
float* aOutput, uint32_t aSize) {
MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
MOZ_ASSERT((aSize % xsimd::batch<float, Arch>::size == 0),
"requires tail processing");
MOZ_UNROLL(2)
for (unsigned i = 0; i < aSize * 2;
i += 2 * xsimd::batch<std::complex<float>>::size) {
auto in1 = xsimd::batch<std::complex<float>>::load_aligned(
reinterpret_cast<const std::complex<float>*>(&aInput[i]));
auto in2 = xsimd::batch<std::complex<float>>::load_aligned(
reinterpret_cast<const std::complex<float>*>(&aScale[i]));
auto out = in1 * in2;
out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
}
};
static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
float sum = 0.f;
if constexpr (Arch::requires_alignment()) {
while (!is_aligned<Arch>(aInput)) {
if (!aLength) {
return sum;
}
sum += *aInput * *aInput;
++aInput;
--aLength;
}
}
MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
constexpr uint32_t unroll_factor = 4;
xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
uint32_t vLength =
aLength & ~(unroll_factor * xsimd::batch<float, Arch>::size - 1);
for (uint32_t i = 0; i < vLength;
i += unroll_factor * xsimd::batch<float, Arch>::size) {
MOZ_UNROLL(4)
for (uint32_t j = 0; j < unroll_factor; ++j) {
auto in = xsimd::batch<float, Arch>::load_aligned(
&aInput[i + xsimd::batch<float, Arch>::size * j]);
accs[j] = xsimd::fma(in, in, accs[j]);
}
}
sum += reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
for (uint32_t i = vLength; i < aLength; ++i) sum += aInput[i] * aInput[i];
return sum;
};
static void NaNToZeroInPlace(float* aSamples, size_t aCount) {
if constexpr (Arch::requires_alignment()) {
while (!is_aligned<Arch>(aSamples)) {
if (!aCount) {
return;
}
if (*aSamples != *aSamples) {
*aSamples = 0.0;
}
++aSamples;
--aCount;
}
}
MOZ_ASSERT(is_aligned<Arch>(aSamples), "aSamples is aligned");
uint32_t vCount = aCount & ~(xsimd::batch<float, Arch>::size - 1);
MOZ_UNROLL(4)
for (uint32_t i = 0; i < vCount; i += xsimd::batch<float, Arch>::size) {
auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
auto vout =
xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
vout.store_aligned(&aSamples[i]);
}
for (uint32_t i = vCount; i < aCount; i++) {
if (aSamples[i] != aSamples[i]) {
aSamples[i] = 0.0;
}
}
};
static void AudioBlockPanStereoToStereo(
const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
const float aGainL[WEBAUDIO_BLOCK_SIZE],
const float aGainR[WEBAUDIO_BLOCK_SIZE],
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
MOZ_ASSERT(is_aligned<Arch>(aGainL), "aGainL is aligned");
MOZ_ASSERT(is_aligned<Arch>(aGainR), "aGainR is aligned");
MOZ_ASSERT(is_aligned<Arch>(aIsOnTheLeft), "aIsOnTheLeft is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
"requires tail processing");
MOZ_UNROLL(2)
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
i += xsimd::batch<float, Arch>::size) {
auto mask =
xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
auto outL_true = xsimd::fma(inputR, gainL, inputL);
auto outR_true = inputR * gainR;
auto outL_false = inputL * gainL;
auto outR_false = xsimd::fma(inputL, gainR, inputR);
auto outL = xsimd::select(mask, outL_true, outL_false);
auto outR = xsimd::select(mask, outR_true, outR_false);
outL.store_aligned(&aOutputL[i]);
outR.store_aligned(&aOutputR[i]);
}
}
};
} // namespace mozilla
#endif

Просмотреть файл

@ -3,350 +3,7 @@
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "AudioNodeEngineNEON.h"
#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
# include <arm64_neon.h>
#else
# include <arm_neon.h>
#endif
//#ifdef DEBUG
#if 0 // see bug 921099
# define ASSERT_ALIGNED(ptr) \
MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
#ptr " has to be aligned 16-bytes aligned.");
#else
# define ASSERT_ALIGNED(ptr)
#endif
#define ADDRESS_OF(array, index) ((float32_t*)&array[index])
#include "AudioNodeEngineGeneric.h"
namespace mozilla {
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
float* aOutput, uint32_t aSize) {
ASSERT_ALIGNED(aInput);
ASSERT_ALIGNED(aOutput);
float32x4_t vin0, vin1, vin2, vin3;
float32x4_t vout0, vout1, vout2, vout3;
float32x4_t vscale = vmovq_n_f32(aScale);
uint32_t dif = aSize % 16;
aSize -= dif;
unsigned i = 0;
for (; i < aSize; i += 16) {
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
vout0 = vmlaq_f32(vout0, vin0, vscale);
vout1 = vmlaq_f32(vout1, vin1, vscale);
vout2 = vmlaq_f32(vout2, vin2, vscale);
vout3 = vmlaq_f32(vout3, vin3, vscale);
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
}
for (unsigned j = 0; j < dif; ++i, ++j) {
aOutput[i] += aInput[i] * aScale;
}
}
void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
float* aOutput) {
ASSERT_ALIGNED(aInput);
ASSERT_ALIGNED(aOutput);
float32x4_t vin0, vin1, vin2, vin3;
float32x4_t vout0, vout1, vout2, vout3;
float32x4_t vscale = vmovq_n_f32(aScale);
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
vout0 = vmulq_f32(vin0, vscale);
vout1 = vmulq_f32(vin1, vscale);
vout2 = vmulq_f32(vin2, vscale);
vout3 = vmulq_f32(vin3, vscale);
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
}
}
void AudioBlockCopyChannelWithScale_NEON(
const float aInput[WEBAUDIO_BLOCK_SIZE],
const float aScale[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
ASSERT_ALIGNED(aInput);
ASSERT_ALIGNED(aScale);
ASSERT_ALIGNED(aOutput);
float32x4_t vin0, vin1, vin2, vin3;
float32x4_t vout0, vout1, vout2, vout3;
float32x4_t vscale0, vscale1, vscale2, vscale3;
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
vout0 = vmulq_f32(vin0, vscale0);
vout1 = vmulq_f32(vin1, vscale1);
vout2 = vmulq_f32(vin2, vscale2);
vout3 = vmulq_f32(vin3, vscale3);
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
}
}
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
ASSERT_ALIGNED(aBlock);
float32x4_t vin0, vin1, vin2, vin3;
float32x4_t vout0, vout1, vout2, vout3;
float32x4_t vscale = vmovq_n_f32(aScale);
uint32_t dif = aSize % 16;
uint32_t vectorSize = aSize - dif;
uint32_t i = 0;
for (; i < vectorSize; i += 16) {
vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
vout0 = vmulq_f32(vin0, vscale);
vout1 = vmulq_f32(vin1, vscale);
vout2 = vmulq_f32(vin2, vscale);
vout3 = vmulq_f32(vin3, vscale);
vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
}
for (unsigned j = 0; j < dif; ++i, ++j) {
aBlock[i] *= aScale;
}
}
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
uint32_t aSize) {
ASSERT_ALIGNED(aBlock);
float32x4_t vin0, vin1, vin2, vin3;
float32x4_t vout0, vout1, vout2, vout3;
float32x4_t vscale0, vscale1, vscale2, vscale3;
uint32_t dif = aSize % 16;
uint32_t vectorSize = aSize - dif;
uint32_t i = 0;
for (; i < vectorSize; i += 16) {
vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
vout0 = vmulq_f32(vin0, vscale0);
vout1 = vmulq_f32(vin1, vscale1);
vout2 = vmulq_f32(vin2, vscale2);
vout3 = vmulq_f32(vin3, vscale3);
vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
}
for (unsigned j = 0; j < dif; ++i, ++j) {
aBlock[i] *= aScale[i];
}
}
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
float aGainL, float aGainR,
bool aIsOnTheLeft,
float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
ASSERT_ALIGNED(aInputL);
ASSERT_ALIGNED(aInputR);
ASSERT_ALIGNED(aOutputL);
ASSERT_ALIGNED(aOutputR);
float32x4_t vinL0, vinL1;
float32x4_t vinR0, vinR1;
float32x4_t voutL0, voutL1;
float32x4_t voutR0, voutR1;
float32x4_t vscaleL = vmovq_n_f32(aGainL);
float32x4_t vscaleR = vmovq_n_f32(aGainR);
if (aIsOnTheLeft) {
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
voutR0 = vmulq_f32(vinR0, vscaleR);
voutR1 = vmulq_f32(vinR1, vscaleR);
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
}
} else {
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
voutL0 = vmulq_f32(vinL0, vscaleL);
voutL1 = vmulq_f32(vinL1, vscaleL);
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
}
}
}
void AudioBlockPanStereoToStereo_NEON(
const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
const float aGainL[WEBAUDIO_BLOCK_SIZE],
const float aGainR[WEBAUDIO_BLOCK_SIZE],
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
ASSERT_ALIGNED(aInputL);
ASSERT_ALIGNED(aInputR);
ASSERT_ALIGNED(aGainL);
ASSERT_ALIGNED(aGainR);
ASSERT_ALIGNED(aIsOnTheLeft);
ASSERT_ALIGNED(aOutputL);
ASSERT_ALIGNED(aOutputR);
float32x4_t vinL0, vinL1;
float32x4_t vinR0, vinR1;
float32x4_t voutL0, voutL1;
float32x4_t voutR0, voutR1;
float32x4_t vscaleL0, vscaleL1;
float32x4_t vscaleR0, vscaleR1;
float32x4_t onleft0, onleft1, notonleft0, notonleft1;
float32x4_t zero = vmovq_n_f32(0);
uint8x8_t isOnTheLeft;
// Although MSVC throws uninitialized value warning for voutL0 and voutL1,
// since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
// compiler warning, set zero.
voutL0 = zero;
voutL1 = zero;
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
// Load output with boolean "on the left" values. This assumes that
// bools are stored as a single byte.
isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
// Convert the boolean values into masks by setting all bits to 1
// if true.
voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
// The right output masks are the same as the left masks
voutR0 = voutL0;
voutR1 = voutL1;
// Calculate left channel assuming isOnTheLeft
onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL1);
// Calculate left channel assuming not isOnTheLeft
notonleft0 = vmulq_f32(vinL0, vscaleL0);
notonleft1 = vmulq_f32(vinL1, vscaleL1);
// Write results using previously stored masks
voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
// Calculate right channel assuming isOnTheLeft
onleft0 = vmulq_f32(vinR0, vscaleR0);
onleft1 = vmulq_f32(vinR1, vscaleR1);
// Calculate right channel assuming not isOnTheLeft
notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
// Write results using previously stored masks
voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
}
}
template struct Engine<xsimd::neon>;
} // namespace mozilla

Просмотреть файл

@ -1,42 +0,0 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef MOZILLA_AUDIONODEENGINENEON_H_
#define MOZILLA_AUDIONODEENGINENEON_H_
#include "AudioNodeEngine.h"
namespace mozilla {
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
float* aOutput, uint32_t aSize);
void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
float* aOutput);
void AudioBlockCopyChannelWithScale_NEON(
const float aInput[WEBAUDIO_BLOCK_SIZE],
const float aScale[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]);
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize);
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale, uint32_t aSize);
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
float aGainL, float aGainR,
bool aIsOnTheLeft,
float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]);
void AudioBlockPanStereoToStereo_NEON(
const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
const float aGainL[WEBAUDIO_BLOCK_SIZE],
const float aGainR[WEBAUDIO_BLOCK_SIZE],
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]);
} // namespace mozilla
#endif /* MOZILLA_AUDIONODEENGINENEON_H_ */

Просмотреть файл

@ -3,361 +3,8 @@
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "AudioNodeEngineSSE2.h"
#include "AlignmentUtils.h"
#include <emmintrin.h>
#include "AudioNodeEngineGeneric.h"
namespace mozilla {
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
float* aOutput, uint32_t aSize) {
__m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
vout1, vout2, vout3, vgain;
ASSERT_ALIGNED16(aInput);
ASSERT_ALIGNED16(aOutput);
ASSERT_MULTIPLE16(aSize);
vgain = _mm_load1_ps(&aScale);
for (unsigned i = 0; i < aSize; i += 16) {
vin0 = _mm_load_ps(&aInput[i]);
vin1 = _mm_load_ps(&aInput[i + 4]);
vin2 = _mm_load_ps(&aInput[i + 8]);
vin3 = _mm_load_ps(&aInput[i + 12]);
vscaled0 = _mm_mul_ps(vin0, vgain);
vscaled1 = _mm_mul_ps(vin1, vgain);
vscaled2 = _mm_mul_ps(vin2, vgain);
vscaled3 = _mm_mul_ps(vin3, vgain);
vin0 = _mm_load_ps(&aOutput[i]);
vin1 = _mm_load_ps(&aOutput[i + 4]);
vin2 = _mm_load_ps(&aOutput[i + 8]);
vin3 = _mm_load_ps(&aOutput[i + 12]);
vout0 = _mm_add_ps(vin0, vscaled0);
vout1 = _mm_add_ps(vin1, vscaled1);
vout2 = _mm_add_ps(vin2, vscaled2);
vout3 = _mm_add_ps(vin3, vscaled3);
_mm_store_ps(&aOutput[i], vout0);
_mm_store_ps(&aOutput[i + 4], vout1);
_mm_store_ps(&aOutput[i + 8], vout2);
_mm_store_ps(&aOutput[i + 12], vout3);
}
}
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
float* aOutput) {
__m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;
ASSERT_ALIGNED16(aInput);
ASSERT_ALIGNED16(aOutput);
__m128 vgain = _mm_load1_ps(&aScale);
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
vin0 = _mm_load_ps(&aInput[i]);
vin1 = _mm_load_ps(&aInput[i + 4]);
vin2 = _mm_load_ps(&aInput[i + 8]);
vin3 = _mm_load_ps(&aInput[i + 12]);
vout0 = _mm_mul_ps(vin0, vgain);
vout1 = _mm_mul_ps(vin1, vgain);
vout2 = _mm_mul_ps(vin2, vgain);
vout3 = _mm_mul_ps(vin3, vgain);
_mm_store_ps(&aOutput[i], vout0);
_mm_store_ps(&aOutput[i + 4], vout1);
_mm_store_ps(&aOutput[i + 8], vout2);
_mm_store_ps(&aOutput[i + 12], vout3);
}
}
void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
const float aScale[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
__m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
vout1, vout2, vout3;
ASSERT_ALIGNED16(aInput);
ASSERT_ALIGNED16(aScale);
ASSERT_ALIGNED16(aOutput);
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
vscaled0 = _mm_load_ps(&aScale[i]);
vscaled1 = _mm_load_ps(&aScale[i + 4]);
vscaled2 = _mm_load_ps(&aScale[i + 8]);
vscaled3 = _mm_load_ps(&aScale[i + 12]);
vin0 = _mm_load_ps(&aInput[i]);
vin1 = _mm_load_ps(&aInput[i + 4]);
vin2 = _mm_load_ps(&aInput[i + 8]);
vin3 = _mm_load_ps(&aInput[i + 12]);
vout0 = _mm_mul_ps(vin0, vscaled0);
vout1 = _mm_mul_ps(vin1, vscaled1);
vout2 = _mm_mul_ps(vin2, vscaled2);
vout3 = _mm_mul_ps(vin3, vscaled3);
_mm_store_ps(&aOutput[i], vout0);
_mm_store_ps(&aOutput[i + 4], vout1);
_mm_store_ps(&aOutput[i + 8], vout2);
_mm_store_ps(&aOutput[i + 12], vout3);
}
}
void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
__m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;
ASSERT_ALIGNED16(aBlock);
ASSERT_MULTIPLE16(aSize);
__m128 vgain = _mm_load1_ps(&aScale);
for (unsigned i = 0; i < aSize; i += 16) {
vin0 = _mm_load_ps(&aBlock[i]);
vin1 = _mm_load_ps(&aBlock[i + 4]);
vin2 = _mm_load_ps(&aBlock[i + 8]);
vin3 = _mm_load_ps(&aBlock[i + 12]);
vout0 = _mm_mul_ps(vin0, vgain);
vout1 = _mm_mul_ps(vin1, vgain);
vout2 = _mm_mul_ps(vin2, vgain);
vout3 = _mm_mul_ps(vin3, vgain);
_mm_store_ps(&aBlock[i], vout0);
_mm_store_ps(&aBlock[i + 4], vout1);
_mm_store_ps(&aBlock[i + 8], vout2);
_mm_store_ps(&aBlock[i + 12], vout3);
}
}
void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
__m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
vin2, vin3;
ASSERT_ALIGNED16(aBlock);
ASSERT_MULTIPLE16(aSize);
for (unsigned i = 0; i < aSize; i += 16) {
vin0 = _mm_load_ps(&aBlock[i]);
vin1 = _mm_load_ps(&aBlock[i + 4]);
vin2 = _mm_load_ps(&aBlock[i + 8]);
vin3 = _mm_load_ps(&aBlock[i + 12]);
vgain0 = _mm_load_ps(&aScale[i]);
vgain1 = _mm_load_ps(&aScale[i + 4]);
vgain2 = _mm_load_ps(&aScale[i + 8]);
vgain3 = _mm_load_ps(&aScale[i + 12]);
vout0 = _mm_mul_ps(vin0, vgain0);
vout1 = _mm_mul_ps(vin1, vgain1);
vout2 = _mm_mul_ps(vin2, vgain2);
vout3 = _mm_mul_ps(vin3, vgain3);
_mm_store_ps(&aBlock[i], vout0);
_mm_store_ps(&aBlock[i + 4], vout1);
_mm_store_ps(&aBlock[i + 8], vout2);
_mm_store_ps(&aBlock[i + 12], vout3);
}
}
void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
float aGainL, float aGainR,
bool aIsOnTheLeft,
float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
__m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
vgainr;
ASSERT_ALIGNED16(aInputL);
ASSERT_ALIGNED16(aInputR);
ASSERT_ALIGNED16(aOutputL);
ASSERT_ALIGNED16(aOutputR);
vgainl = _mm_load1_ps(&aGainL);
vgainr = _mm_load1_ps(&aGainR);
if (aIsOnTheLeft) {
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
vinl0 = _mm_load_ps(&aInputL[i]);
vinr0 = _mm_load_ps(&aInputR[i]);
vinl1 = _mm_load_ps(&aInputL[i + 4]);
vinr1 = _mm_load_ps(&aInputR[i + 4]);
/* left channel : aOutputL = aInputL + aInputR * gainL */
vscaled0 = _mm_mul_ps(vinr0, vgainl);
vscaled1 = _mm_mul_ps(vinr1, vgainl);
vout0 = _mm_add_ps(vscaled0, vinl0);
vout1 = _mm_add_ps(vscaled1, vinl1);
_mm_store_ps(&aOutputL[i], vout0);
_mm_store_ps(&aOutputL[i + 4], vout1);
/* right channel : aOutputR = aInputR * gainR */
vscaled0 = _mm_mul_ps(vinr0, vgainr);
vscaled1 = _mm_mul_ps(vinr1, vgainr);
_mm_store_ps(&aOutputR[i], vscaled0);
_mm_store_ps(&aOutputR[i + 4], vscaled1);
}
} else {
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
vinl0 = _mm_load_ps(&aInputL[i]);
vinr0 = _mm_load_ps(&aInputR[i]);
vinl1 = _mm_load_ps(&aInputL[i + 4]);
vinr1 = _mm_load_ps(&aInputR[i + 4]);
/* left channel : aInputL * gainL */
vscaled0 = _mm_mul_ps(vinl0, vgainl);
vscaled1 = _mm_mul_ps(vinl1, vgainl);
_mm_store_ps(&aOutputL[i], vscaled0);
_mm_store_ps(&aOutputL[i + 4], vscaled1);
/* right channel: aOutputR = aInputR + aInputL * gainR */
vscaled0 = _mm_mul_ps(vinl0, vgainr);
vscaled1 = _mm_mul_ps(vinl1, vgainr);
vout0 = _mm_add_ps(vscaled0, vinr0);
vout1 = _mm_add_ps(vscaled1, vinr1);
_mm_store_ps(&aOutputR[i], vout0);
_mm_store_ps(&aOutputR[i + 4], vout1);
}
}
}
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
float* aOutput, uint32_t aSize) {
unsigned i;
__m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
outimag1, outimag2, outimag3;
ASSERT_ALIGNED16(aInput);
ASSERT_ALIGNED16(aScale);
ASSERT_ALIGNED16(aOutput);
ASSERT_MULTIPLE16(aSize);
for (i = 0; i < aSize * 2; i += 16) {
in0 = _mm_load_ps(&aInput[i]);
in1 = _mm_load_ps(&aInput[i + 4]);
in2 = _mm_load_ps(&aInput[i + 8]);
in3 = _mm_load_ps(&aInput[i + 12]);
outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
in0 = _mm_load_ps(&aScale[i]);
in1 = _mm_load_ps(&aScale[i + 4]);
in2 = _mm_load_ps(&aScale[i + 8]);
in3 = _mm_load_ps(&aScale[i + 12]);
outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
_mm_mul_ps(outimag0, outimag1));
in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
_mm_mul_ps(outimag0, outreal1));
in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
_mm_mul_ps(outimag2, outimag3));
in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
_mm_mul_ps(outimag2, outreal3));
outreal0 = _mm_unpacklo_ps(in0, in1);
outreal1 = _mm_unpackhi_ps(in0, in1);
outreal2 = _mm_unpacklo_ps(in2, in3);
outreal3 = _mm_unpackhi_ps(in2, in3);
_mm_store_ps(&aOutput[i], outreal0);
_mm_store_ps(&aOutput[i + 4], outreal1);
_mm_store_ps(&aOutput[i + 8], outreal2);
_mm_store_ps(&aOutput[i + 12], outreal3);
}
}
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
unsigned i;
__m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
float out[4];
ASSERT_ALIGNED16(aInput);
ASSERT_MULTIPLE16(aLength);
acc0 = _mm_setzero_ps();
acc1 = _mm_setzero_ps();
acc2 = _mm_setzero_ps();
acc3 = _mm_setzero_ps();
for (i = 0; i < aLength; i += 16) {
in0 = _mm_load_ps(&aInput[i]);
in1 = _mm_load_ps(&aInput[i + 4]);
in2 = _mm_load_ps(&aInput[i + 8]);
in3 = _mm_load_ps(&aInput[i + 12]);
in0 = _mm_mul_ps(in0, in0);
in1 = _mm_mul_ps(in1, in1);
in2 = _mm_mul_ps(in2, in2);
in3 = _mm_mul_ps(in3, in3);
acc0 = _mm_add_ps(acc0, in0);
acc1 = _mm_add_ps(acc1, in1);
acc2 = _mm_add_ps(acc2, in2);
acc3 = _mm_add_ps(acc3, in3);
}
acc0 = _mm_add_ps(acc0, acc1);
acc0 = _mm_add_ps(acc0, acc2);
acc0 = _mm_add_ps(acc0, acc3);
_mm_store_ps(out, acc0);
return out[0] + out[1] + out[2] + out[3];
}
void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount) {
__m128 vin0, vin1, vin2, vin3;
__m128 vmask0, vmask1, vmask2, vmask3;
__m128 vout0, vout1, vout2, vout3;
float* samplesAligned16 = ALIGNED16(aSamples);
size_t leadingElementsScalar =
std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
size_t remainingElements = aCount - leadingElementsScalar;
size_t vectoredEnd = aCount - remainingElements % 16;
MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) % 16));
size_t i = 0;
for (; i < leadingElementsScalar; i++) {
if (aSamples[i] != aSamples[i]) {
aSamples[i] = 0.0;
}
}
ASSERT_ALIGNED16(&aSamples[i]);
for (; i < vectoredEnd; i += 16) {
vin0 = _mm_load_ps(&aSamples[i + 0]);
vin1 = _mm_load_ps(&aSamples[i + 4]);
vin2 = _mm_load_ps(&aSamples[i + 8]);
vin3 = _mm_load_ps(&aSamples[i + 12]);
vmask0 = _mm_cmpord_ps(vin0, vin0);
vmask1 = _mm_cmpord_ps(vin1, vin1);
vmask2 = _mm_cmpord_ps(vin2, vin2);
vmask3 = _mm_cmpord_ps(vin3, vin3);
vout0 = _mm_and_ps(vin0, vmask0);
vout1 = _mm_and_ps(vin1, vmask1);
vout2 = _mm_and_ps(vin2, vmask2);
vout3 = _mm_and_ps(vin3, vmask3);
_mm_store_ps(&aSamples[i + 0], vout0);
_mm_store_ps(&aSamples[i + 4], vout1);
_mm_store_ps(&aSamples[i + 8], vout2);
_mm_store_ps(&aSamples[i + 12], vout3);
}
for (; i < aCount; i++) {
if (aSamples[i] != aSamples[i]) {
aSamples[i] = 0.0;
}
}
}
template struct Engine<xsimd::sse2>;
} // namespace mozilla

Просмотреть файл

@ -1,35 +0,0 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "AudioNodeEngine.h"
namespace mozilla {
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
float* aOutput, uint32_t aSize);
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
float* aOutput);
void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
const float aScale[WEBAUDIO_BLOCK_SIZE],
float aOutput[WEBAUDIO_BLOCK_SIZE]);
void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize);
void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize);
void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
const float aInputR[WEBAUDIO_BLOCK_SIZE],
float aGainL, float aGainR,
bool aIsOnTheLeft,
float aOutputL[WEBAUDIO_BLOCK_SIZE],
float aOutputR[WEBAUDIO_BLOCK_SIZE]);
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength);
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
float* aOutput, uint32_t aSize);
void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount);
} // namespace mozilla

Просмотреть файл

@ -127,6 +127,7 @@ UNIFIED_SOURCES += [
if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
DEFINES["USE_NEON"] = True
LOCAL_INCLUDES += ["/third_party/xsimd/include"]
SOURCES += ["AudioNodeEngineNEON.cpp"]
SOURCES["AudioNodeEngineNEON.cpp"].flags += CONFIG["NEON_FLAGS"]
if CONFIG["BUILD_ARM_NEON"]:
@ -136,6 +137,7 @@ if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
if CONFIG["INTEL_ARCHITECTURE"]:
SOURCES += ["AudioNodeEngineSSE2.cpp"]
DEFINES["USE_SSE2"] = True
LOCAL_INCLUDES += ["/third_party/xsimd/include"]
SOURCES["AudioNodeEngineSSE2.cpp"].flags += CONFIG["SSE2_FLAGS"]
include("/ipc/chromium/chromium-config.mozbuild")