зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1845651 - Part 2. Convert OpenMAX DL assembler to clang compatible. r=padenot
Newer Android NDK has no GNU assembler, so I convert OpenMAX DL in Gecko to clang compatible assembler. Gecko's OpenMAX DL is imported from Cromium tree, but it is removed from Chromium (https://crbug.com/917355). So there is no upstream code now. Also, if using MOZ_LIBAV_FF on arm32 build, MOZ_LIBAV_FF is slower than OpenMAX DL. Convolution reverb - https://padenot.github.io/webaudio-benchmark/index.html - OpenMAX DL ... 16x - MOZ_LIBAV_FF ... 13x Depends on D184717 Differential Revision: https://phabricator.services.mozilla.com/D184719
This commit is contained in:
Родитель
98a5909e60
Коммит
ccf77507b4
|
@ -168,10 +168,13 @@
|
|||
|
||||
@ Define the function and make it external.
|
||||
.global \name
|
||||
#ifndef __clang__
|
||||
.func \name
|
||||
#endif
|
||||
.section .text.\name,"ax",%progbits
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
.syntax unified
|
||||
.object_arch armv4
|
||||
.align 2
|
||||
\name :
|
||||
|
@ -203,7 +206,9 @@
|
|||
@ Restore any saved R or D registers.
|
||||
_M_RET
|
||||
.fnend
|
||||
#ifndef __clang__
|
||||
.endfunc
|
||||
#endif
|
||||
@ Reset the global stack tracking variables back to their
|
||||
@ initial values.
|
||||
.set _SBytes, 0
|
||||
|
|
|
@ -47,9 +47,3 @@ if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
|
|||
DEFINES['BIG_FFT_TABLE'] = True
|
||||
|
||||
FINAL_LIBRARY = 'xul'
|
||||
|
||||
if CONFIG['CC_TYPE'] == 'clang':
|
||||
ASFLAGS += [
|
||||
'-no-integrated-as',
|
||||
]
|
||||
|
||||
|
|
|
@ -85,40 +85,40 @@
|
|||
|
||||
@// Neon registers
|
||||
|
||||
#define dX0 D0.F32
|
||||
#define dShift D1.F32
|
||||
#define dX1 D1.F32
|
||||
#define dY0 D2.F32
|
||||
#define dY1 D3.F32
|
||||
#define dX0r D0.F32
|
||||
#define dX0i D1.F32
|
||||
#define dX1r D2.F32
|
||||
#define dX1i D3.F32
|
||||
#define dW0r D4.F32
|
||||
#define dW0i D5.F32
|
||||
#define dW1r D6.F32
|
||||
#define dW1i D7.F32
|
||||
#define dT0 D8.F32
|
||||
#define dT1 D9.F32
|
||||
#define dT2 D10.F32
|
||||
#define dT3 D11.F32
|
||||
#define qT0 D12.F32
|
||||
#define qT1 D14.F32
|
||||
#define qT2 D16.F32
|
||||
#define qT3 D18.F32
|
||||
#define dY0r D4.F32
|
||||
#define dY0i D5.F32
|
||||
#define dY1r D6.F32
|
||||
#define dY1i D7.F32
|
||||
#define dX0 D0
|
||||
#define dShift D1
|
||||
#define dX1 D1
|
||||
#define dY0 D2
|
||||
#define dY1 D3
|
||||
#define dX0r D0
|
||||
#define dX0i D1
|
||||
#define dX1r D2
|
||||
#define dX1i D3
|
||||
#define dW0r D4
|
||||
#define dW0i D5
|
||||
#define dW1r D6
|
||||
#define dW1i D7
|
||||
#define dT0 D8
|
||||
#define dT1 D9
|
||||
#define dT2 D10
|
||||
#define dT3 D11
|
||||
#define qT0 D12
|
||||
#define qT1 D14
|
||||
#define qT2 D16
|
||||
#define qT3 D18
|
||||
#define dY0r D4
|
||||
#define dY0i D5
|
||||
#define dY1r D6
|
||||
#define dY1i D7
|
||||
|
||||
#define dY2 D4.F32
|
||||
#define dY3 D5.F32
|
||||
#define dW0 D6.F32
|
||||
#define dW1 D7.F32
|
||||
#define dW0Tmp D10.F32
|
||||
#define dW1Neg D11.F32
|
||||
#define dY2 D4
|
||||
#define dY3 D5
|
||||
#define dW0 D6
|
||||
#define dW1 D7
|
||||
#define dW0Tmp D10
|
||||
#define dW1Neg D11
|
||||
|
||||
#define half D13.F32
|
||||
#define half D13
|
||||
|
||||
@ Structure offsets for the FFTSpec
|
||||
.set ARMsFFTSpec_N, 0
|
||||
|
@ -135,7 +135,7 @@
|
|||
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
|
||||
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
|
||||
|
||||
VMOV half, 0.5
|
||||
VMOV.F32 half, #0.5
|
||||
|
||||
|
||||
MOV size,N,ASR #1 @// preserve the contents of N
|
||||
|
@ -149,33 +149,33 @@
|
|||
@// Z(0) : no need of twiddle multiply
|
||||
@// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
|
||||
|
||||
VLD1 dX0,[pSrc],step
|
||||
VLD1.F32 dX0,[pSrc],step
|
||||
ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes
|
||||
|
||||
VLD1 dX1,[pSrc]!
|
||||
VLD1.F32 dX1,[pSrc]!
|
||||
@// twStep = 3N/8 * 8 bytes pointing to W^1
|
||||
SUB twStep,step,size,LSL #1
|
||||
|
||||
MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes
|
||||
SUB step1,step1,#8 @// (N/4-1)*8 bytes
|
||||
|
||||
VADD dY0,dX0,dX1 @// [b+d | a+c]
|
||||
VSUB dY1,dX0,dX1 @// [b-d | a-c]
|
||||
VMUL dY0, dY0, half[0]
|
||||
VMUL dY1, dY1, half[0]
|
||||
VADD.F32 dY0,dX0,dX1 @// [b+d | a+c]
|
||||
VSUB.F32 dY1,dX0,dX1 @// [b-d | a-c]
|
||||
VMUL.F32 dY0, dY0, half[0]
|
||||
VMUL.F32 dY1, dY1, half[0]
|
||||
|
||||
@// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
|
||||
VZIP dY0,dY1
|
||||
VZIP.F32 dY0,dY1
|
||||
|
||||
VSUB dX0,dY0,dY1
|
||||
VSUB.F32 dX0,dY0,dY1
|
||||
SUBS size,size,#2
|
||||
VADD dX1,dY0,dY1
|
||||
VADD.F32 dX1,dY0,dY1
|
||||
|
||||
SUB pSrc,pSrc,step
|
||||
|
||||
VST1 dX0[0],[pOut1]!
|
||||
VST1.F32 dX0[0],[pOut1]!
|
||||
ADD pTwiddleTmp,pTwiddle,#8 @// W^2
|
||||
VST1 dX1[1],[pOut1]!
|
||||
VST1.F32 dX1[1],[pOut1]!
|
||||
ADD argTwiddle1,pTwiddle,twStep @// W^1
|
||||
|
||||
|
||||
|
@ -195,65 +195,65 @@
|
|||
evenOddButterflyLoop\name :
|
||||
|
||||
|
||||
VLD1 dW0r,[argTwiddle1],step1
|
||||
VLD1 dW1r,[argTwiddle1]!
|
||||
VLD1.F32 dW0r,[argTwiddle1],step1
|
||||
VLD1.F32 dW1r,[argTwiddle1]!
|
||||
|
||||
VLD2 {dX0r,dX0i},[pSrc],step
|
||||
VLD2.F32 {dX0r,dX0i},[pSrc],step
|
||||
SUB argTwiddle1,argTwiddle1,step1
|
||||
VLD2 {dX1r,dX1i},[pSrc]!
|
||||
VLD2.F32 {dX1r,dX1i},[pSrc]!
|
||||
|
||||
SUB step1,step1,#8 @// (N/4-2)*8 bytes
|
||||
VLD1 dW0i,[pTwiddleTmp],step1
|
||||
VLD1 dW1i,[pTwiddleTmp]!
|
||||
VLD1.F32 dW0i,[pTwiddleTmp],step1
|
||||
VLD1.F32 dW1i,[pTwiddleTmp]!
|
||||
SUB pSrc,pSrc,step
|
||||
|
||||
SUB pTwiddleTmp,pTwiddleTmp,step1
|
||||
VREV64 dX1r,dX1r
|
||||
VREV64 dX1i,dX1i
|
||||
VREV64.F32 dX1r,dX1r
|
||||
VREV64.F32 dX1i,dX1i
|
||||
SUBS size,size,#4
|
||||
|
||||
|
||||
VSUB dT2,dX0r,dX1r @// a-c
|
||||
VADD dT3,dX0i,dX1i @// b+d
|
||||
VADD dT0,dX0r,dX1r @// a+c
|
||||
VSUB dT1,dX0i,dX1i @// b-d
|
||||
VSUB.F32 dT2,dX0r,dX1r @// a-c
|
||||
VADD.F32 dT3,dX0i,dX1i @// b+d
|
||||
VADD.F32 dT0,dX0r,dX1r @// a+c
|
||||
VSUB.F32 dT1,dX0i,dX1i @// b-d
|
||||
SUB step1,step1,#8
|
||||
|
||||
VMUL dT2, dT2, half[0]
|
||||
VMUL dT3, dT3, half[0]
|
||||
VMUL.F32 dT2, dT2, half[0]
|
||||
VMUL.F32 dT3, dT3, half[0]
|
||||
|
||||
VMUL dT0, dT0, half[0]
|
||||
VMUL dT1, dT1, half[0]
|
||||
VMUL.F32 dT0, dT0, half[0]
|
||||
VMUL.F32 dT1, dT1, half[0]
|
||||
|
||||
VZIP dW1r,dW1i
|
||||
VZIP dW0r,dW0i
|
||||
VZIP.F32 dW1r,dW1i
|
||||
VZIP.F32 dW0r,dW0i
|
||||
|
||||
|
||||
VMUL dX1r,dW1r,dT2
|
||||
VMUL dX1i,dW1r,dT3
|
||||
VMUL dX0r,dW0r,dT2
|
||||
VMUL dX0i,dW0r,dT3
|
||||
VMUL.F32 dX1r,dW1r,dT2
|
||||
VMUL.F32 dX1i,dW1r,dT3
|
||||
VMUL.F32 dX0r,dW0r,dT2
|
||||
VMUL.F32 dX0i,dW0r,dT3
|
||||
|
||||
VMLS dX1r,dW1i,dT3
|
||||
VMLA dX1i,dW1i,dT2
|
||||
VMLS.F32 dX1r,dW1i,dT3
|
||||
VMLA.F32 dX1i,dW1i,dT2
|
||||
|
||||
VMLA dX0r,dW0i,dT3
|
||||
VMLS dX0i,dW0i,dT2
|
||||
VMLA.F32 dX0r,dW0i,dT3
|
||||
VMLS.F32 dX0i,dW0i,dT2
|
||||
|
||||
|
||||
VADD dY1r,dT0,dX1i @// F(N/2 -1)
|
||||
VSUB dY1i,dX1r,dT1
|
||||
VADD.F32 dY1r,dT0,dX1i @// F(N/2 -1)
|
||||
VSUB.F32 dY1i,dX1r,dT1
|
||||
|
||||
VREV64 dY1r,dY1r
|
||||
VREV64 dY1i,dY1i
|
||||
VREV64.F32 dY1r,dY1r
|
||||
VREV64.F32 dY1i,dY1i
|
||||
|
||||
|
||||
VADD dY0r,dT0,dX0i @// F(1)
|
||||
VSUB dY0i,dT1,dX0r
|
||||
VADD.F32 dY0r,dT0,dX0i @// F(1)
|
||||
VSUB.F32 dY0i,dT1,dX0r
|
||||
|
||||
|
||||
VST2 {dY0r,dY0i},[pOut1],step
|
||||
VST2 {dY1r,dY1i},[pOut1]!
|
||||
VST2.F32 {dY0r,dY0i},[pOut1],step
|
||||
VST2.F32 {dY1r,dY1i},[pOut1]!
|
||||
SUB pOut1,pOut1,step
|
||||
SUB step,step,#32 @// (N/2-4)*8 bytes
|
||||
|
||||
|
@ -274,11 +274,11 @@ evenOddButterflyLoop\name :
|
|||
@// Since (c,d) = (0,1) for the last element, result is just (a,-b)
|
||||
|
||||
lastElement\name :
|
||||
VLD1 dX0r,[pSrc]
|
||||
VLD1.F32 dX0r,[pSrc]
|
||||
|
||||
VST1 dX0r[0],[pOut1]!
|
||||
VNEG dX0r,dX0r
|
||||
VST1 dX0r[1],[pOut1]
|
||||
VST1.F32 dX0r[0],[pOut1]!
|
||||
VNEG.F32 dX0r,dX0r
|
||||
VST1.F32 dX0r[1],[pOut1]
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -67,10 +67,10 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dX0 D0.F32
|
||||
#define dX1 D1.F32
|
||||
#define dY0 D2.F32
|
||||
#define dY1 D3.F32
|
||||
#define dX0 D0
|
||||
#define dX1 D1
|
||||
#define dY0 D2
|
||||
#define dY1 D3
|
||||
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
@ -99,16 +99,16 @@
|
|||
|
||||
grpZeroSetLoop\name :
|
||||
|
||||
VLD1 dX0,[pSrc],pointStep
|
||||
VLD1 dX1,[pSrc],step @// step = -pointStep + 8
|
||||
VLD1.F32 dX0,[pSrc],pointStep
|
||||
VLD1.F32 dX1,[pSrc],step @// step = -pointStep + 8
|
||||
SUBS setCount,setCount,#1
|
||||
|
||||
VADD dY0,dX0,dX1
|
||||
VSUB dY1,dX0,dX1
|
||||
VADD.F32 dY0,dX0,dX1
|
||||
VSUB.F32 dY1,dX0,dX1
|
||||
|
||||
VST1 dY0,[pDst],outPointStep
|
||||
VST1.F32 dY0,[pDst],outPointStep
|
||||
@// dstStep = step = -pointStep + 8
|
||||
VST1 dY1,[pDst],dstStep
|
||||
VST1.F32 dY1,[pDst],dstStep
|
||||
|
||||
BGT grpZeroSetLoop\name
|
||||
|
||||
|
|
|
@ -60,18 +60,18 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dWr d0.f32
|
||||
#define dWi d1.f32
|
||||
#define dXr0 d2.f32
|
||||
#define dXi0 d3.f32
|
||||
#define dXr1 d4.f32
|
||||
#define dXi1 d5.f32
|
||||
#define dYr0 d6.f32
|
||||
#define dYi0 d7.f32
|
||||
#define dYr1 d8.f32
|
||||
#define dYi1 d9.f32
|
||||
#define qT0 d10.f32
|
||||
#define qT1 d12.f32
|
||||
#define dWr d0
|
||||
#define dWi d1
|
||||
#define dXr0 d2
|
||||
#define dXi0 d3
|
||||
#define dXr1 d4
|
||||
#define dXi1 d5
|
||||
#define dYr0 d6
|
||||
#define dYi0 d7
|
||||
#define dYr1 d8
|
||||
#define dYi1 d9
|
||||
#define qT0 d10
|
||||
#define qT1 d12
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
||||
|
@ -93,37 +93,37 @@
|
|||
radix2lsGrpLoop\name :
|
||||
@ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
|
||||
@ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
|
||||
VLD2 {dWr,dWi},[pTwiddle, :64]!
|
||||
VLD2.F32 {dWr,dWi},[pTwiddle, :64]!
|
||||
|
||||
@ dXr0 = [pSrc[0].Re, pSrc[2].Re]
|
||||
@ dXi0 = [pSrc[0].Im, pSrc[2].Im]
|
||||
@ dXr1 = [pSrc[1].Re, pSrc[3].Re]
|
||||
@ dXi1 = [pSrc[1].Im, pSrc[3].Im]
|
||||
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
|
||||
VLD4.F32 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
|
||||
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL qT0,dWr,dXr1
|
||||
VMLA qT0,dWi,dXi1 @// real part
|
||||
VMUL qT1,dWr,dXi1
|
||||
VMLS qT1,dWi,dXr1 @// imag part
|
||||
VMUL.F32 qT0,dWr,dXr1
|
||||
VMLA.F32 qT0,dWi,dXi1 @// real part
|
||||
VMUL.F32 qT1,dWr,dXi1
|
||||
VMLS.F32 qT1,dWi,dXr1 @// imag part
|
||||
|
||||
.else
|
||||
|
||||
VMUL qT0,dWr,dXr1
|
||||
VMLS qT0,dWi,dXi1 @// real part
|
||||
VMUL qT1,dWr,dXi1
|
||||
VMLA qT1,dWi,dXr1 @// imag part
|
||||
VMUL.F32 qT0,dWr,dXr1
|
||||
VMLS.F32 qT0,dWi,dXi1 @// real part
|
||||
VMUL.F32 qT1,dWr,dXi1
|
||||
VMLA.F32 qT1,dWi,dXr1 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VSUB dYr0,dXr0,qT0
|
||||
VSUB dYi0,dXi0,qT1
|
||||
VADD dYr1,dXr0,qT0
|
||||
VADD dYi1,dXi0,qT1
|
||||
VSUB.F32 dYr0,dXr0,qT0
|
||||
VSUB.F32 dYi0,dXi0,qT1
|
||||
VADD.F32 dYr1,dXr0,qT0
|
||||
VADD.F32 dYi1,dXi0,qT1
|
||||
|
||||
VST2 {dYr0,dYi0},[pDst],outPointStep
|
||||
VST2 {dYr1,dYi1},[pDst],dstStep @// dstStep = step = -outPointStep + 16
|
||||
VST2.F32 {dYr0,dYi0},[pDst],outPointStep
|
||||
VST2.F32 {dYr1,dYi1},[pDst],dstStep @// dstStep = step = -outPointStep + 16
|
||||
|
||||
BGT radix2lsGrpLoop\name
|
||||
|
||||
|
|
|
@ -69,17 +69,17 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dW D0.F32
|
||||
#define dX0 D2.F32
|
||||
#define dX1 D3.F32
|
||||
#define dX2 D4.F32
|
||||
#define dX3 D5.F32
|
||||
#define dY0 D6.F32
|
||||
#define dY1 D7.F32
|
||||
#define dY2 D8.F32
|
||||
#define dY3 D9.F32
|
||||
#define qT0 D10.F32
|
||||
#define qT1 D11.F32
|
||||
#define dW D0
|
||||
#define dX0 D2
|
||||
#define dX1 D3
|
||||
#define dX2 D4
|
||||
#define dX3 D5
|
||||
#define dY0 D6
|
||||
#define dY1 D7
|
||||
#define dY2 D8
|
||||
#define dY3 D9
|
||||
#define qT0 D10
|
||||
#define qT1 D11
|
||||
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
@ -115,7 +115,7 @@
|
|||
|
||||
radix2GrpLoop\name :
|
||||
MOV setCount,pointStep,LSR #3
|
||||
VLD1 dW,[pTwiddle],pointStep @//[wi | wr]
|
||||
VLD1.F32 dW,[pTwiddle],pointStep @//[wi | wr]
|
||||
|
||||
|
||||
@// Loop on the sets
|
||||
|
@ -125,35 +125,35 @@ radix2SetLoop\name :
|
|||
|
||||
|
||||
@// point0: dX0-real part dX1-img part
|
||||
VLD2 {dX0,dX1},[pSrc],pointStep
|
||||
VLD2.F32 {dX0,dX1},[pSrc],pointStep
|
||||
@// point1: dX2-real part dX3-img part
|
||||
VLD2 {dX2,dX3},[pSrc],step
|
||||
VLD2.F32 {dX2,dX3},[pSrc],step
|
||||
|
||||
SUBS setCount,setCount,#2
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL qT0,dX2,dW[0]
|
||||
VMLA qT0,dX3,dW[1] @// real part
|
||||
VMUL qT1,dX3,dW[0]
|
||||
VMLS qT1,dX2,dW[1] @// imag part
|
||||
VMUL.F32 qT0,dX2,dW[0]
|
||||
VMLA.F32 qT0,dX3,dW[1] @// real part
|
||||
VMUL.F32 qT1,dX3,dW[0]
|
||||
VMLS.F32 qT1,dX2,dW[1] @// imag part
|
||||
|
||||
.else
|
||||
|
||||
VMUL qT0,dX2,dW[0]
|
||||
VMLS qT0,dX3,dW[1] @// real part
|
||||
VMUL qT1,dX3,dW[0]
|
||||
VMLA qT1,dX2,dW[1] @// imag part
|
||||
VMUL.F32 qT0,dX2,dW[0]
|
||||
VMLS.F32 qT0,dX3,dW[1] @// real part
|
||||
VMUL.F32 qT1,dX3,dW[0]
|
||||
VMLA.F32 qT1,dX2,dW[1] @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VSUB dY0,dX0,qT0
|
||||
VSUB dY1,dX1,qT1
|
||||
VADD dY2,dX0,qT0
|
||||
VADD dY3,dX1,qT1
|
||||
VSUB.F32 dY0,dX0,qT0
|
||||
VSUB.F32 dY1,dX1,qT1
|
||||
VADD.F32 dY2,dX0,qT0
|
||||
VADD.F32 dY3,dX1,qT1
|
||||
|
||||
VST2 {dY0,dY1},[pDst],outPointStep
|
||||
VST2.F32 {dY0,dY1},[pDst],outPointStep
|
||||
@// dstStep = -outPointStep + 16
|
||||
VST2 {dY2,dY3},[pDst],dstStep
|
||||
VST2.F32 {dY2,dY3},[pDst],dstStep
|
||||
|
||||
BGT radix2SetLoop\name
|
||||
|
||||
|
|
|
@ -68,42 +68,42 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dXr0 D0.F32
|
||||
#define dXi0 D1.F32
|
||||
#define dXr1 D2.F32
|
||||
#define dXi1 D3.F32
|
||||
#define dXr2 D4.F32
|
||||
#define dXi2 D5.F32
|
||||
#define dXr3 D6.F32
|
||||
#define dXi3 D7.F32
|
||||
#define dYr0 D8.F32
|
||||
#define dYi0 D9.F32
|
||||
#define dYr1 D10.F32
|
||||
#define dYi1 D11.F32
|
||||
#define dYr2 D12.F32
|
||||
#define dYi2 D13.F32
|
||||
#define dYr3 D14.F32
|
||||
#define dYi3 D15.F32
|
||||
#define qX0 Q0.F32
|
||||
#define qX1 Q1.F32
|
||||
#define qX2 Q2.F32
|
||||
#define qX3 Q3.F32
|
||||
#define qY0 Q4.F32
|
||||
#define qY1 Q5.F32
|
||||
#define qY2 Q6.F32
|
||||
#define qY3 Q7.F32
|
||||
#define dZr0 D16.F32
|
||||
#define dZi0 D17.F32
|
||||
#define dZr1 D18.F32
|
||||
#define dZi1 D19.F32
|
||||
#define dZr2 D20.F32
|
||||
#define dZi2 D21.F32
|
||||
#define dZr3 D22.F32
|
||||
#define dZi3 D23.F32
|
||||
#define qZ0 Q8.F32
|
||||
#define qZ1 Q9.F32
|
||||
#define qZ2 Q10.F32
|
||||
#define qZ3 Q11.F32
|
||||
#define dXr0 D0
|
||||
#define dXi0 D1
|
||||
#define dXr1 D2
|
||||
#define dXi1 D3
|
||||
#define dXr2 D4
|
||||
#define dXi2 D5
|
||||
#define dXr3 D6
|
||||
#define dXi3 D7
|
||||
#define dYr0 D8
|
||||
#define dYi0 D9
|
||||
#define dYr1 D10
|
||||
#define dYi1 D11
|
||||
#define dYr2 D12
|
||||
#define dYi2 D13
|
||||
#define dYr3 D14
|
||||
#define dYi3 D15
|
||||
#define qX0 Q0
|
||||
#define qX1 Q1
|
||||
#define qX2 Q2
|
||||
#define qX3 Q3
|
||||
#define qY0 Q4
|
||||
#define qY1 Q5
|
||||
#define qY2 Q6
|
||||
#define qY3 Q7
|
||||
#define dZr0 D16
|
||||
#define dZi0 D17
|
||||
#define dZr1 D18
|
||||
#define dZi1 D19
|
||||
#define dZr2 D20
|
||||
#define dZi2 D21
|
||||
#define dZr3 D22
|
||||
#define dZi3 D23
|
||||
#define qZ0 Q8
|
||||
#define qZ1 Q9
|
||||
#define qZ2 Q10
|
||||
#define qZ3 Q11
|
||||
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
@ -118,31 +118,31 @@
|
|||
|
||||
|
||||
@// Update pSubFFTSize and pSubFFTNum regs
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
@// subFFTSize = 1 for the first stage
|
||||
MOV subFFTSize,#4
|
||||
|
||||
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
|
||||
LSR grpSize,subFFTNum,#2
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
MOV subFFTNum,grpSize
|
||||
|
||||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV setStep,pointStep,LSL #1
|
||||
MOV setStep,grpSize,LSL #4
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
@// setStep = 3*pointStep
|
||||
ADD setStep,setStep,pointStep
|
||||
@// setStep = - 3*pointStep+16
|
||||
RSB setStep,setStep,#16
|
||||
|
||||
@// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
@// step1 = 2*pointStep
|
||||
MOV step1,pointStep,LSL #1
|
||||
|
||||
VADD qY0,qX0,qX2
|
||||
VADD.F32 qY0,qX0,qX2
|
||||
|
||||
@// step3 = -pointStep
|
||||
RSB step3,pointStep,#0
|
||||
|
@ -161,68 +161,68 @@ radix4fsGrpZeroSetLoop\name :
|
|||
@// finish first stage of 4 point FFT
|
||||
|
||||
|
||||
VSUB qY2,qX0,qX2
|
||||
VSUB.F32 qY2,qX0,qX2
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VADD qY1,qX1,qX3
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
|
||||
VSUB qY3,qX1,qX3
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
|
||||
VADD.F32 qY1,qX1,qX3
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
|
||||
VSUB.F32 qY3,qX1,qX3
|
||||
|
||||
|
||||
@// finish second stage of 4 point FFT
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VADD.F32 qZ0,qY0,qY1
|
||||
|
||||
@// data[3] & update pSrc for the next set, but not if it's the
|
||||
@// last iteration so that we don't read past the end of the
|
||||
@// input array.
|
||||
BEQ radix4SkipLastUpdateInv\name
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
radix4SkipLastUpdateInv\name:
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
VSUB.F32 dZr3,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi3,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VSUB.F32 qZ1,qY0,qY1
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
|
||||
VADD dZr2,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
VADD.F32 dZr2,dYr2,dYi3
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi2,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr2,dZi2},[pDst, :128],setStep
|
||||
VADD.F32 qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],setStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VADD qZ0,qY0,qY1
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
|
||||
VADD.F32 qZ0,qY0,qY1
|
||||
|
||||
@// data[3] & update pSrc for the next set, but not if it's the
|
||||
@// last iteration so that we don't read past the end of the
|
||||
@// input array.
|
||||
BEQ radix4SkipLastUpdateFwd\name
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
|
||||
radix4SkipLastUpdateFwd\name:
|
||||
VADD dZr2,dYr2,dYi3
|
||||
VADD.F32 dZr2,dYr2,dYi3
|
||||
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB dZi2,dYi2,dYr3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi2,dYi2,dYr3
|
||||
|
||||
VSUB qZ1,qY0,qY1
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VSUB.F32 qZ1,qY0,qY1
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
|
||||
VSUB dZr3,dYr2,dYi3
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VADD dZi3,dYi2,dYr3
|
||||
VSUB.F32 dZr3,dYr2,dYi3
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi3,dYi2,dYr3
|
||||
|
||||
VADD qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2 {dZr3,dZi3},[pDst, :128],setStep
|
||||
VADD.F32 qY0,qX0,qX2 @// u0 for next iteration
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],setStep
|
||||
|
||||
.endif
|
||||
|
||||
|
|
|
@ -68,63 +68,63 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dButterfly1Real02 D0.F32
|
||||
#define dButterfly1Imag02 D1.F32
|
||||
#define dButterfly1Real13 D2.F32
|
||||
#define dButterfly1Imag13 D3.F32
|
||||
#define dButterfly2Real02 D4.F32
|
||||
#define dButterfly2Imag02 D5.F32
|
||||
#define dButterfly2Real13 D6.F32
|
||||
#define dButterfly2Imag13 D7.F32
|
||||
#define dXr0 D0.F32
|
||||
#define dXi0 D1.F32
|
||||
#define dXr1 D2.F32
|
||||
#define dXi1 D3.F32
|
||||
#define dXr2 D4.F32
|
||||
#define dXi2 D5.F32
|
||||
#define dXr3 D6.F32
|
||||
#define dXi3 D7.F32
|
||||
#define dButterfly1Real02 D0
|
||||
#define dButterfly1Imag02 D1
|
||||
#define dButterfly1Real13 D2
|
||||
#define dButterfly1Imag13 D3
|
||||
#define dButterfly2Real02 D4
|
||||
#define dButterfly2Imag02 D5
|
||||
#define dButterfly2Real13 D6
|
||||
#define dButterfly2Imag13 D7
|
||||
#define dXr0 D0
|
||||
#define dXi0 D1
|
||||
#define dXr1 D2
|
||||
#define dXi1 D3
|
||||
#define dXr2 D4
|
||||
#define dXi2 D5
|
||||
#define dXr3 D6
|
||||
#define dXi3 D7
|
||||
|
||||
#define dYr0 D16.F32
|
||||
#define dYi0 D17.F32
|
||||
#define dYr1 D18.F32
|
||||
#define dYi1 D19.F32
|
||||
#define dYr2 D20.F32
|
||||
#define dYi2 D21.F32
|
||||
#define dYr3 D22.F32
|
||||
#define dYi3 D23.F32
|
||||
#define dYr0 D16
|
||||
#define dYi0 D17
|
||||
#define dYr1 D18
|
||||
#define dYi1 D19
|
||||
#define dYr2 D20
|
||||
#define dYi2 D21
|
||||
#define dYr3 D22
|
||||
#define dYi3 D23
|
||||
|
||||
#define dW1r D8.F32
|
||||
#define dW1i D9.F32
|
||||
#define dW2r D10.F32
|
||||
#define dW2i D11.F32
|
||||
#define dW3r D12.F32
|
||||
#define dW3i D13.F32
|
||||
#define qT0 d14.f32
|
||||
#define qT1 d16.F32
|
||||
#define qT2 d18.F32
|
||||
#define qT3 d20.f32
|
||||
#define qT4 d22.f32
|
||||
#define qT5 d24.f32
|
||||
#define dW1r D8
|
||||
#define dW1i D9
|
||||
#define dW2r D10
|
||||
#define dW2i D11
|
||||
#define dW3r D12
|
||||
#define dW3i D13
|
||||
#define qT0 d14
|
||||
#define qT1 d16
|
||||
#define qT2 d18
|
||||
#define qT3 d20
|
||||
#define qT4 d22
|
||||
#define qT5 d24
|
||||
|
||||
#define dZr0 D14.F32
|
||||
#define dZi0 D15.F32
|
||||
#define dZr1 D26.F32
|
||||
#define dZi1 D27.F32
|
||||
#define dZr2 D28.F32
|
||||
#define dZi2 D29.F32
|
||||
#define dZr3 D30.F32
|
||||
#define dZi3 D31.F32
|
||||
#define dZr0 D14
|
||||
#define dZi0 D15
|
||||
#define dZr1 D26
|
||||
#define dZi1 D27
|
||||
#define dZr2 D28
|
||||
#define dZi2 D29
|
||||
#define dZr3 D30
|
||||
#define dZi3 D31
|
||||
|
||||
#define qX0 Q0.F32
|
||||
#define qY0 Q8.F32
|
||||
#define qY1 Q9.F32
|
||||
#define qY2 Q10.F32
|
||||
#define qY3 Q11.F32
|
||||
#define qZ0 Q7.F32
|
||||
#define qZ1 Q13.F32
|
||||
#define qZ2 Q14.F32
|
||||
#define qZ3 Q15.F32
|
||||
#define qX0 Q0
|
||||
#define qY0 Q8
|
||||
#define qY1 Q9
|
||||
#define qY2 Q10
|
||||
#define qY3 Q11
|
||||
#define qZ0 Q7
|
||||
#define qZ1 Q13
|
||||
#define qZ2 Q14
|
||||
#define qZ3 Q15
|
||||
|
||||
|
||||
|
||||
|
@ -139,172 +139,172 @@
|
|||
|
||||
@// Update grpCount and grpSize rightaway
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
|
||||
VLD2.F32 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
|
||||
MOV step16,#16
|
||||
LSL grpCount,subFFTSize,#2
|
||||
|
||||
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
|
||||
VLD1.F32 dW2r,[pTwiddle, :64] @// [wi|wr]
|
||||
MOV subFFTNum,#1 @//after the last stage
|
||||
|
||||
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VLD1.F32 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
MOV stepTwiddle,#0
|
||||
|
||||
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
|
||||
VLD1.F32 dW2i,[pTwiddle, :64]! @// [wi|wr]
|
||||
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
|
||||
|
||||
@// update subFFTSize for the next stage
|
||||
MOV subFFTSize,grpCount
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1.F32 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
MOV dstStep,outPointStep,LSL #1
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
VLD4.F32 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
|
||||
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
|
||||
MOV step24,#24
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
VLD4.F32 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
|
||||
|
||||
@// Process two groups at a time
|
||||
|
||||
radix4lsGrpLoop\name :
|
||||
|
||||
VZIP dW2r,dW2i
|
||||
VZIP.F32 dW2r,dW2i
|
||||
ADD stepTwiddle,stepTwiddle,#16
|
||||
VZIP dW3r,dW3i
|
||||
VZIP.F32 dW3r,dW3i
|
||||
ADD grpTwStep,stepTwiddle,#4
|
||||
VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
|
||||
VUZP.F32 dButterfly1Real13, dButterfly2Real13 @// B.r D.r
|
||||
SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle
|
||||
VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
|
||||
VUZP.F32 dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
|
||||
MOV grpTwStep,grpTwStep,LSL #1
|
||||
VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
|
||||
VUZP.F32 dButterfly1Real02, dButterfly2Real02 @// A.r C.r
|
||||
RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle
|
||||
|
||||
|
||||
VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
|
||||
VUZP.F32 dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
|
||||
|
||||
|
||||
@// grpCount is multiplied by 4
|
||||
SUBS grpCount,grpCount,#8
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL dZr1,dW1r,dXr1
|
||||
VMLA dZr1,dW1i,dXi1 @// real part
|
||||
VMUL dZi1,dW1r,dXi1
|
||||
VMLS dZi1,dW1i,dXr1 @// imag part
|
||||
VMUL.F32 dZr1,dW1r,dXr1
|
||||
VMLA.F32 dZr1,dW1i,dXi1 @// real part
|
||||
VMUL.F32 dZi1,dW1r,dXi1
|
||||
VMLS.F32 dZi1,dW1i,dXr1 @// imag part
|
||||
|
||||
.else
|
||||
|
||||
VMUL dZr1,dW1r,dXr1
|
||||
VMLS dZr1,dW1i,dXi1 @// real part
|
||||
VMUL dZi1,dW1r,dXi1
|
||||
VMLA dZi1,dW1i,dXr1 @// imag part
|
||||
VMUL.F32 dZr1,dW1r,dXr1
|
||||
VMLS.F32 dZr1,dW1i,dXi1 @// real part
|
||||
VMUL.F32 dZi1,dW1r,dXi1
|
||||
VMLA.F32 dZi1,dW1i,dXr1 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
|
||||
VLD2.F32 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL dZr2,dW2r,dXr2
|
||||
VMLA dZr2,dW2i,dXi2 @// real part
|
||||
VMUL dZi2,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VMLS dZi2,dW2i,dXr2 @// imag part
|
||||
VMUL.F32 dZr2,dW2r,dXr2
|
||||
VMLA.F32 dZr2,dW2i,dXi2 @// real part
|
||||
VMUL.F32 dZi2,dW2r,dXi2
|
||||
VLD1.F32 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VMLS.F32 dZi2,dW2i,dXr2 @// imag part
|
||||
|
||||
.else
|
||||
|
||||
VMUL dZr2,dW2r,dXr2
|
||||
VMLS dZr2,dW2i,dXi2 @// real part
|
||||
VMUL dZi2,dW2r,dXi2
|
||||
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VMLA dZi2,dW2i,dXr2 @// imag part
|
||||
VMUL.F32 dZr2,dW2r,dXr2
|
||||
VMLS.F32 dZr2,dW2i,dXi2 @// real part
|
||||
VMUL.F32 dZi2,dW2r,dXi2
|
||||
VLD1.F32 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
|
||||
VMLA.F32 dZi2,dW2i,dXr2 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
|
||||
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
|
||||
VLD1.F32 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
|
||||
|
||||
@// move qX0 so as to load for the next iteration
|
||||
VMOV qZ0,qX0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL dZr3,dW3r,dXr3
|
||||
VMLA dZr3,dW3i,dXi3 @// real part
|
||||
VMUL dZi3,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VMLS dZi3,dW3i,dXr3 @// imag part
|
||||
VMUL.F32 dZr3,dW3r,dXr3
|
||||
VMLA.F32 dZr3,dW3i,dXi3 @// real part
|
||||
VMUL.F32 dZi3,dW3r,dXi3
|
||||
VLD1.F32 dW3r,[pTwiddle, :64],step24
|
||||
VMLS.F32 dZi3,dW3i,dXr3 @// imag part
|
||||
|
||||
.else
|
||||
|
||||
VMUL dZr3,dW3r,dXr3
|
||||
VMLS dZr3,dW3i,dXi3 @// real part
|
||||
VMUL dZi3,dW3r,dXi3
|
||||
VLD1 dW3r,[pTwiddle, :64],step24
|
||||
VMLA dZi3,dW3i,dXr3 @// imag part
|
||||
VMUL.F32 dZr3,dW3r,dXr3
|
||||
VMLS.F32 dZr3,dW3i,dXi3 @// real part
|
||||
VMUL.F32 dZi3,dW3r,dXi3
|
||||
VLD1.F32 dW3r,[pTwiddle, :64],step24
|
||||
VMLA.F32 dZi3,dW3i,dXr3 @// imag part
|
||||
|
||||
.endif
|
||||
|
||||
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
VLD1.F32 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
|
||||
|
||||
@// Don't do the load on the last iteration so we don't read past the end
|
||||
@// of pSrc.
|
||||
addeq pSrc, pSrc, #64
|
||||
beq radix4lsSkipRead\name
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
VLD4.F32 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
|
||||
|
||||
@// AC.r AC.i BD.r BD.i
|
||||
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
VLD4.F32 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
|
||||
radix4lsSkipRead\name:
|
||||
|
||||
@// finish first stage of 4 point FFT
|
||||
|
||||
VADD qY0,qZ0,qZ2
|
||||
VSUB qY2,qZ0,qZ2
|
||||
VADD qY1,qZ1,qZ3
|
||||
VSUB qY3,qZ1,qZ3
|
||||
VADD.F32 qY0,qZ0,qZ2
|
||||
VSUB.F32 qY2,qZ0,qZ2
|
||||
VADD.F32 qY1,qZ1,qZ3
|
||||
VSUB.F32 qY3,qZ1,qZ3
|
||||
|
||||
|
||||
@// finish second stage of 4 point FFT
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VSUB qZ0,qY2,qY1
|
||||
VSUB.F32 qZ0,qY2,qY1
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
VADD.F32 dZr3,dYr0,dYi3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VADD.F32 qZ2,qY2,qY1
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
VSUB.F32 dZr1,dYr0,dYi3
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi1,dYi0,dYr3
|
||||
|
||||
@// dstStep = -outPointStep + 16
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VSUB qZ0,qY2,qY1
|
||||
VSUB.F32 qZ0,qY2,qY1
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
VSUB.F32 dZr1,dYr0,dYi3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VADD.F32 qZ2,qY2,qY1
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
VADD.F32 dZr3,dYr0,dYi3
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi3,dYi0,dYr3
|
||||
|
||||
@// dstStep = -outPointStep + 16
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
|
|
|
@ -76,48 +76,48 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dW1 D0.F32
|
||||
#define dW2 D1.F32
|
||||
#define dW3 D2.F32
|
||||
#define dW1 D0
|
||||
#define dW2 D1
|
||||
#define dW3 D2
|
||||
|
||||
#define dXr0 D4.F32
|
||||
#define dXi0 D5.F32
|
||||
#define dXr1 D6.F32
|
||||
#define dXi1 D7.F32
|
||||
#define dXr2 D8.F32
|
||||
#define dXi2 D9.F32
|
||||
#define dXr3 D10.F32
|
||||
#define dXi3 D11.F32
|
||||
#define dYr0 D12.F32
|
||||
#define dYi0 D13.F32
|
||||
#define dYr1 D14.F32
|
||||
#define dYi1 D15.F32
|
||||
#define dYr2 D16.F32
|
||||
#define dYi2 D17.F32
|
||||
#define dYr3 D18.F32
|
||||
#define dYi3 D19.F32
|
||||
#define qT0 d16.f32
|
||||
#define qT1 d18.f32
|
||||
#define qT2 d12.f32
|
||||
#define qT3 d14.f32
|
||||
#define dZr0 D20.F32
|
||||
#define dZi0 D21.F32
|
||||
#define dZr1 D22.F32
|
||||
#define dZi1 D23.F32
|
||||
#define dZr2 D24.F32
|
||||
#define dZi2 D25.F32
|
||||
#define dZr3 D26.F32
|
||||
#define dZi3 D27.F32
|
||||
#define dXr0 D4
|
||||
#define dXi0 D5
|
||||
#define dXr1 D6
|
||||
#define dXi1 D7
|
||||
#define dXr2 D8
|
||||
#define dXi2 D9
|
||||
#define dXr3 D10
|
||||
#define dXi3 D11
|
||||
#define dYr0 D12
|
||||
#define dYi0 D13
|
||||
#define dYr1 D14
|
||||
#define dYi1 D15
|
||||
#define dYr2 D16
|
||||
#define dYi2 D17
|
||||
#define dYr3 D18
|
||||
#define dYi3 D19
|
||||
#define qT0 d16
|
||||
#define qT1 d18
|
||||
#define qT2 d12
|
||||
#define qT3 d14
|
||||
#define dZr0 D20
|
||||
#define dZi0 D21
|
||||
#define dZr1 D22
|
||||
#define dZi1 D23
|
||||
#define dZr2 D24
|
||||
#define dZi2 D25
|
||||
#define dZr3 D26
|
||||
#define dZi3 D27
|
||||
|
||||
#define qY0 Q6.F32
|
||||
#define qY1 Q7.F32
|
||||
#define qY2 Q8.F32
|
||||
#define qY3 Q9.F32
|
||||
#define qX0 Q2.F32
|
||||
#define qZ0 Q10.F32
|
||||
#define qZ1 Q11.F32
|
||||
#define qZ2 Q12.F32
|
||||
#define qZ3 Q13.F32
|
||||
#define qY0 Q6
|
||||
#define qY1 Q7
|
||||
#define qY2 Q8
|
||||
#define qY3 Q9
|
||||
#define qX0 Q2
|
||||
#define qZ0 Q10
|
||||
#define qZ1 Q11
|
||||
#define qZ2 Q12
|
||||
#define qZ3 Q13
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse , name
|
||||
|
||||
|
@ -131,7 +131,7 @@
|
|||
LSR subFFTNum,subFFTNum,#2
|
||||
MOV subFFTSize,grpCount
|
||||
|
||||
VLD1 dW1,[pTwiddle] @//[wi | wr]
|
||||
VLD1.F32 dW1,[pTwiddle] @//[wi | wr]
|
||||
@// pT0+1 increments pT0 by 8 bytes
|
||||
@// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
|
||||
MOV pointStep,subFFTNum,LSL #1
|
||||
|
@ -142,11 +142,11 @@
|
|||
@// = 2*size bytes
|
||||
|
||||
MOV stepTwiddle,#0
|
||||
VLD1 dW2,[pTwiddle] @//[wi | wr]
|
||||
VLD1.F32 dW2,[pTwiddle] @//[wi | wr]
|
||||
SMULBB outPointStep,grpCount,pointStep
|
||||
LSL pointStep,pointStep,#2 @// 2*grpSize
|
||||
|
||||
VLD1 dW3,[pTwiddle] @//[wi | wr]
|
||||
VLD1.F32 dW3,[pTwiddle] @//[wi | wr]
|
||||
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
|
||||
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
|
||||
|
||||
|
@ -162,16 +162,16 @@
|
|||
|
||||
radix4GrpLoop\name :
|
||||
|
||||
VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc],pointStep @// data[0]
|
||||
ADD stepTwiddle,stepTwiddle,pointStep
|
||||
VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep @// data[1]
|
||||
@// set pTwiddle to the first point
|
||||
ADD pTwiddle,pTwiddle,stepTwiddle
|
||||
VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep @// data[2]
|
||||
MOV twStep,stepTwiddle,LSL #2
|
||||
|
||||
@// data[3] & update pSrc for the next set
|
||||
VLD2 {dXr3,dXi3},[pSrc],setStep
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc],setStep
|
||||
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
|
||||
|
||||
MOV setCount,pointStep,LSR #3
|
||||
|
@ -188,49 +188,49 @@ radix4SetLoop\name :
|
|||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
VMUL dZr1,dXr1,dW1[0]
|
||||
VMUL dZi1,dXi1,dW1[0]
|
||||
VMUL dZr2,dXr2,dW2[0]
|
||||
VMUL dZi2,dXi2,dW2[0]
|
||||
VMUL dZr3,dXr3,dW3[0]
|
||||
VMUL dZi3,dXi3,dW3[0]
|
||||
VMUL.F32 dZr1,dXr1,dW1[0]
|
||||
VMUL.F32 dZi1,dXi1,dW1[0]
|
||||
VMUL.F32 dZr2,dXr2,dW2[0]
|
||||
VMUL.F32 dZi2,dXi2,dW2[0]
|
||||
VMUL.F32 dZr3,dXr3,dW3[0]
|
||||
VMUL.F32 dZi3,dXi3,dW3[0]
|
||||
|
||||
VMLA dZr1,dXi1,dW1[1] @// real part
|
||||
VMLS dZi1,dXr1,dW1[1] @// imag part
|
||||
VMLA.F32 dZr1,dXi1,dW1[1] @// real part
|
||||
VMLS.F32 dZi1,dXr1,dW1[1] @// imag part
|
||||
|
||||
@// data[1] for next iteration
|
||||
VLD2 {dXr1,dXi1},[pSrc],pointStep
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep
|
||||
|
||||
VMLA dZr2,dXi2,dW2[1] @// real part
|
||||
VMLS dZi2,dXr2,dW2[1] @// imag part
|
||||
VMLA.F32 dZr2,dXi2,dW2[1] @// real part
|
||||
VMLS.F32 dZi2,dXr2,dW2[1] @// imag part
|
||||
|
||||
@// data[2] for next iteration
|
||||
VLD2 {dXr2,dXi2},[pSrc],pointStep
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep
|
||||
|
||||
VMLA dZr3,dXi3,dW3[1] @// real part
|
||||
VMLS dZi3,dXr3,dW3[1] @// imag part
|
||||
VMLA.F32 dZr3,dXi3,dW3[1] @// real part
|
||||
VMLS.F32 dZi3,dXr3,dW3[1] @// imag part
|
||||
.else
|
||||
VMUL dZr1,dXr1,dW1[0]
|
||||
VMUL dZi1,dXi1,dW1[0]
|
||||
VMUL dZr2,dXr2,dW2[0]
|
||||
VMUL dZi2,dXi2,dW2[0]
|
||||
VMUL dZr3,dXr3,dW3[0]
|
||||
VMUL dZi3,dXi3,dW3[0]
|
||||
VMUL.F32 dZr1,dXr1,dW1[0]
|
||||
VMUL.F32 dZi1,dXi1,dW1[0]
|
||||
VMUL.F32 dZr2,dXr2,dW2[0]
|
||||
VMUL.F32 dZi2,dXi2,dW2[0]
|
||||
VMUL.F32 dZr3,dXr3,dW3[0]
|
||||
VMUL.F32 dZi3,dXi3,dW3[0]
|
||||
|
||||
VMLS dZr1,dXi1,dW1[1] @// real part
|
||||
VMLA dZi1,dXr1,dW1[1] @// imag part
|
||||
VMLS.F32 dZr1,dXi1,dW1[1] @// real part
|
||||
VMLA.F32 dZi1,dXr1,dW1[1] @// imag part
|
||||
|
||||
@// data[1] for next iteration
|
||||
VLD2 {dXr1,dXi1},[pSrc],pointStep
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep
|
||||
|
||||
VMLS dZr2,dXi2,dW2[1] @// real part
|
||||
VMLA dZi2,dXr2,dW2[1] @// imag part
|
||||
VMLS.F32 dZr2,dXi2,dW2[1] @// real part
|
||||
VMLA.F32 dZi2,dXr2,dW2[1] @// imag part
|
||||
|
||||
@// data[2] for next iteration
|
||||
VLD2 {dXr2,dXi2},[pSrc],pointStep
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep
|
||||
|
||||
VMLS dZr3,dXi3,dW3[1] @// real part
|
||||
VMLA dZi3,dXr3,dW3[1] @// imag part
|
||||
VMLS.F32 dZr3,dXi3,dW3[1] @// real part
|
||||
VMLA.F32 dZi3,dXr3,dW3[1] @// imag part
|
||||
.endif
|
||||
|
||||
@// data[3] & update pSrc to data[0]
|
||||
|
@ -241,54 +241,54 @@ radix4SetLoop\name :
|
|||
@// These are executed only if both grpCount = 4 and setCount = 2
|
||||
addeq pSrc, pSrc, setStep
|
||||
beq radix4SkipRead\name
|
||||
VLD2 {dXr3,dXi3},[pSrc],setStep
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc],setStep
|
||||
radix4SkipRead\name:
|
||||
SUBS setCount,setCount,#2
|
||||
|
||||
@// finish first stage of 4 point FFT
|
||||
VADD qY0,qX0,qZ2
|
||||
VSUB qY2,qX0,qZ2
|
||||
VADD.F32 qY0,qX0,qZ2
|
||||
VSUB.F32 qY2,qX0,qZ2
|
||||
|
||||
@// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128]!
|
||||
VADD qY1,qZ1,qZ3
|
||||
VSUB qY3,qZ1,qZ3
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc, :128]!
|
||||
VADD.F32 qY1,qZ1,qZ3
|
||||
VSUB.F32 qY3,qZ1,qZ3
|
||||
|
||||
@// finish second stage of 4 point FFT
|
||||
|
||||
VSUB qZ0,qY2,qY1
|
||||
VSUB.F32 qZ0,qY2,qY1
|
||||
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
VADD.F32 dZr3,dYr0,dYi3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi3,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
VADD.F32 qZ2,qY2,qY1
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
VSUB.F32 dZr1,dYr0,dYi3
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi1,dYi0,dYr3
|
||||
|
||||
VST2 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],dstStep
|
||||
|
||||
|
||||
.else
|
||||
|
||||
VSUB dZr1,dYr0,dYi3
|
||||
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD dZi1,dYi0,dYr3
|
||||
VSUB.F32 dZr1,dYr0,dYi3
|
||||
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
|
||||
VADD.F32 dZi1,dYi0,dYr3
|
||||
|
||||
VADD qZ2,qY2,qY1
|
||||
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
VADD.F32 qZ2,qY2,qY1
|
||||
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
|
||||
|
||||
VADD dZr3,dYr0,dYi3
|
||||
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VSUB dZi3,dYi0,dYr3
|
||||
VADD.F32 dZr3,dYr0,dYi3
|
||||
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
|
||||
VSUB.F32 dZi3,dYi0,dYr3
|
||||
|
||||
VST2 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
VST2.F32 {dZr3,dZi3},[pDst, :128],dstStep
|
||||
|
||||
|
||||
.endif
|
||||
|
@ -298,13 +298,13 @@ radix4SkipRead\name:
|
|||
BGT radix4SetLoop\name
|
||||
|
||||
|
||||
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1.F32 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
@// subtract 4 since grpCount multiplied by 4
|
||||
SUBS grpCount,grpCount,#4
|
||||
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
VLD1.F32 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
|
||||
@// increment pSrc for the next grp
|
||||
ADD pSrc,pSrc,srcStep
|
||||
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
|
||||
VLD1.F32 dW3,[pTwiddle, :64],twStep @//[wi | wr]
|
||||
BGT radix4GrpLoop\name
|
||||
|
||||
|
||||
|
|
|
@ -68,110 +68,110 @@
|
|||
|
||||
@// Neon Registers
|
||||
|
||||
#define dXr0 D0.F32
|
||||
#define dXi0 D1.F32
|
||||
#define dXr1 D2.F32
|
||||
#define dXi1 D3.F32
|
||||
#define dXr2 D4.F32
|
||||
#define dXi2 D5.F32
|
||||
#define dXr3 D6.F32
|
||||
#define dXi3 D7.F32
|
||||
#define dXr4 D8.F32
|
||||
#define dXi4 D9.F32
|
||||
#define dXr5 D10.F32
|
||||
#define dXi5 D11.F32
|
||||
#define dXr6 D12.F32
|
||||
#define dXi6 D13.F32
|
||||
#define dXr7 D14.F32
|
||||
#define dXi7 D15.F32
|
||||
#define qX0 Q0.F32
|
||||
#define qX1 Q1.F32
|
||||
#define qX2 Q2.F32
|
||||
#define qX3 Q3.F32
|
||||
#define qX4 Q4.F32
|
||||
#define qX5 Q5.F32
|
||||
#define qX6 Q6.F32
|
||||
#define qX7 Q7.F32
|
||||
#define dXr0 D0
|
||||
#define dXi0 D1
|
||||
#define dXr1 D2
|
||||
#define dXi1 D3
|
||||
#define dXr2 D4
|
||||
#define dXi2 D5
|
||||
#define dXr3 D6
|
||||
#define dXi3 D7
|
||||
#define dXr4 D8
|
||||
#define dXi4 D9
|
||||
#define dXr5 D10
|
||||
#define dXi5 D11
|
||||
#define dXr6 D12
|
||||
#define dXi6 D13
|
||||
#define dXr7 D14
|
||||
#define dXi7 D15
|
||||
#define qX0 Q0
|
||||
#define qX1 Q1
|
||||
#define qX2 Q2
|
||||
#define qX3 Q3
|
||||
#define qX4 Q4
|
||||
#define qX5 Q5
|
||||
#define qX6 Q6
|
||||
#define qX7 Q7
|
||||
|
||||
#define dUr0 D16.F32
|
||||
#define dUi0 D17.F32
|
||||
#define dUr2 D18.F32
|
||||
#define dUi2 D19.F32
|
||||
#define dUr4 D20.F32
|
||||
#define dUi4 D21.F32
|
||||
#define dUr6 D22.F32
|
||||
#define dUi6 D23.F32
|
||||
#define dUr1 D24.F32
|
||||
#define dUi1 D25.F32
|
||||
#define dUr3 D26.F32
|
||||
#define dUi3 D27.F32
|
||||
#define dUr5 D28.F32
|
||||
#define dUi5 D29.F32
|
||||
#define dUr0 D16
|
||||
#define dUi0 D17
|
||||
#define dUr2 D18
|
||||
#define dUi2 D19
|
||||
#define dUr4 D20
|
||||
#define dUi4 D21
|
||||
#define dUr6 D22
|
||||
#define dUi6 D23
|
||||
#define dUr1 D24
|
||||
#define dUi1 D25
|
||||
#define dUr3 D26
|
||||
#define dUi3 D27
|
||||
#define dUr5 D28
|
||||
#define dUi5 D29
|
||||
@// reuse dXr7 and dXi7
|
||||
#define dUr7 D30.F32
|
||||
#define dUi7 D31.F32
|
||||
#define qU0 Q8.F32
|
||||
#define qU1 Q12.F32
|
||||
#define qU2 Q9.F32
|
||||
#define qU3 Q13.F32
|
||||
#define qU4 Q10.F32
|
||||
#define qU5 Q14.F32
|
||||
#define qU6 Q11.F32
|
||||
#define qU7 Q15.F32
|
||||
#define dUr7 D30
|
||||
#define dUi7 D31
|
||||
#define qU0 Q8
|
||||
#define qU1 Q12
|
||||
#define qU2 Q9
|
||||
#define qU3 Q13
|
||||
#define qU4 Q10
|
||||
#define qU5 Q14
|
||||
#define qU6 Q11
|
||||
#define qU7 Q15
|
||||
|
||||
|
||||
#define dVr0 D24.F32
|
||||
#define dVi0 D25.F32
|
||||
#define dVr2 D26.F32
|
||||
#define dVi2 D27.F32
|
||||
#define dVr4 D28.F32
|
||||
#define dVi4 D29.F32
|
||||
#define dVr6 D30.F32
|
||||
#define dVi6 D31.F32
|
||||
#define dVr1 D16.F32
|
||||
#define dVi1 D17.F32
|
||||
#define dVr3 D18.F32
|
||||
#define dVi3 D19.F32
|
||||
#define dVr5 D20.F32
|
||||
#define dVi5 D21.F32
|
||||
#define dVr7 D22.F32
|
||||
#define dVi7 D23.F32
|
||||
#define qV0 Q12.F32
|
||||
#define qV1 Q8.F32
|
||||
#define qV2 Q13.F32
|
||||
#define qV3 Q9.F32
|
||||
#define qV4 Q14.F32
|
||||
#define qV5 Q10.F32
|
||||
#define qV6 Q15.F32
|
||||
#define qV7 Q11.F32
|
||||
#define dVr0 D24
|
||||
#define dVi0 D25
|
||||
#define dVr2 D26
|
||||
#define dVi2 D27
|
||||
#define dVr4 D28
|
||||
#define dVi4 D29
|
||||
#define dVr6 D30
|
||||
#define dVi6 D31
|
||||
#define dVr1 D16
|
||||
#define dVi1 D17
|
||||
#define dVr3 D18
|
||||
#define dVi3 D19
|
||||
#define dVr5 D20
|
||||
#define dVi5 D21
|
||||
#define dVr7 D22
|
||||
#define dVi7 D23
|
||||
#define qV0 Q12
|
||||
#define qV1 Q8
|
||||
#define qV2 Q13
|
||||
#define qV3 Q9
|
||||
#define qV4 Q14
|
||||
#define qV5 Q10
|
||||
#define qV6 Q15
|
||||
#define qV7 Q11
|
||||
|
||||
#define dYr0 D16.F32
|
||||
#define dYi0 D17.F32
|
||||
#define dYr2 D18.F32
|
||||
#define dYi2 D19.F32
|
||||
#define dYr4 D20.F32
|
||||
#define dYi4 D21.F32
|
||||
#define dYr6 D22.F32
|
||||
#define dYi6 D23.F32
|
||||
#define dYr1 D24.F32
|
||||
#define dYi1 D25.F32
|
||||
#define dYr3 D26.F32
|
||||
#define dYi3 D27.F32
|
||||
#define dYr5 D28.F32
|
||||
#define dYi5 D29.F32
|
||||
#define dYr7 D30.F32
|
||||
#define dYi7 D31.F32
|
||||
#define qY0 Q8.F32
|
||||
#define qY1 Q12.F32
|
||||
#define qY2 Q9.F32
|
||||
#define qY3 Q13.F32
|
||||
#define qY4 Q10.F32
|
||||
#define qY5 Q14.F32
|
||||
#define qY6 Q11.F32
|
||||
#define qY7 Q15.F32
|
||||
#define dYr0 D16
|
||||
#define dYi0 D17
|
||||
#define dYr2 D18
|
||||
#define dYi2 D19
|
||||
#define dYr4 D20
|
||||
#define dYi4 D21
|
||||
#define dYr6 D22
|
||||
#define dYi6 D23
|
||||
#define dYr1 D24
|
||||
#define dYi1 D25
|
||||
#define dYr3 D26
|
||||
#define dYi3 D27
|
||||
#define dYr5 D28
|
||||
#define dYi5 D29
|
||||
#define dYr7 D30
|
||||
#define dYi7 D31
|
||||
#define qY0 Q8
|
||||
#define qY1 Q12
|
||||
#define qY2 Q9
|
||||
#define qY3 Q13
|
||||
#define qY4 Q10
|
||||
#define qY5 Q14
|
||||
#define qY6 Q11
|
||||
#define qY7 Q15
|
||||
|
||||
#define dT0 D14.F32
|
||||
#define dT1 D15.F32
|
||||
#define dT0 D14
|
||||
#define dT1 D15
|
||||
|
||||
|
||||
.MACRO FFTSTAGE scaled, inverse, name
|
||||
|
@ -197,23 +197,23 @@
|
|||
|
||||
@// Calculate the step of input data for the next set
|
||||
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
|
||||
MOV step1,grpSize,LSL #4
|
||||
|
||||
MOV step2,pointStep,LSL #3
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
SUB step2,step2,pointStep @// step2 = 7*pointStep
|
||||
@// setStep = - 7*pointStep+16
|
||||
RSB setStep,step2,#16
|
||||
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
@// data[7] & update pSrc for the next set
|
||||
@// setStep = -7*pointStep + 16
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep
|
||||
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep
|
||||
@// grp = 0 a special case since all the twiddle factors are 1
|
||||
@// Loop on the sets
|
||||
|
||||
|
@ -225,168 +225,168 @@ radix8fsGrpZeroSetLoop\name :
|
|||
|
||||
@// finish first stage of 8 point FFT
|
||||
|
||||
VADD qU0,qX0,qX4
|
||||
VADD qU2,qX1,qX5
|
||||
VADD qU4,qX2,qX6
|
||||
VADD qU6,qX3,qX7
|
||||
VADD.F32 qU0,qX0,qX4
|
||||
VADD.F32 qU2,qX1,qX5
|
||||
VADD.F32 qU4,qX2,qX6
|
||||
VADD.F32 qU6,qX3,qX7
|
||||
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VADD qV0,qU0,qU4
|
||||
VSUB qV2,qU0,qU4
|
||||
VADD qV4,qU2,qU6
|
||||
VSUB qV6,qU2,qU6
|
||||
VADD.F32 qV0,qU0,qU4
|
||||
VSUB.F32 qV2,qU0,qU4
|
||||
VADD.F32 qV4,qU2,qU6
|
||||
VSUB.F32 qV6,qU2,qU6
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
|
||||
VADD qY0,qV0,qV4
|
||||
VSUB qY4,qV0,qV4
|
||||
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
VADD.F32 qY0,qV0,qV4
|
||||
VSUB.F32 qY4,qV0,qV4
|
||||
VST2.F32 {dYr0,dYi0},[pDst, :128],step1 @// store y0
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
VSUB dYr2,dVr2,dVi6
|
||||
VADD dYi2,dVi2,dVr6
|
||||
VSUB.F32 dYr2,dVr2,dVi6
|
||||
VADD.F32 dYi2,dVi2,dVr6
|
||||
|
||||
VADD dYr6,dVr2,dVi6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
VADD.F32 dYr6,dVr2,dVi6
|
||||
VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y2
|
||||
VSUB.F32 dYi6,dVi2,dVr6
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VSUB.F32 qU1,qX0,qX4
|
||||
VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
VSUB.F32 qU3,qX1,qX5
|
||||
VSUB.F32 qU5,qX2,qX6
|
||||
VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y6
|
||||
|
||||
.ELSE
|
||||
|
||||
VADD dYr6,dVr2,dVi6
|
||||
VSUB dYi6,dVi2,dVr6
|
||||
VADD.F32 dYr6,dVr2,dVi6
|
||||
VSUB.F32 dYi6,dVi2,dVr6
|
||||
|
||||
VSUB dYr2,dVr2,dVi6
|
||||
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VADD dYi2,dVi2,dVr6
|
||||
VSUB.F32 dYr2,dVr2,dVi6
|
||||
VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y2
|
||||
VADD.F32 dYi2,dVi2,dVr6
|
||||
|
||||
|
||||
VSUB qU1,qX0,qX4
|
||||
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VSUB qU3,qX1,qX5
|
||||
VSUB qU5,qX2,qX6
|
||||
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
VSUB.F32 qU1,qX0,qX4
|
||||
VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4
|
||||
VSUB.F32 qU3,qX1,qX5
|
||||
VSUB.F32 qU5,qX2,qX6
|
||||
VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y6
|
||||
|
||||
|
||||
.ENDIF
|
||||
|
||||
@// finish first stage of 8 point FFT
|
||||
|
||||
VSUB qU7,qX3,qX7
|
||||
VLD1 dT0[0], [t0]
|
||||
VSUB.F32 qU7,qX3,qX7
|
||||
VLD1.F32 dT0[0], [t0]
|
||||
|
||||
@// finish second stage of 8 point FFT
|
||||
|
||||
VSUB dVr1,dUr1,dUi5
|
||||
VSUB.F32 dVr1,dUr1,dUi5
|
||||
@// data[0] for next iteration
|
||||
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep
|
||||
VADD dVi1,dUi1,dUr5
|
||||
VADD dVr3,dUr1,dUi5
|
||||
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VSUB dVi3,dUi1,dUr5
|
||||
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep
|
||||
VADD.F32 dVi1,dUi1,dUr5
|
||||
VADD.F32 dVr3,dUr1,dUi5
|
||||
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
|
||||
VSUB.F32 dVi3,dUi1,dUr5
|
||||
|
||||
VSUB dVr5,dUr3,dUi7
|
||||
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VADD dVi5,dUi3,dUr7
|
||||
VADD dVr7,dUr3,dUi7
|
||||
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VSUB dVi7,dUi3,dUr7
|
||||
VSUB.F32 dVr5,dUr3,dUi7
|
||||
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
|
||||
VADD.F32 dVi5,dUi3,dUr7
|
||||
VADD.F32 dVr7,dUr3,dUi7
|
||||
VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
|
||||
VSUB.F32 dVi7,dUi3,dUr7
|
||||
|
||||
@// finish third stage of 8 point FFT
|
||||
|
||||
.ifeqs "\inverse", "TRUE"
|
||||
|
||||
@// calculate a*v5
|
||||
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VMUL dVi5,dVi5,dT0[0]
|
||||
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VMUL.F32 dVi5,dVi5,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VSUB.F32 dVr5,dT1,dVi5 @// a * V5
|
||||
VADD.F32 dVi5,dT1,dVi5
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
|
||||
@// calculate b*v7
|
||||
VMUL dT1,dVr7,dT0[0]
|
||||
VMUL dVi7,dVi7,dT0[0]
|
||||
VMUL.F32 dT1,dVr7,dT0[0]
|
||||
VMUL.F32 dVi7,dVi7,dT0[0]
|
||||
|
||||
VADD qY1,qV1,qV5
|
||||
VSUB qY5,qV1,qV5
|
||||
VADD.F32 qY1,qV1,qV5
|
||||
VSUB.F32 qY5,qV1,qV5
|
||||
|
||||
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
VADD.F32 dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB.F32 dVi7,dVi7,dT1
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
@// On the last iteration, this will read past the end of pSrc,
|
||||
@// so skip this read.
|
||||
BEQ radix8SkipLastUpdateInv\name
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
radix8SkipLastUpdateInv\name:
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VADD dYr7,dVr3,dVr7
|
||||
VADD dYi7,dVi3,dVi7
|
||||
VSUB.F32 dYr3,dVr3,dVr7
|
||||
VSUB.F32 dYi3,dVi3,dVi7
|
||||
VST2.F32 {dYr1,dYi1},[pDst, :128],step1 @// store y1
|
||||
VADD.F32 dYr7,dVr3,dVr7
|
||||
VADD.F32 dYi7,dVi3,dVi7
|
||||
|
||||
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr7,dYi7},[pDst, :128] @// store y7
|
||||
VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y3
|
||||
VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y5
|
||||
VST2.F32 {dYr7,dYi7},[pDst, :128] @// store y7
|
||||
ADD pDst, pDst, #16
|
||||
|
||||
.ELSE
|
||||
|
||||
@// calculate b*v7
|
||||
VMUL dT1,dVr7,dT0[0]
|
||||
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VMUL dVi7,dVi7,dT0[0]
|
||||
VMUL.F32 dT1,dVr7,dT0[0]
|
||||
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
|
||||
VMUL.F32 dVi7,dVi7,dT0[0]
|
||||
|
||||
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VADD dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB dVi7,dVi7,dT1
|
||||
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
|
||||
VADD.F32 dVr7,dT1,dVi7 @// b * V7
|
||||
VSUB.F32 dVi7,dVi7,dT1
|
||||
|
||||
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
|
||||
|
||||
@// calculate a*v5
|
||||
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VMUL dVi5,dVi5,dT0[0]
|
||||
VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1
|
||||
VMUL.F32 dVi5,dVi5,dT0[0]
|
||||
|
||||
VADD dYr7,dVr3,dVr7
|
||||
VADD dYi7,dVi3,dVi7
|
||||
VADD.F32 dYr7,dVr3,dVr7
|
||||
VADD.F32 dYi7,dVi3,dVi7
|
||||
SUB pDst, pDst, step2 @// set pDst to y1
|
||||
|
||||
VSUB dVr5,dT1,dVi5 @// a * V5
|
||||
VADD dVi5,dT1,dVi5
|
||||
VSUB.F32 dVr5,dT1,dVi5 @// a * V5
|
||||
VADD.F32 dVi5,dT1,dVi5
|
||||
|
||||
@// On the last iteration, this will read past the end of pSrc,
|
||||
@// so skip this read.
|
||||
BEQ radix8SkipLastUpdateFwd\name
|
||||
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
|
||||
radix8SkipLastUpdateFwd\name:
|
||||
|
||||
VSUB qY5,qV1,qV5
|
||||
VSUB.F32 qY5,qV1,qV5
|
||||
|
||||
VSUB dYr3,dVr3,dVr7
|
||||
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VSUB dYi3,dVi3,dVi7
|
||||
VADD qY1,qV1,qV5
|
||||
VSUB.F32 dYr3,dVr3,dVr7
|
||||
VST2.F32 {dYr7,dYi7},[pDst, :128],step1 @// store y1
|
||||
VSUB.F32 dYi3,dVi3,dVi7
|
||||
VADD.F32 qY1,qV1,qV5
|
||||
|
||||
|
||||
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y3
|
||||
VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y5
|
||||
VST2.F32 {dYr1,dYi1},[pDst, :128]! @// store y7
|
||||
|
||||
.ENDIF
|
||||
|
||||
|
|
|
@ -91,42 +91,42 @@
|
|||
|
||||
@// Neon registers
|
||||
|
||||
#define dX0 d0.f32
|
||||
#define dzero d1.f32
|
||||
#define dZero d2.f32
|
||||
#define dShift d3.f32
|
||||
#define dX0r d2.f32
|
||||
#define dX0i d3.f32
|
||||
#define dX1r d4.f32
|
||||
#define dX1i d5.f32
|
||||
#define dT0 d6.f32
|
||||
#define dT1 d7.f32
|
||||
#define dT2 d8.f32
|
||||
#define dT3 d9.f32
|
||||
#define qT0 d10.f32
|
||||
#define qT1 d12.f32
|
||||
#define dW0r d14.f32
|
||||
#define dW0i d15.f32
|
||||
#define dW1r d16.f32
|
||||
#define dW1i d17.f32
|
||||
#define dY0r d14.f32
|
||||
#define dY0i d15.f32
|
||||
#define dY1r d16.f32
|
||||
#define dY1i d17.f32
|
||||
#define dX0 d0
|
||||
#define dzero d1
|
||||
#define dZero d2
|
||||
#define dShift d3
|
||||
#define dX0r d2
|
||||
#define dX0i d3
|
||||
#define dX1r d4
|
||||
#define dX1i d5
|
||||
#define dT0 d6
|
||||
#define dT1 d7
|
||||
#define dT2 d8
|
||||
#define dT3 d9
|
||||
#define qT0 d10
|
||||
#define qT1 d12
|
||||
#define dW0r d14
|
||||
#define dW0i d15
|
||||
#define dW1r d16
|
||||
#define dW1i d17
|
||||
#define dY0r d14
|
||||
#define dY0i d15
|
||||
#define dY1r d16
|
||||
#define dY1i d17
|
||||
#define dY0rS64 d14.s64
|
||||
#define dY0iS64 d15.s64
|
||||
#define qT2 d18.f32
|
||||
#define qT3 d20.f32
|
||||
#define qT2 d18
|
||||
#define qT3 d20
|
||||
@// lastThreeelements
|
||||
#define dX1 d3.f32
|
||||
#define dW0 d4.f32
|
||||
#define dW1 d5.f32
|
||||
#define dY0 d10.f32
|
||||
#define dY1 d11.f32
|
||||
#define dY2 d12.f32
|
||||
#define dY3 d13.f32
|
||||
#define dX1 d3
|
||||
#define dW0 d4
|
||||
#define dW1 d5
|
||||
#define dY0 d10
|
||||
#define dY1 d11
|
||||
#define dY2 d12
|
||||
#define dY3 d13
|
||||
|
||||
#define half d0.f32
|
||||
#define half d0
|
||||
|
||||
@// Allocate stack memory required by the function
|
||||
|
||||
|
@ -151,11 +151,11 @@
|
|||
@// N=1 Treat seperately
|
||||
CMP N,#1
|
||||
BGT sizeGreaterThanOne
|
||||
VLD1 dX0[0],[pSrc]
|
||||
VLD1.F32 dX0[0],[pSrc]
|
||||
MOV zero,#0
|
||||
VMOV dzero[0],zero
|
||||
VMOV dZero[0],zero
|
||||
VST3 {dX0[0],dzero[0],dZero[0]},[pDst]
|
||||
VMOV.F32 dzero[0],zero
|
||||
VMOV.F32 dZero[0],zero
|
||||
VST3.F32 {dX0[0],dzero[0],dZero[0]},[pDst]
|
||||
|
||||
B End
|
||||
|
||||
|
@ -176,8 +176,8 @@ sizeGreaterThanOne:
|
|||
|
||||
CMP order,#1
|
||||
BGE orderGreaterthan0 @// order > 0
|
||||
VLD1 dX0,[pSrc]
|
||||
VST1 dX0,[pOut]
|
||||
VLD1.F32 dX0,[pSrc]
|
||||
VST1.F32 dX0,[pOut]
|
||||
MOV pSrc,pOut
|
||||
MOV argDst,pDst
|
||||
BLT FFTEnd
|
||||
|
@ -266,25 +266,25 @@ finalComplexToRealFixup:
|
|||
@// (a-b, 0)
|
||||
|
||||
@// F(0) and F(N/2)
|
||||
VLD2 {dX0r[0],dX0i[0]},[pSrc]!
|
||||
VLD2.F32 {dX0r[0],dX0i[0]},[pSrc]!
|
||||
MOV zero,#0
|
||||
VMOV dX0r[1],zero
|
||||
VMOV.F32 dX0r[1],zero
|
||||
MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes
|
||||
VMOV dX0i[1],zero
|
||||
VMOV.F32 dX0i[1],zero
|
||||
@// twStep = 3N/8 * 8 bytes pointing to W^1
|
||||
SUB twStep,step,subFFTSize,LSL #1
|
||||
|
||||
VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
|
||||
VADD.F32 dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
|
||||
MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes
|
||||
VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
|
||||
VSUB.F32 dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
|
||||
SUBS subFFTSize,subFFTSize,#2
|
||||
|
||||
VST1 dY0r,[argDst],step
|
||||
VST1.F32 dY0r,[argDst],step
|
||||
ADD pTwiddleTmp,argTwiddle,#8 @// W^2
|
||||
VST1 dY0i,[argDst]!
|
||||
VST1.F32 dY0i,[argDst]!
|
||||
ADD argTwiddle,argTwiddle,twStep @// W^1
|
||||
|
||||
VDUP dzero,zero
|
||||
VDUP.F32 dzero,zero
|
||||
SUB argDst,argDst,step
|
||||
|
||||
BLT End
|
||||
|
@ -299,75 +299,75 @@ finalComplexToRealFixup:
|
|||
|
||||
|
||||
ADR t0, HALF
|
||||
VLD1 half[0], [t0]
|
||||
VLD1.F32 half[0], [t0]
|
||||
|
||||
evenOddButterflyLoop:
|
||||
|
||||
|
||||
VLD1 dW0r,[argTwiddle],step1
|
||||
VLD1 dW1r,[argTwiddle]!
|
||||
VLD1.F32 dW0r,[argTwiddle],step1
|
||||
VLD1.F32 dW1r,[argTwiddle]!
|
||||
|
||||
VLD2 {dX0r,dX0i},[pSrc],step
|
||||
VLD2.F32 {dX0r,dX0i},[pSrc],step
|
||||
SUB argTwiddle,argTwiddle,step1
|
||||
VLD2 {dX1r,dX1i},[pSrc]!
|
||||
VLD2.F32 {dX1r,dX1i},[pSrc]!
|
||||
|
||||
|
||||
|
||||
SUB step1,step1,#8 @// (N/4-2)*8 bytes
|
||||
VLD1 dW0i,[pTwiddleTmp],step1
|
||||
VLD1 dW1i,[pTwiddleTmp]!
|
||||
VLD1.F32 dW0i,[pTwiddleTmp],step1
|
||||
VLD1.F32 dW1i,[pTwiddleTmp]!
|
||||
SUB pSrc,pSrc,step
|
||||
|
||||
SUB pTwiddleTmp,pTwiddleTmp,step1
|
||||
VREV64 dX1r,dX1r
|
||||
VREV64 dX1i,dX1i
|
||||
VREV64.F32 dX1r,dX1r
|
||||
VREV64.F32 dX1i,dX1i
|
||||
SUBS subFFTSize,subFFTSize,#4
|
||||
|
||||
|
||||
|
||||
VSUB dT2,dX0r,dX1r @// a-c
|
||||
VSUB.F32 dT2,dX0r,dX1r @// a-c
|
||||
SUB step1,step1,#8
|
||||
VADD dT0,dX0r,dX1r @// a+c
|
||||
VSUB dT1,dX0i,dX1i @// b-d
|
||||
VADD dT3,dX0i,dX1i @// b+d
|
||||
VMUL dT0,dT0,half[0]
|
||||
VMUL dT1,dT1,half[0]
|
||||
VZIP dW1r,dW1i
|
||||
VZIP dW0r,dW0i
|
||||
VADD.F32 dT0,dX0r,dX1r @// a+c
|
||||
VSUB.F32 dT1,dX0i,dX1i @// b-d
|
||||
VADD.F32 dT3,dX0i,dX1i @// b+d
|
||||
VMUL.F32 dT0,dT0,half[0]
|
||||
VMUL.F32 dT1,dT1,half[0]
|
||||
VZIP.F32 dW1r,dW1i
|
||||
VZIP.F32 dW0r,dW0i
|
||||
|
||||
|
||||
VMUL qT0,dW1r,dT2
|
||||
VMUL qT1,dW1r,dT3
|
||||
VMUL qT2,dW0r,dT2
|
||||
VMUL qT3,dW0r,dT3
|
||||
VMUL.F32 qT0,dW1r,dT2
|
||||
VMUL.F32 qT1,dW1r,dT3
|
||||
VMUL.F32 qT2,dW0r,dT2
|
||||
VMUL.F32 qT3,dW0r,dT3
|
||||
|
||||
VMLA qT0,dW1i,dT3
|
||||
VMLS qT1,dW1i,dT2
|
||||
VMLA.F32 qT0,dW1i,dT3
|
||||
VMLS.F32 qT1,dW1i,dT2
|
||||
|
||||
VMLS qT2,dW0i,dT3
|
||||
VMLA qT3,dW0i,dT2
|
||||
VMLS.F32 qT2,dW0i,dT3
|
||||
VMLA.F32 qT3,dW0i,dT2
|
||||
|
||||
|
||||
VMUL dX1r,qT0,half[0]
|
||||
VMUL dX1i,qT1,half[0]
|
||||
VMUL.F32 dX1r,qT0,half[0]
|
||||
VMUL.F32 dX1i,qT1,half[0]
|
||||
|
||||
VSUB dY1r,dT0,dX1i @// F(N/2 -1)
|
||||
VADD dY1i,dT1,dX1r
|
||||
VNEG dY1i,dY1i
|
||||
VSUB.F32 dY1r,dT0,dX1i @// F(N/2 -1)
|
||||
VADD.F32 dY1i,dT1,dX1r
|
||||
VNEG.F32 dY1i,dY1i
|
||||
|
||||
VREV64 dY1r,dY1r
|
||||
VREV64 dY1i,dY1i
|
||||
VREV64.F32 dY1r,dY1r
|
||||
VREV64.F32 dY1i,dY1i
|
||||
|
||||
|
||||
VMUL dX0r,qT2,half[0]
|
||||
VMUL dX0i,qT3,half[0]
|
||||
VMUL.F32 dX0r,qT2,half[0]
|
||||
VMUL.F32 dX0i,qT3,half[0]
|
||||
|
||||
VSUB dY0r,dT0,dX0i @// F(1)
|
||||
VADD dY0i,dT1,dX0r
|
||||
VSUB.F32 dY0r,dT0,dX0i @// F(1)
|
||||
VADD.F32 dY0i,dT1,dX0r
|
||||
|
||||
|
||||
VST2 {dY0r,dY0i},[argDst],step
|
||||
VST2 {dY1r,dY1i},[argDst]!
|
||||
VST2.F32 {dY0r,dY0i},[argDst],step
|
||||
VST2.F32 {dY1r,dY1i},[argDst]!
|
||||
SUB argDst,argDst,step
|
||||
SUB step,step,#32 @// (N/2-4)*8 bytes
|
||||
|
||||
|
@ -388,11 +388,11 @@ evenOddButterflyLoop:
|
|||
@// Since (c,d) = (0,1) for the last element, result is just (a,-b)
|
||||
|
||||
lastElement:
|
||||
VLD1 dX0r,[pSrc]
|
||||
VLD1.F32 dX0r,[pSrc]
|
||||
|
||||
VST1 dX0r[0],[argDst]!
|
||||
VNEG dX0r,dX0r
|
||||
VST1 dX0r[1],[argDst]!
|
||||
VST1.F32 dX0r[0],[argDst]!
|
||||
VNEG.F32 dX0r,dX0r
|
||||
VST1.F32 dX0r[1],[argDst]!
|
||||
|
||||
End:
|
||||
@// Set return value
|
||||
|
|
|
@ -100,45 +100,45 @@
|
|||
|
||||
@// Neon registers
|
||||
|
||||
#define dX0 D0.F32
|
||||
#define dShift D1.F32
|
||||
#define dX1 D1.F32
|
||||
#define dY0 D2.F32
|
||||
#define dY1 D3.F32
|
||||
#define dX0r D0.F32
|
||||
#define dX0i D1.F32
|
||||
#define dX1r D2.F32
|
||||
#define dX1i D3.F32
|
||||
#define dW0r D4.F32
|
||||
#define dW0i D5.F32
|
||||
#define dW1r D6.F32
|
||||
#define dW1i D7.F32
|
||||
#define dT0 D8.F32
|
||||
#define dT1 D9.F32
|
||||
#define dT2 D10.F32
|
||||
#define dT3 D11.F32
|
||||
#define qT0 d12.F32
|
||||
#define qT1 d14.F32
|
||||
#define qT2 d16.F32
|
||||
#define qT3 d18.F32
|
||||
#define dY0r D4.F32
|
||||
#define dY0i D5.F32
|
||||
#define dY1r D6.F32
|
||||
#define dY1i D7.F32
|
||||
#define dzero D20.F32
|
||||
#define dX0 D0
|
||||
#define dShift D1
|
||||
#define dX1 D1
|
||||
#define dY0 D2
|
||||
#define dY1 D3
|
||||
#define dX0r D0
|
||||
#define dX0i D1
|
||||
#define dX1r D2
|
||||
#define dX1i D3
|
||||
#define dW0r D4
|
||||
#define dW0i D5
|
||||
#define dW1r D6
|
||||
#define dW1i D7
|
||||
#define dT0 D8
|
||||
#define dT1 D9
|
||||
#define dT2 D10
|
||||
#define dT3 D11
|
||||
#define qT0 d12
|
||||
#define qT1 d14
|
||||
#define qT2 d16
|
||||
#define qT3 d18
|
||||
#define dY0r D4
|
||||
#define dY0i D5
|
||||
#define dY1r D6
|
||||
#define dY1i D7
|
||||
#define dzero D20
|
||||
|
||||
#define dY2 D4.F32
|
||||
#define dY3 D5.F32
|
||||
#define dW0 D6.F32
|
||||
#define dW1 D7.F32
|
||||
#define dW0Tmp D10.F32
|
||||
#define dW1Neg D11.F32
|
||||
#define dY2 D4
|
||||
#define dY3 D5
|
||||
#define dW0 D6
|
||||
#define dW1 D7
|
||||
#define dW0Tmp D10
|
||||
#define dW1Neg D11
|
||||
|
||||
#define sN S0.S32
|
||||
#define fN S1.F32
|
||||
#define fN S1
|
||||
@// two must be the same as dScale[0]!
|
||||
#define dScale D2.F32
|
||||
#define two S4.F32
|
||||
#define dScale D2
|
||||
#define two S4
|
||||
|
||||
|
||||
@// Allocate stack memory required by the function
|
||||
|
@ -165,8 +165,8 @@
|
|||
@// N=1 Treat seperately
|
||||
CMP N,#1
|
||||
BGT sizeGreaterThanOne
|
||||
VLD1 dX0[0],[pSrc]
|
||||
VST1 dX0[0],[pDst]
|
||||
VLD1.F32 dX0[0],[pSrc]
|
||||
VST1.F32 dX0[0],[pDst]
|
||||
|
||||
B End
|
||||
|
||||
|
@ -195,8 +195,8 @@ complexIFFT:
|
|||
CMP order,#1
|
||||
BGE orderGreaterthan0 @// order > 0
|
||||
|
||||
VLD1 dX0,[pSrc]
|
||||
VST1 dX0,[pDst]
|
||||
VLD1.F32 dX0,[pSrc]
|
||||
VST1.F32 dX0,[pDst]
|
||||
MOV pSrc,pDst
|
||||
BLT FFTEnd
|
||||
|
||||
|
@ -260,14 +260,14 @@ lastStageUnscaledRadix4:
|
|||
|
||||
FFTEnd: @// Does only the scaling
|
||||
@ Scale inverse FFT result by 2 for consistency with other FFTs
|
||||
VMOV two, 2.0 @ two = dScale[0]
|
||||
VMOV.F32 two, #2.0 @ two = dScale[0]
|
||||
|
||||
@// N = subFFTSize ; dataptr = pDst
|
||||
scaleFFTData:
|
||||
VLD1 {dX0},[pSrc] @// pSrc contains pDst pointer
|
||||
VLD1.F32 {dX0},[pSrc] @// pSrc contains pDst pointer
|
||||
SUBS subFFTSize,subFFTSize,#1
|
||||
VMUL dX0, dX0, dScale[0]
|
||||
VST1 {dX0},[pSrc]!
|
||||
VMUL.F32 dX0, dX0, dScale[0]
|
||||
VST1.F32 {dX0},[pSrc]!
|
||||
|
||||
BGT scaleFFTData
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче