Bug 1845651 - Part 2. Convert OpenMAX DL assembler to clang compatible. r=padenot

Newer Android NDK has no GNU assembler, so I convert OpenMAX DL in Gecko to
clang compatible assembler.

Gecko's OpenMAX DL is imported from Cromium tree, but it is removed from
Chromium (https://crbug.com/917355). So there is no upstream code now.

Also, if using MOZ_LIBAV_FF on arm32 build, MOZ_LIBAV_FF is slower than
OpenMAX DL.

Convolution reverb - https://padenot.github.io/webaudio-benchmark/index.html
- OpenMAX DL ... 16x
- MOZ_LIBAV_FF ... 13x

Depends on D184717

Differential Revision: https://phabricator.services.mozilla.com/D184719
This commit is contained in:
Makoto Kato 2023-08-05 07:16:21 +00:00
Родитель 98a5909e60
Коммит ccf77507b4
12 изменённых файлов: 777 добавлений и 778 удалений

Просмотреть файл

@ -168,10 +168,13 @@
@ Define the function and make it external.
.global \name
#ifndef __clang__
.func \name
#endif
.section .text.\name,"ax",%progbits
.arch armv7-a
.fpu neon
.syntax unified
.object_arch armv4
.align 2
\name :
@ -203,7 +206,9 @@
@ Restore any saved R or D registers.
_M_RET
.fnend
#ifndef __clang__
.endfunc
#endif
@ Reset the global stack tracking variables back to their
@ initial values.
.set _SBytes, 0

Просмотреть файл

@ -47,9 +47,3 @@ if CONFIG['CPU_ARCH'] == 'arm' and CONFIG['BUILD_ARM_NEON']:
DEFINES['BIG_FFT_TABLE'] = True
FINAL_LIBRARY = 'xul'
if CONFIG['CC_TYPE'] == 'clang':
ASFLAGS += [
'-no-integrated-as',
]

Просмотреть файл

@ -85,40 +85,40 @@
@// Neon registers
#define dX0 D0.F32
#define dShift D1.F32
#define dX1 D1.F32
#define dY0 D2.F32
#define dY1 D3.F32
#define dX0r D0.F32
#define dX0i D1.F32
#define dX1r D2.F32
#define dX1i D3.F32
#define dW0r D4.F32
#define dW0i D5.F32
#define dW1r D6.F32
#define dW1i D7.F32
#define dT0 D8.F32
#define dT1 D9.F32
#define dT2 D10.F32
#define dT3 D11.F32
#define qT0 D12.F32
#define qT1 D14.F32
#define qT2 D16.F32
#define qT3 D18.F32
#define dY0r D4.F32
#define dY0i D5.F32
#define dY1r D6.F32
#define dY1i D7.F32
#define dX0 D0
#define dShift D1
#define dX1 D1
#define dY0 D2
#define dY1 D3
#define dX0r D0
#define dX0i D1
#define dX1r D2
#define dX1i D3
#define dW0r D4
#define dW0i D5
#define dW1r D6
#define dW1i D7
#define dT0 D8
#define dT1 D9
#define dT2 D10
#define dT3 D11
#define qT0 D12
#define qT1 D14
#define qT2 D16
#define qT3 D18
#define dY0r D4
#define dY0i D5
#define dY1r D6
#define dY1i D7
#define dY2 D4.F32
#define dY3 D5.F32
#define dW0 D6.F32
#define dW1 D7.F32
#define dW0Tmp D10.F32
#define dW1Neg D11.F32
#define dY2 D4
#define dY3 D5
#define dW0 D6
#define dW1 D7
#define dW0Tmp D10
#define dW1Neg D11
#define half D13.F32
#define half D13
@ Structure offsets for the FFTSpec
.set ARMsFFTSpec_N, 0
@ -135,7 +135,7 @@
LDR pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
LDR pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
VMOV half, 0.5
VMOV.F32 half, #0.5
MOV size,N,ASR #1 @// preserve the contents of N
@ -149,33 +149,33 @@
@// Z(0) : no need of twiddle multiply
@// Z(0) = 1/2 { [F(0) + F'(N/2)] +j [F(0) - F'(N/2)] }
VLD1 dX0,[pSrc],step
VLD1.F32 dX0,[pSrc],step
ADD pOut1,pOut,step @// pOut1 = pOut+ N/2*8 bytes
VLD1 dX1,[pSrc]!
VLD1.F32 dX1,[pSrc]!
@// twStep = 3N/8 * 8 bytes pointing to W^1
SUB twStep,step,size,LSL #1
MOV step1,size,LSL #2 @// step1 = N/4 * 8 = N/2*4 bytes
SUB step1,step1,#8 @// (N/4-1)*8 bytes
VADD dY0,dX0,dX1 @// [b+d | a+c]
VSUB dY1,dX0,dX1 @// [b-d | a-c]
VMUL dY0, dY0, half[0]
VMUL dY1, dY1, half[0]
VADD.F32 dY0,dX0,dX1 @// [b+d | a+c]
VSUB.F32 dY1,dX0,dX1 @// [b-d | a-c]
VMUL.F32 dY0, dY0, half[0]
VMUL.F32 dY1, dY1, half[0]
@// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
VZIP dY0,dY1
VZIP.F32 dY0,dY1
VSUB dX0,dY0,dY1
VSUB.F32 dX0,dY0,dY1
SUBS size,size,#2
VADD dX1,dY0,dY1
VADD.F32 dX1,dY0,dY1
SUB pSrc,pSrc,step
VST1 dX0[0],[pOut1]!
VST1.F32 dX0[0],[pOut1]!
ADD pTwiddleTmp,pTwiddle,#8 @// W^2
VST1 dX1[1],[pOut1]!
VST1.F32 dX1[1],[pOut1]!
ADD argTwiddle1,pTwiddle,twStep @// W^1
@ -195,65 +195,65 @@
evenOddButterflyLoop\name :
VLD1 dW0r,[argTwiddle1],step1
VLD1 dW1r,[argTwiddle1]!
VLD1.F32 dW0r,[argTwiddle1],step1
VLD1.F32 dW1r,[argTwiddle1]!
VLD2 {dX0r,dX0i},[pSrc],step
VLD2.F32 {dX0r,dX0i},[pSrc],step
SUB argTwiddle1,argTwiddle1,step1
VLD2 {dX1r,dX1i},[pSrc]!
VLD2.F32 {dX1r,dX1i},[pSrc]!
SUB step1,step1,#8 @// (N/4-2)*8 bytes
VLD1 dW0i,[pTwiddleTmp],step1
VLD1 dW1i,[pTwiddleTmp]!
VLD1.F32 dW0i,[pTwiddleTmp],step1
VLD1.F32 dW1i,[pTwiddleTmp]!
SUB pSrc,pSrc,step
SUB pTwiddleTmp,pTwiddleTmp,step1
VREV64 dX1r,dX1r
VREV64 dX1i,dX1i
VREV64.F32 dX1r,dX1r
VREV64.F32 dX1i,dX1i
SUBS size,size,#4
VSUB dT2,dX0r,dX1r @// a-c
VADD dT3,dX0i,dX1i @// b+d
VADD dT0,dX0r,dX1r @// a+c
VSUB dT1,dX0i,dX1i @// b-d
VSUB.F32 dT2,dX0r,dX1r @// a-c
VADD.F32 dT3,dX0i,dX1i @// b+d
VADD.F32 dT0,dX0r,dX1r @// a+c
VSUB.F32 dT1,dX0i,dX1i @// b-d
SUB step1,step1,#8
VMUL dT2, dT2, half[0]
VMUL dT3, dT3, half[0]
VMUL.F32 dT2, dT2, half[0]
VMUL.F32 dT3, dT3, half[0]
VMUL dT0, dT0, half[0]
VMUL dT1, dT1, half[0]
VMUL.F32 dT0, dT0, half[0]
VMUL.F32 dT1, dT1, half[0]
VZIP dW1r,dW1i
VZIP dW0r,dW0i
VZIP.F32 dW1r,dW1i
VZIP.F32 dW0r,dW0i
VMUL dX1r,dW1r,dT2
VMUL dX1i,dW1r,dT3
VMUL dX0r,dW0r,dT2
VMUL dX0i,dW0r,dT3
VMUL.F32 dX1r,dW1r,dT2
VMUL.F32 dX1i,dW1r,dT3
VMUL.F32 dX0r,dW0r,dT2
VMUL.F32 dX0i,dW0r,dT3
VMLS dX1r,dW1i,dT3
VMLA dX1i,dW1i,dT2
VMLS.F32 dX1r,dW1i,dT3
VMLA.F32 dX1i,dW1i,dT2
VMLA dX0r,dW0i,dT3
VMLS dX0i,dW0i,dT2
VMLA.F32 dX0r,dW0i,dT3
VMLS.F32 dX0i,dW0i,dT2
VADD dY1r,dT0,dX1i @// F(N/2 -1)
VSUB dY1i,dX1r,dT1
VADD.F32 dY1r,dT0,dX1i @// F(N/2 -1)
VSUB.F32 dY1i,dX1r,dT1
VREV64 dY1r,dY1r
VREV64 dY1i,dY1i
VREV64.F32 dY1r,dY1r
VREV64.F32 dY1i,dY1i
VADD dY0r,dT0,dX0i @// F(1)
VSUB dY0i,dT1,dX0r
VADD.F32 dY0r,dT0,dX0i @// F(1)
VSUB.F32 dY0i,dT1,dX0r
VST2 {dY0r,dY0i},[pOut1],step
VST2 {dY1r,dY1i},[pOut1]!
VST2.F32 {dY0r,dY0i},[pOut1],step
VST2.F32 {dY1r,dY1i},[pOut1]!
SUB pOut1,pOut1,step
SUB step,step,#32 @// (N/2-4)*8 bytes
@ -274,11 +274,11 @@ evenOddButterflyLoop\name :
@// Since (c,d) = (0,1) for the last element, result is just (a,-b)
lastElement\name :
VLD1 dX0r,[pSrc]
VLD1.F32 dX0r,[pSrc]
VST1 dX0r[0],[pOut1]!
VNEG dX0r,dX0r
VST1 dX0r[1],[pOut1]
VST1.F32 dX0r[0],[pOut1]!
VNEG.F32 dX0r,dX0r
VST1.F32 dX0r[1],[pOut1]

Просмотреть файл

@ -67,10 +67,10 @@
@// Neon Registers
#define dX0 D0.F32
#define dX1 D1.F32
#define dY0 D2.F32
#define dY1 D3.F32
#define dX0 D0
#define dX1 D1
#define dY0 D2
#define dY1 D3
.MACRO FFTSTAGE scaled, inverse, name
@ -99,16 +99,16 @@
grpZeroSetLoop\name :
VLD1 dX0,[pSrc],pointStep
VLD1 dX1,[pSrc],step @// step = -pointStep + 8
VLD1.F32 dX0,[pSrc],pointStep
VLD1.F32 dX1,[pSrc],step @// step = -pointStep + 8
SUBS setCount,setCount,#1
VADD dY0,dX0,dX1
VSUB dY1,dX0,dX1
VADD.F32 dY0,dX0,dX1
VSUB.F32 dY1,dX0,dX1
VST1 dY0,[pDst],outPointStep
VST1.F32 dY0,[pDst],outPointStep
@// dstStep = step = -pointStep + 8
VST1 dY1,[pDst],dstStep
VST1.F32 dY1,[pDst],dstStep
BGT grpZeroSetLoop\name

Просмотреть файл

@ -60,18 +60,18 @@
@// Neon Registers
#define dWr d0.f32
#define dWi d1.f32
#define dXr0 d2.f32
#define dXi0 d3.f32
#define dXr1 d4.f32
#define dXi1 d5.f32
#define dYr0 d6.f32
#define dYi0 d7.f32
#define dYr1 d8.f32
#define dYi1 d9.f32
#define qT0 d10.f32
#define qT1 d12.f32
#define dWr d0
#define dWi d1
#define dXr0 d2
#define dXi0 d3
#define dXr1 d4
#define dXi1 d5
#define dYr0 d6
#define dYi0 d7
#define dYr1 d8
#define dYi1 d9
#define qT0 d10
#define qT1 d12
.MACRO FFTSTAGE scaled, inverse, name
@ -93,37 +93,37 @@
radix2lsGrpLoop\name :
@ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
@ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
VLD2 {dWr,dWi},[pTwiddle, :64]!
VLD2.F32 {dWr,dWi},[pTwiddle, :64]!
@ dXr0 = [pSrc[0].Re, pSrc[2].Re]
@ dXi0 = [pSrc[0].Im, pSrc[2].Im]
@ dXr1 = [pSrc[1].Re, pSrc[3].Re]
@ dXi1 = [pSrc[1].Im, pSrc[3].Im]
VLD4 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
VLD4.F32 {dXr0,dXi0,dXr1,dXi1},[pSrc, :128]!
SUBS grpCount,grpCount,#4 @// grpCount is multiplied by 2
.ifeqs "\inverse", "TRUE"
VMUL qT0,dWr,dXr1
VMLA qT0,dWi,dXi1 @// real part
VMUL qT1,dWr,dXi1
VMLS qT1,dWi,dXr1 @// imag part
VMUL.F32 qT0,dWr,dXr1
VMLA.F32 qT0,dWi,dXi1 @// real part
VMUL.F32 qT1,dWr,dXi1
VMLS.F32 qT1,dWi,dXr1 @// imag part
.else
VMUL qT0,dWr,dXr1
VMLS qT0,dWi,dXi1 @// real part
VMUL qT1,dWr,dXi1
VMLA qT1,dWi,dXr1 @// imag part
VMUL.F32 qT0,dWr,dXr1
VMLS.F32 qT0,dWi,dXi1 @// real part
VMUL.F32 qT1,dWr,dXi1
VMLA.F32 qT1,dWi,dXr1 @// imag part
.endif
VSUB dYr0,dXr0,qT0
VSUB dYi0,dXi0,qT1
VADD dYr1,dXr0,qT0
VADD dYi1,dXi0,qT1
VSUB.F32 dYr0,dXr0,qT0
VSUB.F32 dYi0,dXi0,qT1
VADD.F32 dYr1,dXr0,qT0
VADD.F32 dYi1,dXi0,qT1
VST2 {dYr0,dYi0},[pDst],outPointStep
VST2 {dYr1,dYi1},[pDst],dstStep @// dstStep = step = -outPointStep + 16
VST2.F32 {dYr0,dYi0},[pDst],outPointStep
VST2.F32 {dYr1,dYi1},[pDst],dstStep @// dstStep = step = -outPointStep + 16
BGT radix2lsGrpLoop\name

Просмотреть файл

@ -69,17 +69,17 @@
@// Neon Registers
#define dW D0.F32
#define dX0 D2.F32
#define dX1 D3.F32
#define dX2 D4.F32
#define dX3 D5.F32
#define dY0 D6.F32
#define dY1 D7.F32
#define dY2 D8.F32
#define dY3 D9.F32
#define qT0 D10.F32
#define qT1 D11.F32
#define dW D0
#define dX0 D2
#define dX1 D3
#define dX2 D4
#define dX3 D5
#define dY0 D6
#define dY1 D7
#define dY2 D8
#define dY3 D9
#define qT0 D10
#define qT1 D11
.MACRO FFTSTAGE scaled, inverse, name
@ -115,7 +115,7 @@
radix2GrpLoop\name :
MOV setCount,pointStep,LSR #3
VLD1 dW,[pTwiddle],pointStep @//[wi | wr]
VLD1.F32 dW,[pTwiddle],pointStep @//[wi | wr]
@// Loop on the sets
@ -125,35 +125,35 @@ radix2SetLoop\name :
@// point0: dX0-real part dX1-img part
VLD2 {dX0,dX1},[pSrc],pointStep
VLD2.F32 {dX0,dX1},[pSrc],pointStep
@// point1: dX2-real part dX3-img part
VLD2 {dX2,dX3},[pSrc],step
VLD2.F32 {dX2,dX3},[pSrc],step
SUBS setCount,setCount,#2
.ifeqs "\inverse", "TRUE"
VMUL qT0,dX2,dW[0]
VMLA qT0,dX3,dW[1] @// real part
VMUL qT1,dX3,dW[0]
VMLS qT1,dX2,dW[1] @// imag part
VMUL.F32 qT0,dX2,dW[0]
VMLA.F32 qT0,dX3,dW[1] @// real part
VMUL.F32 qT1,dX3,dW[0]
VMLS.F32 qT1,dX2,dW[1] @// imag part
.else
VMUL qT0,dX2,dW[0]
VMLS qT0,dX3,dW[1] @// real part
VMUL qT1,dX3,dW[0]
VMLA qT1,dX2,dW[1] @// imag part
VMUL.F32 qT0,dX2,dW[0]
VMLS.F32 qT0,dX3,dW[1] @// real part
VMUL.F32 qT1,dX3,dW[0]
VMLA.F32 qT1,dX2,dW[1] @// imag part
.endif
VSUB dY0,dX0,qT0
VSUB dY1,dX1,qT1
VADD dY2,dX0,qT0
VADD dY3,dX1,qT1
VSUB.F32 dY0,dX0,qT0
VSUB.F32 dY1,dX1,qT1
VADD.F32 dY2,dX0,qT0
VADD.F32 dY3,dX1,qT1
VST2 {dY0,dY1},[pDst],outPointStep
VST2.F32 {dY0,dY1},[pDst],outPointStep
@// dstStep = -outPointStep + 16
VST2 {dY2,dY3},[pDst],dstStep
VST2.F32 {dY2,dY3},[pDst],dstStep
BGT radix2SetLoop\name

Просмотреть файл

@ -68,42 +68,42 @@
@// Neon Registers
#define dXr0 D0.F32
#define dXi0 D1.F32
#define dXr1 D2.F32
#define dXi1 D3.F32
#define dXr2 D4.F32
#define dXi2 D5.F32
#define dXr3 D6.F32
#define dXi3 D7.F32
#define dYr0 D8.F32
#define dYi0 D9.F32
#define dYr1 D10.F32
#define dYi1 D11.F32
#define dYr2 D12.F32
#define dYi2 D13.F32
#define dYr3 D14.F32
#define dYi3 D15.F32
#define qX0 Q0.F32
#define qX1 Q1.F32
#define qX2 Q2.F32
#define qX3 Q3.F32
#define qY0 Q4.F32
#define qY1 Q5.F32
#define qY2 Q6.F32
#define qY3 Q7.F32
#define dZr0 D16.F32
#define dZi0 D17.F32
#define dZr1 D18.F32
#define dZi1 D19.F32
#define dZr2 D20.F32
#define dZi2 D21.F32
#define dZr3 D22.F32
#define dZi3 D23.F32
#define qZ0 Q8.F32
#define qZ1 Q9.F32
#define qZ2 Q10.F32
#define qZ3 Q11.F32
#define dXr0 D0
#define dXi0 D1
#define dXr1 D2
#define dXi1 D3
#define dXr2 D4
#define dXi2 D5
#define dXr3 D6
#define dXi3 D7
#define dYr0 D8
#define dYi0 D9
#define dYr1 D10
#define dYi1 D11
#define dYr2 D12
#define dYi2 D13
#define dYr3 D14
#define dYi3 D15
#define qX0 Q0
#define qX1 Q1
#define qX2 Q2
#define qX3 Q3
#define qY0 Q4
#define qY1 Q5
#define qY2 Q6
#define qY3 Q7
#define dZr0 D16
#define dZi0 D17
#define dZr1 D18
#define dZi1 D19
#define dZr2 D20
#define dZi2 D21
#define dZr3 D22
#define dZi3 D23
#define qZ0 Q8
#define qZ1 Q9
#define qZ2 Q10
#define qZ3 Q11
.MACRO FFTSTAGE scaled, inverse, name
@ -118,31 +118,31 @@
@// Update pSubFFTSize and pSubFFTNum regs
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
@// subFFTSize = 1 for the first stage
MOV subFFTSize,#4
@// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
LSR grpSize,subFFTNum,#2
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
MOV subFFTNum,grpSize
@// Calculate the step of input data for the next set
@//MOV setStep,pointStep,LSL #1
MOV setStep,grpSize,LSL #4
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
@// setStep = 3*pointStep
ADD setStep,setStep,pointStep
@// setStep = - 3*pointStep+16
RSB setStep,setStep,#16
@// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
@// step1 = 2*pointStep
MOV step1,pointStep,LSL #1
VADD qY0,qX0,qX2
VADD.F32 qY0,qX0,qX2
@// step3 = -pointStep
RSB step3,pointStep,#0
@ -161,68 +161,68 @@ radix4fsGrpZeroSetLoop\name :
@// finish first stage of 4 point FFT
VSUB qY2,qX0,qX2
VSUB.F32 qY2,qX0,qX2
VLD2 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VADD qY1,qX1,qX3
VLD2 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
VSUB qY3,qX1,qX3
VLD2.F32 {dXr0,dXi0},[pSrc, :128],step1 @// data[0]
VADD.F32 qY1,qX1,qX3
VLD2.F32 {dXr2,dXi2},[pSrc, :128],step3 @// data[2]
VSUB.F32 qY3,qX1,qX3
@// finish second stage of 4 point FFT
.ifeqs "\inverse", "TRUE"
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VADD qZ0,qY0,qY1
VLD2.F32 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VADD.F32 qZ0,qY0,qY1
@// data[3] & update pSrc for the next set, but not if it's the
@// last iteration so that we don't read past the end of the
@// input array.
BEQ radix4SkipLastUpdateInv\name
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
radix4SkipLastUpdateInv\name:
VSUB dZr3,dYr2,dYi3
VSUB.F32 dZr3,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VADD dZi3,dYi2,dYr3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VADD.F32 dZi3,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VSUB.F32 qZ1,qY0,qY1
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
VADD dZr2,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VSUB dZi2,dYi2,dYr3
VADD.F32 dZr2,dYr2,dYi3
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
VSUB.F32 dZi2,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr2,dZi2},[pDst, :128],setStep
VADD.F32 qY0,qX0,qX2 @// u0 for next iteration
VST2.F32 {dZr2,dZi2},[pDst, :128],setStep
.else
VLD2 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VADD qZ0,qY0,qY1
VLD2.F32 {dXr1,dXi1},[pSrc, :128],step1 @// data[1]
VADD.F32 qZ0,qY0,qY1
@// data[3] & update pSrc for the next set, but not if it's the
@// last iteration so that we don't read past the end of the
@// input array.
BEQ radix4SkipLastUpdateFwd\name
VLD2 {dXr3,dXi3},[pSrc, :128],setStep
VLD2.F32 {dXr3,dXi3},[pSrc, :128],setStep
radix4SkipLastUpdateFwd\name:
VADD dZr2,dYr2,dYi3
VADD.F32 dZr2,dYr2,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB dZi2,dYi2,dYr3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB.F32 dZi2,dYi2,dYr3
VSUB qZ1,qY0,qY1
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB.F32 qZ1,qY0,qY1
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB dZr3,dYr2,dYi3
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VADD dZi3,dYi2,dYr3
VSUB.F32 dZr3,dYr2,dYi3
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
VADD.F32 dZi3,dYi2,dYr3
VADD qY0,qX0,qX2 @// u0 for next iteration
VST2 {dZr3,dZi3},[pDst, :128],setStep
VADD.F32 qY0,qX0,qX2 @// u0 for next iteration
VST2.F32 {dZr3,dZi3},[pDst, :128],setStep
.endif

Просмотреть файл

@ -68,63 +68,63 @@
@// Neon Registers
#define dButterfly1Real02 D0.F32
#define dButterfly1Imag02 D1.F32
#define dButterfly1Real13 D2.F32
#define dButterfly1Imag13 D3.F32
#define dButterfly2Real02 D4.F32
#define dButterfly2Imag02 D5.F32
#define dButterfly2Real13 D6.F32
#define dButterfly2Imag13 D7.F32
#define dXr0 D0.F32
#define dXi0 D1.F32
#define dXr1 D2.F32
#define dXi1 D3.F32
#define dXr2 D4.F32
#define dXi2 D5.F32
#define dXr3 D6.F32
#define dXi3 D7.F32
#define dButterfly1Real02 D0
#define dButterfly1Imag02 D1
#define dButterfly1Real13 D2
#define dButterfly1Imag13 D3
#define dButterfly2Real02 D4
#define dButterfly2Imag02 D5
#define dButterfly2Real13 D6
#define dButterfly2Imag13 D7
#define dXr0 D0
#define dXi0 D1
#define dXr1 D2
#define dXi1 D3
#define dXr2 D4
#define dXi2 D5
#define dXr3 D6
#define dXi3 D7
#define dYr0 D16.F32
#define dYi0 D17.F32
#define dYr1 D18.F32
#define dYi1 D19.F32
#define dYr2 D20.F32
#define dYi2 D21.F32
#define dYr3 D22.F32
#define dYi3 D23.F32
#define dYr0 D16
#define dYi0 D17
#define dYr1 D18
#define dYi1 D19
#define dYr2 D20
#define dYi2 D21
#define dYr3 D22
#define dYi3 D23
#define dW1r D8.F32
#define dW1i D9.F32
#define dW2r D10.F32
#define dW2i D11.F32
#define dW3r D12.F32
#define dW3i D13.F32
#define qT0 d14.f32
#define qT1 d16.F32
#define qT2 d18.F32
#define qT3 d20.f32
#define qT4 d22.f32
#define qT5 d24.f32
#define dW1r D8
#define dW1i D9
#define dW2r D10
#define dW2i D11
#define dW3r D12
#define dW3i D13
#define qT0 d14
#define qT1 d16
#define qT2 d18
#define qT3 d20
#define qT4 d22
#define qT5 d24
#define dZr0 D14.F32
#define dZi0 D15.F32
#define dZr1 D26.F32
#define dZi1 D27.F32
#define dZr2 D28.F32
#define dZi2 D29.F32
#define dZr3 D30.F32
#define dZi3 D31.F32
#define dZr0 D14
#define dZi0 D15
#define dZr1 D26
#define dZi1 D27
#define dZr2 D28
#define dZi2 D29
#define dZr3 D30
#define dZi3 D31
#define qX0 Q0.F32
#define qY0 Q8.F32
#define qY1 Q9.F32
#define qY2 Q10.F32
#define qY3 Q11.F32
#define qZ0 Q7.F32
#define qZ1 Q13.F32
#define qZ2 Q14.F32
#define qZ3 Q15.F32
#define qX0 Q0
#define qY0 Q8
#define qY1 Q9
#define qY2 Q10
#define qY3 Q11
#define qZ0 Q7
#define qZ1 Q13
#define qZ2 Q14
#define qZ3 Q15
@ -139,172 +139,172 @@
@// Update grpCount and grpSize rightaway
VLD2 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
VLD2.F32 {dW1r,dW1i},[pTwiddle, :128] @// [wi|wr]
MOV step16,#16
LSL grpCount,subFFTSize,#2
VLD1 dW2r,[pTwiddle, :64] @// [wi|wr]
VLD1.F32 dW2r,[pTwiddle, :64] @// [wi|wr]
MOV subFFTNum,#1 @//after the last stage
VLD1 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
VLD1.F32 dW3r,[pTwiddle, :64],step16 @// [wi|wr]
MOV stepTwiddle,#0
VLD1 dW2i,[pTwiddle, :64]! @// [wi|wr]
VLD1.F32 dW2i,[pTwiddle, :64]! @// [wi|wr]
SUB grpTwStep,stepTwiddle,#8 @// grpTwStep = -8 to start with
@// update subFFTSize for the next stage
MOV subFFTSize,grpCount
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1.F32 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
MOV dstStep,outPointStep,LSL #1
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
VLD4.F32 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
ADD dstStep,dstStep,outPointStep @// dstStep = 3*outPointStep
RSB dstStep,dstStep,#16 @// dstStep = - 3*outPointStep+16
MOV step24,#24
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
VLD4.F32 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
@// Process two groups at a time
radix4lsGrpLoop\name :
VZIP dW2r,dW2i
VZIP.F32 dW2r,dW2i
ADD stepTwiddle,stepTwiddle,#16
VZIP dW3r,dW3i
VZIP.F32 dW3r,dW3i
ADD grpTwStep,stepTwiddle,#4
VUZP dButterfly1Real13, dButterfly2Real13 @// B.r D.r
VUZP.F32 dButterfly1Real13, dButterfly2Real13 @// B.r D.r
SUB twStep,stepTwiddle,#16 @// -16+stepTwiddle
VUZP dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
VUZP.F32 dButterfly1Imag13, dButterfly2Imag13 @// B.i D.i
MOV grpTwStep,grpTwStep,LSL #1
VUZP dButterfly1Real02, dButterfly2Real02 @// A.r C.r
VUZP.F32 dButterfly1Real02, dButterfly2Real02 @// A.r C.r
RSB grpTwStep,grpTwStep,#0 @// -8-2*stepTwiddle
VUZP dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
VUZP.F32 dButterfly1Imag02, dButterfly2Imag02 @// A.i C.i
@// grpCount is multiplied by 4
SUBS grpCount,grpCount,#8
.ifeqs "\inverse", "TRUE"
VMUL dZr1,dW1r,dXr1
VMLA dZr1,dW1i,dXi1 @// real part
VMUL dZi1,dW1r,dXi1
VMLS dZi1,dW1i,dXr1 @// imag part
VMUL.F32 dZr1,dW1r,dXr1
VMLA.F32 dZr1,dW1i,dXi1 @// real part
VMUL.F32 dZi1,dW1r,dXi1
VMLS.F32 dZi1,dW1i,dXr1 @// imag part
.else
VMUL dZr1,dW1r,dXr1
VMLS dZr1,dW1i,dXi1 @// real part
VMUL dZi1,dW1r,dXi1
VMLA dZi1,dW1i,dXr1 @// imag part
VMUL.F32 dZr1,dW1r,dXr1
VMLS.F32 dZr1,dW1i,dXi1 @// real part
VMUL.F32 dZi1,dW1r,dXi1
VMLA.F32 dZi1,dW1i,dXr1 @// imag part
.endif
VLD2 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
VLD2.F32 {dW1r,dW1i},[pTwiddle, :128],stepTwiddle @// [wi|wr]
.ifeqs "\inverse", "TRUE"
VMUL dZr2,dW2r,dXr2
VMLA dZr2,dW2i,dXi2 @// real part
VMUL dZi2,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VMLS dZi2,dW2i,dXr2 @// imag part
VMUL.F32 dZr2,dW2r,dXr2
VMLA.F32 dZr2,dW2i,dXi2 @// real part
VMUL.F32 dZi2,dW2r,dXi2
VLD1.F32 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VMLS.F32 dZi2,dW2i,dXr2 @// imag part
.else
VMUL dZr2,dW2r,dXr2
VMLS dZr2,dW2i,dXi2 @// real part
VMUL dZi2,dW2r,dXi2
VLD1 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VMLA dZi2,dW2i,dXr2 @// imag part
VMUL.F32 dZr2,dW2r,dXr2
VMLS.F32 dZr2,dW2i,dXi2 @// real part
VMUL.F32 dZi2,dW2r,dXi2
VLD1.F32 dW2r,[pTwiddle, :64],step16 @// [wi|wr]
VMLA.F32 dZi2,dW2i,dXr2 @// imag part
.endif
VLD1 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
VLD1.F32 dW2i,[pTwiddle, :64],twStep @// [wi|wr]
@// move qX0 so as to load for the next iteration
VMOV qZ0,qX0
.ifeqs "\inverse", "TRUE"
VMUL dZr3,dW3r,dXr3
VMLA dZr3,dW3i,dXi3 @// real part
VMUL dZi3,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VMLS dZi3,dW3i,dXr3 @// imag part
VMUL.F32 dZr3,dW3r,dXr3
VMLA.F32 dZr3,dW3i,dXi3 @// real part
VMUL.F32 dZi3,dW3r,dXi3
VLD1.F32 dW3r,[pTwiddle, :64],step24
VMLS.F32 dZi3,dW3i,dXr3 @// imag part
.else
VMUL dZr3,dW3r,dXr3
VMLS dZr3,dW3i,dXi3 @// real part
VMUL dZi3,dW3r,dXi3
VLD1 dW3r,[pTwiddle, :64],step24
VMLA dZi3,dW3i,dXr3 @// imag part
VMUL.F32 dZr3,dW3r,dXr3
VMLS.F32 dZr3,dW3i,dXi3 @// real part
VMUL.F32 dZi3,dW3r,dXi3
VLD1.F32 dW3r,[pTwiddle, :64],step24
VMLA.F32 dZi3,dW3i,dXr3 @// imag part
.endif
VLD1 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
VLD1.F32 dW3i,[pTwiddle, :64],grpTwStep @// [wi|wr]
@// Don't do the load on the last iteration so we don't read past the end
@// of pSrc.
addeq pSrc, pSrc, #64
beq radix4lsSkipRead\name
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
VLD4.F32 {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc, :256]!
@// AC.r AC.i BD.r BD.i
VLD4 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
VLD4.F32 {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc, :256]!
radix4lsSkipRead\name:
@// finish first stage of 4 point FFT
VADD qY0,qZ0,qZ2
VSUB qY2,qZ0,qZ2
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
VADD.F32 qY0,qZ0,qZ2
VSUB.F32 qY2,qZ0,qZ2
VADD.F32 qY1,qZ1,qZ3
VSUB.F32 qY3,qZ1,qZ3
@// finish second stage of 4 point FFT
.ifeqs "\inverse", "TRUE"
VSUB qZ0,qY2,qY1
VSUB.F32 qZ0,qY2,qY1
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD.F32 dZr3,dYr0,dYi3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB.F32 dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VADD.F32 qZ2,qY2,qY1
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VADD dZi1,dYi0,dYr3
VSUB.F32 dZr1,dYr0,dYi3
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
VADD.F32 dZi1,dYi0,dYr3
@// dstStep = -outPointStep + 16
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2.F32 {dZr1,dZi1},[pDst, :128],dstStep
.else
VSUB qZ0,qY2,qY1
VSUB.F32 qZ0,qY2,qY1
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VADD dZi1,dYi0,dYr3
VSUB.F32 dZr1,dYr0,dYi3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VADD.F32 dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VADD.F32 qZ2,qY2,qY1
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD.F32 dZr3,dYr0,dYi3
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB.F32 dZi3,dYi0,dYr3
@// dstStep = -outPointStep + 16
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2.F32 {dZr3,dZi3},[pDst, :128],dstStep
.endif

Просмотреть файл

@ -76,48 +76,48 @@
@// Neon Registers
#define dW1 D0.F32
#define dW2 D1.F32
#define dW3 D2.F32
#define dW1 D0
#define dW2 D1
#define dW3 D2
#define dXr0 D4.F32
#define dXi0 D5.F32
#define dXr1 D6.F32
#define dXi1 D7.F32
#define dXr2 D8.F32
#define dXi2 D9.F32
#define dXr3 D10.F32
#define dXi3 D11.F32
#define dYr0 D12.F32
#define dYi0 D13.F32
#define dYr1 D14.F32
#define dYi1 D15.F32
#define dYr2 D16.F32
#define dYi2 D17.F32
#define dYr3 D18.F32
#define dYi3 D19.F32
#define qT0 d16.f32
#define qT1 d18.f32
#define qT2 d12.f32
#define qT3 d14.f32
#define dZr0 D20.F32
#define dZi0 D21.F32
#define dZr1 D22.F32
#define dZi1 D23.F32
#define dZr2 D24.F32
#define dZi2 D25.F32
#define dZr3 D26.F32
#define dZi3 D27.F32
#define dXr0 D4
#define dXi0 D5
#define dXr1 D6
#define dXi1 D7
#define dXr2 D8
#define dXi2 D9
#define dXr3 D10
#define dXi3 D11
#define dYr0 D12
#define dYi0 D13
#define dYr1 D14
#define dYi1 D15
#define dYr2 D16
#define dYi2 D17
#define dYr3 D18
#define dYi3 D19
#define qT0 d16
#define qT1 d18
#define qT2 d12
#define qT3 d14
#define dZr0 D20
#define dZi0 D21
#define dZr1 D22
#define dZi1 D23
#define dZr2 D24
#define dZi2 D25
#define dZr3 D26
#define dZi3 D27
#define qY0 Q6.F32
#define qY1 Q7.F32
#define qY2 Q8.F32
#define qY3 Q9.F32
#define qX0 Q2.F32
#define qZ0 Q10.F32
#define qZ1 Q11.F32
#define qZ2 Q12.F32
#define qZ3 Q13.F32
#define qY0 Q6
#define qY1 Q7
#define qY2 Q8
#define qY3 Q9
#define qX0 Q2
#define qZ0 Q10
#define qZ1 Q11
#define qZ2 Q12
#define qZ3 Q13
.MACRO FFTSTAGE scaled, inverse , name
@ -131,7 +131,7 @@
LSR subFFTNum,subFFTNum,#2
MOV subFFTSize,grpCount
VLD1 dW1,[pTwiddle] @//[wi | wr]
VLD1.F32 dW1,[pTwiddle] @//[wi | wr]
@// pT0+1 increments pT0 by 8 bytes
@// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
MOV pointStep,subFFTNum,LSL #1
@ -142,11 +142,11 @@
@// = 2*size bytes
MOV stepTwiddle,#0
VLD1 dW2,[pTwiddle] @//[wi | wr]
VLD1.F32 dW2,[pTwiddle] @//[wi | wr]
SMULBB outPointStep,grpCount,pointStep
LSL pointStep,pointStep,#2 @// 2*grpSize
VLD1 dW3,[pTwiddle] @//[wi | wr]
VLD1.F32 dW3,[pTwiddle] @//[wi | wr]
MOV srcStep,pointStep,LSL #1 @// srcStep = 2*pointStep
ADD setStep,srcStep,pointStep @// setStep = 3*pointStep
@ -162,16 +162,16 @@
radix4GrpLoop\name :
VLD2 {dXr0,dXi0},[pSrc],pointStep @// data[0]
VLD2.F32 {dXr0,dXi0},[pSrc],pointStep @// data[0]
ADD stepTwiddle,stepTwiddle,pointStep
VLD2 {dXr1,dXi1},[pSrc],pointStep @// data[1]
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep @// data[1]
@// set pTwiddle to the first point
ADD pTwiddle,pTwiddle,stepTwiddle
VLD2 {dXr2,dXi2},[pSrc],pointStep @// data[2]
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep @// data[2]
MOV twStep,stepTwiddle,LSL #2
@// data[3] & update pSrc for the next set
VLD2 {dXr3,dXi3},[pSrc],setStep
VLD2.F32 {dXr3,dXi3},[pSrc],setStep
SUB twStep,stepTwiddle,twStep @// twStep = -3*stepTwiddle
MOV setCount,pointStep,LSR #3
@ -188,49 +188,49 @@ radix4SetLoop\name :
.ifeqs "\inverse", "TRUE"
VMUL dZr1,dXr1,dW1[0]
VMUL dZi1,dXi1,dW1[0]
VMUL dZr2,dXr2,dW2[0]
VMUL dZi2,dXi2,dW2[0]
VMUL dZr3,dXr3,dW3[0]
VMUL dZi3,dXi3,dW3[0]
VMUL.F32 dZr1,dXr1,dW1[0]
VMUL.F32 dZi1,dXi1,dW1[0]
VMUL.F32 dZr2,dXr2,dW2[0]
VMUL.F32 dZi2,dXi2,dW2[0]
VMUL.F32 dZr3,dXr3,dW3[0]
VMUL.F32 dZi3,dXi3,dW3[0]
VMLA dZr1,dXi1,dW1[1] @// real part
VMLS dZi1,dXr1,dW1[1] @// imag part
VMLA.F32 dZr1,dXi1,dW1[1] @// real part
VMLS.F32 dZi1,dXr1,dW1[1] @// imag part
@// data[1] for next iteration
VLD2 {dXr1,dXi1},[pSrc],pointStep
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep
VMLA dZr2,dXi2,dW2[1] @// real part
VMLS dZi2,dXr2,dW2[1] @// imag part
VMLA.F32 dZr2,dXi2,dW2[1] @// real part
VMLS.F32 dZi2,dXr2,dW2[1] @// imag part
@// data[2] for next iteration
VLD2 {dXr2,dXi2},[pSrc],pointStep
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep
VMLA dZr3,dXi3,dW3[1] @// real part
VMLS dZi3,dXr3,dW3[1] @// imag part
VMLA.F32 dZr3,dXi3,dW3[1] @// real part
VMLS.F32 dZi3,dXr3,dW3[1] @// imag part
.else
VMUL dZr1,dXr1,dW1[0]
VMUL dZi1,dXi1,dW1[0]
VMUL dZr2,dXr2,dW2[0]
VMUL dZi2,dXi2,dW2[0]
VMUL dZr3,dXr3,dW3[0]
VMUL dZi3,dXi3,dW3[0]
VMUL.F32 dZr1,dXr1,dW1[0]
VMUL.F32 dZi1,dXi1,dW1[0]
VMUL.F32 dZr2,dXr2,dW2[0]
VMUL.F32 dZi2,dXi2,dW2[0]
VMUL.F32 dZr3,dXr3,dW3[0]
VMUL.F32 dZi3,dXi3,dW3[0]
VMLS dZr1,dXi1,dW1[1] @// real part
VMLA dZi1,dXr1,dW1[1] @// imag part
VMLS.F32 dZr1,dXi1,dW1[1] @// real part
VMLA.F32 dZi1,dXr1,dW1[1] @// imag part
@// data[1] for next iteration
VLD2 {dXr1,dXi1},[pSrc],pointStep
VLD2.F32 {dXr1,dXi1},[pSrc],pointStep
VMLS dZr2,dXi2,dW2[1] @// real part
VMLA dZi2,dXr2,dW2[1] @// imag part
VMLS.F32 dZr2,dXi2,dW2[1] @// real part
VMLA.F32 dZi2,dXr2,dW2[1] @// imag part
@// data[2] for next iteration
VLD2 {dXr2,dXi2},[pSrc],pointStep
VLD2.F32 {dXr2,dXi2},[pSrc],pointStep
VMLS dZr3,dXi3,dW3[1] @// real part
VMLA dZi3,dXr3,dW3[1] @// imag part
VMLS.F32 dZr3,dXi3,dW3[1] @// real part
VMLA.F32 dZi3,dXr3,dW3[1] @// imag part
.endif
@// data[3] & update pSrc to data[0]
@ -241,54 +241,54 @@ radix4SetLoop\name :
@// These are executed only if both grpCount = 4 and setCount = 2
addeq pSrc, pSrc, setStep
beq radix4SkipRead\name
VLD2 {dXr3,dXi3},[pSrc],setStep
VLD2.F32 {dXr3,dXi3},[pSrc],setStep
radix4SkipRead\name:
SUBS setCount,setCount,#2
@// finish first stage of 4 point FFT
VADD qY0,qX0,qZ2
VSUB qY2,qX0,qZ2
VADD.F32 qY0,qX0,qZ2
VSUB.F32 qY2,qX0,qZ2
@// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc, :128]!
VADD qY1,qZ1,qZ3
VSUB qY3,qZ1,qZ3
VLD2.F32 {dXr0,dXi0},[pSrc, :128]!
VADD.F32 qY1,qZ1,qZ3
VSUB.F32 qY3,qZ1,qZ3
@// finish second stage of 4 point FFT
VSUB qZ0,qY2,qY1
VSUB.F32 qZ0,qY2,qY1
.ifeqs "\inverse", "TRUE"
VADD dZr3,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD.F32 dZr3,dYr0,dYi3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VSUB.F32 dZi3,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr3,dZi3},[pDst, :128],outPointStep
VADD.F32 qZ2,qY2,qY1
VST2.F32 {dZr3,dZi3},[pDst, :128],outPointStep
VSUB dZr1,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VADD dZi1,dYi0,dYr3
VSUB.F32 dZr1,dYr0,dYi3
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
VADD.F32 dZi1,dYi0,dYr3
VST2 {dZr1,dZi1},[pDst, :128],dstStep
VST2.F32 {dZr1,dZi1},[pDst, :128],dstStep
.else
VSUB dZr1,dYr0,dYi3
VST2 {dZr0,dZi0},[pDst, :128],outPointStep
VADD dZi1,dYi0,dYr3
VSUB.F32 dZr1,dYr0,dYi3
VST2.F32 {dZr0,dZi0},[pDst, :128],outPointStep
VADD.F32 dZi1,dYi0,dYr3
VADD qZ2,qY2,qY1
VST2 {dZr1,dZi1},[pDst, :128],outPointStep
VADD.F32 qZ2,qY2,qY1
VST2.F32 {dZr1,dZi1},[pDst, :128],outPointStep
VADD dZr3,dYr0,dYi3
VST2 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB dZi3,dYi0,dYr3
VADD.F32 dZr3,dYr0,dYi3
VST2.F32 {dZr2,dZi2},[pDst, :128],outPointStep
VSUB.F32 dZi3,dYi0,dYr3
VST2 {dZr3,dZi3},[pDst, :128],dstStep
VST2.F32 {dZr3,dZi3},[pDst, :128],dstStep
.endif
@ -298,13 +298,13 @@ radix4SkipRead\name:
BGT radix4SetLoop\name
VLD1 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1.F32 dW1,[pTwiddle, :64],stepTwiddle @//[wi | wr]
@// subtract 4 since grpCount multiplied by 4
SUBS grpCount,grpCount,#4
VLD1 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
VLD1.F32 dW2,[pTwiddle, :64],stepTwiddle @//[wi | wr]
@// increment pSrc for the next grp
ADD pSrc,pSrc,srcStep
VLD1 dW3,[pTwiddle, :64],twStep @//[wi | wr]
VLD1.F32 dW3,[pTwiddle, :64],twStep @//[wi | wr]
BGT radix4GrpLoop\name

Просмотреть файл

@ -68,110 +68,110 @@
@// Neon Registers
#define dXr0 D0.F32
#define dXi0 D1.F32
#define dXr1 D2.F32
#define dXi1 D3.F32
#define dXr2 D4.F32
#define dXi2 D5.F32
#define dXr3 D6.F32
#define dXi3 D7.F32
#define dXr4 D8.F32
#define dXi4 D9.F32
#define dXr5 D10.F32
#define dXi5 D11.F32
#define dXr6 D12.F32
#define dXi6 D13.F32
#define dXr7 D14.F32
#define dXi7 D15.F32
#define qX0 Q0.F32
#define qX1 Q1.F32
#define qX2 Q2.F32
#define qX3 Q3.F32
#define qX4 Q4.F32
#define qX5 Q5.F32
#define qX6 Q6.F32
#define qX7 Q7.F32
#define dXr0 D0
#define dXi0 D1
#define dXr1 D2
#define dXi1 D3
#define dXr2 D4
#define dXi2 D5
#define dXr3 D6
#define dXi3 D7
#define dXr4 D8
#define dXi4 D9
#define dXr5 D10
#define dXi5 D11
#define dXr6 D12
#define dXi6 D13
#define dXr7 D14
#define dXi7 D15
#define qX0 Q0
#define qX1 Q1
#define qX2 Q2
#define qX3 Q3
#define qX4 Q4
#define qX5 Q5
#define qX6 Q6
#define qX7 Q7
#define dUr0 D16.F32
#define dUi0 D17.F32
#define dUr2 D18.F32
#define dUi2 D19.F32
#define dUr4 D20.F32
#define dUi4 D21.F32
#define dUr6 D22.F32
#define dUi6 D23.F32
#define dUr1 D24.F32
#define dUi1 D25.F32
#define dUr3 D26.F32
#define dUi3 D27.F32
#define dUr5 D28.F32
#define dUi5 D29.F32
#define dUr0 D16
#define dUi0 D17
#define dUr2 D18
#define dUi2 D19
#define dUr4 D20
#define dUi4 D21
#define dUr6 D22
#define dUi6 D23
#define dUr1 D24
#define dUi1 D25
#define dUr3 D26
#define dUi3 D27
#define dUr5 D28
#define dUi5 D29
@// reuse dXr7 and dXi7
#define dUr7 D30.F32
#define dUi7 D31.F32
#define qU0 Q8.F32
#define qU1 Q12.F32
#define qU2 Q9.F32
#define qU3 Q13.F32
#define qU4 Q10.F32
#define qU5 Q14.F32
#define qU6 Q11.F32
#define qU7 Q15.F32
#define dUr7 D30
#define dUi7 D31
#define qU0 Q8
#define qU1 Q12
#define qU2 Q9
#define qU3 Q13
#define qU4 Q10
#define qU5 Q14
#define qU6 Q11
#define qU7 Q15
#define dVr0 D24.F32
#define dVi0 D25.F32
#define dVr2 D26.F32
#define dVi2 D27.F32
#define dVr4 D28.F32
#define dVi4 D29.F32
#define dVr6 D30.F32
#define dVi6 D31.F32
#define dVr1 D16.F32
#define dVi1 D17.F32
#define dVr3 D18.F32
#define dVi3 D19.F32
#define dVr5 D20.F32
#define dVi5 D21.F32
#define dVr7 D22.F32
#define dVi7 D23.F32
#define qV0 Q12.F32
#define qV1 Q8.F32
#define qV2 Q13.F32
#define qV3 Q9.F32
#define qV4 Q14.F32
#define qV5 Q10.F32
#define qV6 Q15.F32
#define qV7 Q11.F32
#define dVr0 D24
#define dVi0 D25
#define dVr2 D26
#define dVi2 D27
#define dVr4 D28
#define dVi4 D29
#define dVr6 D30
#define dVi6 D31
#define dVr1 D16
#define dVi1 D17
#define dVr3 D18
#define dVi3 D19
#define dVr5 D20
#define dVi5 D21
#define dVr7 D22
#define dVi7 D23
#define qV0 Q12
#define qV1 Q8
#define qV2 Q13
#define qV3 Q9
#define qV4 Q14
#define qV5 Q10
#define qV6 Q15
#define qV7 Q11
#define dYr0 D16.F32
#define dYi0 D17.F32
#define dYr2 D18.F32
#define dYi2 D19.F32
#define dYr4 D20.F32
#define dYi4 D21.F32
#define dYr6 D22.F32
#define dYi6 D23.F32
#define dYr1 D24.F32
#define dYi1 D25.F32
#define dYr3 D26.F32
#define dYi3 D27.F32
#define dYr5 D28.F32
#define dYi5 D29.F32
#define dYr7 D30.F32
#define dYi7 D31.F32
#define qY0 Q8.F32
#define qY1 Q12.F32
#define qY2 Q9.F32
#define qY3 Q13.F32
#define qY4 Q10.F32
#define qY5 Q14.F32
#define qY6 Q11.F32
#define qY7 Q15.F32
#define dYr0 D16
#define dYi0 D17
#define dYr2 D18
#define dYi2 D19
#define dYr4 D20
#define dYi4 D21
#define dYr6 D22
#define dYi6 D23
#define dYr1 D24
#define dYi1 D25
#define dYr3 D26
#define dYi3 D27
#define dYr5 D28
#define dYi5 D29
#define dYr7 D30
#define dYi7 D31
#define qY0 Q8
#define qY1 Q12
#define qY2 Q9
#define qY3 Q13
#define qY4 Q10
#define qY5 Q14
#define qY6 Q11
#define qY7 Q15
#define dT0 D14.F32
#define dT1 D15.F32
#define dT0 D14
#define dT1 D15
.MACRO FFTSTAGE scaled, inverse, name
@ -197,23 +197,23 @@
@// Calculate the step of input data for the next set
@//MOV step1,pointStep,LSL #1 @// step1 = 2*pointStep
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep @// data[0]
MOV step1,grpSize,LSL #4
MOV step2,pointStep,LSL #3
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
SUB step2,step2,pointStep @// step2 = 7*pointStep
@// setStep = - 7*pointStep+16
RSB setStep,step2,#16
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
@// data[7] & update pSrc for the next set
@// setStep = -7*pointStep + 16
VLD2 {dXr7,dXi7},[pSrc, :128],setStep
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep
@// grp = 0 a special case since all the twiddle factors are 1
@// Loop on the sets
@ -225,168 +225,168 @@ radix8fsGrpZeroSetLoop\name :
@// finish first stage of 8 point FFT
VADD qU0,qX0,qX4
VADD qU2,qX1,qX5
VADD qU4,qX2,qX6
VADD qU6,qX3,qX7
VADD.F32 qU0,qX0,qX4
VADD.F32 qU2,qX1,qX5
VADD.F32 qU4,qX2,qX6
VADD.F32 qU6,qX3,qX7
@// finish second stage of 8 point FFT
VADD qV0,qU0,qU4
VSUB qV2,qU0,qU4
VADD qV4,qU2,qU6
VSUB qV6,qU2,qU6
VADD.F32 qV0,qU0,qU4
VSUB.F32 qV2,qU0,qU4
VADD.F32 qV4,qU2,qU6
VSUB.F32 qV6,qU2,qU6
@// finish third stage of 8 point FFT
VADD qY0,qV0,qV4
VSUB qY4,qV0,qV4
VST2 {dYr0,dYi0},[pDst, :128],step1 @// store y0
VADD.F32 qY0,qV0,qV4
VSUB.F32 qY4,qV0,qV4
VST2.F32 {dYr0,dYi0},[pDst, :128],step1 @// store y0
.ifeqs "\inverse", "TRUE"
VSUB dYr2,dVr2,dVi6
VADD dYi2,dVi2,dVr6
VSUB.F32 dYr2,dVr2,dVi6
VADD.F32 dYi2,dVi2,dVr6
VADD dYr6,dVr2,dVi6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VSUB dYi6,dVi2,dVr6
VADD.F32 dYr6,dVr2,dVi6
VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y2
VSUB.F32 dYi6,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VSUB.F32 qU1,qX0,qX4
VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y6
VSUB.F32 qU3,qX1,qX5
VSUB.F32 qU5,qX2,qX6
VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y6
.ELSE
VADD dYr6,dVr2,dVi6
VSUB dYi6,dVi2,dVr6
VADD.F32 dYr6,dVr2,dVi6
VSUB.F32 dYi6,dVi2,dVr6
VSUB dYr2,dVr2,dVi6
VST2 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VADD dYi2,dVi2,dVr6
VSUB.F32 dYr2,dVr2,dVi6
VST2.F32 {dYr6,dYi6},[pDst, :128],step1 @// store y2
VADD.F32 dYi2,dVi2,dVr6
VSUB qU1,qX0,qX4
VST2 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VSUB qU3,qX1,qX5
VSUB qU5,qX2,qX6
VST2 {dYr2,dYi2},[pDst, :128],step1 @// store y6
VSUB.F32 qU1,qX0,qX4
VST2.F32 {dYr4,dYi4},[pDst, :128],step1 @// store y4
VSUB.F32 qU3,qX1,qX5
VSUB.F32 qU5,qX2,qX6
VST2.F32 {dYr2,dYi2},[pDst, :128],step1 @// store y6
.ENDIF
@// finish first stage of 8 point FFT
VSUB qU7,qX3,qX7
VLD1 dT0[0], [t0]
VSUB.F32 qU7,qX3,qX7
VLD1.F32 dT0[0], [t0]
@// finish second stage of 8 point FFT
VSUB dVr1,dUr1,dUi5
VSUB.F32 dVr1,dUr1,dUi5
@// data[0] for next iteration
VLD2 {dXr0,dXi0},[pSrc, :128],pointStep
VADD dVi1,dUi1,dUr5
VADD dVr3,dUr1,dUi5
VLD2 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VSUB dVi3,dUi1,dUr5
VLD2.F32 {dXr0,dXi0},[pSrc, :128],pointStep
VADD.F32 dVi1,dUi1,dUr5
VADD.F32 dVr3,dUr1,dUi5
VLD2.F32 {dXr1,dXi1},[pSrc, :128],pointStep @// data[1]
VSUB.F32 dVi3,dUi1,dUr5
VSUB dVr5,dUr3,dUi7
VLD2 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VADD dVi5,dUi3,dUr7
VADD dVr7,dUr3,dUi7
VLD2 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VSUB dVi7,dUi3,dUr7
VSUB.F32 dVr5,dUr3,dUi7
VLD2.F32 {dXr2,dXi2},[pSrc, :128],pointStep @// data[2]
VADD.F32 dVi5,dUi3,dUr7
VADD.F32 dVr7,dUr3,dUi7
VLD2.F32 {dXr3,dXi3},[pSrc, :128],pointStep @// data[3]
VSUB.F32 dVi7,dUi3,dUr7
@// finish third stage of 8 point FFT
.ifeqs "\inverse", "TRUE"
@// calculate a*v5
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VMUL dVi5,dVi5,dT0[0]
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VMUL.F32 dVi5,dVi5,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VSUB.F32 dVr5,dT1,dVi5 @// a * V5
VADD.F32 dVi5,dT1,dVi5
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
@// calculate b*v7
VMUL dT1,dVr7,dT0[0]
VMUL dVi7,dVi7,dT0[0]
VMUL.F32 dT1,dVr7,dT0[0]
VMUL.F32 dVi7,dVi7,dT0[0]
VADD qY1,qV1,qV5
VSUB qY5,qV1,qV5
VADD.F32 qY1,qV1,qV5
VSUB.F32 qY5,qV1,qV5
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VADD.F32 dVr7,dT1,dVi7 @// b * V7
VSUB.F32 dVi7,dVi7,dT1
SUB pDst, pDst, step2 @// set pDst to y1
@// On the last iteration, this will read past the end of pSrc,
@// so skip this read.
BEQ radix8SkipLastUpdateInv\name
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
radix8SkipLastUpdateInv\name:
VSUB dYr3,dVr3,dVr7
VSUB dYi3,dVi3,dVi7
VST2 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
VSUB.F32 dYr3,dVr3,dVr7
VSUB.F32 dYi3,dVi3,dVi7
VST2.F32 {dYr1,dYi1},[pDst, :128],step1 @// store y1
VADD.F32 dYr7,dVr3,dVr7
VADD.F32 dYi7,dVi3,dVi7
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2 {dYr7,dYi7},[pDst, :128] @// store y7
VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y3
VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y5
VST2.F32 {dYr7,dYi7},[pDst, :128] @// store y7
ADD pDst, pDst, #16
.ELSE
@// calculate b*v7
VMUL dT1,dVr7,dT0[0]
VLD2 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VMUL dVi7,dVi7,dT0[0]
VMUL.F32 dT1,dVr7,dT0[0]
VLD2.F32 {dXr4,dXi4},[pSrc, :128],pointStep @// data[4]
VMUL.F32 dVi7,dVi7,dT0[0]
VLD2 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VADD dVr7,dT1,dVi7 @// b * V7
VSUB dVi7,dVi7,dT1
VLD2.F32 {dXr5,dXi5},[pSrc, :128],pointStep @// data[5]
VADD.F32 dVr7,dT1,dVi7 @// b * V7
VSUB.F32 dVi7,dVi7,dT1
VLD2 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
VLD2.F32 {dXr6,dXi6},[pSrc, :128],pointStep @// data[6]
@// calculate a*v5
VMUL dT1,dVr5,dT0[0] @// use dVi0 for dT1
VMUL dVi5,dVi5,dT0[0]
VMUL.F32 dT1,dVr5,dT0[0] @// use dVi0 for dT1
VMUL.F32 dVi5,dVi5,dT0[0]
VADD dYr7,dVr3,dVr7
VADD dYi7,dVi3,dVi7
VADD.F32 dYr7,dVr3,dVr7
VADD.F32 dYi7,dVi3,dVi7
SUB pDst, pDst, step2 @// set pDst to y1
VSUB dVr5,dT1,dVi5 @// a * V5
VADD dVi5,dT1,dVi5
VSUB.F32 dVr5,dT1,dVi5 @// a * V5
VADD.F32 dVi5,dT1,dVi5
@// On the last iteration, this will read past the end of pSrc,
@// so skip this read.
BEQ radix8SkipLastUpdateFwd\name
VLD2 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
VLD2.F32 {dXr7,dXi7},[pSrc, :128],setStep @// data[7]
radix8SkipLastUpdateFwd\name:
VSUB qY5,qV1,qV5
VSUB.F32 qY5,qV1,qV5
VSUB dYr3,dVr3,dVr7
VST2 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VSUB dYi3,dVi3,dVi7
VADD qY1,qV1,qV5
VSUB.F32 dYr3,dVr3,dVr7
VST2.F32 {dYr7,dYi7},[pDst, :128],step1 @// store y1
VSUB.F32 dYi3,dVi3,dVi7
VADD.F32 qY1,qV1,qV5
VST2 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2 {dYr1,dYi1},[pDst, :128]! @// store y7
VST2.F32 {dYr5,dYi5},[pDst, :128],step1 @// store y3
VST2.F32 {dYr3,dYi3},[pDst, :128],step1 @// store y5
VST2.F32 {dYr1,dYi1},[pDst, :128]! @// store y7
.ENDIF

Просмотреть файл

@ -91,42 +91,42 @@
@// Neon registers
#define dX0 d0.f32
#define dzero d1.f32
#define dZero d2.f32
#define dShift d3.f32
#define dX0r d2.f32
#define dX0i d3.f32
#define dX1r d4.f32
#define dX1i d5.f32
#define dT0 d6.f32
#define dT1 d7.f32
#define dT2 d8.f32
#define dT3 d9.f32
#define qT0 d10.f32
#define qT1 d12.f32
#define dW0r d14.f32
#define dW0i d15.f32
#define dW1r d16.f32
#define dW1i d17.f32
#define dY0r d14.f32
#define dY0i d15.f32
#define dY1r d16.f32
#define dY1i d17.f32
#define dX0 d0
#define dzero d1
#define dZero d2
#define dShift d3
#define dX0r d2
#define dX0i d3
#define dX1r d4
#define dX1i d5
#define dT0 d6
#define dT1 d7
#define dT2 d8
#define dT3 d9
#define qT0 d10
#define qT1 d12
#define dW0r d14
#define dW0i d15
#define dW1r d16
#define dW1i d17
#define dY0r d14
#define dY0i d15
#define dY1r d16
#define dY1i d17
#define dY0rS64 d14.s64
#define dY0iS64 d15.s64
#define qT2 d18.f32
#define qT3 d20.f32
#define qT2 d18
#define qT3 d20
@// lastThreeelements
#define dX1 d3.f32
#define dW0 d4.f32
#define dW1 d5.f32
#define dY0 d10.f32
#define dY1 d11.f32
#define dY2 d12.f32
#define dY3 d13.f32
#define dX1 d3
#define dW0 d4
#define dW1 d5
#define dY0 d10
#define dY1 d11
#define dY2 d12
#define dY3 d13
#define half d0.f32
#define half d0
@// Allocate stack memory required by the function
@ -151,11 +151,11 @@
@// N=1 Treat seperately
CMP N,#1
BGT sizeGreaterThanOne
VLD1 dX0[0],[pSrc]
VLD1.F32 dX0[0],[pSrc]
MOV zero,#0
VMOV dzero[0],zero
VMOV dZero[0],zero
VST3 {dX0[0],dzero[0],dZero[0]},[pDst]
VMOV.F32 dzero[0],zero
VMOV.F32 dZero[0],zero
VST3.F32 {dX0[0],dzero[0],dZero[0]},[pDst]
B End
@ -176,8 +176,8 @@ sizeGreaterThanOne:
CMP order,#1
BGE orderGreaterthan0 @// order > 0
VLD1 dX0,[pSrc]
VST1 dX0,[pOut]
VLD1.F32 dX0,[pSrc]
VST1.F32 dX0,[pOut]
MOV pSrc,pOut
MOV argDst,pDst
BLT FFTEnd
@ -266,25 +266,25 @@ finalComplexToRealFixup:
@// (a-b, 0)
@// F(0) and F(N/2)
VLD2 {dX0r[0],dX0i[0]},[pSrc]!
VLD2.F32 {dX0r[0],dX0i[0]},[pSrc]!
MOV zero,#0
VMOV dX0r[1],zero
VMOV.F32 dX0r[1],zero
MOV step,subFFTSize,LSL #3 @// step = N/2 * 8 bytes
VMOV dX0i[1],zero
VMOV.F32 dX0i[1],zero
@// twStep = 3N/8 * 8 bytes pointing to W^1
SUB twStep,step,subFFTSize,LSL #1
VADD dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
VADD.F32 dY0r,dX0r,dX0i @// F(0) = ((Z0.r+Z0.i) , 0)
MOV step1,subFFTSize,LSL #2 @// step1 = N/2 * 4 bytes
VSUB dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
VSUB.F32 dY0i,dX0r,dX0i @// F(N/2) = ((Z0.r-Z0.i) , 0)
SUBS subFFTSize,subFFTSize,#2
VST1 dY0r,[argDst],step
VST1.F32 dY0r,[argDst],step
ADD pTwiddleTmp,argTwiddle,#8 @// W^2
VST1 dY0i,[argDst]!
VST1.F32 dY0i,[argDst]!
ADD argTwiddle,argTwiddle,twStep @// W^1
VDUP dzero,zero
VDUP.F32 dzero,zero
SUB argDst,argDst,step
BLT End
@ -299,75 +299,75 @@ finalComplexToRealFixup:
ADR t0, HALF
VLD1 half[0], [t0]
VLD1.F32 half[0], [t0]
evenOddButterflyLoop:
VLD1 dW0r,[argTwiddle],step1
VLD1 dW1r,[argTwiddle]!
VLD1.F32 dW0r,[argTwiddle],step1
VLD1.F32 dW1r,[argTwiddle]!
VLD2 {dX0r,dX0i},[pSrc],step
VLD2.F32 {dX0r,dX0i},[pSrc],step
SUB argTwiddle,argTwiddle,step1
VLD2 {dX1r,dX1i},[pSrc]!
VLD2.F32 {dX1r,dX1i},[pSrc]!
SUB step1,step1,#8 @// (N/4-2)*8 bytes
VLD1 dW0i,[pTwiddleTmp],step1
VLD1 dW1i,[pTwiddleTmp]!
VLD1.F32 dW0i,[pTwiddleTmp],step1
VLD1.F32 dW1i,[pTwiddleTmp]!
SUB pSrc,pSrc,step
SUB pTwiddleTmp,pTwiddleTmp,step1
VREV64 dX1r,dX1r
VREV64 dX1i,dX1i
VREV64.F32 dX1r,dX1r
VREV64.F32 dX1i,dX1i
SUBS subFFTSize,subFFTSize,#4
VSUB dT2,dX0r,dX1r @// a-c
VSUB.F32 dT2,dX0r,dX1r @// a-c
SUB step1,step1,#8
VADD dT0,dX0r,dX1r @// a+c
VSUB dT1,dX0i,dX1i @// b-d
VADD dT3,dX0i,dX1i @// b+d
VMUL dT0,dT0,half[0]
VMUL dT1,dT1,half[0]
VZIP dW1r,dW1i
VZIP dW0r,dW0i
VADD.F32 dT0,dX0r,dX1r @// a+c
VSUB.F32 dT1,dX0i,dX1i @// b-d
VADD.F32 dT3,dX0i,dX1i @// b+d
VMUL.F32 dT0,dT0,half[0]
VMUL.F32 dT1,dT1,half[0]
VZIP.F32 dW1r,dW1i
VZIP.F32 dW0r,dW0i
VMUL qT0,dW1r,dT2
VMUL qT1,dW1r,dT3
VMUL qT2,dW0r,dT2
VMUL qT3,dW0r,dT3
VMUL.F32 qT0,dW1r,dT2
VMUL.F32 qT1,dW1r,dT3
VMUL.F32 qT2,dW0r,dT2
VMUL.F32 qT3,dW0r,dT3
VMLA qT0,dW1i,dT3
VMLS qT1,dW1i,dT2
VMLA.F32 qT0,dW1i,dT3
VMLS.F32 qT1,dW1i,dT2
VMLS qT2,dW0i,dT3
VMLA qT3,dW0i,dT2
VMLS.F32 qT2,dW0i,dT3
VMLA.F32 qT3,dW0i,dT2
VMUL dX1r,qT0,half[0]
VMUL dX1i,qT1,half[0]
VMUL.F32 dX1r,qT0,half[0]
VMUL.F32 dX1i,qT1,half[0]
VSUB dY1r,dT0,dX1i @// F(N/2 -1)
VADD dY1i,dT1,dX1r
VNEG dY1i,dY1i
VSUB.F32 dY1r,dT0,dX1i @// F(N/2 -1)
VADD.F32 dY1i,dT1,dX1r
VNEG.F32 dY1i,dY1i
VREV64 dY1r,dY1r
VREV64 dY1i,dY1i
VREV64.F32 dY1r,dY1r
VREV64.F32 dY1i,dY1i
VMUL dX0r,qT2,half[0]
VMUL dX0i,qT3,half[0]
VMUL.F32 dX0r,qT2,half[0]
VMUL.F32 dX0i,qT3,half[0]
VSUB dY0r,dT0,dX0i @// F(1)
VADD dY0i,dT1,dX0r
VSUB.F32 dY0r,dT0,dX0i @// F(1)
VADD.F32 dY0i,dT1,dX0r
VST2 {dY0r,dY0i},[argDst],step
VST2 {dY1r,dY1i},[argDst]!
VST2.F32 {dY0r,dY0i},[argDst],step
VST2.F32 {dY1r,dY1i},[argDst]!
SUB argDst,argDst,step
SUB step,step,#32 @// (N/2-4)*8 bytes
@ -388,11 +388,11 @@ evenOddButterflyLoop:
@// Since (c,d) = (0,1) for the last element, result is just (a,-b)
lastElement:
VLD1 dX0r,[pSrc]
VLD1.F32 dX0r,[pSrc]
VST1 dX0r[0],[argDst]!
VNEG dX0r,dX0r
VST1 dX0r[1],[argDst]!
VST1.F32 dX0r[0],[argDst]!
VNEG.F32 dX0r,dX0r
VST1.F32 dX0r[1],[argDst]!
End:
@// Set return value

Просмотреть файл

@ -100,45 +100,45 @@
@// Neon registers
#define dX0 D0.F32
#define dShift D1.F32
#define dX1 D1.F32
#define dY0 D2.F32
#define dY1 D3.F32
#define dX0r D0.F32
#define dX0i D1.F32
#define dX1r D2.F32
#define dX1i D3.F32
#define dW0r D4.F32
#define dW0i D5.F32
#define dW1r D6.F32
#define dW1i D7.F32
#define dT0 D8.F32
#define dT1 D9.F32
#define dT2 D10.F32
#define dT3 D11.F32
#define qT0 d12.F32
#define qT1 d14.F32
#define qT2 d16.F32
#define qT3 d18.F32
#define dY0r D4.F32
#define dY0i D5.F32
#define dY1r D6.F32
#define dY1i D7.F32
#define dzero D20.F32
#define dX0 D0
#define dShift D1
#define dX1 D1
#define dY0 D2
#define dY1 D3
#define dX0r D0
#define dX0i D1
#define dX1r D2
#define dX1i D3
#define dW0r D4
#define dW0i D5
#define dW1r D6
#define dW1i D7
#define dT0 D8
#define dT1 D9
#define dT2 D10
#define dT3 D11
#define qT0 d12
#define qT1 d14
#define qT2 d16
#define qT3 d18
#define dY0r D4
#define dY0i D5
#define dY1r D6
#define dY1i D7
#define dzero D20
#define dY2 D4.F32
#define dY3 D5.F32
#define dW0 D6.F32
#define dW1 D7.F32
#define dW0Tmp D10.F32
#define dW1Neg D11.F32
#define dY2 D4
#define dY3 D5
#define dW0 D6
#define dW1 D7
#define dW0Tmp D10
#define dW1Neg D11
#define sN S0.S32
#define fN S1.F32
#define fN S1
@// two must be the same as dScale[0]!
#define dScale D2.F32
#define two S4.F32
#define dScale D2
#define two S4
@// Allocate stack memory required by the function
@ -165,8 +165,8 @@
@// N=1 Treat seperately
CMP N,#1
BGT sizeGreaterThanOne
VLD1 dX0[0],[pSrc]
VST1 dX0[0],[pDst]
VLD1.F32 dX0[0],[pSrc]
VST1.F32 dX0[0],[pDst]
B End
@ -195,8 +195,8 @@ complexIFFT:
CMP order,#1
BGE orderGreaterthan0 @// order > 0
VLD1 dX0,[pSrc]
VST1 dX0,[pDst]
VLD1.F32 dX0,[pSrc]
VST1.F32 dX0,[pDst]
MOV pSrc,pDst
BLT FFTEnd
@ -260,14 +260,14 @@ lastStageUnscaledRadix4:
FFTEnd: @// Does only the scaling
@ Scale inverse FFT result by 2 for consistency with other FFTs
VMOV two, 2.0 @ two = dScale[0]
VMOV.F32 two, #2.0 @ two = dScale[0]
@// N = subFFTSize ; dataptr = pDst
scaleFFTData:
VLD1 {dX0},[pSrc] @// pSrc contains pDst pointer
VLD1.F32 {dX0},[pSrc] @// pSrc contains pDst pointer
SUBS subFFTSize,subFFTSize,#1
VMUL dX0, dX0, dScale[0]
VST1 {dX0},[pSrc]!
VMUL.F32 dX0, dX0, dScale[0]
VST1.F32 {dX0},[pSrc]!
BGT scaleFFTData