зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1540760 - Add missing aarch64 files for ffvpx; r=jya
Differential Revision: https://phabricator.services.mozilla.com/D27789 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
8bcd86c1fe
Коммит
23afbbacd1
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
av_cold void ff_fft_init_aarch64(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
s->fft_calc = ff_fft_calc_neon;
|
||||
#if CONFIG_MDCT
|
||||
s->imdct_calc = ff_imdct_calc_neon;
|
||||
s->imdct_half = ff_imdct_half_neon;
|
||||
s->mdct_calc = ff_mdct_calc_neon;
|
||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
|
||||
#endif
|
||||
}
|
||||
}
|
|
@ -0,0 +1,442 @@
|
|||
/*
|
||||
* ARM NEON optimised FFT
|
||||
*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2009 Naotoshi Nojiri
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This algorithm (though not any of the implementation details) is
|
||||
* based on libdjbfft by D. J. Bernstein.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define M_SQRT1_2 0.70710678118654752440
|
||||
|
||||
.macro transpose d0, d1, s0, s1
|
||||
trn1 \d0, \s0, \s1
|
||||
trn2 \d1, \s0, \s1
|
||||
.endm
|
||||
|
||||
|
||||
function fft4_neon
|
||||
ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
|
||||
ext v16.8b, v2.8b, v3.8b, #4
|
||||
ext v17.8b, v3.8b, v2.8b, #4
|
||||
|
||||
fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
|
||||
fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
|
||||
|
||||
fadd v0.2s, v4.2s, v5.2s
|
||||
fsub v2.2s, v4.2s, v5.2s
|
||||
fadd v1.2s, v6.2s, v7.2s
|
||||
fsub v3.2s, v6.2s, v7.2s
|
||||
|
||||
st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft8_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
|
||||
st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
|
||||
st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function fft16_neon
|
||||
mov x1, x0
|
||||
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
|
||||
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
|
||||
ext v22.8b, v2.8b, v3.8b, #4
|
||||
ext v23.8b, v3.8b, v2.8b, #4
|
||||
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
|
||||
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
|
||||
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
|
||||
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
|
||||
rev64 v27.2s, v28.2s // ???
|
||||
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
|
||||
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
|
||||
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
|
||||
ext v6.8b, v4.8b, v5.8b, #4
|
||||
ext v7.8b, v5.8b, v4.8b, #4
|
||||
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
|
||||
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
|
||||
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
|
||||
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
|
||||
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
|
||||
fadd v0.2s, v20.2s, v21.2s
|
||||
fsub v2.2s, v20.2s, v21.2s
|
||||
fadd v1.2s, v22.2s, v23.2s
|
||||
rev64 v26.2s, v26.2s
|
||||
rev64 v27.2s, v27.2s
|
||||
fsub v3.2s, v22.2s, v23.2s
|
||||
fsub v6.2s, v6.2s, v7.2s
|
||||
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
|
||||
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
|
||||
fadd v7.2s, v4.2s, v5.2s
|
||||
fsub v18.2s, v2.2s, v6.2s
|
||||
ld1 {v20.4s,v21.4s}, [x0], #32
|
||||
ld1 {v22.4s,v23.4s}, [x0], #32
|
||||
ext v26.8b, v24.8b, v25.8b, #4
|
||||
ext v27.8b, v25.8b, v24.8b, #4
|
||||
fadd v2.2s, v2.2s, v6.2s
|
||||
fsub v16.2s, v0.2s, v7.2s
|
||||
fadd v5.2s, v25.2s, v24.2s
|
||||
fsub v4.2s, v26.2s, v27.2s
|
||||
transpose v24.2d, v25.2d, v20.2d, v22.2d
|
||||
transpose v26.2d, v27.2d, v21.2d, v23.2d
|
||||
fadd v0.2s, v0.2s, v7.2s
|
||||
fsub v17.2s, v1.2s, v5.2s
|
||||
fsub v19.2s, v3.2s, v4.2s
|
||||
fadd v3.2s, v3.2s, v4.2s
|
||||
fadd v1.2s, v1.2s, v5.2s
|
||||
ext v20.16b, v21.16b, v21.16b, #4
|
||||
ext v21.16b, v23.16b, v23.16b, #4
|
||||
|
||||
zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
|
||||
zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
|
||||
zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
|
||||
zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
|
||||
|
||||
// 2 x fft4
|
||||
transpose v22.2d, v23.2d, v20.2d, v21.2d
|
||||
|
||||
fadd v4.4s, v24.4s, v25.4s
|
||||
fadd v5.4s, v26.4s, v27.4s
|
||||
fsub v6.4s, v24.4s, v25.4s
|
||||
fsub v7.4s, v22.4s, v23.4s
|
||||
|
||||
ld1 {v23.4s}, [x14]
|
||||
|
||||
fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
|
||||
fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
|
||||
fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
//fft_pass_neon_16
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v23.s[1]
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
|
||||
fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
|
||||
fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
|
||||
fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
|
||||
|
||||
//second half
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v23.s[2]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v23.s[3]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
|
||||
|
||||
zip1 v24.4s, v26.4s, v27.4s
|
||||
zip2 v25.4s, v26.4s, v27.4s
|
||||
fneg v26.4s, v24.4s
|
||||
fadd v4.4s, v25.4s, v24.4s
|
||||
fsub v6.4s, v24.4s, v25.4s // just the second half
|
||||
fadd v5.4s, v25.4s, v26.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
|
||||
fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
|
||||
fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
|
||||
fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
|
||||
|
||||
st1 {v16.4s,v17.4s}, [x1], #32
|
||||
st1 {v18.4s,v19.4s}, [x1], #32
|
||||
st1 {v20.4s,v21.4s}, [x1], #32
|
||||
st1 {v22.4s,v23.4s}, [x1], #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
const trans4_float, align=4
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 8, 9, 10, 11
|
||||
.byte 4, 5, 6, 7
|
||||
.byte 12, 13, 14, 15
|
||||
endconst
|
||||
|
||||
const trans8_float, align=4
|
||||
.byte 24, 25, 26, 27
|
||||
.byte 0, 1, 2, 3
|
||||
.byte 28, 29, 30, 31
|
||||
.byte 4, 5, 6, 7
|
||||
endconst
|
||||
|
||||
function fft_pass_neon
|
||||
sub x6, x2, #1 // n - 1, loop counter
|
||||
lsl x5, x2, #3 // 2 * n * sizeof FFTSample
|
||||
lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
|
||||
add x5, x4, x5 // wim
|
||||
add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
|
||||
add x2, x0, x2, lsl #5 // &z[o2]
|
||||
add x3, x0, x3 // &z[o3]
|
||||
add x1, x0, x1 // &z[o1]
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
trn2 v25.2d, v20.2d, v22.2d
|
||||
sub x5, x5, #4 // wim--
|
||||
trn1 v24.2d, v20.2d, v22.2d
|
||||
ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
|
||||
rev64 v7.4s, v25.4s
|
||||
fmul v25.4s, v25.4s, v4.s[1]
|
||||
ld1 {v16.4s}, [x0] // {z[0],z[1]}
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
|
||||
prfm pldl1keep, [x2, #16]
|
||||
prfm pldl1keep, [x3, #16]
|
||||
fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
prfm pldl1keep, [x0, #16]
|
||||
prfm pldl1keep, [x1, #16]
|
||||
|
||||
zip1 v20.4s, v24.4s, v25.4s
|
||||
zip2 v21.4s, v24.4s, v25.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
1:
|
||||
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
|
||||
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
|
||||
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
|
||||
transpose v26.2d, v27.2d, v20.2d, v22.2d
|
||||
ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
|
||||
rev64 v6.4s, v26.4s
|
||||
fmul v26.4s, v26.4s, v4.s[0]
|
||||
rev64 v7.4s, v27.4s
|
||||
fmul v27.4s, v27.4s, v4.s[1]
|
||||
fmul v6.4s, v6.4s, v29.4s
|
||||
fmul v7.4s, v7.4s, v29.4s
|
||||
ld1 {v16.4s},[x0] // {z[0],z[1]}
|
||||
fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
|
||||
fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
|
||||
ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
|
||||
|
||||
subs x6, x6, #1 // n--
|
||||
|
||||
zip1 v20.4s, v26.4s, v27.4s
|
||||
zip2 v21.4s, v26.4s, v27.4s
|
||||
fneg v22.4s, v20.4s
|
||||
fadd v4.4s, v21.4s, v20.4s
|
||||
fsub v6.4s, v20.4s, v21.4s // just the second half
|
||||
fadd v5.4s, v21.4s, v22.4s // just the first half
|
||||
|
||||
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
|
||||
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
|
||||
|
||||
fadd v20.4s, v16.4s, v4.4s
|
||||
fsub v22.4s, v16.4s, v4.4s
|
||||
fadd v21.4s, v17.4s, v5.4s
|
||||
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
|
||||
fsub v23.4s, v17.4s, v5.4s
|
||||
|
||||
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
|
||||
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
|
||||
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
|
||||
b.ne 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro def_fft n, n2, n4
|
||||
function fft\n\()_neon, align=6
|
||||
sub sp, sp, #16
|
||||
stp x28, x30, [sp]
|
||||
add x28, x0, #\n4*2*8
|
||||
bl fft\n2\()_neon
|
||||
mov x0, x28
|
||||
bl fft\n4\()_neon
|
||||
add x0, x28, #\n4*1*8
|
||||
bl fft\n4\()_neon
|
||||
sub x0, x28, #\n4*2*8
|
||||
ldp x28, x30, [sp], #16
|
||||
movrel x4, X(ff_cos_\n)
|
||||
mov x2, #\n4>>1
|
||||
b fft_pass_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
def_fft 32, 16, 8
|
||||
def_fft 64, 32, 16
|
||||
def_fft 128, 64, 32
|
||||
def_fft 256, 128, 64
|
||||
def_fft 512, 256, 128
|
||||
def_fft 1024, 512, 256
|
||||
def_fft 2048, 1024, 512
|
||||
def_fft 4096, 2048, 1024
|
||||
def_fft 8192, 4096, 2048
|
||||
def_fft 16384, 8192, 4096
|
||||
def_fft 32768, 16384, 8192
|
||||
def_fft 65536, 32768, 16384
|
||||
|
||||
function ff_fft_calc_neon, export=1
|
||||
prfm pldl1keep, [x1]
|
||||
movrel x10, trans4_float
|
||||
ldr w2, [x0]
|
||||
movrel x11, trans8_float
|
||||
sub w2, w2, #2
|
||||
movrel x3, fft_tab_neon
|
||||
ld1 {v30.16b}, [x10]
|
||||
mov x7, #-8
|
||||
movrel x12, pmmp
|
||||
ldr x3, [x3, x2, lsl #3]
|
||||
movrel x13, mppm
|
||||
movrel x14, X(ff_cos_16)
|
||||
ld1 {v31.16b}, [x11]
|
||||
mov x0, x1
|
||||
ld1 {v29.4s}, [x12] // pmmp
|
||||
ld1 {v28.4s}, [x13]
|
||||
br x3
|
||||
endfunc
|
||||
|
||||
function ff_fft_permute_neon, export=1
|
||||
mov x6, #1
|
||||
ldr w2, [x0] // nbits
|
||||
ldr x3, [x0, #16] // tmp_buf
|
||||
ldr x0, [x0, #8] // revtab
|
||||
lsl x6, x6, x2
|
||||
mov x2, x6
|
||||
1:
|
||||
ld1 {v0.2s,v1.2s}, [x1], #16
|
||||
ldr w4, [x0], #4
|
||||
uxth w5, w4
|
||||
lsr w4, w4, #16
|
||||
add x5, x3, x5, lsl #3
|
||||
add x4, x3, x4, lsl #3
|
||||
st1 {v0.2s}, [x5]
|
||||
st1 {v1.2s}, [x4]
|
||||
subs x6, x6, #2
|
||||
b.gt 1b
|
||||
|
||||
sub x1, x1, x2, lsl #3
|
||||
1:
|
||||
ld1 {v0.4s,v1.4s}, [x3], #32
|
||||
st1 {v0.4s,v1.4s}, [x1], #32
|
||||
subs x2, x2, #4
|
||||
b.gt 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const fft_tab_neon, relocate=1
|
||||
.quad fft4_neon
|
||||
.quad fft8_neon
|
||||
.quad fft16_neon
|
||||
.quad fft32_neon
|
||||
.quad fft64_neon
|
||||
.quad fft128_neon
|
||||
.quad fft256_neon
|
||||
.quad fft512_neon
|
||||
.quad fft1024_neon
|
||||
.quad fft2048_neon
|
||||
.quad fft4096_neon
|
||||
.quad fft8192_neon
|
||||
.quad fft16384_neon
|
||||
.quad fft32768_neon
|
||||
.quad fft65536_neon
|
||||
endconst
|
||||
|
||||
const pmmp, align=4
|
||||
.float +1.0, -1.0, -1.0, +1.0
|
||||
endconst
|
||||
|
||||
const mppm, align=4
|
||||
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
|
||||
endconst
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* ARM NEON optimised H.264 chroma functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264chroma.h"
|
||||
|
||||
#include "config.h"
|
||||
|
||||
void ff_put_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
void ff_avg_h264_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_h264_chroma_mc2_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_h264chroma_init_aarch64(H264ChromaContext *c, int bit_depth)
|
||||
{
|
||||
const int high_bit_depth = bit_depth > 8;
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && !high_bit_depth) {
|
||||
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
|
||||
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
|
||||
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
|
||||
|
||||
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
|
||||
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
|
||||
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,450 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
/* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc8 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
|
||||
.ifc \type,avg
|
||||
mov x8, x0
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,rv40
|
||||
movrel x6, rv40bias
|
||||
lsr w9, w5, #1
|
||||
lsr w10, w4, #1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
cmp w7, #0
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v0.8B, w4
|
||||
dup v1.8B, w12
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
dup v2.8B, w6
|
||||
dup v3.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
1: ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
umlal v16.8H, v6.8B, v2.8B
|
||||
prfm pldl1strm, [x1]
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
umlal v16.8H, v7.8B, v3.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
subs w3, w3, #2
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
umlal v17.8H, v4.8B, v2.8B
|
||||
umlal v17.8H, v5.8B, v3.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v0.8B, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v1.8B, w12
|
||||
b.eq 4f
|
||||
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
3: ld1 {v6.8B}, [x1], x2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v6.8B, v1.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v4.8B, v1.8B
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B, v5.8B}, [x1], x2
|
||||
ld1 {v6.8B, v7.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
umull v17.8H, v6.8B, v0.8B
|
||||
umlal v17.8H, v7.8B, v1.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
prfm pldl1strm, [x1]
|
||||
subs w3, w3, #2
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umull v17.8H, v5.8B, v0.8B
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
rshrn v17.8B, v17.8H, #6
|
||||
.else
|
||||
add v16.8H, v16.8H, v22.8H
|
||||
add v17.8H, v17.8H, v22.8H
|
||||
shrn v16.8B, v16.8H, #6
|
||||
shrn v17.8B, v17.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.8B}, [x8], x2
|
||||
ld1 {v21.8B}, [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
urhadd v17.8B, v17.8B, v21.8B
|
||||
.endif
|
||||
st1 {v16.8B}, [x0], x2
|
||||
st1 {v17.8B}, [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
/* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */
|
||||
.macro h264_chroma_mc4 type, codec=h264
|
||||
function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
|
||||
.ifc \type,avg
|
||||
mov x8, x0
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \codec,rv40
|
||||
movrel x6, rv40bias
|
||||
lsr w9, w5, #1
|
||||
lsr w10, w4, #1
|
||||
lsl w9, w9, #3
|
||||
lsl w10, w10, #1
|
||||
add w9, w9, w10
|
||||
add x6, x6, w9, UXTW
|
||||
ld1r {v22.8H}, [x6]
|
||||
.endif
|
||||
.ifc \codec,vc1
|
||||
movi v22.8H, #28
|
||||
.endif
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
cmp w7, #0
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
b.eq 2f
|
||||
|
||||
dup v24.8B, w4
|
||||
dup v25.8B, w12
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
dup v26.8B, w6
|
||||
dup v27.8B, w7
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v0.2S, v24.2S, v25.2S
|
||||
trn1 v2.2S, v26.2S, v27.2S
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
1: ld1 {v6.8B}, [x1], x2
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umlal v18.8H, v6.8B, v2.8B
|
||||
ld1 {v4.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
prfm pldl1strm, [x1]
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
umlal v19.8H, v4.8B, v2.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2: adds w12, w12, w6
|
||||
dup v30.8B, w4
|
||||
b.eq 5f
|
||||
tst w6, w6
|
||||
dup v31.8B, w12
|
||||
trn1 v0.2S, v30.2S, v31.2S
|
||||
trn2 v1.2S, v30.2S, v31.2S
|
||||
b.eq 4f
|
||||
|
||||
ext v1.8B, v0.8B, v1.8B, #4
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
3: ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
umull v19.8H, v4.8B, v1.8B
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1, x2]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 3b
|
||||
ret
|
||||
|
||||
4: ld1 {v4.8B}, [x1], x2
|
||||
ld1 {v6.8B}, [x1], x2
|
||||
ext v5.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v6.8B, v7.8B, #1
|
||||
trn1 v4.2S, v4.2S, v5.2S
|
||||
trn1 v6.2S, v6.2S, v7.2S
|
||||
umull v18.8H, v4.8B, v0.8B
|
||||
umull v19.8H, v6.8B, v0.8B
|
||||
subs w3, w3, #2
|
||||
trn1 v30.2D, v18.2D, v19.2D
|
||||
trn2 v31.2D, v18.2D, v19.2D
|
||||
add v18.8H, v30.8H, v31.8H
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 4b
|
||||
ret
|
||||
|
||||
5: ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
umull v18.8H, v4.8B, v30.8B
|
||||
subs w3, w3, #2
|
||||
prfm pldl1strm, [x1]
|
||||
.ifc \codec,h264
|
||||
rshrn v16.8B, v18.8H, #6
|
||||
.else
|
||||
add v18.8H, v18.8H, v22.8H
|
||||
shrn v16.8B, v18.8H, #6
|
||||
.endif
|
||||
.ifc \type,avg
|
||||
ld1 {v20.S}[0], [x8], x2
|
||||
ld1 {v20.S}[1], [x8], x2
|
||||
urhadd v16.8B, v16.8B, v20.8B
|
||||
.endif
|
||||
prfm pldl1strm, [x1]
|
||||
st1 {v16.S}[0], [x0], x2
|
||||
st1 {v16.S}[1], [x0], x2
|
||||
b.gt 5b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro h264_chroma_mc2 type
|
||||
function ff_\type\()_h264_chroma_mc2_neon, export=1
|
||||
prfm pldl1strm, [x1]
|
||||
prfm pldl1strm, [x1, x2]
|
||||
orr w7, w4, w5
|
||||
cbz w7, 2f
|
||||
|
||||
mul w7, w4, w5
|
||||
lsl w14, w5, #3
|
||||
lsl w13, w4, #3
|
||||
sub w6, w14, w7
|
||||
sub w12, w13, w7
|
||||
sub w4, w7, w13
|
||||
sub w4, w4, w14
|
||||
add w4, w4, #64
|
||||
dup v0.8B, w4
|
||||
dup v2.8B, w12
|
||||
dup v1.8B, w6
|
||||
dup v3.8B, w7
|
||||
trn1 v0.4H, v0.4H, v2.4H
|
||||
trn1 v1.4H, v1.4H, v3.4H
|
||||
1:
|
||||
ld1 {v4.S}[0], [x1], x2
|
||||
ld1 {v4.S}[1], [x1], x2
|
||||
rev64 v5.2S, v4.2S
|
||||
ld1 {v5.S}[1], [x1]
|
||||
ext v6.8B, v4.8B, v5.8B, #1
|
||||
ext v7.8B, v5.8B, v4.8B, #1
|
||||
trn1 v4.4H, v4.4H, v6.4H
|
||||
trn1 v5.4H, v5.4H, v7.4H
|
||||
umull v16.8H, v4.8B, v0.8B
|
||||
umlal v16.8H, v5.8B, v1.8B
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[2], [x0]
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
rev64 v17.4S, v16.4S
|
||||
add v16.8H, v16.8H, v17.8H
|
||||
rshrn v16.8B, v16.8H, #6
|
||||
.ifc \type,avg
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[2], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 1b
|
||||
ret
|
||||
|
||||
2:
|
||||
ld1 {v16.H}[0], [x1], x2
|
||||
ld1 {v16.H}[1], [x1], x2
|
||||
.ifc \type,avg
|
||||
ld1 {v18.H}[0], [x0], x2
|
||||
ld1 {v18.H}[1], [x0]
|
||||
sub x0, x0, x2
|
||||
urhadd v16.8B, v16.8B, v18.8B
|
||||
.endif
|
||||
st1 {v16.H}[0], [x0], x2
|
||||
st1 {v16.H}[1], [x0], x2
|
||||
subs w3, w3, #2
|
||||
b.gt 2b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
h264_chroma_mc8 put
|
||||
h264_chroma_mc8 avg
|
||||
h264_chroma_mc4 put
|
||||
h264_chroma_mc4 avg
|
||||
h264_chroma_mc2 put
|
||||
h264_chroma_mc2 avg
|
||||
|
||||
#if CONFIG_RV40_DECODER
|
||||
const rv40bias
|
||||
.short 0, 16, 32, 16
|
||||
.short 32, 28, 32, 28
|
||||
.short 0, 32, 16, 32
|
||||
.short 32, 28, 32, 28
|
||||
endconst
|
||||
|
||||
h264_chroma_mc8 put, rv40
|
||||
h264_chroma_mc8 avg, rv40
|
||||
h264_chroma_mc4 put, rv40
|
||||
h264_chroma_mc4 avg, rv40
|
||||
#endif
|
||||
|
||||
#if CONFIG_VC1DSP
|
||||
h264_chroma_mc8 put, vc1
|
||||
h264_chroma_mc8 avg, vc1
|
||||
h264_chroma_mc4 put, vc1
|
||||
h264_chroma_mc4 avg, vc1
|
||||
#endif
|
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
|
||||
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
|
||||
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||
c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||
c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_neon;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_neon;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,498 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
cmp w2, #0
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
mov v24.S[0], w6
|
||||
and w6, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
ands w6, w6, w6, lsl #8
|
||||
b.ge 2f
|
||||
1:
|
||||
ret
|
||||
2:
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
dup v22.16B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0)
|
||||
uxtl v24.4S, v24.4H
|
||||
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0)
|
||||
sli v24.8H, v24.8H, #8
|
||||
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0)
|
||||
sli v24.4S, v24.4S, #16
|
||||
cmhi v21.16B, v22.16B, v21.16B // < alpha
|
||||
dup v22.16B, w3 // beta
|
||||
cmlt v23.16B, v24.16B, #0
|
||||
cmhi v28.16B, v22.16B, v28.16B // < beta
|
||||
cmhi v30.16B, v22.16B, v30.16B // < beta
|
||||
bic v21.16B, v21.16B, v23.16B
|
||||
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0)
|
||||
and v21.16B, v21.16B, v28.16B
|
||||
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0)
|
||||
cmhi v17.16B, v22.16B, v17.16B // < beta
|
||||
and v21.16B, v21.16B, v30.16B
|
||||
cmhi v19.16B, v22.16B, v19.16B // < beta
|
||||
and v17.16B, v17.16B, v21.16B
|
||||
and v19.16B, v19.16B, v21.16B
|
||||
and v24.16B, v24.16B, v21.16B
|
||||
urhadd v28.16B, v16.16B, v0.16B
|
||||
sub v21.16B, v24.16B, v17.16B
|
||||
uqadd v23.16B, v18.16B, v24.16B
|
||||
uhadd v20.16B, v20.16B, v28.16B
|
||||
sub v21.16B, v21.16B, v19.16B
|
||||
uhadd v28.16B, v4.16B, v28.16B
|
||||
umin v23.16B, v23.16B, v20.16B
|
||||
uqsub v22.16B, v18.16B, v24.16B
|
||||
uqadd v4.16B, v2.16B, v24.16B
|
||||
umax v23.16B, v23.16B, v22.16B
|
||||
uqsub v22.16B, v2.16B, v24.16B
|
||||
umin v28.16B, v4.16B, v28.16B
|
||||
uxtl v4.8H, v0.8B
|
||||
umax v28.16B, v28.16B, v22.16B
|
||||
uxtl2 v20.8H, v0.16B
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
usubw2 v20.8H, v20.8H, v16.16B
|
||||
shl v4.8H, v4.8H, #2
|
||||
shl v20.8H, v20.8H, #2
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
uaddw2 v20.8H, v20.8H, v18.16B
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
usubw2 v20.8H, v20.8H, v2.16B
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
rshrn2 v4.16B, v20.8H, #3
|
||||
bsl v17.16B, v23.16B, v18.16B
|
||||
bsl v19.16B, v28.16B, v2.16B
|
||||
neg v23.16B, v21.16B
|
||||
uxtl v28.8H, v16.8B
|
||||
smin v4.16B, v4.16B, v21.16B
|
||||
uxtl2 v21.8H, v16.16B
|
||||
smax v4.16B, v4.16B, v23.16B
|
||||
uxtl v22.8H, v0.8B
|
||||
uxtl2 v24.8H, v0.16B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
saddw2 v21.8H, v21.8H, v4.16B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
ssubw2 v24.8H, v24.8H, v4.16B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun2 v16.16B, v21.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
sqxtun2 v0.16B, v24.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
sxtw x1, w1
|
||||
|
||||
ld1 {v0.16B}, [x0], x1
|
||||
ld1 {v2.16B}, [x0], x1
|
||||
ld1 {v4.16B}, [x0], x1
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
ld1 {v18.16B}, [x0], x1
|
||||
ld1 {v16.16B}, [x0], x1
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v17.16B}, [x0], x1
|
||||
st1 {v16.16B}, [x0], x1
|
||||
st1 {v0.16B}, [x0], x1
|
||||
st1 {v19.16B}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
ld1 {v20.8B}, [x0], x1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0], x1
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
ld1 {v26.8B}, [x0], x1
|
||||
ld1 {v6.D}[1], [x0], x1
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
ld1 {v18.D}[1], [x0], x1
|
||||
ld1 {v16.D}[1], [x0], x1
|
||||
ld1 {v0.D}[1], [x0], x1
|
||||
ld1 {v2.D}[1], [x0], x1
|
||||
ld1 {v4.D}[1], [x0], x1
|
||||
ld1 {v26.D}[1], [x0], x1
|
||||
|
||||
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
add x0, x0, #2
|
||||
st1 {v17.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v19.S}[0], [x0], x1
|
||||
st1 {v17.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v19.S}[1], [x0], x1
|
||||
st1 {v17.S}[2], [x0], x1
|
||||
st1 {v16.S}[2], [x0], x1
|
||||
st1 {v0.S}[2], [x0], x1
|
||||
st1 {v19.S}[2], [x0], x1
|
||||
st1 {v17.S}[3], [x0], x1
|
||||
st1 {v16.S}[3], [x0], x1
|
||||
st1 {v0.S}[3], [x0], x1
|
||||
st1 {v19.S}[3], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
dup v22.8B, w2 // alpha
|
||||
uxtl v24.8H, v24.8B
|
||||
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0)
|
||||
uxtl v4.8H, v0.8B
|
||||
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0)
|
||||
usubw v4.8H, v4.8H, v16.8B
|
||||
sli v24.8H, v24.8H, #8
|
||||
shl v4.8H, v4.8H, #2
|
||||
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0)
|
||||
uaddw v4.8H, v4.8H, v18.8B
|
||||
cmhi v26.8B, v22.8B, v26.8B // < alpha
|
||||
usubw v4.8H, v4.8H, v2.8B
|
||||
dup v22.8B, w3 // beta
|
||||
rshrn v4.8B, v4.8H, #3
|
||||
cmhi v28.8B, v22.8B, v28.8B // < beta
|
||||
cmhi v30.8B, v22.8B, v30.8B // < beta
|
||||
smin v4.8B, v4.8B, v24.8B
|
||||
neg v25.8B, v24.8B
|
||||
and v26.8B, v26.8B, v28.8B
|
||||
smax v4.8B, v4.8B, v25.8B
|
||||
and v26.8B, v26.8B, v30.8B
|
||||
uxtl v22.8H, v0.8B
|
||||
and v4.8B, v4.8B, v26.8B
|
||||
uxtl v28.8H, v16.8B
|
||||
saddw v28.8H, v28.8H, v4.8B
|
||||
ssubw v22.8H, v22.8H, v4.8B
|
||||
sqxtun v16.8B, v28.8H
|
||||
sqxtun v0.8B, v22.8H
|
||||
.endm
|
||||
|
||||
function ff_h264_v_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.8B}, [x0], x1
|
||||
ld1 {v16.8B}, [x0], x1
|
||||
ld1 {v0.8B}, [x0], x1
|
||||
ld1 {v2.8B}, [x0]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.8B}, [x0], x1
|
||||
st1 {v0.8B}, [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_h_loop_filter_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #2
|
||||
ld1 {v18.S}[0], [x0], x1
|
||||
ld1 {v16.S}[0], [x0], x1
|
||||
ld1 {v0.S}[0], [x0], x1
|
||||
ld1 {v2.S}[0], [x0], x1
|
||||
ld1 {v18.S}[1], [x0], x1
|
||||
ld1 {v16.S}[1], [x0], x1
|
||||
ld1 {v0.S}[1], [x0], x1
|
||||
ld1 {v2.S}[1], [x0], x1
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
st1 {v18.S}[0], [x0], x1
|
||||
st1 {v16.S}[0], [x0], x1
|
||||
st1 {v0.S}[0], [x0], x1
|
||||
st1 {v2.S}[0], [x0], x1
|
||||
st1 {v18.S}[1], [x0], x1
|
||||
st1 {v16.S}[1], [x0], x1
|
||||
st1 {v0.S}[1], [x0], x1
|
||||
st1 {v2.S}[1], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro biweight_16 macs, macd
|
||||
dup v0.16B, w5
|
||||
dup v1.16B, w6
|
||||
mov v4.16B, v16.16B
|
||||
mov v6.16B, v16.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v20.16B}, [x0], x2
|
||||
\macd v4.8H, v0.8B, v20.8B
|
||||
\macd\()2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v22.16B}, [x1], x2
|
||||
\macs v4.8H, v1.8B, v22.8B
|
||||
\macs\()2 v6.8H, v1.16B, v22.16B
|
||||
mov v24.16B, v16.16B
|
||||
ld1 {v28.16B}, [x0], x2
|
||||
mov v26.16B, v16.16B
|
||||
\macd v24.8H, v0.8B, v28.8B
|
||||
\macd\()2 v26.8H, v0.16B, v28.16B
|
||||
ld1 {v30.16B}, [x1], x2
|
||||
\macs v24.8H, v1.8B, v30.8B
|
||||
\macs\()2 v26.8H, v1.16B, v30.16B
|
||||
sshl v4.8H, v4.8H, v18.8H
|
||||
sshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
sshl v24.8H, v24.8H, v18.8H
|
||||
sshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
mov v6.16B, v16.16B
|
||||
st1 {v4.16B}, [x7], x2
|
||||
mov v4.16B, v16.16B
|
||||
st1 {v24.16B}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_8 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B, v16.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.8B}, [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.8B}, [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.8B}, [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_4 macs, macd
|
||||
dup v0.8B, w5
|
||||
dup v1.8B, w6
|
||||
mov v2.16B, v16.16B
|
||||
mov v20.16B,v16.16B
|
||||
1: subs w3, w3, #4
|
||||
ld1 {v4.S}[0], [x0], x2
|
||||
ld1 {v4.S}[1], [x0], x2
|
||||
\macd v2.8H, v0.8B, v4.8B
|
||||
ld1 {v5.S}[0], [x1], x2
|
||||
ld1 {v5.S}[1], [x1], x2
|
||||
\macs v2.8H, v1.8B, v5.8B
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x2
|
||||
ld1 {v6.S}[1], [x0], x2
|
||||
\macd v20.8H, v0.8B, v6.8B
|
||||
ld1 {v7.S}[0], [x1], x2
|
||||
ld1 {v7.S}[1], [x1], x2
|
||||
\macs v20.8H, v1.8B, v7.8B
|
||||
sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
sshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
mov v20.16B, v16.16B
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
mov v2.16B, v16.16B
|
||||
st1 {v4.S}[0], [x7], x2
|
||||
st1 {v4.S}[1], [x7], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
2: sshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x7], x2
|
||||
st1 {v2.S}[1], [x7], x2
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro biweight_func w
|
||||
function ff_biweight_h264_pixels_\w\()_neon, export=1
|
||||
sxtw x2, w2
|
||||
lsr w8, w5, #31
|
||||
add w7, w7, #1
|
||||
eor w8, w8, w6, lsr #30
|
||||
orr w7, w7, #1
|
||||
dup v18.8H, w4
|
||||
lsl w7, w7, w4
|
||||
not v18.16B, v18.16B
|
||||
dup v16.8H, w7
|
||||
mov x7, x0
|
||||
cbz w8, 10f
|
||||
subs w8, w8, #1
|
||||
b.eq 20f
|
||||
subs w8, w8, #1
|
||||
b.eq 30f
|
||||
b 40f
|
||||
10: biweight_\w umlal, umlal
|
||||
20: neg w5, w5
|
||||
biweight_\w umlal, umlsl
|
||||
30: neg w5, w5
|
||||
neg w6, w6
|
||||
biweight_\w umlsl, umlsl
|
||||
40: neg w6, w6
|
||||
biweight_\w umlsl, umlal
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
biweight_func 16
|
||||
biweight_func 8
|
||||
biweight_func 4
|
||||
|
||||
.macro weight_16 add
|
||||
dup v0.16B, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v20.16B}, [x0], x1
|
||||
umull v4.8H, v0.8B, v20.8B
|
||||
umull2 v6.8H, v0.16B, v20.16B
|
||||
ld1 {v28.16B}, [x0], x1
|
||||
umull v24.8H, v0.8B, v28.8B
|
||||
umull2 v26.8H, v0.16B, v28.16B
|
||||
\add v4.8H, v16.8H, v4.8H
|
||||
srshl v4.8H, v4.8H, v18.8H
|
||||
\add v6.8H, v16.8H, v6.8H
|
||||
srshl v6.8H, v6.8H, v18.8H
|
||||
sqxtun v4.8B, v4.8H
|
||||
sqxtun2 v4.16B, v6.8H
|
||||
\add v24.8H, v16.8H, v24.8H
|
||||
srshl v24.8H, v24.8H, v18.8H
|
||||
\add v26.8H, v16.8H, v26.8H
|
||||
srshl v26.8H, v26.8H, v18.8H
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v26.8H
|
||||
st1 {v4.16B}, [x5], x1
|
||||
st1 {v24.16B}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_8 add
|
||||
dup v0.8B, w4
|
||||
1: subs w2, w2, #2
|
||||
ld1 {v4.8B}, [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
ld1 {v6.8B}, [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8H, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.8B}, [x5], x1
|
||||
st1 {v4.8B}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_4 add
|
||||
dup v0.8B, w4
|
||||
1: subs w2, w2, #4
|
||||
ld1 {v4.S}[0], [x0], x1
|
||||
ld1 {v4.S}[1], [x0], x1
|
||||
umull v2.8H, v0.8B, v4.8B
|
||||
b.lt 2f
|
||||
ld1 {v6.S}[0], [x0], x1
|
||||
ld1 {v6.S}[1], [x0], x1
|
||||
umull v20.8H, v0.8B, v6.8B
|
||||
\add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
\add v20.8H, v16.8H, v20.8H
|
||||
srshl v20.8H, v20.8h, v18.8H
|
||||
sqxtun v4.8B, v20.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
st1 {v4.S}[0], [x5], x1
|
||||
st1 {v4.S}[1], [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
2: \add v2.8H, v16.8H, v2.8H
|
||||
srshl v2.8H, v2.8H, v18.8H
|
||||
sqxtun v2.8B, v2.8H
|
||||
st1 {v2.S}[0], [x5], x1
|
||||
st1 {v2.S}[1], [x5], x1
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro weight_func w
|
||||
function ff_weight_h264_pixels_\w\()_neon, export=1
|
||||
sxtw x1, w1
|
||||
cmp w3, #1
|
||||
mov w6, #1
|
||||
lsl w5, w5, w3
|
||||
dup v16.8H, w5
|
||||
mov x5, x0
|
||||
b.le 20f
|
||||
sub w6, w6, w3
|
||||
dup v18.8H, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w shadd
|
||||
10: neg w4, w4
|
||||
weight_\w shsub
|
||||
20: neg w6, w3
|
||||
dup v18.8H, w6
|
||||
cmp w4, #0
|
||||
b.lt 10f
|
||||
weight_\w add
|
||||
10: neg w4, w4
|
||||
weight_\w sub
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
weight_func 16
|
||||
weight_func 8
|
||||
weight_func 4
|
|
@ -0,0 +1,409 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
function ff_h264_idct_add_neon, export=1
|
||||
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
|
||||
sxtw x2, w2
|
||||
movi v30.8H, #0
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
sshr v16.4H, v1.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sshr v17.4H, v3.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
sub v6.4H, v16.4H, v3.4H
|
||||
add v7.4H, v1.4H, v17.4H
|
||||
add v0.4H, v4.4H, v7.4H
|
||||
add v1.4H, v5.4H, v6.4H
|
||||
sub v2.4H, v5.4H, v6.4H
|
||||
sub v3.4H, v4.4H, v7.4H
|
||||
|
||||
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
ld1 {v18.S}[0], [x0], x2
|
||||
sshr v16.4H, v3.4H, #1
|
||||
sshr v17.4H, v1.4H, #1
|
||||
ld1 {v18.S}[1], [x0], x2
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
ld1 {v19.S}[1], [x0], x2
|
||||
add v6.4H, v16.4H, v1.4H
|
||||
ins v4.D[1], v5.D[0]
|
||||
sub v7.4H, v17.4H, v3.4H
|
||||
ld1 {v19.S}[0], [x0], x2
|
||||
ins v6.D[1], v7.D[0]
|
||||
sub x0, x0, x2, lsl #2
|
||||
add v0.8H, v4.8H, v6.8H
|
||||
sub v1.8H, v4.8H, v6.8H
|
||||
|
||||
srshr v0.8H, v0.8H, #6
|
||||
srshr v1.8H, v1.8H, #6
|
||||
|
||||
uaddw v0.8H, v0.8H, v18.8B
|
||||
uaddw v1.8H, v1.8H, v19.8B
|
||||
|
||||
sqxtun v0.8B, v0.8H
|
||||
sqxtun v1.8B, v1.8H
|
||||
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
|
||||
sub x1, x1, #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_dc_add_neon, export=1
|
||||
sxtw x2, w2
|
||||
mov w3, #0
|
||||
ld1r {v2.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
srshr v2.8H, v2.8H, #6
|
||||
ld1 {v0.S}[0], [x0], x2
|
||||
ld1 {v0.S}[1], [x0], x2
|
||||
uaddw v3.8H, v2.8H, v0.8B
|
||||
ld1 {v1.S}[0], [x0], x2
|
||||
ld1 {v1.S}[1], [x0], x2
|
||||
uaddw v4.8H, v2.8H, v1.8B
|
||||
sqxtun v0.8B, v3.8H
|
||||
sqxtun v1.8B, v4.8H
|
||||
sub x0, x0, x2, lsl #2
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, X(ff_h264_idct_dc_add_neon)
|
||||
movrel x14, X(ff_h264_idct_add_neon)
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
subs w3, w3, #1
|
||||
b.lt 2f
|
||||
ldrsh w3, [x1]
|
||||
add x0, x0, x6
|
||||
ccmp w3, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16intra_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, X(ff_h264_idct_dc_add_neon)
|
||||
movrel x14, X(ff_h264_idct_add_neon)
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
add x0, x0, x6
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1]
|
||||
csel x15, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add8_neon, export=1
|
||||
sub sp, sp, #0x40
|
||||
stp x19, x20, [sp]
|
||||
mov x12, x30
|
||||
ldp x6, x15, [x0] // dest[0], dest[1]
|
||||
add x5, x1, #16*4 // block_offset
|
||||
add x9, x2, #16*32 // block
|
||||
mov w19, w3 // stride
|
||||
movrel x13, X(ff_h264_idct_dc_add_neon)
|
||||
movrel x14, X(ff_h264_idct_add_neon)
|
||||
movrel x7, scan8, 16
|
||||
mov x10, #0
|
||||
mov x11, #16
|
||||
1: mov w2, w19
|
||||
ldrb w3, [x7, x10] // scan8[i]
|
||||
ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
|
||||
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
|
||||
add x0, x0, x6 // block_offset[i] + dst[j-1]
|
||||
add x1, x9, x10, lsl #5 // block + i * 16
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1] // block[i*16]
|
||||
csel x20, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x20
|
||||
2: add x10, x10, #1
|
||||
cmp x10, #4
|
||||
csel x10, x11, x10, eq // mov x10, #16
|
||||
csel x6, x15, x6, eq
|
||||
cmp x10, #20
|
||||
b.lt 1b
|
||||
ldp x19, x20, [sp]
|
||||
add sp, sp, #0x40
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
.macro idct8x8_cols pass
|
||||
.if \pass == 0
|
||||
va .req v18
|
||||
vb .req v30
|
||||
sshr v18.8H, v26.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
ld1 {v30.8H, v31.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sshr v19.8H, v30.8H, #1
|
||||
sub v18.8H, v18.8H, v30.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.else
|
||||
va .req v30
|
||||
vb .req v18
|
||||
sshr v30.8H, v26.8H, #1
|
||||
sshr v19.8H, v18.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sub v30.8H, v30.8H, v18.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.endif
|
||||
add v26.8H, v17.8H, va.8H
|
||||
sub v28.8H, v17.8H, va.8H
|
||||
add v24.8H, v16.8H, v19.8H
|
||||
sub vb.8H, v16.8H, v19.8H
|
||||
sub v16.8H, v29.8H, v27.8H
|
||||
add v17.8H, v31.8H, v25.8H
|
||||
sub va.8H, v31.8H, v25.8H
|
||||
add v19.8H, v29.8H, v27.8H
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v25.8H, #1
|
||||
sshr v27.8H, v27.8H, #1
|
||||
sshr v29.8H, v29.8H, #1
|
||||
sshr v31.8H, v31.8H, #1
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v16.8H, #2
|
||||
sshr v27.8H, v17.8H, #2
|
||||
sshr v29.8H, va.8H, #2
|
||||
sshr v31.8H, v19.8H, #2
|
||||
sub v19.8H, v19.8H, v25.8H
|
||||
sub va.8H, v27.8H, va.8H
|
||||
add v17.8H, v17.8H, v29.8H
|
||||
add v16.8H, v16.8H, v31.8H
|
||||
.if \pass == 0
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v18.8H
|
||||
sub v18.8H, v26.8H, v18.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
add v27.8H, v30.8H, v16.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
sub v28.8H, v30.8H, v16.8H
|
||||
.else
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v30.8H
|
||||
sub v30.8H, v26.8H, v30.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
add v27.8H, v18.8H, v16.8H
|
||||
sub v28.8H, v18.8H, v16.8H
|
||||
.endif
|
||||
.unreq va
|
||||
.unreq vb
|
||||
.endm
|
||||
|
||||
function ff_h264_idct8_add_neon, export=1
|
||||
movi v19.8H, #0
|
||||
sxtw x2, w2
|
||||
ld1 {v24.8H, v25.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v26.8H, v27.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v28.8H, v29.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
|
||||
idct8x8_cols 0
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
|
||||
idct8x8_cols 1
|
||||
|
||||
mov x3, x0
|
||||
srshr v24.8H, v24.8H, #6
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v25.8H, v25.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
srshr v26.8H, v26.8H, #6
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
srshr v27.8H, v27.8H, #6
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
srshr v28.8H, v28.8H, #6
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
srshr v29.8H, v29.8H, #6
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
srshr v30.8H, v30.8H, #6
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v24.8H, v24.8H, v0.8B
|
||||
uaddw v25.8H, v25.8H, v1.8B
|
||||
uaddw v26.8H, v26.8H, v2.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
uaddw v27.8H, v27.8H, v3.8B
|
||||
sqxtun v1.8B, v25.8H
|
||||
uaddw v28.8H, v28.8H, v4.8B
|
||||
sqxtun v2.8B, v26.8H
|
||||
st1 {v0.8B}, [x3], x2
|
||||
uaddw v29.8H, v29.8H, v5.8B
|
||||
sqxtun v3.8B, v27.8H
|
||||
st1 {v1.8B}, [x3], x2
|
||||
uaddw v30.8H, v30.8H, v6.8B
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v2.8B}, [x3], x2
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v3.8B}, [x3], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x3], x2
|
||||
st1 {v5.8B}, [x3], x2
|
||||
st1 {v6.8B}, [x3], x2
|
||||
st1 {v7.8B}, [x3], x2
|
||||
|
||||
sub x1, x1, #128
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_dc_add_neon, export=1
|
||||
mov w3, #0
|
||||
sxtw x2, w2
|
||||
ld1r {v31.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
uaddw v24.8H, v31.8H, v0.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
uaddw v25.8H, v31.8H, v1.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
uaddw v26.8H, v31.8H, v2.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
uaddw v27.8H, v31.8H, v3.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
uaddw v28.8H, v31.8H, v4.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v29.8H, v31.8H, v5.8B
|
||||
uaddw v30.8H, v31.8H, v6.8B
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
sqxtun v1.8B, v25.8H
|
||||
sqxtun v2.8B, v26.8H
|
||||
sqxtun v3.8B, v27.8H
|
||||
sub x0, x0, x2, lsl #3
|
||||
st1 {v0.8B}, [x0], x2
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v1.8B}, [x0], x2
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v2.8B}, [x0], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
st1 {v3.8B}, [x0], x2
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v6.8B}, [x0], x2
|
||||
st1 {v7.8B}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_add4_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0
|
||||
mov x5, x1
|
||||
mov x1, x2
|
||||
mov w2, w3
|
||||
movrel x7, scan8
|
||||
mov w10, #16
|
||||
movrel x13, X(ff_h264_idct8_dc_add_neon)
|
||||
movrel x14, X(ff_h264_idct8_add_neon)
|
||||
1: ldrb w9, [x7], #4
|
||||
ldrsw x0, [x5], #16
|
||||
ldrb w9, [x4, w9, UXTW]
|
||||
subs w9, w9, #1
|
||||
b.lt 2f
|
||||
ldrsh w11, [x1]
|
||||
add x0, x6, x0
|
||||
ccmp w11, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs w10, w10, #4
|
||||
add x1, x1, #128
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
const scan8
|
||||
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
||||
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
||||
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
||||
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
||||
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
||||
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
||||
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
||||
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
||||
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
||||
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
||||
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
||||
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
||||
endconst
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
|
||||
|
||||
static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
|
||||
const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
const int high_depth = bit_depth > 8;
|
||||
|
||||
if (high_depth)
|
||||
return;
|
||||
|
||||
if (chroma_format_idc <= 1) {
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
|
||||
if (codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
|
||||
h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP7 &&
|
||||
codec_id != AV_CODEC_ID_VP8) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
|
||||
h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
|
||||
h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
|
||||
}
|
||||
}
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
|
||||
h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
|
||||
if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 &&
|
||||
codec_id != AV_CODEC_ID_VP7 && codec_id != AV_CODEC_ID_VP8)
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
|
||||
}
|
||||
|
||||
av_cold void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id,
|
||||
int bit_depth, const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags))
|
||||
h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
|
||||
}
|
|
@ -0,0 +1,361 @@
|
|||
/*
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
||||
.if \n >= 8 || \hi == 0
|
||||
ld1 {\rd\().b}[0], [\rs], \rt
|
||||
ld1 {\rd\().b}[1], [\rs], \rt
|
||||
ld1 {\rd\().b}[2], [\rs], \rt
|
||||
ld1 {\rd\().b}[3], [\rs], \rt
|
||||
.endif
|
||||
.if \n >= 8 || \hi == 1
|
||||
ld1 {\rd\().b}[4], [\rs], \rt
|
||||
ld1 {\rd\().b}[5], [\rs], \rt
|
||||
ld1 {\rd\().b}[6], [\rs], \rt
|
||||
ld1 {\rd\().b}[7], [\rs], \rt
|
||||
.endif
|
||||
.if \n == 16
|
||||
ld1 {\rd\().b}[8], [\rs], \rt
|
||||
ld1 {\rd\().b}[9], [\rs], \rt
|
||||
ld1 {\rd\().b}[10], [\rs], \rt
|
||||
ld1 {\rd\().b}[11], [\rs], \rt
|
||||
ld1 {\rd\().b}[12], [\rs], \rt
|
||||
ld1 {\rd\().b}[13], [\rs], \rt
|
||||
ld1 {\rd\().b}[14], [\rs], \rt
|
||||
ld1 {\rd\().b}[15], [\rs], \rt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
function ff_pred16x16_128_dc_neon, export=1
|
||||
movi v0.16b, #128
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_top_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
ld1 {v0.16b}, [x2]
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_left_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1, 16
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b .L_pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.16b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 16
|
||||
uaddlv h0, v0.16b
|
||||
uaddlv h1, v1.16b
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
rshrn v0.8b, v0.8h, #5
|
||||
dup v0.16b, v0.b[0]
|
||||
.L_pred16x16_dc_end:
|
||||
mov w3, #8
|
||||
6: st1 {v0.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 6b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_hor_neon, export=1
|
||||
sub x2, x0, #1
|
||||
mov w3, #16
|
||||
1: ld1r {v0.16b}, [x2], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_vert_neon, export=1
|
||||
sub x2, x0, x1
|
||||
add x1, x1, x1
|
||||
ld1 {v0.16b}, [x2], x1
|
||||
mov w3, #8
|
||||
1: st1 {v0.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred16x16_plane_neon, export=1
|
||||
sub x3, x0, x1
|
||||
movrel x4, p16weight
|
||||
add x2, x3, #8
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.8b}, [x3]
|
||||
ld1 {v2.8b}, [x2], x1
|
||||
ldcol.8 v1, x3, x1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1
|
||||
rev64 v0.8b, v0.8b
|
||||
rev64 v1.8b, v1.8b
|
||||
uaddl v7.8h, v2.8b, v3.8b
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
usubl v3.8h, v3.8b, v1.8b
|
||||
ld1 {v0.8h}, [x4]
|
||||
mul v2.8h, v2.8h, v0.8h
|
||||
mul v3.8h, v3.8h, v0.8h
|
||||
addp v2.8h, v2.8h, v3.8h
|
||||
addp v2.8h, v2.8h, v2.8h
|
||||
addp v2.4h, v2.4h, v2.4h
|
||||
sshll v3.4s, v2.4h, #2
|
||||
saddw v2.4s, v3.4s, v2.4h
|
||||
rshrn v4.4h, v2.4s, #6
|
||||
trn2 v5.4h, v4.4h, v4.4h
|
||||
add v2.4h, v4.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #3
|
||||
ext v7.16b, v7.16b, v7.16b, #14
|
||||
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
||||
add v7.4h, v7.4h, v0.4h
|
||||
shl v2.4h, v7.4h, #4
|
||||
sub v2.4h, v2.4h, v3.4h
|
||||
shl v3.4h, v4.4h, #4
|
||||
ext v0.16b, v0.16b, v0.16b, #14
|
||||
sub v6.4h, v5.4h, v3.4h
|
||||
mov v0.h[0], wzr
|
||||
mul v0.8h, v0.8h, v4.h[0]
|
||||
dup v1.8h, v2.h[0]
|
||||
dup v2.8h, v4.h[0]
|
||||
dup v3.8h, v6.h[0]
|
||||
shl v2.8h, v2.8h, #3
|
||||
add v1.8h, v1.8h, v0.8h
|
||||
add v3.8h, v3.8h, v2.8h
|
||||
mov w3, #16
|
||||
1:
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
sqshrun2 v0.16b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
st1 {v0.16b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const p16weight, align=4
|
||||
.short 1,2,3,4,5,6,7,8
|
||||
endconst
|
||||
const p8weight, align=4
|
||||
.short 1,2,3,4,1,2,3,4
|
||||
endconst
|
||||
|
||||
function ff_pred8x8_hor_neon, export=1
|
||||
sub x2, x0, #1
|
||||
mov w3, #8
|
||||
1: ld1r {v0.8b}, [x2], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_vert_neon, export=1
|
||||
sub x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
ld1 {v0.8b}, [x2], x1
|
||||
mov w3, #4
|
||||
1: st1 {v0.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_plane_neon, export=1
|
||||
sub x3, x0, x1
|
||||
movrel x4, p8weight
|
||||
movrel x5, p16weight
|
||||
add x2, x3, #4
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.s}[0], [x3]
|
||||
ld1 {v2.s}[0], [x2], x1
|
||||
ldcol.8 v0, x3, x1, 4, hi=1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1, 4
|
||||
uaddl v7.8h, v2.8b, v3.8b
|
||||
rev32 v0.8b, v0.8b
|
||||
trn1 v2.2s, v2.2s, v3.2s
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
ld1 {v6.8h}, [x4]
|
||||
mul v2.8h, v2.8h, v6.8h
|
||||
ld1 {v0.8h}, [x5]
|
||||
saddlp v2.4s, v2.8h
|
||||
addp v2.4s, v2.4s, v2.4s
|
||||
shl v3.4s, v2.4s, #4
|
||||
add v2.4s, v3.4s, v2.4s
|
||||
rshrn v5.4h, v2.4s, #5
|
||||
addp v2.4h, v5.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #1
|
||||
add v3.4h, v3.4h, v2.4h
|
||||
rev64 v7.4h, v7.4h
|
||||
add v7.4h, v7.4h, v0.4h
|
||||
shl v2.4h, v7.4h, #4
|
||||
sub v2.4h, v2.4h, v3.4h
|
||||
ext v0.16b, v0.16b, v0.16b, #14
|
||||
mov v0.h[0], wzr
|
||||
mul v0.8h, v0.8h, v5.h[0]
|
||||
dup v1.8h, v2.h[0]
|
||||
dup v2.8h, v5.h[1]
|
||||
add v1.8h, v1.8h, v0.8h
|
||||
mov w3, #8
|
||||
1:
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
st1 {v0.8b}, [x0], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_128_dc_neon, export=1
|
||||
movi v0.8b, #128
|
||||
movi v1.8b, #128
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_top_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
ld1 {v0.8b}, [x2]
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
zip1 v0.8h, v0.8h, v0.8h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
zip1 v0.8b, v2.8b, v2.8b
|
||||
zip1 v1.8b, v2.8b, v2.8b
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_left_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
dup v1.8b, v2.b[1]
|
||||
dup v0.8b, v2.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1
|
||||
uaddlp v0.4h, v0.8b
|
||||
uaddlp v1.4h, v1.8b
|
||||
trn1 v2.2s, v0.2s, v1.2s
|
||||
trn2 v3.2s, v0.2s, v1.2s
|
||||
addp v4.4h, v2.4h, v3.4h
|
||||
addp v5.4h, v4.4h, v4.4h
|
||||
rshrn v6.8b, v5.8h, #3
|
||||
rshrn v7.8b, v4.8h, #2
|
||||
dup v0.8b, v6.b[0]
|
||||
dup v2.8b, v7.b[2]
|
||||
dup v1.8b, v7.b[3]
|
||||
dup v3.8b, v6.b[1]
|
||||
zip1 v0.2s, v0.2s, v2.2s
|
||||
zip1 v1.2s, v1.2s, v3.2s
|
||||
.L_pred8x8_dc_end:
|
||||
mov w3, #4
|
||||
add x2, x0, x1, lsl #2
|
||||
6: st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x2], x1
|
||||
subs w3, w3, #1
|
||||
b.ne 6b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l0t_dc_neon, export=1
|
||||
sub x2, x0, x1
|
||||
sub x3, x0, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 4
|
||||
zip1 v0.4s, v0.4s, v1.4s
|
||||
uaddlp v0.8h, v0.16b
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
addp v1.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
rshrn v3.8b, v1.8h, #3
|
||||
dup v4.8b, v3.b[0]
|
||||
dup v6.8b, v2.b[2]
|
||||
dup v5.8b, v2.b[0]
|
||||
zip1 v0.2s, v4.2s, v6.2s
|
||||
zip1 v1.2s, v5.2s, v6.2s
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_l00_dc_neon, export=1
|
||||
sub x2, x0, #1
|
||||
ldcol.8 v0, x2, x1, 4
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
movi v1.8b, #128
|
||||
dup v0.8b, v0.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0lt_dc_neon, export=1
|
||||
add x3, x0, x1, lsl #2
|
||||
sub x2, x0, x1
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.8b}, [x2]
|
||||
ldcol.8 v1, x3, x1, 4, hi=1
|
||||
zip1 v0.4s, v0.4s, v1.4s
|
||||
uaddlp v0.8h, v0.16b
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
addp v1.4h, v0.4h, v0.4h
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
rshrn v3.8b, v1.8h, #3
|
||||
dup v4.8b, v2.b[0]
|
||||
dup v5.8b, v2.b[3]
|
||||
dup v6.8b, v2.b[2]
|
||||
dup v7.8b, v3.b[1]
|
||||
zip1 v0.2s, v4.2s, v6.2s
|
||||
zip1 v1.2s, v5.2s, v7.2s
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function ff_pred8x8_0l0_dc_neon, export=1
|
||||
add x2, x0, x1, lsl #2
|
||||
sub x2, x2, #1
|
||||
ldcol.8 v1, x2, x1, 4
|
||||
uaddlp v2.4h, v1.8b
|
||||
addp v2.4h, v2.4h, v2.4h
|
||||
rshrn v1.8b, v2.8h, #2
|
||||
movi v0.8b, #128
|
||||
dup v1.8b, v1.b[0]
|
||||
b .L_pred8x8_dc_end
|
||||
endfunc
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/hpeldsp.h"
|
||||
|
||||
void ff_put_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_put_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_x2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_y2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels8_xy2_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *block, const uint8_t *pixels,
|
||||
ptrdiff_t line_size, int h);
|
||||
|
||||
av_cold void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
|
||||
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
|
||||
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
|
||||
c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
|
||||
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
|
||||
c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
|
||||
|
||||
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
|
||||
c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
|
||||
c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
|
||||
c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
|
||||
|
||||
c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
|
||||
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
|
||||
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
|
||||
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
|
||||
c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
|
||||
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
|
||||
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
|
||||
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
|
||||
|
||||
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
|
||||
c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
|
||||
c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,397 @@
|
|||
/*
|
||||
* ARM NEON optimised DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro pixels16 rnd=1, avg=0
|
||||
.if \avg
|
||||
mov x12, x0
|
||||
.endif
|
||||
1: ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
ld1 {v2.16B}, [x1], x2
|
||||
ld1 {v3.16B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x12], x2
|
||||
urhadd v0.16B, v0.16B, v4.16B
|
||||
ld1 {v5.16B}, [x12], x2
|
||||
urhadd v1.16B, v1.16B, v5.16B
|
||||
ld1 {v6.16B}, [x12], x2
|
||||
urhadd v2.16B, v2.16B, v6.16B
|
||||
ld1 {v7.16B}, [x12], x2
|
||||
urhadd v3.16B, v3.16B, v7.16B
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v1.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
subs w3, w3, #2
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
avg v0.16B, v0.16B, v1.16B
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
avg v2.16B, v2.16B, v3.16B
|
||||
.if \avg
|
||||
ld1 {v1.16B}, [x0], x2
|
||||
ld1 {v3.16B}, [x0]
|
||||
urhadd v0.16B, v0.16B, v1.16B
|
||||
urhadd v2.16B, v2.16B, v3.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.16B}, [x0], x2
|
||||
st1 {v2.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v2.16B, v0.16B, v1.16B
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
avg v3.16B, v0.16B, v1.16B
|
||||
.if \avg
|
||||
ld1 {v4.16B}, [x0], x2
|
||||
ld1 {v5.16B}, [x0]
|
||||
urhadd v2.16B, v2.16B, v4.16B
|
||||
urhadd v3.16B, v3.16B, v5.16B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v2.16B}, [x0], x2
|
||||
st1 {v3.16B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels16_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
ld1 {v4.16B, v5.16B}, [x1], x2
|
||||
NRND movi v26.8H, #1
|
||||
ext v1.16B, v0.16B, v1.16B, #1
|
||||
ext v5.16B, v4.16B, v5.16B, #1
|
||||
uaddl v16.8H, v0.8B, v1.8B
|
||||
uaddl2 v20.8H, v0.16B, v1.16B
|
||||
uaddl v18.8H, v4.8B, v5.8B
|
||||
uaddl2 v22.8H, v4.16B, v5.16B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
ld1 {v2.16B, v3.16B}, [x1], x2
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v3.16B, v2.16B, v3.16B, #1
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
.endif
|
||||
uaddl v18.8H, v2.8B, v3.8B
|
||||
uaddl2 v22.8H, v2.16B, v3.16B
|
||||
st1 {v30.16B}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B, v1.16B}, [x1], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
ext v30.16B, v0.16B, v1.16B, #1
|
||||
add v1.8H, v20.8H, v22.8H
|
||||
mshrn v28.8B, v24.8H, #2
|
||||
NRND add v1.8H, v1.8H, v26.8H
|
||||
mshrn2 v28.16B, v1.8H, #2
|
||||
.if \avg
|
||||
ld1 {v16.16B}, [x0]
|
||||
urhadd v28.16B, v28.16B, v16.16B
|
||||
.endif
|
||||
uaddl v16.8H, v0.8B, v30.8B
|
||||
uaddl2 v20.8H, v0.16B, v30.16B
|
||||
st1 {v28.16B}, [x0], x2
|
||||
add v24.8H, v16.8H, v18.8H
|
||||
NRND add v24.8H, v24.8H, v26.8H
|
||||
add v0.8H, v20.8H, v22.8H
|
||||
mshrn v30.8B, v24.8H, #2
|
||||
NRND add v0.8H, v0.8H, v26.8H
|
||||
mshrn2 v30.16B, v0.8H, #2
|
||||
.if \avg
|
||||
ld1 {v18.16B}, [x0]
|
||||
urhadd v30.16B, v30.16B, v18.16B
|
||||
.endif
|
||||
st1 {v30.16B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8 rnd=1, avg=0
|
||||
1: ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
ld1 {v2.8B}, [x1], x2
|
||||
ld1 {v3.8B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
urhadd v1.8B, v1.8B, v5.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
urhadd v2.8B, v2.8B, v6.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
urhadd v3.8B, v3.8B, v7.8B
|
||||
sub x0, x0, x2, lsl #2
|
||||
.endif
|
||||
subs w3, w3, #4
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v1.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
st1 {v3.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_x2 rnd=1, avg=0
|
||||
1: ld1 {v0.8B, v1.8B}, [x1], x2
|
||||
ext v1.8B, v0.8B, v1.8B, #1
|
||||
ld1 {v2.8B, v3.8B}, [x1], x2
|
||||
ext v3.8B, v2.8B, v3.8B, #1
|
||||
subs w3, w3, #2
|
||||
avg v0.8B, v0.8B, v1.8B
|
||||
avg v2.8B, v2.8B, v3.8B
|
||||
.if \avg
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v0.8B, v0.8B, v4.8B
|
||||
urhadd v2.8B, v2.8B, v5.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v0.8B}, [x0], x2
|
||||
st1 {v2.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_y2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
1: subs w3, w3, #2
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
ld1 {v1.8B}, [x1], x2
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
b.ne 1b
|
||||
|
||||
avg v4.8B, v0.8B, v1.8B
|
||||
ld1 {v0.8B}, [x1], x2
|
||||
avg v5.8B, v0.8B, v1.8B
|
||||
.if \avg
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
ld1 {v3.8B}, [x0]
|
||||
urhadd v4.8B, v4.8B, v2.8B
|
||||
urhadd v5.8B, v5.8B, v3.8B
|
||||
sub x0, x0, x2
|
||||
.endif
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixels8_xy2 rnd=1, avg=0
|
||||
sub w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
NRND movi v19.8H, #1
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
1: subs w3, w3, #2
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
ld1 {v1.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
.endif
|
||||
ext v6.16B, v1.16B, v6.16B, #1
|
||||
uaddl v17.8H, v1.8B, v6.8B
|
||||
st1 {v7.8B}, [x0], x2
|
||||
b.gt 1b
|
||||
|
||||
ld1 {v0.16B}, [x1], x2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
ext v4.16B, v0.16B, v4.16B, #1
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
uaddl v16.8H, v0.8B, v4.8B
|
||||
mshrn v5.8B, v18.8H, #2
|
||||
add v18.8H, v16.8H, v17.8H
|
||||
.if \avg
|
||||
ld1 {v7.8B}, [x0]
|
||||
urhadd v5.8B, v5.8B, v7.8B
|
||||
.endif
|
||||
NRND add v18.8H, v18.8H, v19.8H
|
||||
st1 {v5.8B}, [x0], x2
|
||||
mshrn v7.8B, v18.8H, #2
|
||||
.if \avg
|
||||
ld1 {v5.8B}, [x0]
|
||||
urhadd v7.8B, v7.8B, v5.8B
|
||||
.endif
|
||||
st1 {v7.8B}, [x0], x2
|
||||
|
||||
ret
|
||||
.endm
|
||||
|
||||
.macro pixfunc pfx, name, suf, rnd=1, avg=0
|
||||
.if \rnd
|
||||
.macro avg rd, rn, rm
|
||||
urhadd \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn rd, rn, rm
|
||||
rshrn \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn2 rd, rn, rm
|
||||
rshrn2 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
.endm
|
||||
.else
|
||||
.macro avg rd, rn, rm
|
||||
uhadd \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn rd, rn, rm
|
||||
shrn \rd, \rn, \rm
|
||||
.endm
|
||||
.macro mshrn2 rd, rn, rm
|
||||
shrn2 \rd, \rn, \rm
|
||||
.endm
|
||||
.macro NRND insn:vararg
|
||||
\insn
|
||||
.endm
|
||||
.endif
|
||||
function ff_\pfx\name\suf\()_neon, export=1
|
||||
\name \rnd, \avg
|
||||
endfunc
|
||||
.purgem avg
|
||||
.purgem mshrn
|
||||
.purgem mshrn2
|
||||
.purgem NRND
|
||||
.endm
|
||||
|
||||
.macro pixfunc2 pfx, name, avg=0
|
||||
pixfunc \pfx, \name, rnd=1, avg=\avg
|
||||
pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
|
||||
.endm
|
||||
|
||||
function ff_put_h264_qpel16_mc00_neon, export=1
|
||||
mov w3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels16, avg=0
|
||||
pixfunc2 put_, pixels16_x2, avg=0
|
||||
pixfunc2 put_, pixels16_y2, avg=0
|
||||
pixfunc2 put_, pixels16_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel16_mc00_neon, export=1
|
||||
mov w3, #16
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels16, avg=1
|
||||
pixfunc2 avg_, pixels16_x2, avg=1
|
||||
pixfunc2 avg_, pixels16_y2, avg=1
|
||||
pixfunc2 avg_, pixels16_xy2, avg=1
|
||||
|
||||
function ff_put_h264_qpel8_mc00_neon, export=1
|
||||
mov w3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc put_, pixels8, avg=0
|
||||
pixfunc2 put_, pixels8_x2, avg=0
|
||||
pixfunc2 put_, pixels8_y2, avg=0
|
||||
pixfunc2 put_, pixels8_xy2, avg=0
|
||||
|
||||
function ff_avg_h264_qpel8_mc00_neon, export=1
|
||||
mov w3, #8
|
||||
endfunc
|
||||
|
||||
pixfunc avg_, pixels8, avg=1
|
||||
pixfunc avg_, pixels8_x2, avg=1
|
||||
pixfunc avg_, pixels8_y2, avg=1
|
||||
pixfunc avg_, pixels8_xy2, avg=1
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_IDCT_H
|
||||
#define AVCODEC_AARCH64_IDCT_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void ff_simple_idct_neon(int16_t *data);
|
||||
void ff_simple_idct_put_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
void ff_simple_idct_add_neon(uint8_t *dest, ptrdiff_t line_size, int16_t *data);
|
||||
|
||||
#endif /* AVCODEC_AARCH64_IDCT_H */
|
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* ARM-NEON-optimized IDCT functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/idctdsp.h"
|
||||
#include "idct.h"
|
||||
|
||||
av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
|
||||
unsigned high_bit_depth)
|
||||
{
|
||||
if (!avctx->lowres && !high_bit_depth) {
|
||||
if (avctx->idct_algo == FF_IDCT_AUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
|
||||
avctx->idct_algo == FF_IDCT_SIMPLENEON) {
|
||||
c->idct_put = ff_simple_idct_put_neon;
|
||||
c->idct_add = ff_simple_idct_add_neon;
|
||||
c->idct = ff_simple_idct_neon;
|
||||
c->perm_type = FF_IDCT_PERM_PARTTRANS;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,323 @@
|
|||
/*
|
||||
* AArch64 NEON optimised MDCT
|
||||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_imdct_half_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #2 // n4 = n >> 2
|
||||
add x7, x2, x12, lsl #1
|
||||
mov x12, #-16
|
||||
sub x7, x7, #16
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
ldr w6, [x3], #4
|
||||
fmul v4.2s, v0.2s, v3.2s
|
||||
fmul v5.2s, v17.2s, v3.2s
|
||||
fsub v4.2s, v6.2s, v4.2s
|
||||
fadd v5.2s, v5.2s, v7.2s
|
||||
ubfm x8, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x8, x1, x8, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
b.eq 2f
|
||||
ld2 {v16.2s,v17.2s}, [x7], x12
|
||||
ld2 {v0.2s,v1.2s}, [x2], #16
|
||||
rev64 v17.2s, v17.2s
|
||||
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
|
||||
fmul v6.2s, v17.2s, v2.2s
|
||||
fmul v7.2s, v0.2s, v2.2s
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
b 1b
|
||||
2:
|
||||
st2 {v4.s,v5.s}[0], [x6]
|
||||
st2 {v4.s,v5.s}[1], [x8]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
|
||||
ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
3:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s
|
||||
ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s
|
||||
fmul v6.2s, v21.2s, v19.2s
|
||||
fmul v5.2s, v20.2s, v19.2s
|
||||
fmul v22.2s, v1.2s, v16.2s
|
||||
fmul v23.2s, v21.2s, v18.2s
|
||||
fmul v24.2s, v0.2s, v16.2s
|
||||
fmul v25.2s, v20.2s, v18.2s
|
||||
fadd v7.2s, v7.2s, v22.2s
|
||||
fadd v5.2s, v5.2s, v23.2s
|
||||
fsub v4.2s, v4.2s, v24.2s
|
||||
fsub v6.2s, v6.2s, v25.2s
|
||||
b.eq 4f
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s},[x6], #16
|
||||
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 3b
|
||||
4:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldp x19, x20, [sp]
|
||||
ldr x30, [sp, #16]
|
||||
add sp, sp, #32
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_imdct_calc_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
ldr w3, [x0, #28] // mdct_bits
|
||||
mov x19, #1
|
||||
mov x20, x1
|
||||
lsl x19, x19, x3
|
||||
add x1, x1, x19
|
||||
|
||||
bl X(ff_imdct_half_neon)
|
||||
|
||||
add x0, x20, x19, lsl #2
|
||||
add x1, x20, x19, lsl #1
|
||||
sub x0, x0, #8
|
||||
sub x2, x1, #16
|
||||
mov x3, #-16
|
||||
mov x6, #-8
|
||||
1:
|
||||
ld1 {v0.4s}, [x2], x3
|
||||
prfum pldl1keep, [x0, #-16]
|
||||
rev64 v0.4s, v0.4s
|
||||
ld1 {v2.2s,v3.2s}, [x1], #16
|
||||
fneg v4.4s, v0.4s
|
||||
prfum pldl1keep, [x2, #-16]
|
||||
rev64 v2.2s, v2.2s
|
||||
rev64 v3.2s, v3.2s
|
||||
ext v4.16b, v4.16b, v4.16b, #8
|
||||
st1 {v2.2s}, [x0], x6
|
||||
st1 {v3.2s}, [x0], x6
|
||||
st1 {v4.4s}, [x20], #16
|
||||
subs x19, x19, #16
|
||||
b.gt 1b
|
||||
|
||||
ldp x19, x20, [sp], #16
|
||||
ldr x30, [sp], #16
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function ff_mdct_calc_neon, export=1
|
||||
sub sp, sp, #32
|
||||
stp x19, x20, [sp]
|
||||
str x30, [sp, #16]
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x0, #28] // mdct_bits
|
||||
ldr x4, [x0, #32] // tcos
|
||||
ldr x3, [x0, #8] // revtab
|
||||
lsl x14, x12, x14 // n = 1 << nbits
|
||||
add x7, x2, x14 // in4u
|
||||
sub x9, x7, #16 // in4d
|
||||
add x2, x7, x14, lsl #1 // in3u
|
||||
add x8, x9, x14, lsl #1 // in3d
|
||||
add x5, x4, x14, lsl #1
|
||||
sub x5, x5, #16
|
||||
sub x3, x3, #4
|
||||
mov x12, #-16
|
||||
lsr x13, x14, #1
|
||||
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
1:
|
||||
fmul v7.2s, v0.2s, v21.2s // I*s
|
||||
ldr w10, [x3, x13]
|
||||
fmul v6.2s, v2.2s, v20.2s // -R*c
|
||||
ldr w6, [x3, #4]!
|
||||
fmul v4.2s, v2.2s, v21.2s // -R*s
|
||||
fmul v5.2s, v0.2s, v20.2s // I*c
|
||||
fmul v24.2s, v16.2s, v30.2s // R*c
|
||||
fmul v25.2s, v18.2s, v31.2s // -I*s
|
||||
fmul v22.2s, v16.2s, v31.2s // R*s
|
||||
fmul v23.2s, v18.2s, v30.2s // I*c
|
||||
subs x14, x14, #16
|
||||
subs x13, x13, #8
|
||||
fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
|
||||
fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
|
||||
fsub v24.2s, v25.2s, v24.2s // I*s-R*c
|
||||
fadd v25.2s, v22.2s, v23.2s // R*s-I*c
|
||||
b.eq 1f
|
||||
mov x12, #-16
|
||||
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
|
||||
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
|
||||
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
|
||||
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
|
||||
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
|
||||
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
|
||||
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
|
||||
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
|
||||
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
|
||||
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
|
||||
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
|
||||
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
|
||||
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
b 1b
|
||||
1:
|
||||
fneg v7.2s, v7.2s // R*s-I*c
|
||||
ubfm x12, x6, #16, #31
|
||||
ubfm x6, x6, #0, #15
|
||||
add x12, x1, x12, lsl #3
|
||||
add x6, x1, x6, lsl #3
|
||||
st2 {v6.s,v7.s}[0], [x6]
|
||||
st2 {v6.s,v7.s}[1], [x12]
|
||||
ubfm x6, x10, #16, #31
|
||||
ubfm x10, x10, #0, #15
|
||||
add x6 , x1, x6, lsl #3
|
||||
add x10, x1, x10, lsl #3
|
||||
st2 {v24.s,v25.s}[0], [x10]
|
||||
st2 {v24.s,v25.s}[1], [x6]
|
||||
|
||||
mov x19, x0
|
||||
mov x20, x1
|
||||
bl X(ff_fft_calc_neon)
|
||||
|
||||
mov x12, #1
|
||||
ldr w14, [x19, #28] // mdct_bits
|
||||
ldr x4, [x19, #32] // tcos
|
||||
lsl x12, x12, x14 // n = 1 << nbits
|
||||
lsr x14, x12, #3 // n8 = n >> 3
|
||||
|
||||
add x4, x4, x14, lsl #3
|
||||
add x6, x20, x14, lsl #3
|
||||
sub x1, x4, #16
|
||||
sub x3, x6, #16
|
||||
|
||||
mov x7, #-16
|
||||
mov x8, x6
|
||||
mov x0, x3
|
||||
|
||||
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
1:
|
||||
subs x14, x14, #2
|
||||
fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
|
||||
ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
|
||||
fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
|
||||
fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
|
||||
fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
|
||||
fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
|
||||
fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
|
||||
fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
|
||||
fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
|
||||
fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
|
||||
fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
|
||||
fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
|
||||
fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
|
||||
fneg v4.2s, v4.2s
|
||||
fneg v6.2s, v6.2s
|
||||
b.eq 1f
|
||||
ld2 {v0.2s, v1.2s}, [x3], x7
|
||||
ld2 {v20.2s,v21.2s}, [x6], #16
|
||||
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0], x7
|
||||
st2 {v6.2s,v7.2s}, [x8], #16
|
||||
b 1b
|
||||
1:
|
||||
rev64 v5.2s, v5.2s
|
||||
rev64 v7.2s, v7.2s
|
||||
st2 {v4.2s,v5.2s}, [x0]
|
||||
st2 {v6.2s,v7.2s}, [x8]
|
||||
|
||||
ldp x19, x20, [sp], #16
|
||||
ldr x30, [sp], #16
|
||||
ret
|
||||
endfunc
|
|
@ -0,0 +1,149 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \r9\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \r1\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \r3\().8B, \r2\().8B, \r3\().8B
|
||||
trn1 \r0\().8B, \r4\().8B, \r5\().8B
|
||||
trn2 \r5\().8B, \r4\().8B, \r5\().8B
|
||||
trn1 \r2\().8B, \r6\().8B, \r7\().8B
|
||||
trn2 \r7\().8B, \r6\().8B, \r7\().8B
|
||||
|
||||
trn1 \r4\().4H, \r0\().4H, \r2\().4H
|
||||
trn2 \r2\().4H, \r0\().4H, \r2\().4H
|
||||
trn1 \r6\().4H, \r5\().4H, \r7\().4H
|
||||
trn2 \r7\().4H, \r5\().4H, \r7\().4H
|
||||
trn1 \r5\().4H, \r9\().4H, \r3\().4H
|
||||
trn2 \r9\().4H, \r9\().4H, \r3\().4H
|
||||
trn1 \r3\().4H, \r8\().4H, \r1\().4H
|
||||
trn2 \r8\().4H, \r8\().4H, \r1\().4H
|
||||
|
||||
trn1 \r0\().2S, \r3\().2S, \r4\().2S
|
||||
trn2 \r4\().2S, \r3\().2S, \r4\().2S
|
||||
|
||||
trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
||||
trn2 \r5\().2S, \r5\().2S, \r6\().2S
|
||||
|
||||
trn2 \r6\().2S, \r8\().2S, \r2\().2S
|
||||
trn1 \r2\().2S, \r8\().2S, \r2\().2S
|
||||
|
||||
trn1 \r3\().2S, \r9\().2S, \r7\().2S
|
||||
trn2 \r7\().2S, \r9\().2S, \r7\().2S
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t1\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \r1\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \r3\().16B, \r2\().16B, \r3\().16B
|
||||
trn1 \r0\().16B, \r4\().16B, \r5\().16B
|
||||
trn2 \r5\().16B, \r4\().16B, \r5\().16B
|
||||
trn1 \r2\().16B, \r6\().16B, \r7\().16B
|
||||
trn2 \r7\().16B, \r6\().16B, \r7\().16B
|
||||
|
||||
trn1 \r4\().8H, \r0\().8H, \r2\().8H
|
||||
trn2 \r2\().8H, \r0\().8H, \r2\().8H
|
||||
trn1 \r6\().8H, \r5\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r5\().8H, \r7\().8H
|
||||
trn1 \r5\().8H, \t1\().8H, \r3\().8H
|
||||
trn2 \t1\().8H, \t1\().8H, \r3\().8H
|
||||
trn1 \r3\().8H, \t0\().8H, \r1\().8H
|
||||
trn2 \t0\().8H, \t0\().8H, \r1\().8H
|
||||
|
||||
trn1 \r0\().4S, \r3\().4S, \r4\().4S
|
||||
trn2 \r4\().4S, \r3\().4S, \r4\().4S
|
||||
|
||||
trn1 \r1\().4S, \r5\().4S, \r6\().4S
|
||||
trn2 \r5\().4S, \r5\().4S, \r6\().4S
|
||||
|
||||
trn2 \r6\().4S, \t0\().4S, \r2\().4S
|
||||
trn1 \r2\().4S, \t0\().4S, \r2\().4S
|
||||
|
||||
trn1 \r3\().4S, \t1\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \t1\().4S, \r7\().4S
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16B, \r0\().16B, \r1\().16B
|
||||
trn2 \t5\().16B, \r0\().16B, \r1\().16B
|
||||
trn1 \t6\().16B, \r2\().16B, \r3\().16B
|
||||
trn2 \t7\().16B, \r2\().16B, \r3\().16B
|
||||
|
||||
trn1 \r0\().8H, \t4\().8H, \t6\().8H
|
||||
trn2 \r2\().8H, \t4\().8H, \t6\().8H
|
||||
trn1 \r1\().8H, \t5\().8H, \t7\().8H
|
||||
trn2 \r3\().8H, \t5\().8H, \t7\().8H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8B, \r0\().8B, \r1\().8B
|
||||
trn2 \t5\().8B, \r0\().8B, \r1\().8B
|
||||
trn1 \t6\().8B, \r2\().8B, \r3\().8B
|
||||
trn2 \t7\().8B, \r2\().8B, \r3\().8B
|
||||
|
||||
trn1 \r0\().4H, \t4\().4H, \t6\().4H
|
||||
trn2 \r2\().4H, \t4\().4H, \t6\().4H
|
||||
trn1 \r1\().4H, \t5\().4H, \t7\().4H
|
||||
trn2 \r3\().4H, \t5\().4H, \t7\().4H
|
||||
.endm
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
trn1 \r6\().4H, \r2\().4H, \r3\().4H
|
||||
trn2 \r7\().4H, \r2\().4H, \r3\().4H
|
||||
trn1 \r0\().2S, \r4\().2S, \r6\().2S
|
||||
trn2 \r2\().2S, \r4\().2S, \r6\().2S
|
||||
trn1 \r1\().2S, \r5\().2S, \r7\().2S
|
||||
trn2 \r3\().2S, \r5\().2S, \r7\().2S
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
||||
trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
||||
trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
||||
|
||||
trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
||||
trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
||||
trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
||||
trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
||||
trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
||||
trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
||||
trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
||||
|
||||
trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
||||
trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
||||
|
||||
trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
||||
trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
||||
|
||||
trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
||||
trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
||||
|
||||
trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
||||
trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
||||
|
||||
.endm
|
|
@ -0,0 +1,362 @@
|
|||
/*
|
||||
* ARM NEON IDCT
|
||||
*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2017 Matthieu Bouron <matthieu.bouron@gmail.com>
|
||||
*
|
||||
* Based on Simple IDCT
|
||||
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
#define Z1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
|
||||
#define Z4c ((1<<(COL_SHIFT-1))/Z4)
|
||||
#define ROW_SHIFT 11
|
||||
#define COL_SHIFT 20
|
||||
|
||||
#define z1 v0.H[0]
|
||||
#define z2 v0.H[1]
|
||||
#define z3 v0.H[2]
|
||||
#define z4 v0.H[3]
|
||||
#define z5 v0.H[4]
|
||||
#define z6 v0.H[5]
|
||||
#define z7 v0.H[6]
|
||||
#define z4c v0.H[7]
|
||||
|
||||
const idct_coeff_neon, align=4
|
||||
.short Z1, Z2, Z3, Z4, Z5, Z6, Z7, Z4c
|
||||
endconst
|
||||
|
||||
.macro idct_start data
|
||||
prfm pldl1keep, [\data]
|
||||
mov x10, x30
|
||||
movrel x3, idct_coeff_neon
|
||||
ld1 {v0.2D}, [x3]
|
||||
.endm
|
||||
|
||||
.macro idct_end
|
||||
br x10
|
||||
.endm
|
||||
|
||||
.macro smull1 a, b, c
|
||||
smull \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro smlal1 a, b, c
|
||||
smlal \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro smlsl1 a, b, c
|
||||
smlsl \a, \b, \c
|
||||
.endm
|
||||
|
||||
.macro idct_col4_top y1, y2, y3, y4, i, l
|
||||
smull\i v7.4S, \y3\l, z2
|
||||
smull\i v16.4S, \y3\l, z6
|
||||
smull\i v17.4S, \y2\l, z1
|
||||
add v19.4S, v23.4S, v7.4S
|
||||
smull\i v18.4S, \y2\l, z3
|
||||
add v20.4S, v23.4S, v16.4S
|
||||
smull\i v5.4S, \y2\l, z5
|
||||
sub v21.4S, v23.4S, v16.4S
|
||||
smull\i v6.4S, \y2\l, z7
|
||||
sub v22.4S, v23.4S, v7.4S
|
||||
|
||||
smlal\i v17.4S, \y4\l, z3
|
||||
smlsl\i v18.4S, \y4\l, z7
|
||||
smlsl\i v5.4S, \y4\l, z1
|
||||
smlsl\i v6.4S, \y4\l, z5
|
||||
.endm
|
||||
|
||||
.macro idct_row4_neon y1, y2, y3, y4, pass
|
||||
ld1 {\y1\().2D,\y2\().2D}, [x2], #32
|
||||
movi v23.4S, #1<<2, lsl #8
|
||||
orr v5.16B, \y1\().16B, \y2\().16B
|
||||
ld1 {\y3\().2D,\y4\().2D}, [x2], #32
|
||||
orr v6.16B, \y3\().16B, \y4\().16B
|
||||
orr v5.16B, v5.16B, v6.16B
|
||||
mov x3, v5.D[1]
|
||||
smlal v23.4S, \y1\().4H, z4
|
||||
|
||||
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H
|
||||
|
||||
cmp x3, #0
|
||||
b.eq \pass\()f
|
||||
|
||||
smull2 v7.4S, \y1\().8H, z4
|
||||
smlal2 v17.4S, \y2\().8H, z5
|
||||
smlsl2 v18.4S, \y2\().8H, z1
|
||||
smull2 v16.4S, \y3\().8H, z2
|
||||
smlal2 v5.4S, \y2\().8H, z7
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
smlal2 v6.4S, \y2\().8H, z3
|
||||
smull2 v7.4S, \y3\().8H, z6
|
||||
smlal2 v17.4S, \y4\().8H, z7
|
||||
smlsl2 v18.4S, \y4\().8H, z5
|
||||
smlal2 v5.4S, \y4\().8H, z3
|
||||
smlsl2 v6.4S, \y4\().8H, z1
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
|
||||
\pass: add \y3\().4S, v19.4S, v17.4S
|
||||
add \y4\().4S, v20.4S, v18.4S
|
||||
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT
|
||||
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT
|
||||
add v7.4S, v21.4S, v5.4S
|
||||
add v16.4S, v22.4S, v6.4S
|
||||
shrn \y3\().4H, v7.4S, #ROW_SHIFT
|
||||
shrn \y4\().4H, v16.4S, #ROW_SHIFT
|
||||
sub v22.4S, v22.4S, v6.4S
|
||||
sub v19.4S, v19.4S, v17.4S
|
||||
sub v21.4S, v21.4S, v5.4S
|
||||
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT
|
||||
sub v20.4S, v20.4S, v18.4S
|
||||
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT
|
||||
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT
|
||||
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT
|
||||
|
||||
trn1 v16.8H, \y1\().8H, \y2\().8H
|
||||
trn2 v17.8H, \y1\().8H, \y2\().8H
|
||||
trn1 v18.8H, \y3\().8H, \y4\().8H
|
||||
trn2 v19.8H, \y3\().8H, \y4\().8H
|
||||
trn1 \y1\().4S, v16.4S, v18.4S
|
||||
trn1 \y2\().4S, v17.4S, v19.4S
|
||||
trn2 \y3\().4S, v16.4S, v18.4S
|
||||
trn2 \y4\().4S, v17.4S, v19.4S
|
||||
.endm
|
||||
|
||||
.macro declare_idct_col4_neon i, l
|
||||
function idct_col4_neon\i
|
||||
dup v23.4H, z4c
|
||||
.if \i == 1
|
||||
add v23.4H, v23.4H, v24.4H
|
||||
.else
|
||||
mov v5.D[0], v24.D[1]
|
||||
add v23.4H, v23.4H, v5.4H
|
||||
.endif
|
||||
smull v23.4S, v23.4H, z4
|
||||
|
||||
idct_col4_top v24, v25, v26, v27, \i, \l
|
||||
|
||||
mov x4, v28.D[\i - 1]
|
||||
mov x5, v29.D[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 1f
|
||||
|
||||
smull\i v7.4S, v28\l, z4
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v7.4S
|
||||
sub v21.4S, v21.4S, v7.4S
|
||||
add v22.4S, v22.4S, v7.4S
|
||||
|
||||
1: mov x4, v30.D[\i - 1]
|
||||
cmp x5, #0
|
||||
b.eq 2f
|
||||
|
||||
smlal\i v17.4S, v29\l, z5
|
||||
smlsl\i v18.4S, v29\l, z1
|
||||
smlal\i v5.4S, v29\l, z7
|
||||
smlal\i v6.4S, v29\l, z3
|
||||
|
||||
2: mov x5, v31.D[\i - 1]
|
||||
cmp x4, #0
|
||||
b.eq 3f
|
||||
|
||||
smull\i v7.4S, v30\l, z6
|
||||
smull\i v16.4S, v30\l, z2
|
||||
add v19.4S, v19.4S, v7.4S
|
||||
sub v22.4S, v22.4S, v7.4S
|
||||
sub v20.4S, v20.4S, v16.4S
|
||||
add v21.4S, v21.4S, v16.4S
|
||||
|
||||
3: cmp x5, #0
|
||||
b.eq 4f
|
||||
|
||||
smlal\i v17.4S, v31\l, z7
|
||||
smlsl\i v18.4S, v31\l, z5
|
||||
smlal\i v5.4S, v31\l, z3
|
||||
smlsl\i v6.4S, v31\l, z1
|
||||
|
||||
4: addhn v7.4H, v19.4S, v17.4S
|
||||
addhn2 v7.8H, v20.4S, v18.4S
|
||||
subhn v18.4H, v20.4S, v18.4S
|
||||
subhn2 v18.8H, v19.4S, v17.4S
|
||||
|
||||
addhn v16.4H, v21.4S, v5.4S
|
||||
addhn2 v16.8H, v22.4S, v6.4S
|
||||
subhn v17.4H, v22.4S, v6.4S
|
||||
subhn2 v17.8H, v21.4S, v5.4S
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
declare_idct_col4_neon 1, .4H
|
||||
declare_idct_col4_neon 2, .8H
|
||||
|
||||
function ff_simple_idct_put_neon, export=1
|
||||
idct_start x2
|
||||
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sqshrun v1.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v3.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sqshrun v2.8B, v7.8H, #COL_SHIFT-16
|
||||
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16
|
||||
sqshrun v4.8B, v17.8H, #COL_SHIFT-16
|
||||
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16
|
||||
|
||||
zip1 v16.4S, v1.4S, v2.4S
|
||||
zip2 v17.4S, v1.4S, v2.4S
|
||||
|
||||
st1 {v16.D}[0], [x0], x1
|
||||
st1 {v16.D}[1], [x0], x1
|
||||
|
||||
zip1 v18.4S, v3.4S, v4.4S
|
||||
zip2 v19.4S, v3.4S, v4.4S
|
||||
|
||||
st1 {v17.D}[0], [x0], x1
|
||||
st1 {v17.D}[1], [x0], x1
|
||||
st1 {v18.D}[0], [x0], x1
|
||||
st1 {v18.D}[1], [x0], x1
|
||||
st1 {v19.D}[0], [x0], x1
|
||||
st1 {v19.D}[1], [x0], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
||||
function ff_simple_idct_add_neon, export=1
|
||||
idct_start x2
|
||||
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
mov x9, x0
|
||||
ld1 {v19.D}[0], [x0], x1
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
ld1 {v19.D}[1], [x0], x1
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
ld1 {v20.D}[0], [x0], x1
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
ld1 {v20.D}[1], [x0], x1
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
ld1 {v21.D}[0], [x0], x1
|
||||
uaddw v23.8H, v23.8H, v19.8B
|
||||
uaddw2 v24.8H, v24.8H, v19.16B
|
||||
ld1 {v21.D}[1], [x0], x1
|
||||
sqxtun v23.8B, v23.8H
|
||||
sqxtun2 v23.16B, v24.8H
|
||||
ld1 {v22.D}[0], [x0], x1
|
||||
uaddw v24.8H, v25.8H, v20.8B
|
||||
uaddw2 v25.8H, v26.8H, v20.16B
|
||||
ld1 {v22.D}[1], [x0], x1
|
||||
sqxtun v24.8B, v24.8H
|
||||
sqxtun2 v24.16B, v25.8H
|
||||
st1 {v23.D}[0], [x9], x1
|
||||
uaddw v25.8H, v27.8H, v21.8B
|
||||
uaddw2 v26.8H, v28.8H, v21.16B
|
||||
st1 {v23.D}[1], [x9], x1
|
||||
sqxtun v25.8B, v25.8H
|
||||
sqxtun2 v25.16B, v26.8H
|
||||
st1 {v24.D}[0], [x9], x1
|
||||
uaddw v26.8H, v29.8H, v22.8B
|
||||
uaddw2 v27.8H, v30.8H, v22.16B
|
||||
st1 {v24.D}[1], [x9], x1
|
||||
sqxtun v26.8B, v26.8H
|
||||
sqxtun2 v26.16B, v27.8H
|
||||
st1 {v25.D}[0], [x9], x1
|
||||
st1 {v25.D}[1], [x9], x1
|
||||
st1 {v26.D}[0], [x9], x1
|
||||
st1 {v26.D}[1], [x9], x1
|
||||
|
||||
idct_end
|
||||
endfunc
|
||||
|
||||
function ff_simple_idct_neon, export=1
|
||||
idct_start x0
|
||||
|
||||
mov x2, x0
|
||||
idct_row4_neon v24, v25, v26, v27, 1
|
||||
idct_row4_neon v28, v29, v30, v31, 2
|
||||
sub x2, x2, #128
|
||||
bl idct_col4_neon1
|
||||
|
||||
sshr v1.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v2.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v3.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v4.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
bl idct_col4_neon2
|
||||
|
||||
sshr v7.8H, v7.8H, #COL_SHIFT-16
|
||||
sshr v16.8H, v16.8H, #COL_SHIFT-16
|
||||
sshr v17.8H, v17.8H, #COL_SHIFT-16
|
||||
sshr v18.8H, v18.8H, #COL_SHIFT-16
|
||||
|
||||
zip1 v23.2D, v1.2D, v7.2D
|
||||
zip2 v24.2D, v1.2D, v7.2D
|
||||
st1 {v23.2D,v24.2D}, [x2], #32
|
||||
zip1 v25.2D, v2.2D, v16.2D
|
||||
zip2 v26.2D, v2.2D, v16.2D
|
||||
st1 {v25.2D,v26.2D}, [x2], #32
|
||||
zip1 v27.2D, v3.2D, v17.2D
|
||||
zip2 v28.2D, v3.2D, v17.2D
|
||||
st1 {v27.2D,v28.2D}, [x2], #32
|
||||
zip1 v29.2D, v4.2D, v18.2D
|
||||
zip2 v30.2D, v4.2D, v18.2D
|
||||
st1 {v29.2D,v30.2D}, [x2], #32
|
||||
|
||||
idct_end
|
||||
endfunc
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
|
||||
#include "config.h"
|
||||
|
||||
void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
|
||||
int h, int x, int y);
|
||||
|
||||
av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
function ff_prefetch_aarch64, export=1
|
||||
subs w2, w2, #2
|
||||
prfm pldl1strm, [x0]
|
||||
prfm pldl1strm, [x0, x1]
|
||||
add x0, x0, x1, lsl #1
|
||||
b.gt X(ff_prefetch_aarch64)
|
||||
ret
|
||||
endfunc
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/videodsp.h"
|
||||
|
||||
void ff_prefetch_aarch64(uint8_t *mem, ptrdiff_t stride, int h);
|
||||
|
||||
av_cold void ff_videodsp_init_aarch64(VideoDSPContext *ctx, int bpc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv8(cpu_flags))
|
||||
ctx->prefetch = ff_prefetch_aarch64;
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_AARCH64_VP9DSP_INIT_H
|
||||
#define AVCODEC_AARCH64_VP9DSP_INIT_H
|
||||
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
|
||||
void ff_vp9dsp_init_10bpp_aarch64(VP9DSPContext *dsp);
|
||||
void ff_vp9dsp_init_12bpp_aarch64(VP9DSPContext *dsp);
|
||||
|
||||
#endif /* AVCODEC_AARCH64_VP9DSP_INIT_H */
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPP 10
|
||||
#define INIT_FUNC ff_vp9dsp_init_10bpp_aarch64
|
||||
#include "vp9dsp_init_16bpp_aarch64_template.c"
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define BPP 12
|
||||
#define INIT_FUNC ff_vp9dsp_init_12bpp_aarch64
|
||||
#include "vp9dsp_init_16bpp_aarch64_template.c"
|
|
@ -0,0 +1,273 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "vp9dsp_init.h"
|
||||
|
||||
#define declare_fpel(type, sz, suffix) \
|
||||
void ff_vp9_##type##sz##suffix##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define decl_mc_func(op, filter, dir, sz, bpp) \
|
||||
void ff_vp9_##op##_##filter##sz##_##dir##_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define define_8tap_2d_fn(op, filter, sz, bpp) \
|
||||
static void op##_##filter##sz##_hv_##bpp##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz * 2]); \
|
||||
/* We only need h + 7 lines, but the horizontal filter assumes an \
|
||||
* even number of rows, so filter h + 8 lines here. */ \
|
||||
ff_vp9_put_##filter##sz##_h_##bpp##_neon(temp, 2 * sz, \
|
||||
src - 3 * src_stride, src_stride, \
|
||||
h + 8, mx, 0); \
|
||||
ff_vp9_##op##_##filter##sz##_v_##bpp##_neon(dst, dst_stride, \
|
||||
temp + 3 * 2 * sz, 2 * sz, \
|
||||
h, 0, my); \
|
||||
}
|
||||
|
||||
#define decl_filter_funcs(op, dir, sz, bpp) \
|
||||
decl_mc_func(op, regular, dir, sz, bpp); \
|
||||
decl_mc_func(op, sharp, dir, sz, bpp); \
|
||||
decl_mc_func(op, smooth, dir, sz, bpp)
|
||||
|
||||
#define decl_mc_funcs(sz, bpp) \
|
||||
decl_filter_funcs(put, h, sz, bpp); \
|
||||
decl_filter_funcs(avg, h, sz, bpp); \
|
||||
decl_filter_funcs(put, v, sz, bpp); \
|
||||
decl_filter_funcs(avg, v, sz, bpp); \
|
||||
decl_filter_funcs(put, hv, sz, bpp); \
|
||||
decl_filter_funcs(avg, hv, sz, bpp)
|
||||
|
||||
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
||||
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
||||
#define ff_vp9_copy128_neon ff_vp9_copy128_aarch64
|
||||
|
||||
declare_fpel(copy, 128, );
|
||||
declare_fpel(copy, 64, );
|
||||
declare_fpel(copy, 32, );
|
||||
declare_fpel(copy, 16, );
|
||||
declare_fpel(copy, 8, );
|
||||
declare_fpel(avg, 64, _16);
|
||||
declare_fpel(avg, 32, _16);
|
||||
declare_fpel(avg, 16, _16);
|
||||
declare_fpel(avg, 8, _16);
|
||||
declare_fpel(avg, 4, _16);
|
||||
|
||||
decl_mc_funcs(64, BPP);
|
||||
decl_mc_funcs(32, BPP);
|
||||
decl_mc_funcs(16, BPP);
|
||||
decl_mc_funcs(8, BPP);
|
||||
decl_mc_funcs(4, BPP);
|
||||
|
||||
#define define_8tap_2d_funcs(sz, bpp) \
|
||||
define_8tap_2d_fn(put, regular, sz, bpp) \
|
||||
define_8tap_2d_fn(put, sharp, sz, bpp) \
|
||||
define_8tap_2d_fn(put, smooth, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, regular, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, sharp, sz, bpp) \
|
||||
define_8tap_2d_fn(avg, smooth, sz, bpp)
|
||||
|
||||
define_8tap_2d_funcs(64, BPP)
|
||||
define_8tap_2d_funcs(32, BPP)
|
||||
define_8tap_2d_funcs(16, BPP)
|
||||
define_8tap_2d_funcs(8, BPP)
|
||||
define_8tap_2d_funcs(4, BPP)
|
||||
|
||||
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_fpel(idx1, idx2, sz, type, suffix) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
||||
|
||||
#define init_copy(idx, sz, suffix) \
|
||||
init_fpel(idx, 0, sz, copy, suffix)
|
||||
|
||||
#define init_avg(idx, sz, suffix) \
|
||||
init_fpel(idx, 1, sz, avg, suffix)
|
||||
|
||||
#define init_copy_avg(idx, sz1, sz2) \
|
||||
init_copy(idx, sz2, _neon); \
|
||||
init_avg (idx, sz1, _16_neon)
|
||||
|
||||
if (have_armv8(cpu_flags)) {
|
||||
init_copy(0, 128, _aarch64);
|
||||
init_copy(1, 64, _aarch64);
|
||||
init_copy(2, 32, _aarch64);
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx, bpp) \
|
||||
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_##bpp##_neon
|
||||
|
||||
#define init_mc_funcs(idx, dir, mx, my, sz, pfx, bpp) \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx, bpp); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx, bpp)
|
||||
|
||||
#define init_mc_funcs_dirs(idx, sz, bpp) \
|
||||
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_, bpp); \
|
||||
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_, bpp); \
|
||||
init_mc_funcs(idx, hv, 1, 1, sz, , bpp)
|
||||
|
||||
|
||||
init_avg(0, 64, _16_neon);
|
||||
init_avg(1, 32, _16_neon);
|
||||
init_avg(2, 16, _16_neon);
|
||||
init_copy_avg(3, 8, 16);
|
||||
init_copy_avg(4, 4, 8);
|
||||
|
||||
init_mc_funcs_dirs(0, 64, BPP);
|
||||
init_mc_funcs_dirs(1, 32, BPP);
|
||||
init_mc_funcs_dirs(2, 16, BPP);
|
||||
init_mc_funcs_dirs(3, 8, BPP);
|
||||
init_mc_funcs_dirs(4, 4, BPP);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm2(type_a, type_b, sz, bpp) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_##bpp##_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
#define define_itxfm(type_a, type_b, sz, bpp) define_itxfm2(type_a, type_b, sz, bpp)
|
||||
|
||||
#define define_itxfm_funcs(sz, bpp) \
|
||||
define_itxfm(idct, idct, sz, bpp); \
|
||||
define_itxfm(iadst, idct, sz, bpp); \
|
||||
define_itxfm(idct, iadst, sz, bpp); \
|
||||
define_itxfm(iadst, iadst, sz, bpp)
|
||||
|
||||
define_itxfm_funcs(4, BPP);
|
||||
define_itxfm_funcs(8, BPP);
|
||||
define_itxfm_funcs(16, BPP);
|
||||
define_itxfm(idct, idct, 32, BPP);
|
||||
define_itxfm(iwht, iwht, 4, BPP);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm2(tx, sz, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_##bpp##_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_##bpp##_neon
|
||||
#define init_itxfm(tx, sz, bpp) init_itxfm2(tx, sz, bpp)
|
||||
|
||||
#define init_idct2(tx, nm, bpp) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_##bpp##_neon
|
||||
#define init_idct(tx, nm, bpp) init_idct2(tx, nm, bpp)
|
||||
|
||||
init_itxfm(TX_4X4, 4x4, BPP);
|
||||
init_itxfm(TX_8X8, 8x8, BPP);
|
||||
init_itxfm(TX_16X16, 16x16, BPP);
|
||||
init_idct(TX_32X32, idct_idct_32x32, BPP);
|
||||
init_idct(4, iwht_iwht_4x4, BPP);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_loop_filter(dir, wd, size, bpp) \
|
||||
void ff_vp9_loop_filter_##dir##_##wd##_##size##_##bpp##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
||||
|
||||
#define define_loop_filters(wd, size, bpp) \
|
||||
define_loop_filter(h, wd, size, bpp); \
|
||||
define_loop_filter(v, wd, size, bpp)
|
||||
|
||||
define_loop_filters(4, 8, BPP);
|
||||
define_loop_filters(8, 8, BPP);
|
||||
define_loop_filters(16, 8, BPP);
|
||||
|
||||
define_loop_filters(16, 16, BPP);
|
||||
|
||||
define_loop_filters(44, 16, BPP);
|
||||
define_loop_filters(48, 16, BPP);
|
||||
define_loop_filters(84, 16, BPP);
|
||||
define_loop_filters(88, 16, BPP);
|
||||
|
||||
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_lpf_func_8(idx1, idx2, dir, wd, bpp) \
|
||||
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_8_##bpp##_neon
|
||||
|
||||
#define init_lpf_func_16(idx, dir, bpp) \
|
||||
dsp->loop_filter_16[idx] = ff_vp9_loop_filter_##dir##_16_16_##bpp##_neon
|
||||
|
||||
#define init_lpf_func_mix2(idx1, idx2, idx3, dir, wd, bpp) \
|
||||
dsp->loop_filter_mix2[idx1][idx2][idx3] = ff_vp9_loop_filter_##dir##_##wd##_16_##bpp##_neon
|
||||
|
||||
#define init_lpf_funcs_8_wd(idx, wd, bpp) \
|
||||
init_lpf_func_8(idx, 0, h, wd, bpp); \
|
||||
init_lpf_func_8(idx, 1, v, wd, bpp)
|
||||
|
||||
#define init_lpf_funcs_16(bpp) \
|
||||
init_lpf_func_16(0, h, bpp); \
|
||||
init_lpf_func_16(1, v, bpp)
|
||||
|
||||
#define init_lpf_funcs_mix2_wd(idx1, idx2, wd, bpp) \
|
||||
init_lpf_func_mix2(idx1, idx2, 0, h, wd, bpp); \
|
||||
init_lpf_func_mix2(idx1, idx2, 1, v, wd, bpp)
|
||||
|
||||
#define init_lpf_funcs_8(bpp) \
|
||||
init_lpf_funcs_8_wd(0, 4, bpp); \
|
||||
init_lpf_funcs_8_wd(1, 8, bpp); \
|
||||
init_lpf_funcs_8_wd(2, 16, bpp)
|
||||
|
||||
#define init_lpf_funcs_mix2(bpp) \
|
||||
init_lpf_funcs_mix2_wd(0, 0, 44, bpp); \
|
||||
init_lpf_funcs_mix2_wd(0, 1, 48, bpp); \
|
||||
init_lpf_funcs_mix2_wd(1, 0, 84, bpp); \
|
||||
init_lpf_funcs_mix2_wd(1, 1, 88, bpp)
|
||||
|
||||
init_lpf_funcs_8(BPP);
|
||||
init_lpf_funcs_16(BPP);
|
||||
init_lpf_funcs_mix2(BPP);
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
||||
{
|
||||
vp9dsp_mc_init_aarch64(dsp);
|
||||
vp9dsp_loopfilter_init_aarch64(dsp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp);
|
||||
}
|
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
* Copyright (c) 2016 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/vp9dsp.h"
|
||||
#include "vp9dsp_init.h"
|
||||
|
||||
#define declare_fpel(type, sz) \
|
||||
void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define declare_copy_avg(sz) \
|
||||
declare_fpel(copy, sz); \
|
||||
declare_fpel(avg , sz)
|
||||
|
||||
#define decl_mc_func(op, filter, dir, sz) \
|
||||
void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my)
|
||||
|
||||
#define define_8tap_2d_fn(op, filter, sz) \
|
||||
static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride, \
|
||||
const uint8_t *src, ptrdiff_t src_stride, \
|
||||
int h, int mx, int my) \
|
||||
{ \
|
||||
LOCAL_ALIGNED_16(uint8_t, temp, [((1 + (sz < 64)) * sz + 8) * sz]); \
|
||||
/* We only need h + 7 lines, but the horizontal filter assumes an \
|
||||
* even number of rows, so filter h + 8 lines here. */ \
|
||||
ff_vp9_put_##filter##sz##_h_neon(temp, sz, \
|
||||
src - 3 * src_stride, src_stride, \
|
||||
h + 8, mx, 0); \
|
||||
ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride, \
|
||||
temp + 3 * sz, sz, \
|
||||
h, 0, my); \
|
||||
}
|
||||
|
||||
#define decl_filter_funcs(op, dir, sz) \
|
||||
decl_mc_func(op, regular, dir, sz); \
|
||||
decl_mc_func(op, sharp, dir, sz); \
|
||||
decl_mc_func(op, smooth, dir, sz)
|
||||
|
||||
#define decl_mc_funcs(sz) \
|
||||
decl_filter_funcs(put, h, sz); \
|
||||
decl_filter_funcs(avg, h, sz); \
|
||||
decl_filter_funcs(put, v, sz); \
|
||||
decl_filter_funcs(avg, v, sz); \
|
||||
decl_filter_funcs(put, hv, sz); \
|
||||
decl_filter_funcs(avg, hv, sz)
|
||||
|
||||
#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
|
||||
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
|
||||
|
||||
declare_copy_avg(64);
|
||||
declare_copy_avg(32);
|
||||
declare_copy_avg(16);
|
||||
declare_copy_avg(8);
|
||||
declare_copy_avg(4);
|
||||
|
||||
decl_mc_funcs(64);
|
||||
decl_mc_funcs(32);
|
||||
decl_mc_funcs(16);
|
||||
decl_mc_funcs(8);
|
||||
decl_mc_funcs(4);
|
||||
|
||||
#define define_8tap_2d_funcs(sz) \
|
||||
define_8tap_2d_fn(put, regular, sz) \
|
||||
define_8tap_2d_fn(put, sharp, sz) \
|
||||
define_8tap_2d_fn(put, smooth, sz) \
|
||||
define_8tap_2d_fn(avg, regular, sz) \
|
||||
define_8tap_2d_fn(avg, sharp, sz) \
|
||||
define_8tap_2d_fn(avg, smooth, sz)
|
||||
|
||||
define_8tap_2d_funcs(64)
|
||||
define_8tap_2d_funcs(32)
|
||||
define_8tap_2d_funcs(16)
|
||||
define_8tap_2d_funcs(8)
|
||||
define_8tap_2d_funcs(4)
|
||||
|
||||
static av_cold void vp9dsp_mc_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_fpel(idx1, idx2, sz, type, suffix) \
|
||||
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
|
||||
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##suffix
|
||||
|
||||
#define init_copy(idx, sz, suffix) \
|
||||
init_fpel(idx, 0, sz, copy, suffix)
|
||||
|
||||
#define init_avg(idx, sz, suffix) \
|
||||
init_fpel(idx, 1, sz, avg, suffix)
|
||||
|
||||
#define init_copy_avg(idx, sz) \
|
||||
init_copy(idx, sz, _neon); \
|
||||
init_avg (idx, sz, _neon)
|
||||
|
||||
if (have_armv8(cpu_flags)) {
|
||||
init_copy(0, 64, _aarch64);
|
||||
init_copy(1, 32, _aarch64);
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
|
||||
dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
|
||||
|
||||
#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP, sharp, dir, mx, my, sz, pfx); \
|
||||
init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH, smooth, dir, mx, my, sz, pfx)
|
||||
|
||||
#define init_mc_funcs_dirs(idx, sz) \
|
||||
init_mc_funcs(idx, h, 1, 0, sz, ff_vp9_); \
|
||||
init_mc_funcs(idx, v, 0, 1, sz, ff_vp9_); \
|
||||
init_mc_funcs(idx, hv, 1, 1, sz,)
|
||||
|
||||
init_avg(0, 64, _neon);
|
||||
init_avg(1, 32, _neon);
|
||||
init_copy_avg(2, 16);
|
||||
init_copy_avg(3, 8);
|
||||
init_copy_avg(4, 4);
|
||||
|
||||
init_mc_funcs_dirs(0, 64);
|
||||
init_mc_funcs_dirs(1, 32);
|
||||
init_mc_funcs_dirs(2, 16);
|
||||
init_mc_funcs_dirs(3, 8);
|
||||
init_mc_funcs_dirs(4, 4);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_itxfm(type_a, type_b, sz) \
|
||||
void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst, \
|
||||
ptrdiff_t stride, \
|
||||
int16_t *_block, int eob)
|
||||
|
||||
#define define_itxfm_funcs(sz) \
|
||||
define_itxfm(idct, idct, sz); \
|
||||
define_itxfm(iadst, idct, sz); \
|
||||
define_itxfm(idct, iadst, sz); \
|
||||
define_itxfm(iadst, iadst, sz)
|
||||
|
||||
define_itxfm_funcs(4);
|
||||
define_itxfm_funcs(8);
|
||||
define_itxfm_funcs(16);
|
||||
define_itxfm(idct, idct, 32);
|
||||
define_itxfm(iwht, iwht, 4);
|
||||
|
||||
|
||||
static av_cold void vp9dsp_itxfm_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
#define init_itxfm(tx, sz) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = ff_vp9_idct_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = ff_vp9_iadst_idct_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = ff_vp9_idct_iadst_##sz##_add_neon; \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
|
||||
|
||||
#define init_idct(tx, nm) \
|
||||
dsp->itxfm_add[tx][DCT_DCT] = \
|
||||
dsp->itxfm_add[tx][ADST_DCT] = \
|
||||
dsp->itxfm_add[tx][DCT_ADST] = \
|
||||
dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
|
||||
|
||||
init_itxfm(TX_4X4, 4x4);
|
||||
init_itxfm(TX_8X8, 8x8);
|
||||
init_itxfm(TX_16X16, 16x16);
|
||||
init_idct(TX_32X32, idct_idct_32x32);
|
||||
init_idct(4, iwht_iwht_4x4);
|
||||
}
|
||||
}
|
||||
|
||||
#define define_loop_filter(dir, wd, len) \
|
||||
void ff_vp9_loop_filter_##dir##_##wd##_##len##_neon(uint8_t *dst, ptrdiff_t stride, int E, int I, int H)
|
||||
|
||||
#define define_loop_filters(wd, len) \
|
||||
define_loop_filter(h, wd, len); \
|
||||
define_loop_filter(v, wd, len)
|
||||
|
||||
define_loop_filters(4, 8);
|
||||
define_loop_filters(8, 8);
|
||||
define_loop_filters(16, 8);
|
||||
|
||||
define_loop_filters(16, 16);
|
||||
|
||||
define_loop_filters(44, 16);
|
||||
define_loop_filters(48, 16);
|
||||
define_loop_filters(84, 16);
|
||||
define_loop_filters(88, 16);
|
||||
|
||||
static av_cold void vp9dsp_loopfilter_init_aarch64(VP9DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_neon;
|
||||
dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_neon;
|
||||
dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_neon;
|
||||
dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_neon;
|
||||
dsp->loop_filter_8[2][1] = ff_vp9_loop_filter_v_16_8_neon;
|
||||
dsp->loop_filter_8[2][0] = ff_vp9_loop_filter_h_16_8_neon;
|
||||
|
||||
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
|
||||
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
|
||||
|
||||
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
|
||||
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
|
||||
dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_neon;
|
||||
dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_neon;
|
||||
dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_neon;
|
||||
dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_neon;
|
||||
dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_neon;
|
||||
dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_neon;
|
||||
}
|
||||
}
|
||||
|
||||
av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp, int bpp)
|
||||
{
|
||||
if (bpp == 10) {
|
||||
ff_vp9dsp_init_10bpp_aarch64(dsp);
|
||||
return;
|
||||
} else if (bpp == 12) {
|
||||
ff_vp9dsp_init_12bpp_aarch64(dsp);
|
||||
return;
|
||||
} else if (bpp != 8)
|
||||
return;
|
||||
|
||||
vp9dsp_mc_init_aarch64(dsp);
|
||||
vp9dsp_loopfilter_init_aarch64(dsp);
|
||||
vp9dsp_itxfm_init_aarch64(dsp);
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,873 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
|
||||
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \t5\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \t6\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \t7\().8h, \r2\().8h, \r3\().8h
|
||||
|
||||
trn1 \r0\().4s, \t4\().4s, \t6\().4s
|
||||
trn2 \r2\().4s, \t4\().4s, \t6\().4s
|
||||
trn1 \r1\().4s, \t5\().4s, \t7\().4s
|
||||
trn2 \r3\().4s, \t5\().4s, \t7\().4s
|
||||
.endm
|
||||
|
||||
// The input to and output from this macro is in the registers v16-v31,
|
||||
// and v0-v7 are used as scratch registers.
|
||||
// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
|
||||
// Depending on the width of the loop filter, we either use v16-v19
|
||||
// and v28-v31 as temp registers, or v8-v15.
|
||||
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
|
||||
dup v0.8h, w2 // E
|
||||
dup v2.8h, w3 // I
|
||||
dup v3.8h, w4 // H
|
||||
|
||||
uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2)
|
||||
uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1)
|
||||
uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1)
|
||||
uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2)
|
||||
uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3)
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
umax v5.8h, v6.8h, v7.8h
|
||||
umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
||||
uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0)
|
||||
umax v4.8h, v4.8h, v5.8h
|
||||
add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2
|
||||
uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1)
|
||||
umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3))
|
||||
ushr v5.8h, v5.8h, #1
|
||||
cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I
|
||||
add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||
cmhs v6.8h, v0.8h, v6.8h
|
||||
and v4.16b, v4.16b, v6.16b // fm
|
||||
|
||||
// If no pixels need filtering, just exit as soon as possible
|
||||
mov x11, v4.d[0]
|
||||
mov x12, v4.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
br x10
|
||||
1:
|
||||
|
||||
.if \wd >= 8
|
||||
dup v0.8h, w5
|
||||
|
||||
uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0)
|
||||
uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0)
|
||||
uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0)
|
||||
uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0)
|
||||
umax v6.8h, v6.8h, v2.8h
|
||||
umax v1.8h, v1.8h, \tmp1\().8h
|
||||
umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h
|
||||
.if \wd == 16
|
||||
uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0)
|
||||
umax v6.8h, v6.8h, v1.8h
|
||||
uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0)
|
||||
umax v6.8h, v6.8h, \tmp2\().8h
|
||||
uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0)
|
||||
cmhs v6.8h, v0.8h, v6.8h // flat8in
|
||||
uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0)
|
||||
and v6.16b, v6.16b, v4.16b // flat8in && fm
|
||||
uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0)
|
||||
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
||||
uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0)
|
||||
uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0)
|
||||
uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0)
|
||||
|
||||
umax v7.8h, v7.8h, v2.8h
|
||||
umax v1.8h, v1.8h, v8.8h
|
||||
umax v9.8h, v9.8h, v10.8h
|
||||
umax v11.8h, v11.8h, v12.8h
|
||||
// The rest of the calculation of flat8out is interleaved below
|
||||
.else
|
||||
// The rest of the calculation of flat8in is interleaved below
|
||||
.endif
|
||||
.endif
|
||||
|
||||
// Calculate the normal inner loop filter for 2 or 4 pixels
|
||||
uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0)
|
||||
.if \wd == 16
|
||||
umax v7.8h, v7.8h, v1.8h
|
||||
umax v9.8h, v9.8h, v11.8h
|
||||
.elseif \wd == 8
|
||||
umax v6.8h, v6.8h, v1.8h
|
||||
.endif
|
||||
uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0)
|
||||
.if \wd == 16
|
||||
umax v7.8h, v7.8h, v9.8h
|
||||
.elseif \wd == 8
|
||||
umax v6.8h, v6.8h, \tmp2\().8h
|
||||
.endif
|
||||
dup \tmp2\().8h, w6 // left shift for saturation
|
||||
sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1
|
||||
neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation
|
||||
umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0))
|
||||
sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0
|
||||
movi \tmp5\().8h, #3
|
||||
.if \wd == 8
|
||||
cmhs v6.8h, v0.8h, v6.8h // flat8in
|
||||
.endif
|
||||
cmhs v5.8h, v3.8h, v5.8h // !hev
|
||||
.if \wd == 8
|
||||
and v6.16b, v6.16b, v4.16b // flat8in && fm
|
||||
.endif
|
||||
sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h
|
||||
.if \wd == 16
|
||||
cmhs v7.8h, v0.8h, v7.8h // flat8out
|
||||
.elseif \wd == 8
|
||||
bic v4.16b, v4.16b, v6.16b // fm && !flat8in
|
||||
.endif
|
||||
and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in
|
||||
.if \wd == 16
|
||||
and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm
|
||||
.endif
|
||||
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
|
||||
|
||||
mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0)
|
||||
bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0
|
||||
movi v2.8h, #4
|
||||
add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
|
||||
movi v3.8h, #3
|
||||
sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h
|
||||
movi \tmp5\().8h, #0
|
||||
sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
|
||||
dup \tmp6\().8h, w7 // max pixel value
|
||||
.if \wd == 16
|
||||
bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out
|
||||
.endif
|
||||
|
||||
ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1
|
||||
|
||||
add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4
|
||||
add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3
|
||||
smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
|
||||
smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
|
||||
sshr \tmp3\().8h, \tmp3\().8h, #3 // f1
|
||||
sshr \tmp4\().8h, \tmp4\().8h, #3 // f2
|
||||
|
||||
add v0.8h, v23.8h, \tmp4\().8h // p0 + f2
|
||||
sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
|
||||
smin v0.8h, v0.8h, \tmp6\().8h
|
||||
smin v2.8h, v2.8h, \tmp6\().8h
|
||||
srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
|
||||
smax v0.8h, v0.8h, \tmp5\().8h // out p0
|
||||
smax v2.8h, v2.8h, \tmp5\().8h // out q0
|
||||
bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
|
||||
bit v24.16b, v2.16b, v4.16b
|
||||
|
||||
add v0.8h, v22.8h, \tmp3\().8h // p1 + f
|
||||
sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
|
||||
.if \wd >= 8
|
||||
mov x11, v6.d[0]
|
||||
.endif
|
||||
smin v0.8h, v0.8h, \tmp6\().8h
|
||||
smin v2.8h, v2.8h, \tmp6\().8h
|
||||
.if \wd >= 8
|
||||
mov x12, v6.d[1]
|
||||
.endif
|
||||
smax v0.8h, v0.8h, \tmp5\().8h // out p1
|
||||
smax v2.8h, v2.8h, \tmp5\().8h // out q1
|
||||
.if \wd >= 8
|
||||
adds x11, x11, x12
|
||||
.endif
|
||||
bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
|
||||
bit v25.16b, v2.16b, v5.16b
|
||||
|
||||
// If no pixels need flat8in, jump to flat8out
|
||||
// (or to a writeout of the inner 4 pixels, for wd=8)
|
||||
.if \wd >= 8
|
||||
.if \wd == 16
|
||||
b.eq 6f
|
||||
.else
|
||||
b.ne 1f
|
||||
br x13
|
||||
1:
|
||||
.endif
|
||||
|
||||
// flat8in
|
||||
add \tmp1\().8h, v20.8h, v21.8h
|
||||
add \tmp3\().8h, v22.8h, v25.8h
|
||||
add \tmp5\().8h, v20.8h, v22.8h
|
||||
add \tmp7\().8h, v23.8h, v26.8h
|
||||
add v0.8h, \tmp1\().8h, \tmp1\().8h
|
||||
add v0.8h, v0.8h, v23.8h
|
||||
add v0.8h, v0.8h, v24.8h
|
||||
add v0.8h, v0.8h, \tmp5\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
||||
urshr v2.8h, v0.8h, #3 // out p2
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
add \tmp1\().8h, v20.8h, v23.8h
|
||||
add \tmp3\().8h, v24.8h, v27.8h
|
||||
urshr v3.8h, v0.8h, #3 // out p1
|
||||
|
||||
add v0.8h, v0.8h, \tmp7\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
add \tmp5\().8h, v21.8h, v24.8h
|
||||
add \tmp7\().8h, v25.8h, v27.8h
|
||||
urshr v4.8h, v0.8h, #3 // out p0
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
|
||||
add \tmp1\().8h, v22.8h, v25.8h
|
||||
add \tmp3\().8h, v26.8h, v27.8h
|
||||
urshr v5.8h, v0.8h, #3 // out q0
|
||||
|
||||
add v0.8h, v0.8h, \tmp7\().8h
|
||||
sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
|
||||
urshr \tmp5\().8h, v0.8h, #3 // out q1
|
||||
|
||||
add v0.8h, v0.8h, \tmp3\().8h
|
||||
// The output here is written back into the input registers. This doesn't
|
||||
// matter for the flat8part below, since we only update those pixels
|
||||
// which won't be touched below.
|
||||
bit v21.16b, v2.16b, v6.16b
|
||||
bit v22.16b, v3.16b, v6.16b
|
||||
bit v23.16b, v4.16b, v6.16b
|
||||
urshr \tmp6\().8h, v0.8h, #3 // out q2
|
||||
bit v24.16b, v5.16b, v6.16b
|
||||
bit v25.16b, \tmp5\().16b, v6.16b
|
||||
bit v26.16b, \tmp6\().16b, v6.16b
|
||||
.endif
|
||||
.if \wd == 16
|
||||
6:
|
||||
orr v2.16b, v6.16b, v7.16b
|
||||
mov x11, v2.d[0]
|
||||
mov x12, v2.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
// If no pixels needed flat8in nor flat8out, jump to a
|
||||
// writeout of the inner 4 pixels
|
||||
br x14
|
||||
1:
|
||||
|
||||
mov x11, v7.d[0]
|
||||
mov x12, v7.d[1]
|
||||
adds x11, x11, x12
|
||||
b.ne 1f
|
||||
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
|
||||
br x15
|
||||
|
||||
1:
|
||||
// flat8out
|
||||
// This writes all outputs into v2-v17 (skipping v6 and v16).
|
||||
// If this part is skipped, the output is read from v21-v26 (which is the input
|
||||
// to this section).
|
||||
shl v0.8h, v16.8h, #3 // 8 * v16
|
||||
sub v0.8h, v0.8h, v16.8h // 7 * v16
|
||||
add v0.8h, v0.8h, v17.8h
|
||||
add v8.8h, v17.8h, v18.8h
|
||||
add v10.8h, v19.8h, v20.8h
|
||||
add v0.8h, v0.8h, v8.8h
|
||||
add v8.8h, v16.8h, v17.8h
|
||||
add v12.8h, v21.8h, v22.8h
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v10.8h, v18.8h, v25.8h
|
||||
add v14.8h, v23.8h, v24.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
add v0.8h, v0.8h, v12.8h
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v18.8h
|
||||
add v14.8h, v19.8h, v26.8h
|
||||
urshr v2.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v8.8h, v16.8h, v19.8h
|
||||
add v10.8h, v20.8h, v27.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
bif v2.16b, v17.16b, v7.16b
|
||||
urshr v3.8h , v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v20.8h
|
||||
add v14.8h, v21.8h, v28.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
bif v3.16b, v18.16b, v7.16b
|
||||
urshr v4.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v8.8h, v16.8h, v21.8h
|
||||
add v10.8h, v22.8h, v29.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
bif v4.16b, v19.16b, v7.16b
|
||||
urshr v5.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v12.8h, v16.8h, v22.8h
|
||||
add v14.8h, v23.8h, v30.8h
|
||||
sub v10.8h, v10.8h, v8.8h
|
||||
bif v5.16b, v20.16b, v7.16b
|
||||
urshr v6.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
add v10.8h, v16.8h, v23.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
add v12.8h, v24.8h, v31.8h
|
||||
bif v6.16b, v21.16b, v7.16b
|
||||
urshr v8.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
sub v10.8h, v12.8h, v10.8h
|
||||
add v12.8h, v17.8h, v24.8h
|
||||
add v14.8h, v25.8h, v31.8h
|
||||
bif v8.16b, v22.16b, v7.16b
|
||||
urshr v9.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v10.8h
|
||||
sub v14.8h, v14.8h, v12.8h
|
||||
add v12.8h, v26.8h, v31.8h
|
||||
bif v9.16b, v23.16b, v7.16b
|
||||
urshr v10.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v14.8h, v18.8h, v25.8h
|
||||
add v18.8h, v19.8h, v26.8h
|
||||
sub v12.8h, v12.8h, v14.8h
|
||||
add v14.8h, v27.8h, v31.8h
|
||||
bif v10.16b, v24.16b, v7.16b
|
||||
urshr v11.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v12.8h
|
||||
add v12.8h, v20.8h, v27.8h
|
||||
sub v14.8h, v14.8h, v18.8h
|
||||
add v18.8h, v28.8h, v31.8h
|
||||
bif v11.16b, v25.16b, v7.16b
|
||||
sub v18.8h, v18.8h, v12.8h
|
||||
urshr v12.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v14.8h
|
||||
add v14.8h, v21.8h, v28.8h
|
||||
add v20.8h, v29.8h, v31.8h
|
||||
bif v12.16b, v26.16b, v7.16b
|
||||
urshr v13.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v18.8h
|
||||
sub v20.8h, v20.8h, v14.8h
|
||||
add v18.8h, v22.8h, v29.8h
|
||||
add v22.8h, v30.8h, v31.8h
|
||||
bif v13.16b, v27.16b, v7.16b
|
||||
urshr v14.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v20.8h
|
||||
sub v22.8h, v22.8h, v18.8h
|
||||
bif v14.16b, v28.16b, v7.16b
|
||||
urshr v15.8h, v0.8h, #4
|
||||
|
||||
add v0.8h, v0.8h, v22.8h
|
||||
bif v15.16b, v29.16b, v7.16b
|
||||
urshr v17.8h, v0.8h, #4
|
||||
bif v17.16b, v30.16b, v7.16b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
|
||||
// while we need those for inputs/outputs in wd=16 and use v8-v15
|
||||
// for temp registers there instead.
|
||||
function vp9_loop_filter_4
|
||||
loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_8
|
||||
loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function vp9_loop_filter_16
|
||||
loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro loop_filter_4
|
||||
bl vp9_loop_filter_4
|
||||
.endm
|
||||
|
||||
.macro loop_filter_8
|
||||
// calculate alternative 'return' targets
|
||||
adr x13, 6f
|
||||
bl vp9_loop_filter_8
|
||||
.endm
|
||||
|
||||
.macro loop_filter_16
|
||||
// calculate alternative 'return' targets
|
||||
adr x14, 7f
|
||||
adr x15, 8f
|
||||
bl vp9_loop_filter_16
|
||||
.endm
|
||||
|
||||
|
||||
// The public functions in this file have got the following signature:
|
||||
// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
|
||||
|
||||
.macro bpp_frontend func, bpp, push
|
||||
function ff_\func\()_\bpp\()_neon, export=1
|
||||
.if \push
|
||||
mov x16, x30
|
||||
stp d14, d15, [sp, #-0x10]!
|
||||
stp d12, d13, [sp, #-0x10]!
|
||||
stp d10, d11, [sp, #-0x10]!
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
.if \push
|
||||
bl \func\()_16_neon
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
br x16
|
||||
.else
|
||||
b \func\()_16_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends func, push=0
|
||||
bpp_frontend \func, 10, \push
|
||||
bpp_frontend \func, 12, \push
|
||||
.endm
|
||||
|
||||
.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
|
||||
function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
|
||||
mov x16, x30
|
||||
.if \push
|
||||
stp d14, d15, [sp, #-0x10]!
|
||||
stp d12, d13, [sp, #-0x10]!
|
||||
stp d10, d11, [sp, #-0x10]!
|
||||
stp d8, d9, [sp, #-0x10]!
|
||||
.endif
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
bl \func\()_\int_suffix\()_16_neon
|
||||
.ifc \dir,h
|
||||
add x0, x0, x1, lsl #3
|
||||
.else
|
||||
add x0, x0, #16
|
||||
.endif
|
||||
bl \func\()_\int_suffix\()_16_neon
|
||||
.if \push
|
||||
ldp d8, d9, [sp], 0x10
|
||||
ldp d10, d11, [sp], 0x10
|
||||
ldp d12, d13, [sp], 0x10
|
||||
ldp d14, d15, [sp], 0x10
|
||||
.endif
|
||||
br x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
|
||||
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
|
||||
bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
|
||||
.endm
|
||||
|
||||
.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
|
||||
function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
|
||||
mov x16, x30
|
||||
lsr w8, w2, #8
|
||||
lsr w14, w3, #8
|
||||
lsr w15, w4, #8
|
||||
and w2, w2, #0xff
|
||||
and w3, w3, #0xff
|
||||
and w4, w4, #0xff
|
||||
lsl w2, w2, #\bpp - 8
|
||||
lsl w3, w3, #\bpp - 8
|
||||
lsl w4, w4, #\bpp - 8
|
||||
mov x5, #1 << (\bpp - 8)
|
||||
mov x6, #16 - \bpp
|
||||
mov x7, #((1 << \bpp) - 1)
|
||||
bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
|
||||
.ifc \dir,h
|
||||
add x0, x0, x1, lsl #3
|
||||
.else
|
||||
add x0, x0, #16
|
||||
.endif
|
||||
lsl w2, w8, #\bpp - 8
|
||||
lsl w3, w14, #\bpp - 8
|
||||
lsl w4, w15, #\bpp - 8
|
||||
bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
|
||||
br x16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro bpp_frontends_mix2 wd1, wd2
|
||||
bpp_frontend_mix2 \wd1, \wd2, v, 10
|
||||
bpp_frontend_mix2 \wd1, \wd2, v, 12
|
||||
bpp_frontend_mix2 \wd1, \wd2, h, 10
|
||||
bpp_frontend_mix2 \wd1, \wd2, h, 12
|
||||
.endm
|
||||
|
||||
function vp9_loop_filter_v_4_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x9, x9, x1, lsl #1
|
||||
|
||||
loop_filter_4
|
||||
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_4_8
|
||||
|
||||
function vp9_loop_filter_h_4_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
loop_filter_4
|
||||
|
||||
// Move x9 forward by 2 pixels; we don't need to rewrite the
|
||||
// outermost 2 pixels since they aren't changed.
|
||||
add x9, x9, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
|
||||
// We only will write the mid 4 pixels back; after the loop filter,
|
||||
// these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
|
||||
// We need to transpose them to columns, done with a 4x8 transpose
|
||||
// (which in practice is two 4x4 transposes of the two 4x4 halves
|
||||
// of the 8x4 pixels; into 4x8 pixels).
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_4_8
|
||||
|
||||
function vp9_loop_filter_v_8_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #2
|
||||
add x9, x9, x1
|
||||
|
||||
loop_filter_8
|
||||
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
|
||||
br x10
|
||||
6:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_8_8
|
||||
|
||||
function vp9_loop_filter_h_8_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
|
||||
sub x9, x9, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
loop_filter_8
|
||||
|
||||
add x0, x9, x1, lsl #2
|
||||
|
||||
// Even though only 6 pixels per row have been changed, we write the
|
||||
// full 8 pixel registers.
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
st1 {v20.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
|
||||
br x10
|
||||
6:
|
||||
// If we didn't need to do the flat8in part, we use the same writeback
|
||||
// as in loop_filter_h_4_8.
|
||||
add x9, x9, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_8_8
|
||||
|
||||
bpp_frontends_mix2 4, 4
|
||||
bpp_frontends_mix2 4, 8
|
||||
bpp_frontends_mix2 8, 4
|
||||
bpp_frontends_mix2 8, 8
|
||||
|
||||
function vp9_loop_filter_v_16_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, x1, lsl #3
|
||||
ld1 {v16.8h}, [x9], x1 // p7
|
||||
ld1 {v24.8h}, [x0], x1 // q0
|
||||
ld1 {v17.8h}, [x9], x1 // p6
|
||||
ld1 {v25.8h}, [x0], x1 // q1
|
||||
ld1 {v18.8h}, [x9], x1 // p5
|
||||
ld1 {v26.8h}, [x0], x1 // q2
|
||||
ld1 {v19.8h}, [x9], x1 // p4
|
||||
ld1 {v27.8h}, [x0], x1 // q3
|
||||
ld1 {v20.8h}, [x9], x1 // p3
|
||||
ld1 {v28.8h}, [x0], x1 // q4
|
||||
ld1 {v21.8h}, [x9], x1 // p2
|
||||
ld1 {v29.8h}, [x0], x1 // q5
|
||||
ld1 {v22.8h}, [x9], x1 // p1
|
||||
ld1 {v30.8h}, [x0], x1 // q6
|
||||
ld1 {v23.8h}, [x9], x1 // p0
|
||||
ld1 {v31.8h}, [x0], x1 // q7
|
||||
sub x9, x9, x1, lsl #3
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x9, x9, x1
|
||||
|
||||
loop_filter_16
|
||||
|
||||
// If we did the flat8out part, we get the output in
|
||||
// v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
|
||||
// store v2-v9 there, and v10-v17 into x0.
|
||||
st1 {v2.8h}, [x9], x1
|
||||
st1 {v10.8h}, [x0], x1
|
||||
st1 {v3.8h}, [x9], x1
|
||||
st1 {v11.8h}, [x0], x1
|
||||
st1 {v4.8h}, [x9], x1
|
||||
st1 {v12.8h}, [x0], x1
|
||||
st1 {v5.8h}, [x9], x1
|
||||
st1 {v13.8h}, [x0], x1
|
||||
st1 {v6.8h}, [x9], x1
|
||||
st1 {v14.8h}, [x0], x1
|
||||
st1 {v8.8h}, [x9], x1
|
||||
st1 {v15.8h}, [x0], x1
|
||||
st1 {v9.8h}, [x9], x1
|
||||
st1 {v17.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, x1
|
||||
|
||||
br x10
|
||||
8:
|
||||
add x9, x9, x1, lsl #2
|
||||
// If we didn't do the flat8out part, the output is left in the
|
||||
// input registers.
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
sub x0, x0, x1
|
||||
br x10
|
||||
7:
|
||||
sub x9, x0, x1, lsl #1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #1
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_v_16_8, push=1
|
||||
bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
|
||||
|
||||
function vp9_loop_filter_h_16_8_16_neon
|
||||
mov x10, x30
|
||||
sub x9, x0, #16
|
||||
ld1 {v16.8h}, [x9], x1
|
||||
ld1 {v24.8h}, [x0], x1
|
||||
ld1 {v17.8h}, [x9], x1
|
||||
ld1 {v25.8h}, [x0], x1
|
||||
ld1 {v18.8h}, [x9], x1
|
||||
ld1 {v26.8h}, [x0], x1
|
||||
ld1 {v19.8h}, [x9], x1
|
||||
ld1 {v27.8h}, [x0], x1
|
||||
ld1 {v20.8h}, [x9], x1
|
||||
ld1 {v28.8h}, [x0], x1
|
||||
ld1 {v21.8h}, [x9], x1
|
||||
ld1 {v29.8h}, [x0], x1
|
||||
ld1 {v22.8h}, [x9], x1
|
||||
ld1 {v30.8h}, [x0], x1
|
||||
ld1 {v23.8h}, [x9], x1
|
||||
ld1 {v31.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
sub x9, x9, x1, lsl #3
|
||||
|
||||
// The 16x8 pixels read above is in two 8x8 blocks; the left
|
||||
// half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
|
||||
// of this, to get one column per register.
|
||||
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
|
||||
|
||||
loop_filter_16
|
||||
|
||||
transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1
|
||||
transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
|
||||
|
||||
st1 {v16.8h}, [x9], x1
|
||||
st1 {v10.8h}, [x0], x1
|
||||
st1 {v2.8h}, [x9], x1
|
||||
st1 {v11.8h}, [x0], x1
|
||||
st1 {v3.8h}, [x9], x1
|
||||
st1 {v12.8h}, [x0], x1
|
||||
st1 {v4.8h}, [x9], x1
|
||||
st1 {v13.8h}, [x0], x1
|
||||
st1 {v5.8h}, [x9], x1
|
||||
st1 {v14.8h}, [x0], x1
|
||||
st1 {v6.8h}, [x9], x1
|
||||
st1 {v15.8h}, [x0], x1
|
||||
st1 {v8.8h}, [x9], x1
|
||||
st1 {v17.8h}, [x0], x1
|
||||
st1 {v9.8h}, [x9], x1
|
||||
st1 {v31.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
|
||||
br x10
|
||||
8:
|
||||
// The same writeback as in loop_filter_h_8_8
|
||||
sub x9, x0, #8
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
|
||||
|
||||
st1 {v20.8h}, [x9], x1
|
||||
st1 {v24.8h}, [x0], x1
|
||||
st1 {v21.8h}, [x9], x1
|
||||
st1 {v25.8h}, [x0], x1
|
||||
st1 {v22.8h}, [x9], x1
|
||||
st1 {v26.8h}, [x0], x1
|
||||
st1 {v23.8h}, [x9], x1
|
||||
st1 {v27.8h}, [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
br x10
|
||||
7:
|
||||
// The same writeback as in loop_filter_h_4_8
|
||||
sub x9, x0, #4
|
||||
add x0, x9, x1, lsl #2
|
||||
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
|
||||
st1 {v22.d}[0], [x9], x1
|
||||
st1 {v22.d}[1], [x0], x1
|
||||
st1 {v23.d}[0], [x9], x1
|
||||
st1 {v23.d}[1], [x0], x1
|
||||
st1 {v24.d}[0], [x9], x1
|
||||
st1 {v24.d}[1], [x0], x1
|
||||
st1 {v25.d}[0], [x9], x1
|
||||
st1 {v25.d}[1], [x0], x1
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #4
|
||||
br x10
|
||||
endfunc
|
||||
|
||||
bpp_frontends vp9_loop_filter_h_16_8, push=1
|
||||
bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,631 @@
|
|||
/*
|
||||
* Copyright (c) 2017 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// All public functions in this file have the following signature:
|
||||
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *ref, ptrdiff_t ref_stride,
|
||||
// int h, int mx, int my);
|
||||
|
||||
function ff_vp9_copy128_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
ldp x9, x10, [x2, #32]
|
||||
stp x7, x8, [x0, #16]
|
||||
subs w4, w4, #1
|
||||
ldp x11, x12, [x2, #48]
|
||||
stp x9, x10, [x0, #32]
|
||||
stp x11, x12, [x0, #48]
|
||||
ldp x5, x6, [x2, #64]
|
||||
ldp x7, x8, [x2, #80]
|
||||
stp x5, x6, [x0, #64]
|
||||
ldp x9, x10, [x2, #96]
|
||||
stp x7, x8, [x0, #80]
|
||||
ldp x11, x12, [x2, #112]
|
||||
stp x9, x10, [x0, #96]
|
||||
stp x11, x12, [x0, #112]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg64_16_neon, export=1
|
||||
mov x5, x0
|
||||
sub x1, x1, #64
|
||||
sub x3, x3, #64
|
||||
1:
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v4.8h
|
||||
urhadd v1.8h, v1.8h, v5.8h
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
||||
urhadd v2.8h, v2.8h, v6.8h
|
||||
urhadd v3.8h, v3.8h, v7.8h
|
||||
subs w4, w4, #1
|
||||
urhadd v16.8h, v16.8h, v20.8h
|
||||
urhadd v17.8h, v17.8h, v21.8h
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], #64
|
||||
urhadd v18.8h, v18.8h, v22.8h
|
||||
urhadd v19.8h, v19.8h, v23.8h
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg32_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x3
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
|
||||
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v4.8h
|
||||
urhadd v1.8h, v1.8h, v5.8h
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x1
|
||||
urhadd v2.8h, v2.8h, v6.8h
|
||||
urhadd v3.8h, v3.8h, v7.8h
|
||||
subs w4, w4, #2
|
||||
urhadd v16.8h, v16.8h, v20.8h
|
||||
urhadd v17.8h, v17.8h, v21.8h
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x5], x1
|
||||
urhadd v18.8h, v18.8h, v22.8h
|
||||
urhadd v19.8h, v19.8h, v23.8h
|
||||
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg16_16_neon, export=1
|
||||
1:
|
||||
ld1 {v2.8h, v3.8h}, [x2], x3
|
||||
ld1 {v0.8h, v1.8h}, [x0]
|
||||
urhadd v0.8h, v0.8h, v2.8h
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
subs w4, w4, #1
|
||||
st1 {v0.8h, v1.8h}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg8_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.8h}, [x2], x3
|
||||
ld1 {v0.8h}, [x0], x1
|
||||
ld1 {v3.8h}, [x2], x3
|
||||
urhadd v0.8h, v0.8h, v2.8h
|
||||
ld1 {v1.8h}, [x0], x1
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8h}, [x5], x1
|
||||
st1 {v1.8h}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg4_16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.4h}, [x2], x3
|
||||
ld1 {v0.4h}, [x0], x1
|
||||
ld1 {v3.4h}, [x2], x3
|
||||
urhadd v0.4h, v0.4h, v2.4h
|
||||
ld1 {v1.4h}, [x0], x1
|
||||
urhadd v1.4h, v1.4h, v3.4h
|
||||
subs w4, w4, #2
|
||||
st1 {v0.4h}, [x5], x1
|
||||
st1 {v1.8b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
||||
// for size >= 16), and multiply-accumulate into dst1 and dst5 (or
|
||||
// dst1-dst2 and dst5-dst6 for size >= 8 and dst1-dst4 and dst5-dst8
|
||||
// for size >= 16)
|
||||
.macro extmlal dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
smlal \dst1\().4s, v20.4h, v0.h[\offset]
|
||||
smlal \dst5\().4s, v22.4h, v0.h[\offset]
|
||||
.if \size >= 16
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
.endif
|
||||
.if \size >= 8
|
||||
smlal2 \dst2\().4s, v20.8h, v0.h[\offset]
|
||||
smlal2 \dst6\().4s, v22.8h, v0.h[\offset]
|
||||
.endif
|
||||
.if \size >= 16
|
||||
smlal \dst3\().4s, v21.4h, v0.h[\offset]
|
||||
smlal \dst7\().4s, v23.4h, v0.h[\offset]
|
||||
smlal2 \dst4\().4s, v21.8h, v0.h[\offset]
|
||||
smlal2 \dst8\().4s, v23.8h, v0.h[\offset]
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
// Instantiate a horizontal filter function for the given size.
|
||||
// This can work on 4, 8 or 16 pixels in parallel; for larger
|
||||
// widths it will do 16 pixels at a time and loop horizontally.
|
||||
// The actual width (in bytes) is passed in x5, the height in w4 and
|
||||
// the filter coefficients in x9.
|
||||
.macro do_8tap_h type, size
|
||||
function \type\()_8tap_\size\()h
|
||||
sub x2, x2, #6
|
||||
add x6, x0, x1
|
||||
add x7, x2, x3
|
||||
add x1, x1, x1
|
||||
add x3, x3, x3
|
||||
// Only size >= 16 loops horizontally and needs
|
||||
// reduced dst stride
|
||||
.if \size >= 16
|
||||
sub x1, x1, x5
|
||||
.endif
|
||||
// size >= 16 loads two qwords and increments r2,
|
||||
// for size 4/8 it's enough with one qword and no
|
||||
// postincrement
|
||||
.if \size >= 16
|
||||
sub x3, x3, x5
|
||||
sub x3, x3, #16
|
||||
.endif
|
||||
// Load the filter vector
|
||||
ld1 {v0.8h}, [x9]
|
||||
1:
|
||||
.if \size >= 16
|
||||
mov x9, x5
|
||||
.endif
|
||||
// Load src
|
||||
.if \size >= 16
|
||||
ld1 {v5.8h, v6.8h, v7.8h}, [x2], #48
|
||||
ld1 {v16.8h, v17.8h, v18.8h}, [x7], #48
|
||||
.else
|
||||
ld1 {v5.8h, v6.8h}, [x2]
|
||||
ld1 {v16.8h, v17.8h}, [x7]
|
||||
.endif
|
||||
2:
|
||||
|
||||
smull v1.4s, v5.4h, v0.h[0]
|
||||
smull v24.4s, v16.4h, v0.h[0]
|
||||
.if \size >= 8
|
||||
smull2 v2.4s, v5.8h, v0.h[0]
|
||||
smull2 v25.4s, v16.8h, v0.h[0]
|
||||
.endif
|
||||
.if \size >= 16
|
||||
smull v3.4s, v6.4h, v0.h[0]
|
||||
smull v26.4s, v17.4h, v0.h[0]
|
||||
smull2 v4.4s, v6.8h, v0.h[0]
|
||||
smull2 v27.4s, v17.8h, v0.h[0]
|
||||
.endif
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 1, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 2, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 3, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 4, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 5, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 6, \size
|
||||
extmlal v1, v2, v3, v4, v24, v25, v26, v27, v5, v6, v7, v16, v17, v18, 7, \size
|
||||
|
||||
// Round, shift and saturate
|
||||
// The sqrshrun takes care of clamping negative values to zero, but
|
||||
// we manually need to do umin with the max pixel value.
|
||||
sqrshrun v1.4h, v1.4s, #7
|
||||
sqrshrun v24.4h, v24.4s, #7
|
||||
.if \size >= 8
|
||||
sqrshrun2 v1.8h, v2.4s, #7
|
||||
sqrshrun2 v24.8h, v25.4s, #7
|
||||
umin v1.8h, v1.8h, v31.8h
|
||||
umin v24.8h, v24.8h, v31.8h
|
||||
.if \size >= 16
|
||||
sqrshrun v2.4h, v3.4s, #7
|
||||
sqrshrun v25.4h, v26.4s, #7
|
||||
sqrshrun2 v2.8h, v4.4s, #7
|
||||
sqrshrun2 v25.8h, v27.4s, #7
|
||||
umin v2.8h, v2.8h, v31.8h
|
||||
umin v25.8h, v25.8h, v31.8h
|
||||
.endif
|
||||
.else
|
||||
umin v1.4h, v1.4h, v31.4h
|
||||
umin v24.4h, v24.4h, v31.4h
|
||||
.endif
|
||||
// Average
|
||||
.ifc \type,avg
|
||||
.if \size >= 16
|
||||
ld1 {v3.8h, v4.8h}, [x0]
|
||||
ld1 {v29.8h, v30.8h}, [x6]
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
urhadd v2.8h, v2.8h, v4.8h
|
||||
urhadd v24.8h, v24.8h, v29.8h
|
||||
urhadd v25.8h, v25.8h, v30.8h
|
||||
.elseif \size >= 8
|
||||
ld1 {v3.8h}, [x0]
|
||||
ld1 {v4.8h}, [x6]
|
||||
urhadd v1.8h, v1.8h, v3.8h
|
||||
urhadd v24.8h, v24.8h, v4.8h
|
||||
.else
|
||||
ld1 {v3.4h}, [x0]
|
||||
ld1 {v4.4h}, [x6]
|
||||
urhadd v1.4h, v1.4h, v3.4h
|
||||
urhadd v24.4h, v24.4h, v4.4h
|
||||
.endif
|
||||
.endif
|
||||
// Store and loop horizontally (for size >= 16)
|
||||
.if \size >= 16
|
||||
subs x9, x9, #32
|
||||
st1 {v1.8h, v2.8h}, [x0], #32
|
||||
st1 {v24.8h, v25.8h}, [x6], #32
|
||||
b.eq 3f
|
||||
mov v5.16b, v7.16b
|
||||
mov v16.16b, v18.16b
|
||||
ld1 {v6.8h, v7.8h}, [x2], #32
|
||||
ld1 {v17.8h, v18.8h}, [x7], #32
|
||||
b 2b
|
||||
.elseif \size == 8
|
||||
st1 {v1.8h}, [x0]
|
||||
st1 {v24.8h}, [x6]
|
||||
.else // \size == 4
|
||||
st1 {v1.4h}, [x0]
|
||||
st1 {v24.4h}, [x6]
|
||||
.endif
|
||||
3:
|
||||
// Loop vertically
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x2, x2, x3
|
||||
add x7, x7, x3
|
||||
subs w4, w4, #2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_size size
|
||||
do_8tap_h put, \size
|
||||
do_8tap_h avg, \size
|
||||
.endm
|
||||
|
||||
do_8tap_h_size 4
|
||||
do_8tap_h_size 8
|
||||
do_8tap_h_size 16
|
||||
|
||||
.macro do_8tap_h_func type, filter, offset, size, bpp
|
||||
function ff_vp9_\type\()_\filter\()\size\()_h_\bpp\()_neon, export=1
|
||||
mvni v31.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
||||
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w5, #8
|
||||
add x9, x6, w5, uxtw #4
|
||||
mov x5, #2*\size
|
||||
.if \size >= 16
|
||||
b \type\()_8tap_16h
|
||||
.else
|
||||
b \type\()_8tap_\size\()h
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters size, bpp
|
||||
do_8tap_h_func put, regular, 1, \size, \bpp
|
||||
do_8tap_h_func avg, regular, 1, \size, \bpp
|
||||
do_8tap_h_func put, sharp, 2, \size, \bpp
|
||||
do_8tap_h_func avg, sharp, 2, \size, \bpp
|
||||
do_8tap_h_func put, smooth, 0, \size, \bpp
|
||||
do_8tap_h_func avg, smooth, 0, \size, \bpp
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters_bpp bpp
|
||||
do_8tap_h_filters 64, \bpp
|
||||
do_8tap_h_filters 32, \bpp
|
||||
do_8tap_h_filters 16, \bpp
|
||||
do_8tap_h_filters 8, \bpp
|
||||
do_8tap_h_filters 4, \bpp
|
||||
.endm
|
||||
|
||||
do_8tap_h_filters_bpp 10
|
||||
do_8tap_h_filters_bpp 12
|
||||
|
||||
|
||||
// Vertical filters
|
||||
|
||||
// Round, shift and saturate and store reg1-reg4
|
||||
.macro do_store4 reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, minreg, type
|
||||
sqrshrun \reg1\().4h, \reg1\().4s, #7
|
||||
sqrshrun \reg2\().4h, \reg2\().4s, #7
|
||||
sqrshrun \reg3\().4h, \reg3\().4s, #7
|
||||
sqrshrun \reg4\().4h, \reg4\().4s, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().4h}, [x7], x1
|
||||
ld1 {\tmp2\().4h}, [x7], x1
|
||||
ld1 {\tmp3\().4h}, [x7], x1
|
||||
ld1 {\tmp4\().4h}, [x7], x1
|
||||
.endif
|
||||
umin \reg1\().4h, \reg1\().4h, \minreg\().4h
|
||||
umin \reg2\().4h, \reg2\().4h, \minreg\().4h
|
||||
umin \reg3\().4h, \reg3\().4h, \minreg\().4h
|
||||
umin \reg4\().4h, \reg4\().4h, \minreg\().4h
|
||||
.ifc \type,avg
|
||||
urhadd \reg1\().4h, \reg1\().4h, \tmp1\().4h
|
||||
urhadd \reg2\().4h, \reg2\().4h, \tmp2\().4h
|
||||
urhadd \reg3\().4h, \reg3\().4h, \tmp3\().4h
|
||||
urhadd \reg4\().4h, \reg4\().4h, \tmp4\().4h
|
||||
.endif
|
||||
st1 {\reg1\().4h}, [x0], x1
|
||||
st1 {\reg2\().4h}, [x0], x1
|
||||
st1 {\reg3\().4h}, [x0], x1
|
||||
st1 {\reg4\().4h}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Round, shift and saturate and store reg1-8, where
|
||||
// reg1-2, reg3-4 etc pairwise correspond to 4 rows.
|
||||
.macro do_store8 reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, minreg, type
|
||||
sqrshrun \reg1\().4h, \reg1\().4s, #7
|
||||
sqrshrun2 \reg1\().8h, \reg2\().4s, #7
|
||||
sqrshrun \reg2\().4h, \reg3\().4s, #7
|
||||
sqrshrun2 \reg2\().8h, \reg4\().4s, #7
|
||||
sqrshrun \reg3\().4h, \reg5\().4s, #7
|
||||
sqrshrun2 \reg3\().8h, \reg6\().4s, #7
|
||||
sqrshrun \reg4\().4h, \reg7\().4s, #7
|
||||
sqrshrun2 \reg4\().8h, \reg8\().4s, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\reg5\().8h}, [x7], x1
|
||||
ld1 {\reg6\().8h}, [x7], x1
|
||||
ld1 {\reg7\().8h}, [x7], x1
|
||||
ld1 {\reg8\().8h}, [x7], x1
|
||||
.endif
|
||||
umin \reg1\().8h, \reg1\().8h, \minreg\().8h
|
||||
umin \reg2\().8h, \reg2\().8h, \minreg\().8h
|
||||
umin \reg3\().8h, \reg3\().8h, \minreg\().8h
|
||||
umin \reg4\().8h, \reg4\().8h, \minreg\().8h
|
||||
.ifc \type,avg
|
||||
urhadd \reg1\().8h, \reg1\().8h, \reg5\().8h
|
||||
urhadd \reg2\().8h, \reg2\().8h, \reg6\().8h
|
||||
urhadd \reg3\().8h, \reg3\().8h, \reg7\().8h
|
||||
urhadd \reg4\().8h, \reg4\().8h, \reg8\().8h
|
||||
.endif
|
||||
st1 {\reg1\().8h}, [x0], x1
|
||||
st1 {\reg2\().8h}, [x0], x1
|
||||
st1 {\reg3\().8h}, [x0], x1
|
||||
st1 {\reg4\().8h}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
||||
// (src1-src8 into dst1, src2-src9 into dst2).
|
||||
.macro convolve4 dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, tmp1, tmp2
|
||||
smull \dst1\().4s, \src1\().4h, v0.h[0]
|
||||
smull \dst2\().4s, \src2\().4h, v0.h[0]
|
||||
smull \tmp1\().4s, \src2\().4h, v0.h[1]
|
||||
smull \tmp2\().4s, \src3\().4h, v0.h[1]
|
||||
smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
||||
smlal \dst2\().4s, \src4\().4h, v0.h[2]
|
||||
smlal \tmp1\().4s, \src4\().4h, v0.h[3]
|
||||
smlal \tmp2\().4s, \src5\().4h, v0.h[3]
|
||||
smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
||||
smlal \dst2\().4s, \src6\().4h, v0.h[4]
|
||||
smlal \tmp1\().4s, \src6\().4h, v0.h[5]
|
||||
smlal \tmp2\().4s, \src7\().4h, v0.h[5]
|
||||
smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
||||
smlal \dst2\().4s, \src8\().4h, v0.h[6]
|
||||
smlal \tmp1\().4s, \src8\().4h, v0.h[7]
|
||||
smlal \tmp2\().4s, \src9\().4h, v0.h[7]
|
||||
add \dst1\().4s, \dst1\().4s, \tmp1\().4s
|
||||
add \dst2\().4s, \dst2\().4s, \tmp2\().4s
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst4
|
||||
// (src1-src8 into dst1-dst2, src2-src9 into dst3-dst4).
|
||||
.macro convolve8 dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8, src9
|
||||
smull \dst1\().4s, \src1\().4h, v0.h[0]
|
||||
smull2 \dst2\().4s, \src1\().8h, v0.h[0]
|
||||
smull \dst3\().4s, \src2\().4h, v0.h[0]
|
||||
smull2 \dst4\().4s, \src2\().8h, v0.h[0]
|
||||
smlal \dst1\().4s, \src2\().4h, v0.h[1]
|
||||
smlal2 \dst2\().4s, \src2\().8h, v0.h[1]
|
||||
smlal \dst3\().4s, \src3\().4h, v0.h[1]
|
||||
smlal2 \dst4\().4s, \src3\().8h, v0.h[1]
|
||||
smlal \dst1\().4s, \src3\().4h, v0.h[2]
|
||||
smlal2 \dst2\().4s, \src3\().8h, v0.h[2]
|
||||
smlal \dst3\().4s, \src4\().4h, v0.h[2]
|
||||
smlal2 \dst4\().4s, \src4\().8h, v0.h[2]
|
||||
smlal \dst1\().4s, \src4\().4h, v0.h[3]
|
||||
smlal2 \dst2\().4s, \src4\().8h, v0.h[3]
|
||||
smlal \dst3\().4s, \src5\().4h, v0.h[3]
|
||||
smlal2 \dst4\().4s, \src5\().8h, v0.h[3]
|
||||
smlal \dst1\().4s, \src5\().4h, v0.h[4]
|
||||
smlal2 \dst2\().4s, \src5\().8h, v0.h[4]
|
||||
smlal \dst3\().4s, \src6\().4h, v0.h[4]
|
||||
smlal2 \dst4\().4s, \src6\().8h, v0.h[4]
|
||||
smlal \dst1\().4s, \src6\().4h, v0.h[5]
|
||||
smlal2 \dst2\().4s, \src6\().8h, v0.h[5]
|
||||
smlal \dst3\().4s, \src7\().4h, v0.h[5]
|
||||
smlal2 \dst4\().4s, \src7\().8h, v0.h[5]
|
||||
smlal \dst1\().4s, \src7\().4h, v0.h[6]
|
||||
smlal2 \dst2\().4s, \src7\().8h, v0.h[6]
|
||||
smlal \dst3\().4s, \src8\().4h, v0.h[6]
|
||||
smlal2 \dst4\().4s, \src8\().8h, v0.h[6]
|
||||
smlal \dst1\().4s, \src8\().4h, v0.h[7]
|
||||
smlal2 \dst2\().4s, \src8\().8h, v0.h[7]
|
||||
smlal \dst3\().4s, \src9\().4h, v0.h[7]
|
||||
smlal2 \dst4\().4s, \src9\().8h, v0.h[7]
|
||||
.endm
|
||||
|
||||
// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
||||
// The height is passed in x4, the width in x5 and the filter coefficients
|
||||
// in x6.
|
||||
.macro do_8tap_8v type
|
||||
function \type\()_8tap_8v
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
1:
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
mov x6, x4
|
||||
|
||||
ld1 {v17.8h}, [x2], x3
|
||||
ld1 {v18.8h}, [x2], x3
|
||||
ld1 {v19.8h}, [x2], x3
|
||||
ld1 {v20.8h}, [x2], x3
|
||||
ld1 {v21.8h}, [x2], x3
|
||||
ld1 {v22.8h}, [x2], x3
|
||||
ld1 {v23.8h}, [x2], x3
|
||||
2:
|
||||
ld1 {v24.8h}, [x2], x3
|
||||
ld1 {v25.8h}, [x2], x3
|
||||
ld1 {v26.8h}, [x2], x3
|
||||
ld1 {v27.8h}, [x2], x3
|
||||
|
||||
convolve8 v2, v3, v4, v5, v17, v18, v19, v20, v21, v22, v23, v24, v25
|
||||
convolve8 v6, v7, v30, v31, v19, v20, v21, v22, v23, v24, v25, v26, v27
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v30, v31, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
ld1 {v16.8h}, [x2], x3
|
||||
ld1 {v17.8h}, [x2], x3
|
||||
ld1 {v18.8h}, [x2], x3
|
||||
ld1 {v19.8h}, [x2], x3
|
||||
convolve8 v2, v3, v4, v5, v21, v22, v23, v24, v25, v26, v27, v16, v17
|
||||
convolve8 v6, v7, v20, v21, v23, v24, v25, v26, v27, v16, v17, v18, v19
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v20, v21, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
ld1 {v20.8h}, [x2], x3
|
||||
ld1 {v21.8h}, [x2], x3
|
||||
ld1 {v22.8h}, [x2], x3
|
||||
ld1 {v23.8h}, [x2], x3
|
||||
convolve8 v2, v3, v4, v5, v25, v26, v27, v16, v17, v18, v19, v20, v21
|
||||
convolve8 v6, v7, v24, v25, v27, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
do_store8 v2, v3, v4, v5, v6, v7, v24, v25, v1, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.ne 2b
|
||||
|
||||
8:
|
||||
subs x5, x5, #8
|
||||
b.eq 9f
|
||||
// x0 -= h * dst_stride
|
||||
msub x0, x1, x4, x0
|
||||
// x2 -= h * src_stride
|
||||
msub x2, x3, x4, x2
|
||||
// x2 -= 8 * src_stride
|
||||
sub x2, x2, x3, lsl #3
|
||||
// x2 += 1 * src_stride
|
||||
add x2, x2, x3
|
||||
add x2, x2, #16
|
||||
add x0, x0, #16
|
||||
b 1b
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_8v put
|
||||
do_8tap_8v avg
|
||||
|
||||
|
||||
// Instantiate a vertical filter function for filtering a 4 pixels wide
|
||||
// slice. This only is designed to work for 4 or 8 output lines.
|
||||
.macro do_8tap_4v type
|
||||
function \type\()_8tap_4v
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
|
||||
ld1 {v16.4h}, [x2], x3
|
||||
ld1 {v17.4h}, [x2], x3
|
||||
ld1 {v18.4h}, [x2], x3
|
||||
ld1 {v19.4h}, [x2], x3
|
||||
ld1 {v20.4h}, [x2], x3
|
||||
ld1 {v21.4h}, [x2], x3
|
||||
ld1 {v22.4h}, [x2], x3
|
||||
ld1 {v23.4h}, [x2], x3
|
||||
ld1 {v24.4h}, [x2], x3
|
||||
ld1 {v25.4h}, [x2], x3
|
||||
ld1 {v26.4h}, [x2], x3
|
||||
|
||||
convolve4 v2, v3, v16, v17, v18, v19, v20, v21, v22, v23, v24, v30, v31
|
||||
convolve4 v4, v5, v18, v19, v20, v21, v22, v23, v24, v25, v26, v30, v31
|
||||
do_store4 v2, v3, v4, v5, v28, v29, v30, v31, v1, \type
|
||||
|
||||
subs x4, x4, #4
|
||||
b.eq 9f
|
||||
|
||||
ld1 {v27.4h}, [x2], x3
|
||||
ld1 {v28.4h}, [x2], x3
|
||||
ld1 {v29.4h}, [x2], x3
|
||||
ld1 {v30.4h}, [x2], x3
|
||||
|
||||
convolve4 v2, v3, v20, v21, v22, v23, v24, v25, v26, v27, v28, v16, v17
|
||||
convolve4 v4, v5, v22, v23, v24, v25, v26, v27, v28, v29, v30, v16, v17
|
||||
do_store4 v2, v3, v4, v5, v16, v17, v18, v19, v1, \type
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_4v put
|
||||
do_8tap_4v avg
|
||||
|
||||
|
||||
.macro do_8tap_v_func type, filter, offset, size, bpp
|
||||
function ff_vp9_\type\()_\filter\()\size\()_v_\bpp\()_neon, export=1
|
||||
uxtw x4, w4
|
||||
mvni v1.8h, #((0xff << (\bpp - 8)) & 0xff), lsl #8
|
||||
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
||||
add x6, x5, w6, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 8
|
||||
b \type\()_8tap_8v
|
||||
.else
|
||||
b \type\()_8tap_4v
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters size, bpp
|
||||
do_8tap_v_func put, regular, 1, \size, \bpp
|
||||
do_8tap_v_func avg, regular, 1, \size, \bpp
|
||||
do_8tap_v_func put, sharp, 2, \size, \bpp
|
||||
do_8tap_v_func avg, sharp, 2, \size, \bpp
|
||||
do_8tap_v_func put, smooth, 0, \size, \bpp
|
||||
do_8tap_v_func avg, smooth, 0, \size, \bpp
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters_bpp bpp
|
||||
do_8tap_v_filters 64, \bpp
|
||||
do_8tap_v_filters 32, \bpp
|
||||
do_8tap_v_filters 16, \bpp
|
||||
do_8tap_v_filters 8, \bpp
|
||||
do_8tap_v_filters 4, \bpp
|
||||
.endm
|
||||
|
||||
do_8tap_v_filters_bpp 10
|
||||
do_8tap_v_filters_bpp 12
|
|
@ -0,0 +1,687 @@
|
|||
/*
|
||||
* Copyright (c) 2016 Google Inc.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
// All public functions in this file have the following signature:
|
||||
// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
|
||||
// const uint8_t *ref, ptrdiff_t ref_stride,
|
||||
// int h, int mx, int my);
|
||||
|
||||
function ff_vp9_copy64_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
ldp x9, x10, [x2, #32]
|
||||
stp x7, x8, [x0, #16]
|
||||
subs w4, w4, #1
|
||||
ldp x11, x12, [x2, #48]
|
||||
stp x9, x10, [x0, #32]
|
||||
stp x11, x12, [x0, #48]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg64_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x2], x3
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
|
||||
urhadd v0.16b, v0.16b, v4.16b
|
||||
urhadd v1.16b, v1.16b, v5.16b
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
||||
urhadd v2.16b, v2.16b, v6.16b
|
||||
urhadd v3.16b, v3.16b, v7.16b
|
||||
subs w4, w4, #2
|
||||
urhadd v16.16b, v16.16b, v20.16b
|
||||
urhadd v17.16b, v17.16b, v21.16b
|
||||
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x5], x1
|
||||
urhadd v18.16b, v18.16b, v22.16b
|
||||
urhadd v19.16b, v19.16b, v23.16b
|
||||
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy32_aarch64, export=1
|
||||
1:
|
||||
ldp x5, x6, [x2]
|
||||
ldp x7, x8, [x2, #16]
|
||||
stp x5, x6, [x0]
|
||||
subs w4, w4, #1
|
||||
stp x7, x8, [x0, #16]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg32_neon, export=1
|
||||
1:
|
||||
ld1 {v2.16b, v3.16b}, [x2], x3
|
||||
ld1 {v0.16b, v1.16b}, [x0]
|
||||
urhadd v0.16b, v0.16b, v2.16b
|
||||
urhadd v1.16b, v1.16b, v3.16b
|
||||
subs w4, w4, #1
|
||||
st1 {v0.16b, v1.16b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy16_neon, export=1
|
||||
add x5, x0, x1
|
||||
lsl x1, x1, #1
|
||||
add x6, x2, x3
|
||||
lsl x3, x3, #1
|
||||
1:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ld1 {v1.16b}, [x6], x3
|
||||
ld1 {v2.16b}, [x2], x3
|
||||
ld1 {v3.16b}, [x6], x3
|
||||
subs w4, w4, #4
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v1.16b}, [x5], x1
|
||||
st1 {v2.16b}, [x0], x1
|
||||
st1 {v3.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg16_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.16b}, [x2], x3
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v3.16b}, [x2], x3
|
||||
urhadd v0.16b, v0.16b, v2.16b
|
||||
ld1 {v1.16b}, [x0], x1
|
||||
urhadd v1.16b, v1.16b, v3.16b
|
||||
subs w4, w4, #2
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy8_neon, export=1
|
||||
1:
|
||||
ld1 {v0.8b}, [x2], x3
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg8_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.8b}, [x2], x3
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
urhadd v0.8b, v0.8b, v2.8b
|
||||
ld1 {v1.8b}, [x0], x1
|
||||
urhadd v1.8b, v1.8b, v3.8b
|
||||
subs w4, w4, #2
|
||||
st1 {v0.8b}, [x5], x1
|
||||
st1 {v1.8b}, [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_copy4_neon, export=1
|
||||
1:
|
||||
ld1 {v0.s}[0], [x2], x3
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
subs w4, w4, #4
|
||||
st1 {v2.s}[0], [x0], x1
|
||||
st1 {v3.s}[0], [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vp9_avg4_neon, export=1
|
||||
mov x5, x0
|
||||
1:
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
ld1 {v0.s}[0], [x0], x1
|
||||
ld1 {v2.s}[1], [x2], x3
|
||||
ld1 {v0.s}[1], [x0], x1
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
ld1 {v1.s}[0], [x0], x1
|
||||
ld1 {v3.s}[1], [x2], x3
|
||||
ld1 {v1.s}[1], [x0], x1
|
||||
subs w4, w4, #4
|
||||
urhadd v0.8b, v0.8b, v2.8b
|
||||
urhadd v1.8b, v1.8b, v3.8b
|
||||
st1 {v0.s}[0], [x5], x1
|
||||
st1 {v0.s}[1], [x5], x1
|
||||
st1 {v1.s}[0], [x5], x1
|
||||
st1 {v1.s}[1], [x5], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
|
||||
// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
|
||||
// dst1-dst2 and dst3-dst4 for size >= 16)
|
||||
.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
.if \size >= 16
|
||||
mla \dst1\().8h, v20.8h, v0.h[\offset]
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
mla \dst3\().8h, v22.8h, v0.h[\offset]
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
mla \dst2\().8h, v21.8h, v0.h[\offset]
|
||||
mla \dst4\().8h, v23.8h, v0.h[\offset]
|
||||
.elseif \size == 8
|
||||
mla \dst1\().8h, v20.8h, v0.h[\offset]
|
||||
mla \dst3\().8h, v22.8h, v0.h[\offset]
|
||||
.else
|
||||
mla \dst1\().4h, v20.4h, v0.h[\offset]
|
||||
mla \dst3\().4h, v22.4h, v0.h[\offset]
|
||||
.endif
|
||||
.endm
|
||||
// The same as above, but don't accumulate straight into the
|
||||
// destination, but use a temp register and accumulate with saturation.
|
||||
.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
|
||||
ext v20.16b, \src1\().16b, \src2\().16b, #(2*\offset)
|
||||
ext v22.16b, \src4\().16b, \src5\().16b, #(2*\offset)
|
||||
.if \size >= 16
|
||||
mul v20.8h, v20.8h, v0.h[\offset]
|
||||
ext v21.16b, \src2\().16b, \src3\().16b, #(2*\offset)
|
||||
mul v22.8h, v22.8h, v0.h[\offset]
|
||||
ext v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
|
||||
mul v21.8h, v21.8h, v0.h[\offset]
|
||||
mul v23.8h, v23.8h, v0.h[\offset]
|
||||
.elseif \size == 8
|
||||
mul v20.8h, v20.8h, v0.h[\offset]
|
||||
mul v22.8h, v22.8h, v0.h[\offset]
|
||||
.else
|
||||
mul v20.4h, v20.4h, v0.h[\offset]
|
||||
mul v22.4h, v22.4h, v0.h[\offset]
|
||||
.endif
|
||||
.if \size == 4
|
||||
sqadd \dst1\().4h, \dst1\().4h, v20.4h
|
||||
sqadd \dst3\().4h, \dst3\().4h, v22.4h
|
||||
.else
|
||||
sqadd \dst1\().8h, \dst1\().8h, v20.8h
|
||||
sqadd \dst3\().8h, \dst3\().8h, v22.8h
|
||||
.if \size >= 16
|
||||
sqadd \dst2\().8h, \dst2\().8h, v21.8h
|
||||
sqadd \dst4\().8h, \dst4\().8h, v23.8h
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
|
||||
// Instantiate a horizontal filter function for the given size.
|
||||
// This can work on 4, 8 or 16 pixels in parallel; for larger
|
||||
// widths it will do 16 pixels at a time and loop horizontally.
|
||||
// The actual width is passed in x5, the height in w4 and the
|
||||
// filter coefficients in x9. idx2 is the index of the largest
|
||||
// filter coefficient (3 or 4) and idx1 is the other one of them.
|
||||
.macro do_8tap_h type, size, idx1, idx2
|
||||
function \type\()_8tap_\size\()h_\idx1\idx2
|
||||
sub x2, x2, #3
|
||||
add x6, x0, x1
|
||||
add x7, x2, x3
|
||||
add x1, x1, x1
|
||||
add x3, x3, x3
|
||||
// Only size >= 16 loops horizontally and needs
|
||||
// reduced dst stride
|
||||
.if \size >= 16
|
||||
sub x1, x1, x5
|
||||
.endif
|
||||
// size >= 16 loads two qwords and increments x2,
|
||||
// for size 4/8 it's enough with one qword and no
|
||||
// postincrement
|
||||
.if \size >= 16
|
||||
sub x3, x3, x5
|
||||
sub x3, x3, #8
|
||||
.endif
|
||||
// Load the filter vector
|
||||
ld1 {v0.8h}, [x9]
|
||||
1:
|
||||
.if \size >= 16
|
||||
mov x9, x5
|
||||
.endif
|
||||
// Load src
|
||||
.if \size >= 16
|
||||
ld1 {v4.8b, v5.8b, v6.8b}, [x2], #24
|
||||
ld1 {v16.8b, v17.8b, v18.8b}, [x7], #24
|
||||
.else
|
||||
ld1 {v4.8b, v5.8b}, [x2]
|
||||
ld1 {v16.8b, v17.8b}, [x7]
|
||||
.endif
|
||||
uxtl v4.8h, v4.8b
|
||||
uxtl v5.8h, v5.8b
|
||||
uxtl v16.8h, v16.8b
|
||||
uxtl v17.8h, v17.8b
|
||||
.if \size >= 16
|
||||
uxtl v6.8h, v6.8b
|
||||
uxtl v18.8h, v18.8b
|
||||
.endif
|
||||
2:
|
||||
|
||||
// Accumulate, adding idx2 last with a separate
|
||||
// saturating add. The positive filter coefficients
|
||||
// for all indices except idx2 must add up to less
|
||||
// than 127 for this not to overflow.
|
||||
mul v1.8h, v4.8h, v0.h[0]
|
||||
mul v24.8h, v16.8h, v0.h[0]
|
||||
.if \size >= 16
|
||||
mul v2.8h, v5.8h, v0.h[0]
|
||||
mul v25.8h, v17.8h, v0.h[0]
|
||||
.endif
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 1, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 2, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx1, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 5, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 6, \size
|
||||
extmla v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, 7, \size
|
||||
extmulqadd v1, v2, v24, v25, v4, v5, v6, v16, v17, v18, \idx2, \size
|
||||
|
||||
// Round, shift and saturate
|
||||
sqrshrun v1.8b, v1.8h, #7
|
||||
sqrshrun v24.8b, v24.8h, #7
|
||||
.if \size >= 16
|
||||
sqrshrun2 v1.16b, v2.8h, #7
|
||||
sqrshrun2 v24.16b, v25.8h, #7
|
||||
.endif
|
||||
// Average
|
||||
.ifc \type,avg
|
||||
.if \size >= 16
|
||||
ld1 {v2.16b}, [x0]
|
||||
ld1 {v3.16b}, [x6]
|
||||
urhadd v1.16b, v1.16b, v2.16b
|
||||
urhadd v24.16b, v24.16b, v3.16b
|
||||
.elseif \size == 8
|
||||
ld1 {v2.8b}, [x0]
|
||||
ld1 {v3.8b}, [x6]
|
||||
urhadd v1.8b, v1.8b, v2.8b
|
||||
urhadd v24.8b, v24.8b, v3.8b
|
||||
.else
|
||||
ld1 {v2.s}[0], [x0]
|
||||
ld1 {v3.s}[0], [x6]
|
||||
urhadd v1.8b, v1.8b, v2.8b
|
||||
urhadd v24.8b, v24.8b, v3.8b
|
||||
.endif
|
||||
.endif
|
||||
// Store and loop horizontally (for size >= 16)
|
||||
.if \size >= 16
|
||||
subs x9, x9, #16
|
||||
st1 {v1.16b}, [x0], #16
|
||||
st1 {v24.16b}, [x6], #16
|
||||
b.eq 3f
|
||||
mov v4.16b, v6.16b
|
||||
mov v16.16b, v18.16b
|
||||
ld1 {v6.16b}, [x2], #16
|
||||
ld1 {v18.16b}, [x7], #16
|
||||
uxtl v5.8h, v6.8b
|
||||
uxtl2 v6.8h, v6.16b
|
||||
uxtl v17.8h, v18.8b
|
||||
uxtl2 v18.8h, v18.16b
|
||||
b 2b
|
||||
.elseif \size == 8
|
||||
st1 {v1.8b}, [x0]
|
||||
st1 {v24.8b}, [x6]
|
||||
.else // \size == 4
|
||||
st1 {v1.s}[0], [x0]
|
||||
st1 {v24.s}[0], [x6]
|
||||
.endif
|
||||
3:
|
||||
// Loop vertically
|
||||
add x0, x0, x1
|
||||
add x6, x6, x1
|
||||
add x2, x2, x3
|
||||
add x7, x7, x3
|
||||
subs w4, w4, #2
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_size size
|
||||
do_8tap_h put, \size, 3, 4
|
||||
do_8tap_h avg, \size, 3, 4
|
||||
do_8tap_h put, \size, 4, 3
|
||||
do_8tap_h avg, \size, 4, 3
|
||||
.endm
|
||||
|
||||
do_8tap_h_size 4
|
||||
do_8tap_h_size 8
|
||||
do_8tap_h_size 16
|
||||
|
||||
.macro do_8tap_h_func type, filter, offset, size
|
||||
function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
|
||||
movrel x6, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w5, #8
|
||||
add x9, x6, w5, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 16
|
||||
b.ge \type\()_8tap_16h_34
|
||||
b \type\()_8tap_16h_43
|
||||
.else
|
||||
b.ge \type\()_8tap_\size\()h_34
|
||||
b \type\()_8tap_\size\()h_43
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_h_filters size
|
||||
do_8tap_h_func put, regular, 1, \size
|
||||
do_8tap_h_func avg, regular, 1, \size
|
||||
do_8tap_h_func put, sharp, 2, \size
|
||||
do_8tap_h_func avg, sharp, 2, \size
|
||||
do_8tap_h_func put, smooth, 0, \size
|
||||
do_8tap_h_func avg, smooth, 0, \size
|
||||
.endm
|
||||
|
||||
do_8tap_h_filters 64
|
||||
do_8tap_h_filters 32
|
||||
do_8tap_h_filters 16
|
||||
do_8tap_h_filters 8
|
||||
do_8tap_h_filters 4
|
||||
|
||||
|
||||
// Vertical filters
|
||||
|
||||
// Round, shift and saturate and store reg1-reg2 over 4 lines
|
||||
.macro do_store4 reg1, reg2, tmp1, tmp2, type
|
||||
sqrshrun \reg1\().8b, \reg1\().8h, #7
|
||||
sqrshrun \reg2\().8b, \reg2\().8h, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().s}[0], [x7], x1
|
||||
ld1 {\tmp2\().s}[0], [x7], x1
|
||||
ld1 {\tmp1\().s}[1], [x7], x1
|
||||
ld1 {\tmp2\().s}[1], [x7], x1
|
||||
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
||||
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
||||
.endif
|
||||
st1 {\reg1\().s}[0], [x0], x1
|
||||
st1 {\reg2\().s}[0], [x0], x1
|
||||
st1 {\reg1\().s}[1], [x0], x1
|
||||
st1 {\reg2\().s}[1], [x0], x1
|
||||
.endm
|
||||
|
||||
// Round, shift and saturate and store reg1-4
|
||||
.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
|
||||
sqrshrun \reg1\().8b, \reg1\().8h, #7
|
||||
sqrshrun \reg2\().8b, \reg2\().8h, #7
|
||||
sqrshrun \reg3\().8b, \reg3\().8h, #7
|
||||
sqrshrun \reg4\().8b, \reg4\().8h, #7
|
||||
.ifc \type,avg
|
||||
ld1 {\tmp1\().8b}, [x7], x1
|
||||
ld1 {\tmp2\().8b}, [x7], x1
|
||||
ld1 {\tmp3\().8b}, [x7], x1
|
||||
ld1 {\tmp4\().8b}, [x7], x1
|
||||
urhadd \reg1\().8b, \reg1\().8b, \tmp1\().8b
|
||||
urhadd \reg2\().8b, \reg2\().8b, \tmp2\().8b
|
||||
urhadd \reg3\().8b, \reg3\().8b, \tmp3\().8b
|
||||
urhadd \reg4\().8b, \reg4\().8b, \tmp4\().8b
|
||||
.endif
|
||||
st1 {\reg1\().8b}, [x0], x1
|
||||
st1 {\reg2\().8b}, [x0], x1
|
||||
st1 {\reg3\().8b}, [x0], x1
|
||||
st1 {\reg4\().8b}, [x0], x1
|
||||
.endm
|
||||
|
||||
// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
|
||||
// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
|
||||
// at the end with saturation. Indices 0 and 7 always have negative or zero
|
||||
// coefficients, so they can be accumulated into tmp1-tmp2 together with the
|
||||
// largest coefficient.
|
||||
.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
|
||||
mul \dst1\().8h, \src2\().8h, v0.h[1]
|
||||
mul \dst2\().8h, \src3\().8h, v0.h[1]
|
||||
mul \tmp1\().8h, \src1\().8h, v0.h[0]
|
||||
mul \tmp2\().8h, \src2\().8h, v0.h[0]
|
||||
mla \dst1\().8h, \src3\().8h, v0.h[2]
|
||||
mla \dst2\().8h, \src4\().8h, v0.h[2]
|
||||
.if \idx1 == 3
|
||||
mla \dst1\().8h, \src4\().8h, v0.h[3]
|
||||
mla \dst2\().8h, \src5\().8h, v0.h[3]
|
||||
.else
|
||||
mla \dst1\().8h, \src5\().8h, v0.h[4]
|
||||
mla \dst2\().8h, \src6\().8h, v0.h[4]
|
||||
.endif
|
||||
mla \dst1\().8h, \src6\().8h, v0.h[5]
|
||||
mla \dst2\().8h, \src7\().8h, v0.h[5]
|
||||
mla \tmp1\().8h, \src8\().8h, v0.h[7]
|
||||
mla \tmp2\().8h, \src9\().8h, v0.h[7]
|
||||
mla \dst1\().8h, \src7\().8h, v0.h[6]
|
||||
mla \dst2\().8h, \src8\().8h, v0.h[6]
|
||||
.if \idx2 == 3
|
||||
mla \tmp1\().8h, \src4\().8h, v0.h[3]
|
||||
mla \tmp2\().8h, \src5\().8h, v0.h[3]
|
||||
.else
|
||||
mla \tmp1\().8h, \src5\().8h, v0.h[4]
|
||||
mla \tmp2\().8h, \src6\().8h, v0.h[4]
|
||||
.endif
|
||||
sqadd \dst1\().8h, \dst1\().8h, \tmp1\().8h
|
||||
sqadd \dst2\().8h, \dst2\().8h, \tmp2\().8h
|
||||
.endm
|
||||
|
||||
// Load pixels and extend them to 16 bit
|
||||
.macro loadl dst1, dst2, dst3, dst4
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
ld1 {v2.8b}, [x2], x3
|
||||
ld1 {v3.8b}, [x2], x3
|
||||
.ifnb \dst4
|
||||
ld1 {v4.8b}, [x2], x3
|
||||
.endif
|
||||
uxtl \dst1\().8h, v1.8b
|
||||
uxtl \dst2\().8h, v2.8b
|
||||
uxtl \dst3\().8h, v3.8b
|
||||
.ifnb \dst4
|
||||
uxtl \dst4\().8h, v4.8b
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// Instantiate a vertical filter function for filtering 8 pixels at a time.
|
||||
// The height is passed in x4, the width in x5 and the filter coefficients
|
||||
// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
|
||||
// and idx1 is the other one of them.
|
||||
.macro do_8tap_8v type, idx1, idx2
|
||||
function \type\()_8tap_8v_\idx1\idx2
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
1:
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
mov x6, x4
|
||||
|
||||
loadl v17, v18, v19
|
||||
|
||||
loadl v20, v21, v22, v23
|
||||
2:
|
||||
loadl v24, v25, v26, v27
|
||||
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
loadl v16, v17, v18, v19
|
||||
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.eq 8f
|
||||
|
||||
loadl v20, v21, v22, v23
|
||||
convolve v1, v2, v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5, v6
|
||||
convolve v3, v4, v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5, v6
|
||||
do_store v1, v2, v3, v4, v5, v6, v7, v28, \type
|
||||
|
||||
subs x6, x6, #4
|
||||
b.ne 2b
|
||||
|
||||
8:
|
||||
subs x5, x5, #8
|
||||
b.eq 9f
|
||||
// x0 -= h * dst_stride
|
||||
msub x0, x1, x4, x0
|
||||
// x2 -= h * src_stride
|
||||
msub x2, x3, x4, x2
|
||||
// x2 -= 8 * src_stride
|
||||
sub x2, x2, x3, lsl #3
|
||||
// x2 += 1 * src_stride
|
||||
add x2, x2, x3
|
||||
add x2, x2, #8
|
||||
add x0, x0, #8
|
||||
b 1b
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_8v put, 3, 4
|
||||
do_8tap_8v put, 4, 3
|
||||
do_8tap_8v avg, 3, 4
|
||||
do_8tap_8v avg, 4, 3
|
||||
|
||||
|
||||
// Instantiate a vertical filter function for filtering a 4 pixels wide
|
||||
// slice. The first half of the registers contain one row, while the second
|
||||
// half of a register contains the second-next row (also stored in the first
|
||||
// half of the register two steps ahead). The convolution does two outputs
|
||||
// at a time; the output of v17-v24 into one, and v18-v25 into another one.
|
||||
// The first half of first output is the first output row, the first half
|
||||
// of the other output is the second output row. The second halves of the
|
||||
// registers are rows 3 and 4.
|
||||
// This only is designed to work for 4 or 8 output lines.
|
||||
.macro do_8tap_4v type, idx1, idx2
|
||||
function \type\()_8tap_4v_\idx1\idx2
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
ld1 {v0.8h}, [x6]
|
||||
.ifc \type,avg
|
||||
mov x7, x0
|
||||
.endif
|
||||
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
ld1 {v3.s}[0], [x2], x3
|
||||
ld1 {v4.s}[0], [x2], x3
|
||||
ld1 {v5.s}[0], [x2], x3
|
||||
ld1 {v6.s}[0], [x2], x3
|
||||
trn1 v1.2s, v1.2s, v3.2s
|
||||
ld1 {v7.s}[0], [x2], x3
|
||||
trn1 v2.2s, v2.2s, v4.2s
|
||||
ld1 {v26.s}[0], [x2], x3
|
||||
uxtl v17.8h, v1.8b
|
||||
trn1 v3.2s, v3.2s, v5.2s
|
||||
ld1 {v27.s}[0], [x2], x3
|
||||
uxtl v18.8h, v2.8b
|
||||
trn1 v4.2s, v4.2s, v6.2s
|
||||
ld1 {v28.s}[0], [x2], x3
|
||||
uxtl v19.8h, v3.8b
|
||||
trn1 v5.2s, v5.2s, v7.2s
|
||||
ld1 {v29.s}[0], [x2], x3
|
||||
uxtl v20.8h, v4.8b
|
||||
trn1 v6.2s, v6.2s, v26.2s
|
||||
uxtl v21.8h, v5.8b
|
||||
trn1 v7.2s, v7.2s, v27.2s
|
||||
uxtl v22.8h, v6.8b
|
||||
trn1 v26.2s, v26.2s, v28.2s
|
||||
uxtl v23.8h, v7.8b
|
||||
trn1 v27.2s, v27.2s, v29.2s
|
||||
uxtl v24.8h, v26.8b
|
||||
uxtl v25.8h, v27.8b
|
||||
|
||||
convolve v1, v2, v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3, v4
|
||||
do_store4 v1, v2, v5, v6, \type
|
||||
|
||||
subs x4, x4, #4
|
||||
b.eq 9f
|
||||
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
ld1 {v2.s}[0], [x2], x3
|
||||
trn1 v28.2s, v28.2s, v1.2s
|
||||
trn1 v29.2s, v29.2s, v2.2s
|
||||
ld1 {v1.s}[1], [x2], x3
|
||||
uxtl v26.8h, v28.8b
|
||||
ld1 {v2.s}[1], [x2], x3
|
||||
uxtl v27.8h, v29.8b
|
||||
uxtl v28.8h, v1.8b
|
||||
uxtl v29.8h, v2.8b
|
||||
|
||||
convolve v1, v2, v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3, v4
|
||||
do_store4 v1, v2, v5, v6, \type
|
||||
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
do_8tap_4v put, 3, 4
|
||||
do_8tap_4v put, 4, 3
|
||||
do_8tap_4v avg, 3, 4
|
||||
do_8tap_4v avg, 4, 3
|
||||
|
||||
|
||||
.macro do_8tap_v_func type, filter, offset, size
|
||||
function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
|
||||
uxtw x4, w4
|
||||
movrel x5, X(ff_vp9_subpel_filters), 256*\offset
|
||||
cmp w6, #8
|
||||
add x6, x5, w6, uxtw #4
|
||||
mov x5, #\size
|
||||
.if \size >= 8
|
||||
b.ge \type\()_8tap_8v_34
|
||||
b \type\()_8tap_8v_43
|
||||
.else
|
||||
b.ge \type\()_8tap_4v_34
|
||||
b \type\()_8tap_4v_43
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro do_8tap_v_filters size
|
||||
do_8tap_v_func put, regular, 1, \size
|
||||
do_8tap_v_func avg, regular, 1, \size
|
||||
do_8tap_v_func put, sharp, 2, \size
|
||||
do_8tap_v_func avg, sharp, 2, \size
|
||||
do_8tap_v_func put, smooth, 0, \size
|
||||
do_8tap_v_func avg, smooth, 0, \size
|
||||
.endm
|
||||
|
||||
do_8tap_v_filters 64
|
||||
do_8tap_v_filters 32
|
||||
do_8tap_v_filters 16
|
||||
do_8tap_v_filters 8
|
||||
do_8tap_v_filters 4
|
|
@ -0,0 +1,104 @@
|
|||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#ifdef __ELF__
|
||||
# define ELF
|
||||
#else
|
||||
# define ELF #
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_FUNC
|
||||
# define FUNC
|
||||
#else
|
||||
# define FUNC #
|
||||
#endif
|
||||
|
||||
.macro function name, export=0, align=2
|
||||
.macro endfunc
|
||||
ELF .size \name, . - \name
|
||||
FUNC .endfunc
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN_ASM\name
|
||||
ELF .type EXTERN_ASM\name, %function
|
||||
FUNC .func EXTERN_ASM\name
|
||||
EXTERN_ASM\name:
|
||||
.else
|
||||
ELF .type \name, %function
|
||||
FUNC .func \name
|
||||
\name:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro const name, align=2, relocate=0
|
||||
.macro endconst
|
||||
ELF .size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
#if HAVE_SECTION_DATA_REL_RO
|
||||
.if \relocate
|
||||
.section .data.rel.ro
|
||||
.else
|
||||
.section .rodata
|
||||
.endif
|
||||
#elif !defined(__MACH__)
|
||||
.section .rodata
|
||||
#else
|
||||
.const_data
|
||||
#endif
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro movrel rd, val, offset=0
|
||||
#if CONFIG_PIC && defined(__APPLE__)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val@PAGE
|
||||
add \rd, \rd, \val@PAGEOFF
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)@PAGE
|
||||
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||
.endif
|
||||
#elif CONFIG_PIC && defined(_WIN32)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val
|
||||
add \rd, \rd, :lo12:\val
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
.endif
|
||||
#elif CONFIG_PIC
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
#else
|
||||
ldr \rd, =\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#define GLUE(a, b) a ## b
|
||||
#define JOIN(a, b) GLUE(a, b)
|
||||
#define X(s) JOIN(EXTERN_ASM, s)
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AARCH64_BSWAP_H
|
||||
#define AVUTIL_AARCH64_BSWAP_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
#include "libavutil/attributes.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define av_bswap16 av_bswap16
|
||||
static av_always_inline av_const unsigned av_bswap16(unsigned x)
|
||||
{
|
||||
__asm__("rev16 %w0, %w0" : "+r"(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
#define av_bswap32 av_bswap32
|
||||
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
|
||||
{
|
||||
__asm__("rev %w0, %w0" : "+r"(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
#define av_bswap64 av_bswap64
|
||||
static av_always_inline av_const uint64_t av_bswap64(uint64_t x)
|
||||
{
|
||||
__asm__("rev %0, %0" : "+r"(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#endif /* AVUTIL_AARCH64_BSWAP_H */
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/cpu_internal.h"
|
||||
#include "config.h"
|
||||
|
||||
int ff_get_cpu_flags_aarch64(void)
|
||||
{
|
||||
return AV_CPU_FLAG_ARMV8 * HAVE_ARMV8 |
|
||||
AV_CPU_FLAG_NEON * HAVE_NEON |
|
||||
AV_CPU_FLAG_VFP * HAVE_VFP;
|
||||
}
|
||||
|
||||
size_t ff_get_cpu_max_align_aarch64(void)
|
||||
{
|
||||
int flags = av_get_cpu_flags();
|
||||
|
||||
if (flags & AV_CPU_FLAG_NEON)
|
||||
return 16;
|
||||
|
||||
return 8;
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AARCH64_CPU_H
|
||||
#define AVUTIL_AARCH64_CPU_H
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/cpu_internal.h"
|
||||
|
||||
#define have_armv8(flags) CPUEXT(flags, ARMV8)
|
||||
#define have_neon(flags) CPUEXT(flags, NEON)
|
||||
#define have_vfp(flags) CPUEXT(flags, VFP)
|
||||
|
||||
#endif /* AVUTIL_AARCH64_CPU_H */
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* ARM NEON optimised Float DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/float_dsp.h"
|
||||
#include "cpu.h"
|
||||
|
||||
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmac_scalar_neon(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
|
||||
int len);
|
||||
|
||||
void ff_vector_dmul_scalar_neon(double *dst, const double *src, double mul,
|
||||
int len);
|
||||
|
||||
void ff_vector_fmul_window_neon(float *dst, const float *src0,
|
||||
const float *src1, const float *win, int len);
|
||||
|
||||
void ff_vector_fmul_add_neon(float *dst, const float *src0, const float *src1,
|
||||
const float *src2, int len);
|
||||
|
||||
void ff_vector_fmul_reverse_neon(float *dst, const float *src0,
|
||||
const float *src1, int len);
|
||||
|
||||
void ff_butterflies_float_neon(float *v1, float *v2, int len);
|
||||
|
||||
float ff_scalarproduct_float_neon(const float *v1, const float *v2, int len);
|
||||
|
||||
av_cold void ff_float_dsp_init_aarch64(AVFloatDSPContext *fdsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
fdsp->butterflies_float = ff_butterflies_float_neon;
|
||||
fdsp->scalarproduct_float = ff_scalarproduct_float_neon;
|
||||
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_neon;
|
||||
fdsp->vector_fmul = ff_vector_fmul_neon;
|
||||
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_neon;
|
||||
fdsp->vector_fmul_add = ff_vector_fmul_add_neon;
|
||||
fdsp->vector_fmul_reverse = ff_vector_fmul_reverse_neon;
|
||||
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_neon;
|
||||
fdsp->vector_fmul_window = ff_vector_fmul_window_neon;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* ARM NEON optimised Float DSP functions
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "asm.S"
|
||||
|
||||
function ff_vector_fmul_neon, export=1
|
||||
1: subs w3, w3, #16
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
ld1 {v4.4S, v5.4S}, [x2], #32
|
||||
ld1 {v6.4S, v7.4S}, [x2], #32
|
||||
fmul v16.4S, v0.4S, v4.4S
|
||||
fmul v17.4S, v1.4S, v5.4S
|
||||
fmul v18.4S, v2.4S, v6.4S
|
||||
fmul v19.4S, v3.4S, v7.4S
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmac_scalar_neon, export=1
|
||||
mov x3, #-32
|
||||
1: subs w2, w2, #16
|
||||
ld1 {v16.4S, v17.4S}, [x0], #32
|
||||
ld1 {v18.4S, v19.4S}, [x0], x3
|
||||
ld1 {v4.4S, v5.4S}, [x1], #32
|
||||
ld1 {v6.4S, v7.4S}, [x1], #32
|
||||
fmla v16.4S, v4.4S, v0.S[0]
|
||||
fmla v17.4S, v5.4S, v0.S[0]
|
||||
fmla v18.4S, v6.4S, v0.S[0]
|
||||
fmla v19.4S, v7.4S, v0.S[0]
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
st1 {v18.4S, v19.4S}, [x0], #32
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_scalar_neon, export=1
|
||||
mov w4, #15
|
||||
bics w3, w2, w4
|
||||
dup v16.4S, v0.S[0]
|
||||
b.eq 3f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
1: subs w3, w3, #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
ld1 {v2.4S, v3.4S}, [x1], #32
|
||||
fmul v1.4S, v1.4S, v16.4S
|
||||
fmul v2.4S, v2.4S, v16.4S
|
||||
st1 {v0.4S, v1.4S}, [x0], #32
|
||||
fmul v3.4S, v3.4S, v16.4S
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
b 1b
|
||||
2: ands w2, w2, #15
|
||||
st1 {v2.4S, v3.4S}, [x0], #32
|
||||
b.eq 4f
|
||||
3: ld1 {v0.4S}, [x1], #16
|
||||
fmul v0.4S, v0.4S, v16.4S
|
||||
st1 {v0.4S}, [x0], #16
|
||||
subs w2, w2, #4
|
||||
b.gt 3b
|
||||
4: ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_dmul_scalar_neon, export=1
|
||||
dup v16.2D, v0.D[0]
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
1: subs w2, w2, #8
|
||||
fmul v0.2D, v0.2D, v16.2D
|
||||
ld1 {v2.2D, v3.2D}, [x1], #32
|
||||
fmul v1.2D, v1.2D, v16.2D
|
||||
fmul v2.2D, v2.2D, v16.2D
|
||||
st1 {v0.2D, v1.2D}, [x0], #32
|
||||
fmul v3.2D, v3.2D, v16.2D
|
||||
ld1 {v0.2D, v1.2D}, [x1], #32
|
||||
st1 {v2.2D, v3.2D}, [x0], #32
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_window_neon, export=1
|
||||
sxtw x4, w4 // len
|
||||
sub x2, x2, #8
|
||||
sub x5, x4, #2
|
||||
add x2, x2, x5, lsl #2 // src1 + 4 * (len - 4)
|
||||
add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
|
||||
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
|
||||
mov x7, #-16
|
||||
ld1 {v0.4S}, [x1], #16 // s0
|
||||
ld1 {v2.4S}, [x3], #16 // wi
|
||||
ld1 {v1.4S}, [x2], x7 // s1
|
||||
1: ld1 {v3.4S}, [x6], x7 // wj
|
||||
subs x4, x4, #4
|
||||
fmul v17.4S, v0.4S, v2.4S // s0 * wi
|
||||
rev64 v4.4S, v1.4S
|
||||
rev64 v5.4S, v3.4S
|
||||
rev64 v17.4S, v17.4S
|
||||
ext v4.16B, v4.16B, v4.16B, #8 // s1_r
|
||||
ext v5.16B, v5.16B, v5.16B, #8 // wj_r
|
||||
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev
|
||||
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r
|
||||
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj
|
||||
b.eq 2f
|
||||
ld1 {v0.4S}, [x1], #16
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
ld1 {v2.4S}, [x3], #16
|
||||
ld1 {v1.4S}, [x2], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
b 1b
|
||||
2:
|
||||
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi
|
||||
st1 {v17.4S}, [x5], x7
|
||||
st1 {v16.4S}, [x0], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_add_neon, export=1
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
1: subs w4, w4, #8
|
||||
fmla v4.4S, v0.4S, v2.4S
|
||||
fmla v5.4S, v1.4S, v3.4S
|
||||
b.eq 2f
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
ld1 {v2.4S, v3.4S}, [x2], #32
|
||||
st1 {v4.4S, v5.4S}, [x0], #32
|
||||
ld1 {v4.4S, v5.4S}, [x3], #32
|
||||
b 1b
|
||||
2: st1 {v4.4S, v5.4S}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vector_fmul_reverse_neon, export=1
|
||||
sxtw x3, w3
|
||||
add x2, x2, x3, lsl #2
|
||||
sub x2, x2, #32
|
||||
mov x4, #-32
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
1: subs x3, x3, #8
|
||||
rev64 v3.4S, v3.4S
|
||||
rev64 v2.4S, v2.4S
|
||||
ext v3.16B, v3.16B, v3.16B, #8
|
||||
ext v2.16B, v2.16B, v2.16B, #8
|
||||
fmul v16.4S, v0.4S, v3.4S
|
||||
fmul v17.4S, v1.4S, v2.4S
|
||||
b.eq 2f
|
||||
ld1 {v2.4S, v3.4S}, [x2], x4
|
||||
ld1 {v0.4S, v1.4S}, [x1], #32
|
||||
st1 {v16.4S, v17.4S}, [x0], #32
|
||||
b 1b
|
||||
2: st1 {v16.4S, v17.4S}, [x0], #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_butterflies_float_neon, export=1
|
||||
1: ld1 {v0.4S}, [x0]
|
||||
ld1 {v1.4S}, [x1]
|
||||
subs w2, w2, #4
|
||||
fsub v2.4S, v0.4S, v1.4S
|
||||
fadd v3.4S, v0.4S, v1.4S
|
||||
st1 {v2.4S}, [x1], #16
|
||||
st1 {v3.4S}, [x0], #16
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_scalarproduct_float_neon, export=1
|
||||
movi v2.4S, #0
|
||||
1: ld1 {v0.4S}, [x0], #16
|
||||
ld1 {v1.4S}, [x1], #16
|
||||
subs w2, w2, #4
|
||||
fmla v2.4S, v0.4S, v1.4S
|
||||
b.gt 1b
|
||||
faddp v0.4S, v2.4S, v2.4S
|
||||
faddp s0, v0.2S
|
||||
ret
|
||||
endfunc
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVUTIL_AARCH64_TIMER_H
|
||||
#define AVUTIL_AARCH64_TIMER_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define AV_READ_TIME read_time
|
||||
|
||||
static inline uint64_t read_time(void)
|
||||
{
|
||||
uint64_t cycle_counter;
|
||||
__asm__ volatile(
|
||||
"isb \t\n"
|
||||
"mrs %0, pmccntr_el0 "
|
||||
: "=r"(cycle_counter) :: "memory" );
|
||||
|
||||
return cycle_counter;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
#endif /* AVUTIL_AARCH64_TIMER_H */
|
Загрузка…
Ссылка в новой задаче