diff --git a/media/ffvpx/config_common.h b/media/ffvpx/config_common.h index e62c0223b347..bb30c9b0bfc7 100644 --- a/media/ffvpx/config_common.h +++ b/media/ffvpx/config_common.h @@ -11,4 +11,11 @@ #define HAVE_AVX2_EXTERNAL 0 #endif +#ifdef MOZ_LIBAV_FFT +#undef CONFIG_FFT +#undef CONFIG_RDFT +#define CONFIG_FFT 1 +#define CONFIG_RDFT 1 +#endif + #endif diff --git a/media/ffvpx/libavcodec/avcodec.symbols b/media/ffvpx/libavcodec/avcodec.symbols index 486d4a3e22e8..a66717deb4e8 100644 --- a/media/ffvpx/libavcodec/avcodec.symbols +++ b/media/ffvpx/libavcodec/avcodec.symbols @@ -53,6 +53,9 @@ av_parser_close av_parser_init av_parser_next av_parser_parse2 +av_rdft_calc +av_rdft_end +av_rdft_init av_register_codec_parser av_register_hwaccel av_shrink_packet diff --git a/media/ffvpx/libavcodec/avfft.c b/media/ffvpx/libavcodec/avfft.c new file mode 100644 index 000000000000..2200f37708f0 --- /dev/null +++ b/media/ffvpx/libavcodec/avfft.c @@ -0,0 +1,145 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/mem.h" +#include "avfft.h" +#include "fft.h" +#include "rdft.h" +#include "dct.h" + +/* FFT */ + +FFTContext *av_fft_init(int nbits, int inverse) +{ + FFTContext *s = av_mallocz(sizeof(*s)); + + if (s && ff_fft_init(s, nbits, inverse)) + av_freep(&s); + + return s; +} + +void av_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} + +void av_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} + +av_cold void av_fft_end(FFTContext *s) +{ + if (s) { + ff_fft_end(s); + av_free(s); + } +} + +#if CONFIG_MDCT + +FFTContext *av_mdct_init(int nbits, int inverse, double scale) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_mdct_init(s, nbits, inverse, scale)) + av_freep(&s); + + return s; +} + +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} + +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +av_cold void av_mdct_end(FFTContext *s) +{ + if (s) { + ff_mdct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_MDCT */ + +#if CONFIG_RDFT + +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans) +{ + RDFTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_rdft_init(s, nbits, trans)) + av_freep(&s); + + return s; +} + +void av_rdft_calc(RDFTContext *s, FFTSample *data) +{ + s->rdft_calc(s, data); +} + +av_cold void av_rdft_end(RDFTContext *s) +{ + if (s) { + ff_rdft_end(s); + av_free(s); + } +} + +#endif /* CONFIG_RDFT */ + +#if CONFIG_DCT + +DCTContext *av_dct_init(int nbits, enum DCTTransformType inverse) +{ + DCTContext *s = av_malloc(sizeof(*s)); + + if (s && ff_dct_init(s, nbits, inverse)) + av_freep(&s); + + return s; +} + +void av_dct_calc(DCTContext *s, FFTSample *data) +{ + s->dct_calc(s, data); +} + +av_cold void av_dct_end(DCTContext *s) +{ + if (s) { + ff_dct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_DCT */ diff --git a/media/ffvpx/libavcodec/avfft.h b/media/ffvpx/libavcodec/avfft.h new file mode 100644 index 000000000000..0c0f9b8d8dae --- /dev/null +++ b/media/ffvpx/libavcodec/avfft.h @@ -0,0 +1,118 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVFFT_H +#define AVCODEC_AVFFT_H + +/** + * @file + * @ingroup lavc_fft + * FFT functions + */ + +/** + * @defgroup lavc_fft FFT functions + * @ingroup lavc_misc + * + * @{ + */ + +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext FFTContext; + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +FFTContext *av_fft_init(int nbits, int inverse); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +void av_fft_permute(FFTContext *s, FFTComplex *z); + +/** + * Do a complex FFT with the parameters defined in av_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +void av_fft_calc(FFTContext *s, FFTComplex *z); + +void av_fft_end(FFTContext *s); + +FFTContext *av_mdct_init(int nbits, int inverse, double scale); +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_end(FFTContext *s); + +/* Real Discrete Fourier Transform */ + +enum RDFTransformType { + DFT_R2C, + IDFT_C2R, + IDFT_R2C, + DFT_C2R, +}; + +typedef struct RDFTContext RDFTContext; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans); +void av_rdft_calc(RDFTContext *s, FFTSample *data); +void av_rdft_end(RDFTContext *s); + +/* Discrete Cosine Transform */ + +typedef struct DCTContext DCTContext; + +enum DCTTransformType { + DCT_II = 0, + DCT_III, + DCT_I, + DST_I, +}; + +/** + * Set up DCT. + * + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * @param type the type of transform + * + * @note the first element of the input of DST-I is ignored + */ +DCTContext *av_dct_init(int nbits, enum DCTTransformType type); +void av_dct_calc(DCTContext *s, FFTSample *data); +void av_dct_end (DCTContext *s); + +/** + * @} + */ + +#endif /* AVCODEC_AVFFT_H */ diff --git a/media/ffvpx/libavcodec/dct.h b/media/ffvpx/libavcodec/dct.h new file mode 100644 index 000000000000..0a03e256d136 --- /dev/null +++ b/media/ffvpx/libavcodec/dct.h @@ -0,0 +1,69 @@ +/* + * (I)DCT Transforms + * Copyright (c) 2009 Peter Ross + * Copyright (c) 2010 Alex Converse + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) +#define AVCODEC_DCT_H + +#include +#include + +#include "rdft.h" + +struct DCTContext { + int nbits; + int inverse; + RDFTContext rdft; + const float *costab; + FFTSample *csc2; + void (*dct_calc)(struct DCTContext *s, FFTSample *data); + void (*dct32)(FFTSample *out, const FFTSample *in); +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type); +void ff_dct_end (DCTContext *s); + +void ff_dct_init_x86(DCTContext *s); + +void ff_fdct_ifast(int16_t *data); +void ff_fdct_ifast248(int16_t *data); +void ff_jpeg_fdct_islow_8(int16_t *data); +void ff_jpeg_fdct_islow_10(int16_t *data); +void ff_fdct248_islow_8(int16_t *data); +void ff_fdct248_islow_10(int16_t *data); + +void ff_j_rev_dct(int16_t *data); +void ff_j_rev_dct4(int16_t *data); +void ff_j_rev_dct2(int16_t *data); +void ff_j_rev_dct1(int16_t *data); +void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block); +void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block); + +#endif /* AVCODEC_DCT_H */ diff --git a/media/ffvpx/libavcodec/dummy_funcs.c b/media/ffvpx/libavcodec/dummy_funcs.c index 9f2cff784050..d2d2d1720333 100644 --- a/media/ffvpx/libavcodec/dummy_funcs.c +++ b/media/ffvpx/libavcodec/dummy_funcs.c @@ -6,7 +6,9 @@ #include "avcodec.h" +typedef struct FFTContext FFTContext; typedef struct H264PredContext H264PredContext; +typedef struct RDFTContext RDFTContext; typedef struct VideoDSPContext VideoDSPContext; typedef struct VP8DSPContext VP8DSPContext; typedef struct VP9DSPContext VP9DSPContext; @@ -856,6 +858,11 @@ AVBitStreamFilter ff_noise_bsf; AVBitStreamFilter ff_remove_extradata_bsf; AVBitStreamFilter ff_text2movsub_bsf; +void ff_fft_init_aarch64(FFTContext *s) {} +void ff_fft_init_arm(FFTContext *s) {} +void ff_fft_init_mips(FFTContext *s) {} +void ff_fft_init_ppc(FFTContext *s) {} +void ff_rdft_init_arm(RDFTContext *s) {} void ff_h264_pred_init_aarch64(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc) {} diff --git a/media/ffvpx/libavcodec/fft-internal.h b/media/ffvpx/libavcodec/fft-internal.h new file mode 100644 index 000000000000..0a8f7d05cf8d --- /dev/null +++ b/media/ffvpx/libavcodec/fft-internal.h @@ -0,0 +1,94 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_INTERNAL_H +#define AVCODEC_FFT_INTERNAL_H + +#if FFT_FLOAT + +#define FIX15(v) (v) +#define sqrthalf (float)M_SQRT1_2 + +#define BF(x, y, a, b) do { \ + x = a - b; \ + y = a + b; \ + } while (0) + +#define CMUL(dre, dim, are, aim, bre, bim) do { \ + (dre) = (are) * (bre) - (aim) * (bim); \ + (dim) = (are) * (bim) + (aim) * (bre); \ + } while (0) + +#else + +#define SCALE_FLOAT(a, bits) lrint((a) * (double)(1 << (bits))) + +#if FFT_FIXED_32 + +#define CMUL(dre, dim, are, aim, bre, bim) do { \ + int64_t accu; \ + (accu) = (int64_t)(bre) * (are); \ + (accu) -= (int64_t)(bim) * (aim); \ + (dre) = (int)(((accu) + 0x40000000) >> 31); \ + (accu) = (int64_t)(bre) * (aim); \ + (accu) += (int64_t)(bim) * (are); \ + (dim) = (int)(((accu) + 0x40000000) >> 31); \ + } while (0) + +#define FIX15(a) av_clip(SCALE_FLOAT(a, 31), -2147483647, 2147483647) + +#else /* FFT_FIXED_32 */ + +#include "fft.h" +#include "mathops.h" + +void ff_mdct_calcw_c(FFTContext *s, FFTDouble *output, const FFTSample *input); + +#define FIX15(a) av_clip(SCALE_FLOAT(a, 15), -32767, 32767) + +#define sqrthalf ((int16_t)((1<<15)*M_SQRT1_2)) + +#define BF(x, y, a, b) do { \ + x = (a - b) >> 1; \ + y = (a + b) >> 1; \ + } while (0) + +#define CMULS(dre, dim, are, aim, bre, bim, sh) do { \ + (dre) = (MUL16(are, bre) - MUL16(aim, bim)) >> sh; \ + (dim) = (MUL16(are, bim) + MUL16(aim, bre)) >> sh; \ + } while (0) + +#define CMUL(dre, dim, are, aim, bre, bim) \ + CMULS(dre, dim, are, aim, bre, bim, 15) + +#define CMULL(dre, dim, are, aim, bre, bim) \ + CMULS(dre, dim, are, aim, bre, bim, 0) + +#endif /* FFT_FIXED_32 */ + +#endif /* FFT_FLOAT */ + +#define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c) +#define ff_imdct_half_c FFT_NAME(ff_imdct_half_c) +#define ff_mdct_calc_c FFT_NAME(ff_mdct_calc_c) + +void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_FFT_INTERNAL_H */ diff --git a/media/ffvpx/libavcodec/fft.h b/media/ffvpx/libavcodec/fft.h new file mode 100644 index 000000000000..c858570a21c0 --- /dev/null +++ b/media/ffvpx/libavcodec/fft.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_H +#define AVCODEC_FFT_H + +#ifndef FFT_FLOAT +#define FFT_FLOAT 1 +#endif + +#ifndef FFT_FIXED_32 +#define FFT_FIXED_32 0 +#endif + +#include +#include "config.h" +#include "libavutil/mem.h" + +#if FFT_FLOAT + +#include "avfft.h" + +#define FFT_NAME(x) x + +typedef float FFTDouble; + +#else + +#if FFT_FIXED_32 + +#define Q31(x) (int)((x)*2147483648.0 + 0.5) +#define FFT_NAME(x) x ## _fixed_32 + +typedef int32_t FFTSample; + +#else /* FFT_FIXED_32 */ + +#define FFT_NAME(x) x ## _fixed + +typedef int16_t FFTSample; + +#endif /* FFT_FIXED_32 */ + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef int FFTDouble; +typedef struct FFTContext FFTContext; + +#endif /* FFT_FLOAT */ + +typedef struct FFTDComplex { + FFTDouble re, im; +} FFTDComplex; + +/* FFT computation */ + +enum fft_permutation_type { + FF_FFT_PERM_DEFAULT, + FF_FFT_PERM_SWAP_LSBS, + FF_FFT_PERM_AVX, +}; + +enum mdct_permutation_type { + FF_MDCT_PERM_NONE, + FF_MDCT_PERM_INTERLEAVE, +}; + +struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *tmp_buf; + int mdct_size; /* size of MDCT (i.e. number of input data * 2) */ + int mdct_bits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + /** + * Do the permutation needed BEFORE calling fft_calc(). + */ + void (*fft_permute)(struct FFTContext *s, FFTComplex *z); + /** + * Do a complex FFT with the parameters defined in ff_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); + void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*mdct_calcw)(struct FFTContext *s, FFTDouble *output, const FFTSample *input); + enum fft_permutation_type fft_permutation; + enum mdct_permutation_type mdct_permutation; + uint32_t *revtab32; +}; + +#if CONFIG_HARDCODED_TABLES +#define COSTABLE_CONST const +#else +#define COSTABLE_CONST +#endif + +#define COSTABLE(size) \ + COSTABLE_CONST DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2] + +extern COSTABLE(16); +extern COSTABLE(32); +extern COSTABLE(64); +extern COSTABLE(128); +extern COSTABLE(256); +extern COSTABLE(512); +extern COSTABLE(1024); +extern COSTABLE(2048); +extern COSTABLE(4096); +extern COSTABLE(8192); +extern COSTABLE(16384); +extern COSTABLE(32768); +extern COSTABLE(65536); +extern COSTABLE(131072); +extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18]; + +#define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs) + +/** + * Initialize the cosine table in ff_cos_tabs[index] + * @param index index in ff_cos_tabs array of the table to initialize + */ +void ff_init_ff_cos_tabs(int index); + +#define ff_fft_init FFT_NAME(ff_fft_init) +#define ff_fft_end FFT_NAME(ff_fft_end) + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +int ff_fft_init(FFTContext *s, int nbits, int inverse); + +void ff_fft_init_aarch64(FFTContext *s); +void ff_fft_init_x86(FFTContext *s); +void ff_fft_init_arm(FFTContext *s); +void ff_fft_init_mips(FFTContext *s); +void ff_fft_init_ppc(FFTContext *s); + +void ff_fft_fixed_init_arm(FFTContext *s); + +void ff_fft_end(FFTContext *s); + +#define ff_mdct_init FFT_NAME(ff_mdct_init) +#define ff_mdct_end FFT_NAME(ff_mdct_end) + +int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); +void ff_mdct_end(FFTContext *s); + +#endif /* AVCODEC_FFT_H */ diff --git a/media/ffvpx/libavcodec/fft_float.c b/media/ffvpx/libavcodec/fft_float.c new file mode 100644 index 000000000000..73cc98d0d4b1 --- /dev/null +++ b/media/ffvpx/libavcodec/fft_float.c @@ -0,0 +1,21 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define FFT_FLOAT 1 +#define FFT_FIXED_32 0 +#include "fft_template.c" diff --git a/media/ffvpx/libavcodec/fft_template.c b/media/ffvpx/libavcodec/fft_template.c new file mode 100644 index 000000000000..6c77854e411b --- /dev/null +++ b/media/ffvpx/libavcodec/fft_template.c @@ -0,0 +1,613 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2008 Loren Merritt + * Copyright (c) 2002 Fabrice Bellard + * Partly based on libdjbfft by D. J. Bernstein + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FFT/IFFT transforms. + */ + +#include +#include +#include "libavutil/mathematics.h" +#include "libavutil/thread.h" +#include "fft.h" +#include "fft-internal.h" + +#if FFT_FIXED_32 +#include "fft_table.h" + +static void av_cold fft_lut_init(void) +{ + int n = 0; + ff_fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n); +} + +#else /* FFT_FIXED_32 */ + +/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ +#if !CONFIG_HARDCODED_TABLES +COSTABLE(16); +COSTABLE(32); +COSTABLE(64); +COSTABLE(128); +COSTABLE(256); +COSTABLE(512); +COSTABLE(1024); +COSTABLE(2048); +COSTABLE(4096); +COSTABLE(8192); +COSTABLE(16384); +COSTABLE(32768); +COSTABLE(65536); +COSTABLE(131072); + +static av_cold void init_ff_cos_tabs(int index) +{ + int i; + int m = 1<> 1; + if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; + m >>= 1; + if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; + else return split_radix_permutation(i, m, inverse)*4 - 1; +} + +av_cold void ff_init_ff_cos_tabs(int index) +{ +#if (!CONFIG_HARDCODED_TABLES) && (!FFT_FIXED_32) + ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func); +#endif +} + +static const int avx_tab[] = { + 0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15 +}; + +static int is_second_half_of_fft32(int i, int n) +{ + if (n <= 32) + return i >= 16; + else if (i < n/2) + return is_second_half_of_fft32(i, n/2); + else if (i < 3*n/4) + return is_second_half_of_fft32(i - n/2, n/4); + else + return is_second_half_of_fft32(i - 3*n/4, n/4); +} + +static av_cold void fft_perm_avx(FFTContext *s) +{ + int i; + int n = 1 << s->nbits; + + for (i = 0; i < n; i += 16) { + int k; + if (is_second_half_of_fft32(i, n)) { + for (k = 0; k < 16; k++) + s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = + i + avx_tab[k]; + + } else { + for (k = 0; k < 16; k++) { + int j = i + k; + j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4); + s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j; + } + } + } +} + +av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, n; + + s->revtab = NULL; + s->revtab32 = NULL; + + if (nbits < 2 || nbits > 17) + goto fail; + s->nbits = nbits; + n = 1 << nbits; + + if (nbits <= 16) { + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + } else { + s->revtab32 = av_malloc(n * sizeof(uint32_t)); + if (!s->revtab32) + goto fail; + } + s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); + if (!s->tmp_buf) + goto fail; + s->inverse = inverse; + s->fft_permutation = FF_FFT_PERM_DEFAULT; + + s->fft_permute = fft_permute_c; + s->fft_calc = fft_calc_c; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_c; + s->imdct_half = ff_imdct_half_c; + s->mdct_calc = ff_mdct_calc_c; +#endif + +#if FFT_FIXED_32 + { + static AVOnce control = AV_ONCE_INIT; + ff_thread_once(&control, fft_lut_init); + } +#else /* FFT_FIXED_32 */ +#if FFT_FLOAT + if (ARCH_AARCH64) ff_fft_init_aarch64(s); + if (ARCH_ARM) ff_fft_init_arm(s); + if (ARCH_PPC) ff_fft_init_ppc(s); + if (ARCH_X86) ff_fft_init_x86(s); + if (CONFIG_MDCT) s->mdct_calcw = s->mdct_calc; + if (HAVE_MIPSFPU) ff_fft_init_mips(s); +#else + if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c; + if (ARCH_ARM) ff_fft_fixed_init_arm(s); +#endif + for(j=4; j<=nbits; j++) { + ff_init_ff_cos_tabs(j); + } +#endif /* FFT_FIXED_32 */ + + + if (s->fft_permutation == FF_FFT_PERM_AVX) { + fft_perm_avx(s); + } else { + for(i=0; ifft_permutation == FF_FFT_PERM_SWAP_LSBS) + j = (j&~3) | ((j>>1)&1) | ((j<<1)&2); + k = -split_radix_permutation(i, n, s->inverse) & (n-1); + if (s->revtab) + s->revtab[k] = j; + if (s->revtab32) + s->revtab32[k] = j; + } + } + + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->revtab32); + av_freep(&s->tmp_buf); + return -1; +} + +static void fft_permute_c(FFTContext *s, FFTComplex *z) +{ + int j, np; + const uint16_t *revtab = s->revtab; + const uint32_t *revtab32 = s->revtab32; + np = 1 << s->nbits; + /* TODO: handle split-radix permute in a more optimal way, probably in-place */ + if (revtab) { + for(j=0;jtmp_buf[revtab[j]] = z[j]; + } else + for(j=0;jtmp_buf[revtab32[j]] = z[j]; + + memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); +} + +av_cold void ff_fft_end(FFTContext *s) +{ + av_freep(&s->revtab); + av_freep(&s->revtab32); + av_freep(&s->tmp_buf); +} + +#if FFT_FIXED_32 + +static void fft_calc_c(FFTContext *s, FFTComplex *z) { + + int nbits, i, n, num_transforms, offset, step; + int n4, n2, n34; + unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + FFTComplex *tmpz; + const int fft_size = (1 << s->nbits); + int64_t accu; + + num_transforms = (0x2aab >> (16 - s->nbits)) | 1; + + for (n=0; n> 1) | 1; + + for (n=0; n> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4); + tmp7 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1); + tmp6 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4); + tmp8 = (int32_t)((accu + 0x40000000) >> 31); + tmp1 = tmp5 + tmp7; + tmp3 = tmp5 - tmp7; + tmp2 = tmp6 + tmp8; + tmp4 = tmp6 - tmp8; + + tmpz[5].re = tmpz[1].re - tmp1; + tmpz[1].re = tmpz[1].re + tmp1; + tmpz[5].im = tmpz[1].im - tmp2; + tmpz[1].im = tmpz[1].im + tmp2; + tmpz[7].re = tmpz[3].re - tmp4; + tmpz[3].re = tmpz[3].re + tmp4; + tmpz[7].im = tmpz[3].im + tmp3; + tmpz[3].im = tmpz[3].im - tmp3; + } + + step = 1 << ((MAX_LOG2_NFFT-4) - 4); + n4 = 4; + + for (nbits=4; nbits<=s->nbits; nbits++){ + n2 = 2*n4; + n34 = 3*n4; + num_transforms = (num_transforms >> 1) | 1; + + for (n=0; n> 31); + accu = (int64_t)w_re*tmpz[ n2+i].im; + accu -= (int64_t)w_im*tmpz[ n2+i].re; + tmp2 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)w_re*tmpz[n34+i].re; + accu -= (int64_t)w_im*tmpz[n34+i].im; + tmp3 = (int32_t)((accu + 0x40000000) >> 31); + accu = (int64_t)w_re*tmpz[n34+i].im; + accu += (int64_t)w_im*tmpz[n34+i].re; + tmp4 = (int32_t)((accu + 0x40000000) >> 31); + + tmp5 = tmp1 + tmp3; + tmp1 = tmp1 - tmp3; + tmp6 = tmp2 + tmp4; + tmp2 = tmp2 - tmp4; + + tmpz[ n2+i].re = tmpz[ i].re - tmp5; + tmpz[ i].re = tmpz[ i].re + tmp5; + tmpz[ n2+i].im = tmpz[ i].im - tmp6; + tmpz[ i].im = tmpz[ i].im + tmp6; + tmpz[n34+i].re = tmpz[n4+i].re - tmp2; + tmpz[ n4+i].re = tmpz[n4+i].re + tmp2; + tmpz[n34+i].im = tmpz[n4+i].im + tmp1; + tmpz[ n4+i].im = tmpz[n4+i].im - tmp1; + + w_re_ptr += step; + w_im_ptr -= step; + } + } + step >>= 1; + n4 <<= 1; + } +} + +#else /* FFT_FIXED_32 */ + +#define BUTTERFLIES(a0,a1,a2,a3) {\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, a0.re, t5);\ + BF(a3.im, a1.im, a1.im, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, a1.re, t4);\ + BF(a2.im, a0.im, a0.im, t6);\ +} + +// force loading all the inputs before storing any. +// this is slightly slower for small data, but avoids store->load aliasing +// for addresses separated by large powers of 2. +#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ + FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, r0, t5);\ + BF(a3.im, a1.im, i1, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, r1, t4);\ + BF(a2.im, a0.im, i0, t6);\ +} + +#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ + CMUL(t1, t2, a2.re, a2.im, wre, -wim);\ + CMUL(t5, t6, a3.re, a3.im, wre, wim);\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ + t1 = a2.re;\ + t2 = a2.im;\ + t5 = a3.re;\ + t6 = a3.im;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +/* z[0...8n-1], w[1...2n-1] */ +#define PASS(name)\ +static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ +{\ + FFTDouble t1, t2, t3, t4, t5, t6;\ + int o1 = 2*n;\ + int o2 = 4*n;\ + int o3 = 6*n;\ + const FFTSample *wim = wre+o1;\ + n--;\ +\ + TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + do {\ + z += 2;\ + wre += 2;\ + wim -= 2;\ + TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + } while(--n);\ +} + +PASS(pass) +#undef BUTTERFLIES +#define BUTTERFLIES BUTTERFLIES_BIG +PASS(pass_big) + +#define DECL_FFT(n,n2,n4)\ +static void fft##n(FFTComplex *z)\ +{\ + fft##n2(z);\ + fft##n4(z+n4*2);\ + fft##n4(z+n4*3);\ + pass(z,FFT_NAME(ff_cos_##n),n4/2);\ +} + +static void fft4(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6, t7, t8; + + BF(t3, t1, z[0].re, z[1].re); + BF(t8, t6, z[3].re, z[2].re); + BF(z[2].re, z[0].re, t1, t6); + BF(t4, t2, z[0].im, z[1].im); + BF(t7, t5, z[2].im, z[3].im); + BF(z[3].im, z[1].im, t4, t8); + BF(z[3].re, z[1].re, t3, t7); + BF(z[2].im, z[0].im, t2, t5); +} + +static void fft8(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6; + + fft4(z); + + BF(t1, z[5].re, z[4].re, -z[5].re); + BF(t2, z[5].im, z[4].im, -z[5].im); + BF(t5, z[7].re, z[6].re, -z[7].re); + BF(t6, z[7].im, z[6].im, -z[7].im); + + BUTTERFLIES(z[0],z[2],z[4],z[6]); + TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); +} + +#if !CONFIG_SMALL +static void fft16(FFTComplex *z) +{ + FFTDouble t1, t2, t3, t4, t5, t6; + FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1]; + FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3]; + + fft8(z); + fft4(z+8); + fft4(z+12); + + TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); + TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); + TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3); + TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1); +} +#else +DECL_FFT(16,8,4) +#endif +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +#if !CONFIG_SMALL +#define pass pass_big +#endif +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384) +DECL_FFT(131072,65536,32768) + +static void (* const fft_dispatch[])(FFTComplex*) = { + fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, + fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072 +}; + +static void fft_calc_c(FFTContext *s, FFTComplex *z) +{ + fft_dispatch[s->nbits-2](z); +} +#endif /* FFT_FIXED_32 */ diff --git a/media/ffvpx/libavcodec/moz.build b/media/ffvpx/libavcodec/moz.build index ad457d18441d..0982d4480bee 100644 --- a/media/ffvpx/libavcodec/moz.build +++ b/media/ffvpx/libavcodec/moz.build @@ -72,6 +72,13 @@ if not CONFIG['MOZ_FFVPX_FLACONLY']: 'vp9recon.c' ] +if CONFIG['MOZ_LIBAV_FFT']: + SOURCES += [ + 'avfft.c', + 'fft_float.c', + 'rdft.c', + ] + SYMBOLS_FILE = 'avcodec.symbols' NoVisibilityFlags() diff --git a/media/ffvpx/libavcodec/rdft.c b/media/ffvpx/libavcodec/rdft.c new file mode 100644 index 000000000000..6ba748423853 --- /dev/null +++ b/media/ffvpx/libavcodec/rdft.c @@ -0,0 +1,117 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include +#include +#include "libavutil/mathematics.h" +#include "rdft.h" + +/** + * @file + * (Inverse) Real Discrete Fourier Transforms. + */ + +/** Map one real FFT into two parallel real even and odd FFTs. Then interleave + * the two real FFTs into one complex FFT. Unmangle the results. + * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM + */ +static void rdft_calc_c(RDFTContext *s, FFTSample *data) +{ + int i, i1, i2; + FFTComplex ev, od, odsum; + const int n = 1 << s->nbits; + const float k1 = 0.5; + const float k2 = 0.5 - s->inverse; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + + if (!s->inverse) { + s->fft.fft_permute(&s->fft, (FFTComplex*)data); + s->fft.fft_calc(&s->fft, (FFTComplex*)data); + } + /* i=0 is a special case because of packing, the DC term is real, so we + are going to throw the N/2 term (also real) in with it. */ + ev.re = data[0]; + data[0] = ev.re+data[1]; + data[1] = ev.re-data[1]; + +#define RDFT_UNMANGLE(sign0, sign1) \ + for (i = 1; i < (n>>2); i++) { \ + i1 = 2*i; \ + i2 = n-i1; \ + /* Separate even and odd FFTs */ \ + ev.re = k1*(data[i1 ]+data[i2 ]); \ + od.im = k2*(data[i2 ]-data[i1 ]); \ + ev.im = k1*(data[i1+1]-data[i2+1]); \ + od.re = k2*(data[i1+1]+data[i2+1]); \ + /* Apply twiddle factors to the odd FFT and add to the even FFT */ \ + odsum.re = od.re*tcos[i] sign0 od.im*tsin[i]; \ + odsum.im = od.im*tcos[i] sign1 od.re*tsin[i]; \ + data[i1 ] = ev.re + odsum.re; \ + data[i1+1] = ev.im + odsum.im; \ + data[i2 ] = ev.re - odsum.re; \ + data[i2+1] = odsum.im - ev.im; \ + } + + if (s->negative_sin) { + RDFT_UNMANGLE(+,-) + } else { + RDFT_UNMANGLE(-,+) + } + + data[2*i+1]=s->sign_convention*data[2*i+1]; + if (s->inverse) { + data[0] *= k1; + data[1] *= k1; + s->fft.fft_permute(&s->fft, (FFTComplex*)data); + s->fft.fft_calc(&s->fft, (FFTComplex*)data); + } +} + +av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans) +{ + int n = 1 << nbits; + int ret; + + s->nbits = nbits; + s->inverse = trans == IDFT_C2R || trans == DFT_C2R; + s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1; + s->negative_sin = trans == DFT_C2R || trans == DFT_R2C; + + if (nbits < 4 || nbits > 16) + return AVERROR(EINVAL); + + if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0) + return ret; + + ff_init_ff_cos_tabs(nbits); + s->tcos = ff_cos_tabs[nbits]; + s->tsin = ff_cos_tabs[nbits] + (n >> 2); + s->rdft_calc = rdft_calc_c; + + if (ARCH_ARM) ff_rdft_init_arm(s); + + return 0; +} + +av_cold void ff_rdft_end(RDFTContext *s) +{ + ff_fft_end(&s->fft); +} diff --git a/media/ffvpx/libavcodec/rdft.h b/media/ffvpx/libavcodec/rdft.h new file mode 100644 index 000000000000..ffafca7f2427 --- /dev/null +++ b/media/ffvpx/libavcodec/rdft.h @@ -0,0 +1,52 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT) +#define AVCODEC_RDFT_H + +#include "config.h" +#include "fft.h" + +struct RDFTContext { + int nbits; + int inverse; + int sign_convention; + + /* pre/post rotation tables */ + const FFTSample *tcos; + const FFTSample *tsin; + int negative_sin; + FFTContext fft; + void (*rdft_calc)(struct RDFTContext *s, FFTSample *z); +}; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans); +void ff_rdft_end(RDFTContext *s); + +void ff_rdft_init_arm(RDFTContext *s); + + +#endif /* AVCODEC_RDFT_H */ diff --git a/media/ffvpx/libavcodec/x86/fft.asm b/media/ffvpx/libavcodec/x86/fft.asm new file mode 100644 index 000000000000..cdbfd66e821f --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.asm @@ -0,0 +1,1093 @@ +;****************************************************************************** +;* FFT transform with SSE/3DNow optimizations +;* Copyright (c) 2008 Loren Merritt +;* Copyright (c) 2011 Vitor Sessak +;* +;* This algorithm (though not any of the implementation details) is +;* based on libdjbfft by D. J. Bernstein. +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +; These functions are not individually interchangeable with the C versions. +; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results +; in blocks as conventient to the vector size. +; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) + +%include "libavutil/x86/x86util.asm" + +%if ARCH_X86_64 +%define pointer resq +%else +%define pointer resd +%endif + +struc FFTContext + .nbits: resd 1 + .reverse: resd 1 + .revtab: pointer 1 + .tmpbuf: pointer 1 + .mdctsize: resd 1 + .mdctbits: resd 1 + .tcos: pointer 1 + .tsin: pointer 1 + .fftperm: pointer 1 + .fftcalc: pointer 1 + .imdctcalc:pointer 1 + .imdcthalf:pointer 1 +endstruc + +SECTION_RODATA 32 + +%define M_SQRT1_2 0.70710678118654752440 +%define M_COS_PI_1_8 0.923879532511287 +%define M_COS_PI_3_8 0.38268343236509 + +ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 +ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 + +ps_root2: times 8 dd M_SQRT1_2 +ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 +ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 + +perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 +perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 +ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 +ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 +ps_m1p1: dd 1<<31, 0 + +cextern ps_neg + +%assign i 16 +%rep 14 +cextern cos_ %+ i +%assign i i<<1 +%endrep + +%if ARCH_X86_64 + %define pointer dq +%else + %define pointer dd +%endif + +%macro IF0 1+ +%endmacro +%macro IF1 1+ + %1 +%endmacro + +SECTION .text + +%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 + mova %1, %3 + mova %2, %1 + pfadd %1, %4 + pfsub %2, %4 +%endmacro + +%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 + mova %5, %3 + pfsub %3, %4 + pfadd %5, %4 ; {t6,t5} + pxor %3, [ps_m1p1] ; {t8,t7} + mova %6, %1 + movd [r0+12], %3 + punpckhdq %3, [r0+8] + pfadd %1, %5 ; {r0,i0} + pfsub %6, %5 ; {r2,i2} + mova %4, %2 + pfadd %2, %3 ; {r1,i1} + pfsub %4, %3 ; {r3,i3} + SWAP %3, %6 +%endmacro + +; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} +; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} +; %3, %4, %5 tmp +; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} +; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} +%macro T8_AVX 5 + vsubps %5, %1, %2 ; v = %1 - %2 + vaddps %3, %1, %2 ; w = %1 + %2 + vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 + vpermilps %2, %2, [perm1] + vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} + vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} + vsubps %4, %5, %1 ; s = r - q + vaddps %1, %5, %1 ; u = r + q + vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} + vshufps %5, %4, %1, 0xbb + vshufps %3, %4, %1, 0xee + vperm2f128 %3, %3, %5, 0x13 + vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} + vshufps %2, %1, %4, 0xdd + vshufps %1, %1, %4, 0x88 + vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} + vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} + vsubps %5, %1, %3 + vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} + vsubps %2, %4, %1 ; %2 = v - w + vaddps %1, %4, %1 ; %1 = v + w +%endmacro + +; In SSE mode do one fft4 transforms +; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} +; +; In AVX mode do two fft4 transforms +; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} +%macro T4_SSE 3 + subps %3, %1, %2 ; {t3,t4,-t8,t7} + addps %1, %1, %2 ; {t1,t2,t6,t5} + xorps %3, %3, [ps_p1p1m1p1] + shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} + shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} + subps %3, %1, %2 ; {r2,i2,r3,i3} + addps %1, %1, %2 ; {r0,i0,r1,i1} + shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} + shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} +%endmacro + +; In SSE mode do one FFT8 +; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} +; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} +; +; In AVX mode do two FFT8 +; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} +; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} +; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} +; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} +%macro T8_SSE 6 + addps %6, %3, %4 ; {t1,t2,t3,t4} + subps %3, %3, %4 ; {r5,i5,r7,i7} + shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} + mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} + mulps %4, %4, [ps_root2] + addps %3, %3, %4 ; {t8,t7,ta,t9} + shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} + shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} + subps %3, %6, %4 ; {t6,t5,tc,tb} + addps %6, %6, %4 ; {t1,t2,t9,ta} + shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} + shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} + subps %3, %1, %6 ; {r4,r5,r6,r7} + addps %1, %1, %6 ; {r0,r1,r2,r3} + subps %4, %2, %5 ; {i4,i5,i6,i7} + addps %2, %2, %5 ; {i0,i1,i2,i3} +%endmacro + +; scheduled for cpu-bound sizes +%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim +IF%1 mova m4, Z(4) +IF%1 mova m5, Z(5) + mova m0, %2 ; wre + mova m1, %3 ; wim + mulps m2, m4, m0 ; r2*wre +IF%1 mova m6, Z2(6) + mulps m3, m5, m1 ; i2*wim +IF%1 mova m7, Z2(7) + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m1, m1, m6 ; r3*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 + mova Z2(6), m4 + mova Z(2), m3 + mova m2, Z(3) + addps m3, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 + mova Z2(7), m2 + mova Z(3), m1 + subps m4, m7, m3 ; i2 + addps m3, m3, m7 ; i0 + mova Z(5), m4 + mova Z(1), m3 +%endmacro + +; scheduled to avoid store->load aliasing +%macro PASS_BIG 1 ; (!interleave) + mova m4, Z(4) ; r2 + mova m5, Z(5) ; i2 + mova m0, [wq] ; wre + mova m1, [wq+o1q] ; wim + mulps m2, m4, m0 ; r2*wre + mova m6, Z2(6) ; r3 + mulps m3, m5, m1 ; i2*wim + mova m7, Z2(7) ; i3 + mulps m4, m4, m1 ; r2*wim + mulps m5, m5, m0 ; i2*wre + addps m2, m2, m3 ; r2*wre + i2*wim + mulps m3, m1, m7 ; i3*wim + mulps m1, m1, m6 ; r3*wim + subps m5, m5, m4 ; i2*wre - r2*wim + mulps m4, m0, m6 ; r3*wre + mulps m0, m0, m7 ; i3*wre + subps m4, m4, m3 ; r3*wre - i3*wim + mova m3, Z(0) + addps m0, m0, m1 ; i3*wre + r3*wim + subps m1, m4, m2 ; t3 + addps m4, m4, m2 ; t5 + subps m3, m3, m4 ; r2 + addps m4, m4, Z(0) ; r0 + mova m6, Z(2) + mova Z(4), m3 + mova Z(0), m4 + subps m3, m5, m0 ; t4 + subps m4, m6, m3 ; r3 + addps m3, m3, m6 ; r1 +IF%1 mova Z2(6), m4 +IF%1 mova Z(2), m3 + mova m2, Z(3) + addps m5, m5, m0 ; t6 + subps m2, m2, m1 ; i3 + mova m7, Z(1) + addps m1, m1, Z(3) ; i1 +IF%1 mova Z2(7), m2 +IF%1 mova Z(3), m1 + subps m6, m7, m5 ; i2 + addps m5, m5, m7 ; i0 +IF%1 mova Z(5), m6 +IF%1 mova Z(1), m5 +%if %1==0 + INTERL m1, m3, m7, Z, 2 + INTERL m2, m4, m0, Z2, 6 + + mova m1, Z(0) + mova m2, Z(4) + + INTERL m5, m1, m3, Z, 0 + INTERL m6, m2, m7, Z, 4 +%endif +%endmacro + +%macro PUNPCK 3 + mova %3, %1 + punpckldq %1, %2 + punpckhdq %3, %2 +%endmacro + +%define Z(x) [r0+mmsize*x] +%define Z2(x) [r0+mmsize*x] +%define ZH(x) [r0+mmsize*x+mmsize/2] + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +align 16 +fft8_avx: + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m2, m3, m4 + mova Z(0), m0 + mova Z(1), m1 + ret + + +align 16 +fft16_avx: + mova m2, Z(2) + mova m3, Z(3) + T4_SSE m2, m3, m7 + + mova m0, Z(0) + mova m1, Z(1) + T8_AVX m0, m1, m4, m5, m7 + + mova m4, [ps_cos16_1] + mova m5, [ps_cos16_2] + vmulps m6, m2, m4 + vmulps m7, m3, m5 + vaddps m7, m7, m6 + vmulps m2, m2, m5 + vmulps m3, m3, m4 + vsubps m3, m3, m2 + vblendps m2, m7, m3, 0xf0 + vperm2f128 m3, m7, m3, 0x21 + vaddps m4, m2, m3 + vsubps m2, m3, m2 + vperm2f128 m2, m2, m2, 0x01 + vsubps m3, m1, m2 + vaddps m1, m1, m2 + vsubps m5, m0, m4 + vaddps m0, m0, m4 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + vextractf128 Z(2), m5, 0 + vextractf128 ZH(2), m3, 0 + vextractf128 Z(3), m5, 1 + vextractf128 ZH(3), m3, 1 + ret + +align 16 +fft32_avx: + call fft16_avx + + mova m0, Z(4) + mova m1, Z(5) + + T4_SSE m0, m1, m4 + + mova m2, Z(6) + mova m3, Z(7) + + T8_SSE m0, m1, m2, m3, m4, m6 + ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} + ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} + + vperm2f128 m4, m0, m2, 0x20 + vperm2f128 m5, m1, m3, 0x20 + vperm2f128 m6, m0, m2, 0x31 + vperm2f128 m7, m1, m3, 0x31 + + PASS_SMALL 0, [cos_32], [cos_32+32] + + ret + +fft32_interleave_avx: + call fft32_avx + mov r2d, 32 +.deint_loop: + mova m2, Z(0) + mova m3, Z(1) + vunpcklps m0, m2, m3 + vunpckhps m1, m2, m3 + vextractf128 Z(0), m0, 0 + vextractf128 ZH(0), m1, 0 + vextractf128 Z(1), m0, 1 + vextractf128 ZH(1), m1, 1 + add r0, mmsize*2 + sub r2d, mmsize/4 + jg .deint_loop + ret + +%endif + +INIT_XMM sse + +align 16 +fft4_avx: +fft4_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova Z(0), m0 + mova Z(1), m1 + ret + +align 16 +fft8_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + ret + +align 16 +fft16_sse: + mova m0, Z(0) + mova m1, Z(1) + T4_SSE m0, m1, m2 + mova m2, Z(2) + mova m3, Z(3) + T8_SSE m0, m1, m2, m3, m4, m5 + mova m4, Z(4) + mova m5, Z(5) + mova Z(0), m0 + mova Z(1), m1 + mova Z(2), m2 + mova Z(3), m3 + T4_SSE m4, m5, m6 + mova m6, Z2(6) + mova m7, Z2(7) + T4_SSE m6, m7, m0 + PASS_SMALL 0, [cos_16], [cos_16+16] + ret + + +%macro FFT48_3DNOW 0 +align 16 +fft4 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + PUNPCK m0, m1, m4 + PUNPCK m2, m3, m5 + mova Z(0), m0 + mova Z(1), m4 + mova Z(2), m2 + mova Z(3), m5 + ret + +align 16 +fft8 %+ SUFFIX: + T2_3DNOW m0, m1, Z(0), Z(1) + mova m2, Z(2) + mova m3, Z(3) + T4_3DNOW m0, m1, m2, m3, m4, m5 + mova Z(0), m0 + mova Z(2), m2 + T2_3DNOW m4, m5, Z(4), Z(5) + T2_3DNOW m6, m7, Z2(6), Z2(7) + PSWAPD m0, m5 + PSWAPD m2, m7 + pxor m0, [ps_m1p1] + pxor m2, [ps_m1p1] + pfsub m5, m0 + pfadd m7, m2 + pfmul m5, [ps_root2] + pfmul m7, [ps_root2] + T4_3DNOW m1, m3, m5, m7, m0, m2 + mova Z(5), m5 + mova Z2(7), m7 + mova m0, Z(0) + mova m2, Z(2) + T4_3DNOW m0, m2, m4, m6, m5, m7 + PUNPCK m0, m1, m5 + PUNPCK m2, m3, m7 + mova Z(0), m0 + mova Z(1), m5 + mova Z(2), m2 + mova Z(3), m7 + PUNPCK m4, Z(5), m5 + PUNPCK m6, Z2(7), m7 + mova Z(4), m4 + mova Z(5), m5 + mova Z2(6), m6 + mova Z2(7), m7 + ret +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnowext +FFT48_3DNOW + +INIT_MMX 3dnow +FFT48_3DNOW +%endif + +%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] +%define Z2(x) [zcq + o3q + mmsize*(x&1)] +%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] +%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] + +%macro DECL_PASS 2+ ; name, payload +align 16 +%1: +DEFINE_ARGS zc, w, n, o1, o3 + lea o3q, [nq*3] + lea o1q, [nq*8] + shl o3q, 4 +.loop: + %2 + add zcq, mmsize*2 + add wq, mmsize + sub nd, mmsize/8 + jg .loop + rep ret +%endmacro + +%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs + lea r2, [dispatch_tab%1] + mov r2, [r2 + (%2q-2)*gprsize] +%ifdef PIC + lea r3, [$$] + add r2, r3 +%endif + call r2 +%endmacro ; FFT_DISPATCH + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +%macro INTERL_AVX 5 + vunpckhps %3, %2, %1 + vunpcklps %2, %2, %1 + vextractf128 %4(%5), %2, 0 + vextractf128 %4 %+ H(%5), %3, 0 + vextractf128 %4(%5 + 1), %2, 1 + vextractf128 %4 %+ H(%5 + 1), %3, 1 +%endmacro + +%define INTERL INTERL_AVX + +DECL_PASS pass_avx, PASS_BIG 1 +DECL_PASS pass_interleave_avx, PASS_BIG 0 + +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + REP_RET + +%endif + +INIT_XMM sse + +%macro INTERL_SSE 5 + mova %3, %2 + unpcklps %2, %1 + unpckhps %3, %1 + mova %4(%5), %2 + mova %4(%5+1), %3 +%endmacro + +%define INTERL INTERL_SSE + +DECL_PASS pass_sse, PASS_BIG 1 +DECL_PASS pass_interleave_sse, PASS_BIG 0 + +%macro FFT_CALC_FUNC 0 +cglobal fft_calc, 2,5,8 + mov r3d, [r0 + FFTContext.nbits] + PUSH r1 + PUSH r3 + mov r0, r1 + mov r1, r3 + FFT_DISPATCH _interleave %+ SUFFIX, r1 + POP rcx + POP r4 + cmp rcx, 3+(mmsize/16) + jg .end + mov r2, -1 + add rcx, 3 + shl r2, cl + sub r4, r2 +.loop: +%if mmsize == 8 + PSWAPD m0, [r4 + r2 + 4] + mova [r4 + r2 + 4], m0 +%else + movaps xmm0, [r4 + r2] + movaps xmm1, xmm0 + unpcklps xmm0, [r4 + r2 + 16] + unpckhps xmm1, [r4 + r2 + 16] + movaps [r4 + r2], xmm0 + movaps [r4 + r2 + 16], xmm1 +%endif + add r2, mmsize*2 + jl .loop +.end: +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +FFT_CALC_FUNC +INIT_MMX 3dnowext +FFT_CALC_FUNC +%endif +INIT_XMM sse +FFT_CALC_FUNC + +cglobal fft_permute, 2,7,1 + mov r4, [r0 + FFTContext.revtab] + mov r5, [r0 + FFTContext.tmpbuf] + mov ecx, [r0 + FFTContext.nbits] + mov r2, 1 + shl r2, cl + xor r0, r0 +%if ARCH_X86_32 + mov r1, r1m +%endif +.loop: + movaps xmm0, [r1 + 8*r0] + movzx r6, word [r4 + 2*r0] + movzx r3, word [r4 + 2*r0 + 2] + movlps [r5 + 8*r6], xmm0 + movhps [r5 + 8*r3], xmm0 + add r0, 2 + cmp r0, r2 + jl .loop + shl r2, 3 + add r1, r2 + add r5, r2 + neg r2 +; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B +.loopcopy: + movaps xmm0, [r5 + r2] + movaps xmm1, [r5 + r2 + 16] + movaps [r1 + r2], xmm0 + movaps [r1 + r2 + 16], xmm1 + add r2, 32 + jl .loopcopy + REP_RET + +%macro IMDCT_CALC_FUNC 0 +cglobal imdct_calc, 3,5,3 + mov r3d, [r0 + FFTContext.mdctsize] + mov r4, [r0 + FFTContext.imdcthalf] + add r1, r3 + PUSH r3 + PUSH r1 +%if ARCH_X86_32 + push r2 + push r1 + push r0 +%else + sub rsp, 8+32*WIN64 ; allocate win64 shadow space +%endif + call r4 +%if ARCH_X86_32 + add esp, 12 +%else + add rsp, 8+32*WIN64 +%endif + POP r1 + POP r3 + lea r0, [r1 + 2*r3] + mov r2, r3 + sub r3, mmsize + neg r2 + mova m2, [ps_neg] +.loop: +%if mmsize == 8 + PSWAPD m0, [r1 + r3] + PSWAPD m1, [r0 + r2] + pxor m0, m2 +%else + mova m0, [r1 + r3] + mova m1, [r0 + r2] + shufps m0, m0, 0x1b + shufps m1, m1, 0x1b + xorps m0, m2 +%endif + mova [r0 + r3], m1 + mova [r1 + r2], m0 + sub r3, mmsize + add r2, mmsize + jl .loop +%if cpuflag(3dnow) + femms + RET +%else + REP_RET +%endif +%endmacro + +%if ARCH_X86_32 +INIT_MMX 3dnow +IMDCT_CALC_FUNC +INIT_MMX 3dnowext +IMDCT_CALC_FUNC +%endif + +INIT_XMM sse +IMDCT_CALC_FUNC + +%if ARCH_X86_32 +INIT_MMX 3dnow +%define mulps pfmul +%define addps pfadd +%define subps pfsub +%define unpcklps punpckldq +%define unpckhps punpckhdq +DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] +DECL_PASS pass_interleave_3dnow, PASS_BIG 0 +%define pass_3dnowext pass_3dnow +%define pass_interleave_3dnowext pass_interleave_3dnow +%endif + +%ifdef PIC +%define SECTION_REL - $$ +%else +%define SECTION_REL +%endif + +%macro DECL_FFT 1-2 ; nbits, suffix +%ifidn %0, 1 +%xdefine fullsuffix SUFFIX +%else +%xdefine fullsuffix %2 %+ SUFFIX +%endif +%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL +%if %1>=5 +%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL +%endif +%if %1>=6 +%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL +%endif + +%assign n 1<<%1 +%rep 18-%1 +%assign n2 n/2 +%assign n4 n/4 +%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL + +align 16 +fft %+ n %+ fullsuffix: + call fft %+ n2 %+ SUFFIX + add r0, n*4 - (n&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + add r0, n*2 - (n2&(-2<<%1)) + call fft %+ n4 %+ SUFFIX + sub r0, n*6 + (n2&(-2<<%1)) + lea r1, [cos_ %+ n] + mov r2d, n4/2 + jmp pass %+ fullsuffix + +%assign n n*2 +%endrep +%undef n + +align 8 +dispatch_tab %+ fullsuffix: pointer list_of_fft +%endmacro ; DECL_FFT + +%if HAVE_AVX_EXTERNAL +INIT_YMM avx +DECL_FFT 6 +DECL_FFT 6, _interleave +%endif +INIT_XMM sse +DECL_FFT 5 +DECL_FFT 5, _interleave +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_FFT 4 +DECL_FFT 4, _interleave +INIT_MMX 3dnowext +DECL_FFT 4 +DECL_FFT 4, _interleave +%endif + +INIT_XMM sse +%undef mulps +%undef addps +%undef subps +%undef unpcklps +%undef unpckhps + +%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 +%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 + PSWAPD m0, [%3+%2*4] + movq m2, [%3+%1*4-8] + movq m3, m0 + punpckldq m0, m2 + punpckhdq m2, m3 + movd m1, [%4+%1*2-4] ; tcos[j] + movd m3, [%4+%2*2] ; tcos[n4-j-1] + punpckldq m1, [%5+%1*2-4] ; tsin[j] + punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] + + mova m4, m0 + PSWAPD m5, m1 + pfmul m0, m1 + pfmul m4, m5 + mova m6, m2 + PSWAPD m5, m3 + pfmul m2, m3 + pfmul m6, m5 +%if cpuflag(3dnowext) + pfpnacc m0, m4 + pfpnacc m2, m6 +%else + SBUTTERFLY dq, 0, 4, 1 + SBUTTERFLY dq, 2, 6, 3 + pxor m4, m7 + pxor m6, m7 + pfadd m0, m4 + pfadd m2, m6 +%endif +%else + movaps xmm0, [%3+%2*4] + movaps xmm1, [%3+%1*4-0x10] + movaps xmm2, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm1, xmm2, 0x77 + movlps xmm4, [%4+%2*2] + movlps xmm5, [%5+%2*2+0x0] + movhps xmm4, [%4+%1*2-0x8] + movhps xmm5, [%5+%1*2-0x8] + movaps xmm2, xmm0 + movaps xmm3, xmm1 + mulps xmm0, xmm5 + mulps xmm1, xmm4 + mulps xmm2, xmm4 + mulps xmm3, xmm5 + subps xmm1, xmm0 + addps xmm2, xmm3 + movaps xmm0, xmm1 + unpcklps xmm1, xmm2 + unpckhps xmm0, xmm2 +%endif +%endmacro + +%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 + mulps m6, %3, [%5+%1] + mulps m7, %2, [%5+%1] + mulps %2, %2, [%6+%1] + mulps %3, %3, [%6+%1] + subps %2, %2, m6 + addps %3, %3, m7 +%endmacro + +%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + vmovaps ymm1, [%3+%1*2] + vmovaps ymm0, [%3+%1*2+0x20] + vmovaps ymm3, [%3+%2*2] + vmovaps ymm2, [%3+%2*2+0x20] + + CMUL %1, ymm0, ymm1, %3, %4, %5 + CMUL %2, ymm2, ymm3, %3, %4, %5 + vshufps ymm1, ymm1, ymm1, 0x1b + vshufps ymm3, ymm3, ymm3, 0x1b + vperm2f128 ymm1, ymm1, ymm1, 0x01 + vperm2f128 ymm3, ymm3, ymm3, 0x01 + vunpcklps ymm6, ymm2, ymm1 + vunpckhps ymm4, ymm2, ymm1 + vunpcklps ymm7, ymm0, ymm3 + vunpckhps ymm5, ymm0, ymm3 + + vextractf128 [%3+%1*2], ymm7, 0 + vextractf128 [%3+%1*2+0x10], ymm5, 0 + vextractf128 [%3+%1*2+0x20], ymm7, 1 + vextractf128 [%3+%1*2+0x30], ymm5, 1 + + vextractf128 [%3+%2*2], ymm6, 0 + vextractf128 [%3+%2*2+0x10], ymm4, 0 + vextractf128 [%3+%2*2+0x20], ymm6, 1 + vextractf128 [%3+%2*2+0x30], ymm4, 1 + sub %2, 0x20 + add %1, 0x20 + jl .post +%endmacro + +%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + movaps xmm1, [%3+%1*2] + movaps xmm0, [%3+%1*2+0x10] + CMUL %1, xmm0, xmm1, %3, %4, %5 + movaps xmm5, [%3+%2*2] + movaps xmm4, [%3+%2*2+0x10] + CMUL %2, xmm4, xmm5, %3, %4, %5 + shufps xmm1, xmm1, 0x1b + shufps xmm5, xmm5, 0x1b + movaps xmm6, xmm4 + unpckhps xmm4, xmm1 + unpcklps xmm6, xmm1 + movaps xmm2, xmm0 + unpcklps xmm0, xmm5 + unpckhps xmm2, xmm5 + movaps [%3+%2*2], xmm6 + movaps [%3+%2*2+0x10], xmm4 + movaps [%3+%1*2], xmm0 + movaps [%3+%1*2+0x10], xmm2 + sub %2, 0x10 + add %1, 0x10 + jl .post +%endmacro + +%macro CMUL_3DNOW 6 + mova m6, [%1+%2*2] + mova %3, [%1+%2*2+8] + mova %4, m6 + mova m7, %3 + pfmul m6, [%5+%2] + pfmul %3, [%6+%2] + pfmul %4, [%6+%2] + pfmul m7, [%5+%2] + pfsub %3, m6 + pfadd %4, m7 +%endmacro + +%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 +.post: + CMUL_3DNOW %3, %1, m0, m1, %4, %5 + CMUL_3DNOW %3, %2, m2, m3, %4, %5 + movd [%3+%1*2+ 0], m0 + movd [%3+%2*2+12], m1 + movd [%3+%2*2+ 0], m2 + movd [%3+%1*2+12], m3 + psrlq m0, 32 + psrlq m1, 32 + psrlq m2, 32 + psrlq m3, 32 + movd [%3+%1*2+ 8], m0 + movd [%3+%2*2+ 4], m1 + movd [%3+%2*2+ 8], m2 + movd [%3+%1*2+ 4], m3 + sub %2, 8 + add %1, 8 + jl .post +%endmacro + +%macro DECL_IMDCT 1 +cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input +%if ARCH_X86_64 +%define rrevtab r7 +%define rtcos r8 +%define rtsin r9 +%else +%define rrevtab r6 +%define rtsin r6 +%define rtcos r5 +%endif + mov r3d, [r0+FFTContext.mdctsize] + add r2, r3 + shr r3, 1 + mov rtcos, [r0+FFTContext.tcos] + mov rtsin, [r0+FFTContext.tsin] + add rtcos, r3 + add rtsin, r3 +%if ARCH_X86_64 == 0 + push rtcos + push rtsin +%endif + shr r3, 1 + mov rrevtab, [r0+FFTContext.revtab] + add rrevtab, r3 +%if ARCH_X86_64 == 0 + push rrevtab +%endif + +%if mmsize == 8 + sub r3, 2 +%else + sub r3, 4 +%endif +%if ARCH_X86_64 || mmsize == 8 + xor r4, r4 + sub r4, r3 +%endif +%if notcpuflag(3dnowext) && mmsize == 8 + movd m7, [ps_neg] +%endif +.pre: +%if ARCH_X86_64 == 0 +;unspill +%if mmsize != 8 + xor r4, r4 + sub r4, r3 +%endif + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + + PREROTATER r4, r3, r2, rtcos, rtsin +%if mmsize == 8 + mov r6, [esp] ; rrevtab = ptr+n8 + movzx r5, word [rrevtab+r4-2] ; rrevtab[j] + movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] + mova [r1+r5*8], m0 + mova [r1+r6*8], m2 + add r4, 2 + sub r3, 2 +%else +%if ARCH_X86_64 + movzx r5, word [rrevtab+r4-4] + movzx r6, word [rrevtab+r4-2] + movzx r10, word [rrevtab+r3] + movzx r11, word [rrevtab+r3+2] + movlps [r1+r5 *8], xmm0 + movhps [r1+r6 *8], xmm0 + movlps [r1+r10*8], xmm1 + movhps [r1+r11*8], xmm1 + add r4, 4 +%else + mov r6, [esp] + movzx r5, word [r6+r4-4] + movzx r4, word [r6+r4-2] + movlps [r1+r5*8], xmm0 + movhps [r1+r4*8], xmm0 + movzx r5, word [r6+r3] + movzx r4, word [r6+r3+2] + movlps [r1+r5*8], xmm1 + movhps [r1+r4*8], xmm1 +%endif + sub r3, 4 +%endif + jns .pre + + mov r5, r0 + mov r6, r1 + mov r0, r1 + mov r1d, [r5+FFTContext.nbits] + + FFT_DISPATCH SUFFIX, r1 + + mov r0d, [r5+FFTContext.mdctsize] + add r6, r0 + shr r0, 1 +%if ARCH_X86_64 == 0 +%define rtcos r2 +%define rtsin r3 + mov rtcos, [esp+8] + mov rtsin, [esp+4] +%endif + neg r0 + mov r1, -mmsize + sub r1, r0 + %1 r0, r1, r6, rtcos, rtsin +%if ARCH_X86_64 == 0 + add esp, 12 +%endif +%if mmsize == 8 + femms +%endif + RET +%endmacro + +DECL_IMDCT POSROTATESHUF + +%if ARCH_X86_32 +INIT_MMX 3dnow +DECL_IMDCT POSROTATESHUF_3DNOW + +INIT_MMX 3dnowext +DECL_IMDCT POSROTATESHUF_3DNOW +%endif + +INIT_YMM avx + +%if HAVE_AVX_EXTERNAL +DECL_IMDCT POSROTATESHUF_AVX +%endif diff --git a/media/ffvpx/libavcodec/x86/fft.h b/media/ffvpx/libavcodec/x86/fft.h new file mode 100644 index 000000000000..398091eb1f58 --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft.h @@ -0,0 +1,38 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_FFT_H +#define AVCODEC_X86_FFT_H + +#include "libavcodec/fft.h" + +void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); +void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); +void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); +void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); + +void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); + +#endif /* AVCODEC_X86_FFT_H */ diff --git a/media/ffvpx/libavcodec/x86/fft_init.c b/media/ffvpx/libavcodec/x86/fft_init.c new file mode 100644 index 000000000000..928f1dcda72c --- /dev/null +++ b/media/ffvpx/libavcodec/x86/fft_init.c @@ -0,0 +1,61 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" + +#include "fft.h" + +av_cold void ff_fft_init_x86(FFTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (s->nbits > 16) + return; + +#if ARCH_X86_32 + if (EXTERNAL_AMD3DNOW(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_3dnow; + s->imdct_half = ff_imdct_half_3dnow; + s->fft_calc = ff_fft_calc_3dnow; + } + + if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_3dnowext; + s->imdct_half = ff_imdct_half_3dnowext; + s->fft_calc = ff_fft_calc_3dnowext; + } +#endif /* ARCH_X86_32 */ + + if (EXTERNAL_SSE(cpu_flags)) { + s->imdct_calc = ff_imdct_calc_sse; + s->imdct_half = ff_imdct_half_sse; + s->fft_permute = ff_fft_permute_sse; + s->fft_calc = ff_fft_calc_sse; + s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; + } + + if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) { + s->imdct_half = ff_imdct_half_avx; + s->fft_calc = ff_fft_calc_avx; + s->fft_permutation = FF_FFT_PERM_AVX; + } +} diff --git a/media/ffvpx/libavcodec/x86/moz.build b/media/ffvpx/libavcodec/x86/moz.build index 3f0740df1b19..e9325617e0c2 100644 --- a/media/ffvpx/libavcodec/x86/moz.build +++ b/media/ffvpx/libavcodec/x86/moz.build @@ -30,6 +30,12 @@ SOURCES += [ 'vp9mc_16bpp.asm', ] +if CONFIG['MOZ_LIBAV_FFT']: + SOURCES += [ + 'fft.asm', + 'fft_init.c', + ] + FINAL_LIBRARY = 'mozavcodec' include('/media/ffvpx/ffvpxcommon.mozbuild')