Cleanup dead fwd transform functions
Cleanup related wrappers and unit-tests. Change-Id: I2d37a8c80de63dbeaef584e3d5fa842c0b2ee6db
This commit is contained in:
Родитель
8e3da0973f
Коммит
d405f8a627
|
@ -341,24 +341,15 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq
|
||||||
add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
|
specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
|
||||||
|
|
||||||
add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct8x8_1 sse2/;
|
|
||||||
|
|
||||||
add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct16x16 sse2/;
|
specialize qw/aom_fdct16x16 sse2/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct16x16_1 sse2 avx2/;
|
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct32x32 sse2 avx2/;
|
specialize qw/aom_fdct32x32 sse2 avx2/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct32x32_rd sse2 avx2/;
|
specialize qw/aom_fdct32x32_rd sse2 avx2/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct32x32_1 sse2 avx2/;
|
|
||||||
|
|
||||||
# High bit depth
|
# High bit depth
|
||||||
add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_highbd_fdct4x4 sse2/;
|
specialize qw/aom_highbd_fdct4x4 sse2/;
|
||||||
|
@ -366,20 +357,15 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq
|
||||||
add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_highbd_fdct8x8 sse2/;
|
specialize qw/aom_highbd_fdct8x8 sse2/;
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_highbd_fdct16x16 sse2/;
|
specialize qw/aom_highbd_fdct16x16 sse2/;
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_highbd_fdct32x32 sse2/;
|
specialize qw/aom_highbd_fdct32x32 sse2/;
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_highbd_fdct32x32_rd sse2/;
|
specialize qw/aom_highbd_fdct32x32_rd sse2/;
|
||||||
|
|
||||||
add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
} else {
|
} else {
|
||||||
add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct4x4 sse2 msa/;
|
specialize qw/aom_fdct4x4 sse2 msa/;
|
||||||
|
@ -390,23 +376,14 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq
|
||||||
add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
|
specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
|
||||||
|
|
||||||
add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct8x8_1 sse2 neon msa/;
|
|
||||||
|
|
||||||
add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct16x16 sse2 msa/;
|
specialize qw/aom_fdct16x16 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
|
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct32x32 sse2 avx2 msa/;
|
specialize qw/aom_fdct32x32 sse2 avx2 msa/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
|
||||||
specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
|
specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
|
||||||
|
|
||||||
add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
|
|
||||||
specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
|
|
||||||
} # CONFIG_HIGHBITDEPTH
|
} # CONFIG_HIGHBITDEPTH
|
||||||
} # CONFIG_AV1_ENCODER
|
} # CONFIG_AV1_ENCODER
|
||||||
|
|
||||||
|
|
|
@ -172,15 +172,6 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
|
|
||||||
int r, c;
|
|
||||||
tran_low_t sum = 0;
|
|
||||||
for (r = 0; r < 8; ++r)
|
|
||||||
for (c = 0; c < 8; ++c) sum += input[r * stride + c];
|
|
||||||
|
|
||||||
output[0] = sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
|
void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
|
||||||
// The 2D transform is done with two passes which are actually pretty
|
// The 2D transform is done with two passes which are actually pretty
|
||||||
// similar. In the first one, we transform the columns and transpose
|
// similar. In the first one, we transform the columns and transpose
|
||||||
|
@ -361,15 +352,6 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
|
|
||||||
int r, c;
|
|
||||||
int sum = 0;
|
|
||||||
for (r = 0; r < 16; ++r)
|
|
||||||
for (c = 0; c < 16; ++c) sum += input[r * stride + c];
|
|
||||||
|
|
||||||
output[0] = (tran_low_t)(sum >> 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE tran_high_t dct_32_round(tran_high_t input) {
|
static INLINE tran_high_t dct_32_round(tran_high_t input) {
|
||||||
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
|
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
|
||||||
// TODO(debargha, peter.derivaz): Find new bounds for this assert,
|
// TODO(debargha, peter.derivaz): Find new bounds for this assert,
|
||||||
|
@ -758,15 +740,6 @@ void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
|
|
||||||
int r, c;
|
|
||||||
int sum = 0;
|
|
||||||
for (r = 0; r < 32; ++r)
|
|
||||||
for (c = 0; c < 32; ++c) sum += input[r * stride + c];
|
|
||||||
|
|
||||||
output[0] = (tran_low_t)(sum >> 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CONFIG_HIGHBITDEPTH
|
#if CONFIG_HIGHBITDEPTH
|
||||||
void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
|
void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
|
||||||
int stride) {
|
int stride) {
|
||||||
|
@ -778,32 +751,17 @@ void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
|
||||||
aom_fdct8x8_c(input, final_output, stride);
|
aom_fdct8x8_c(input, final_output, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
|
|
||||||
int stride) {
|
|
||||||
aom_fdct8x8_1_c(input, final_output, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
|
void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
|
||||||
int stride) {
|
int stride) {
|
||||||
aom_fdct16x16_c(input, output, stride);
|
aom_fdct16x16_c(input, output, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
|
|
||||||
int stride) {
|
|
||||||
aom_fdct16x16_1_c(input, output, stride);
|
|
||||||
}
|
|
||||||
|
|
||||||
void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
|
void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
|
||||||
aom_fdct32x32_c(input, out, stride);
|
aom_fdct32x32_c(input, out, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
|
void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
|
||||||
int stride) {
|
int stride) {
|
||||||
aom_fdct32x32_rd_c(input, out, stride);
|
aom_fdct32x32_rd_c(input, out, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
|
|
||||||
int stride) {
|
|
||||||
aom_fdct32x32_1_c(input, out, stride);
|
|
||||||
}
|
|
||||||
#endif // CONFIG_HIGHBITDEPTH
|
#endif // CONFIG_HIGHBITDEPTH
|
||||||
|
|
|
@ -926,23 +926,3 @@ void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
|
||||||
out + (8 * i * 32));
|
out + (8 * i * 32));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
|
|
||||||
int sum = LD_HADD(input, stride);
|
|
||||||
sum += LD_HADD(input + 8, stride);
|
|
||||||
sum += LD_HADD(input + 16, stride);
|
|
||||||
sum += LD_HADD(input + 24, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 8, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 8 + 8, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 8 + 16, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 8 + 24, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 16, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 16 + 8, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 16 + 16, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 16 + 24, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 24, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 24 + 8, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 24 + 16, stride);
|
|
||||||
sum += LD_HADD(input + 32 * 24 + 24, stride);
|
|
||||||
out[0] = (int16_t)(sum >> 3);
|
|
||||||
}
|
|
||||||
|
|
|
@ -236,11 +236,3 @@ void aom_fdct16x16_msa(const int16_t *input, int16_t *output,
|
||||||
fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
|
fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
|
|
||||||
int sum = LD_HADD(input, stride);
|
|
||||||
sum += LD_HADD(input + 8, stride);
|
|
||||||
sum += LD_HADD(input + 16 * 8, stride);
|
|
||||||
sum += LD_HADD(input + 16 * 8 + 8, stride);
|
|
||||||
out[0] = (int16_t)(sum >> 1);
|
|
||||||
}
|
|
||||||
|
|
|
@ -85,147 +85,6 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
|
||||||
output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
|
output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
|
|
||||||
int stride) {
|
|
||||||
__m128i in0, in1, in2, in3;
|
|
||||||
__m128i u0, u1;
|
|
||||||
__m128i sum = _mm_setzero_si128();
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < 2; ++i) {
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
|
|
||||||
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
input += 8 * stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
u0 = _mm_setzero_si128();
|
|
||||||
in0 = _mm_unpacklo_epi16(u0, sum);
|
|
||||||
in1 = _mm_unpackhi_epi16(u0, sum);
|
|
||||||
in0 = _mm_srai_epi32(in0, 16);
|
|
||||||
in1 = _mm_srai_epi32(in1, 16);
|
|
||||||
|
|
||||||
sum = _mm_add_epi32(in0, in1);
|
|
||||||
in0 = _mm_unpacklo_epi32(sum, u0);
|
|
||||||
in1 = _mm_unpackhi_epi32(sum, u0);
|
|
||||||
|
|
||||||
sum = _mm_add_epi32(in0, in1);
|
|
||||||
in0 = _mm_srli_si128(sum, 8);
|
|
||||||
|
|
||||||
in1 = _mm_add_epi32(sum, in0);
|
|
||||||
in1 = _mm_srai_epi32(in1, 1);
|
|
||||||
output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
|
|
||||||
int stride) {
|
|
||||||
__m128i in0, in1, in2, in3;
|
|
||||||
__m128i u0, u1;
|
|
||||||
__m128i sum = _mm_setzero_si128();
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i) {
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
|
||||||
|
|
||||||
input += stride;
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
|
||||||
|
|
||||||
input += stride;
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
|
||||||
|
|
||||||
input += stride;
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
|
||||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
|
||||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
|
||||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
|
||||||
|
|
||||||
input += stride;
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
u0 = _mm_add_epi16(in0, in1);
|
|
||||||
u1 = _mm_add_epi16(in2, in3);
|
|
||||||
sum = _mm_add_epi16(sum, u0);
|
|
||||||
|
|
||||||
sum = _mm_add_epi16(sum, u1);
|
|
||||||
}
|
|
||||||
|
|
||||||
u0 = _mm_setzero_si128();
|
|
||||||
in0 = _mm_unpacklo_epi16(u0, sum);
|
|
||||||
in1 = _mm_unpackhi_epi16(u0, sum);
|
|
||||||
in0 = _mm_srai_epi32(in0, 16);
|
|
||||||
in1 = _mm_srai_epi32(in1, 16);
|
|
||||||
|
|
||||||
sum = _mm_add_epi32(in0, in1);
|
|
||||||
in0 = _mm_unpacklo_epi32(sum, u0);
|
|
||||||
in1 = _mm_unpackhi_epi32(sum, u0);
|
|
||||||
|
|
||||||
sum = _mm_add_epi32(in0, in1);
|
|
||||||
in0 = _mm_srli_si128(sum, 8);
|
|
||||||
|
|
||||||
in1 = _mm_add_epi32(sum, in0);
|
|
||||||
in1 = _mm_srai_epi32(in1, 3);
|
|
||||||
output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define DCT_HIGH_BIT_DEPTH 0
|
#define DCT_HIGH_BIT_DEPTH 0
|
||||||
#define FDCT4x4_2D aom_fdct4x4_sse2
|
#define FDCT4x4_2D aom_fdct4x4_sse2
|
||||||
#define FDCT8x8_2D aom_fdct8x8_sse2
|
#define FDCT8x8_2D aom_fdct8x8_sse2
|
||||||
|
|
|
@ -18,51 +18,6 @@
|
||||||
#include "aom_dsp/txfm_common.h"
|
#include "aom_dsp/txfm_common.h"
|
||||||
#include "aom_dsp/x86/txfm_common_avx2.h"
|
#include "aom_dsp/x86/txfm_common_avx2.h"
|
||||||
|
|
||||||
static int32_t get_16x16_sum(const int16_t *input, int stride) {
|
|
||||||
__m256i r0, r1, r2, r3, u0, u1;
|
|
||||||
__m256i zero = _mm256_setzero_si256();
|
|
||||||
__m256i sum = _mm256_setzero_si256();
|
|
||||||
const int16_t *blockBound = input + (stride << 4);
|
|
||||||
__m128i v0, v1;
|
|
||||||
|
|
||||||
while (input < blockBound) {
|
|
||||||
r0 = _mm256_loadu_si256((__m256i const *)input);
|
|
||||||
r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
|
|
||||||
r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
|
|
||||||
r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
|
|
||||||
|
|
||||||
u0 = _mm256_add_epi16(r0, r1);
|
|
||||||
u1 = _mm256_add_epi16(r2, r3);
|
|
||||||
sum = _mm256_add_epi16(sum, u0);
|
|
||||||
sum = _mm256_add_epi16(sum, u1);
|
|
||||||
|
|
||||||
input += stride << 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// unpack 16 int16_t into 2x8 int32_t
|
|
||||||
u0 = _mm256_unpacklo_epi16(zero, sum);
|
|
||||||
u1 = _mm256_unpackhi_epi16(zero, sum);
|
|
||||||
u0 = _mm256_srai_epi32(u0, 16);
|
|
||||||
u1 = _mm256_srai_epi32(u1, 16);
|
|
||||||
sum = _mm256_add_epi32(u0, u1);
|
|
||||||
|
|
||||||
u0 = _mm256_srli_si256(sum, 8);
|
|
||||||
u1 = _mm256_add_epi32(sum, u0);
|
|
||||||
|
|
||||||
v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
|
|
||||||
_mm256_castsi256_si128(u1));
|
|
||||||
v1 = _mm_srli_si128(v0, 4);
|
|
||||||
v0 = _mm_add_epi32(v0, v1);
|
|
||||||
return (int32_t)_mm_extract_epi32(v0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
|
|
||||||
int stride) {
|
|
||||||
int32_t dc = get_16x16_sum(input, stride);
|
|
||||||
output[0] = (tran_low_t)(dc >> 1);
|
|
||||||
_mm256_zeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE void load_buffer_16x16(const int16_t *input, int stride,
|
static INLINE void load_buffer_16x16(const int16_t *input, int stride,
|
||||||
int flipud, int fliplr, __m256i *in) {
|
int flipud, int fliplr, __m256i *in) {
|
||||||
if (!flipud) {
|
if (!flipud) {
|
||||||
|
@ -1084,22 +1039,6 @@ void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
}
|
}
|
||||||
|
|
||||||
void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
|
|
||||||
int stride) {
|
|
||||||
// left and upper corner
|
|
||||||
int32_t sum = get_16x16_sum(input, stride);
|
|
||||||
// right and upper corner
|
|
||||||
sum += get_16x16_sum(input + 16, stride);
|
|
||||||
// left and lower corner
|
|
||||||
sum += get_16x16_sum(input + (stride << 4), stride);
|
|
||||||
// right and lower corner
|
|
||||||
sum += get_16x16_sum(input + (stride << 4) + 16, stride);
|
|
||||||
|
|
||||||
sum >>= 3;
|
|
||||||
output[0] = (tran_low_t)sum;
|
|
||||||
_mm256_zeroupper();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
|
static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
__m256i temp;
|
__m256i temp;
|
||||||
|
|
|
@ -796,11 +796,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(
|
|
||||||
C, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_8),
|
|
||||||
make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_10),
|
|
||||||
make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_12)));
|
|
||||||
#else
|
#else
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
C, Trans16x16HT,
|
C, Trans16x16HT,
|
||||||
|
@ -809,9 +804,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
|
make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_1_c,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // CONFIG_HIGHBITDEPTH
|
#endif // CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
|
#if HAVE_NEON_ASM && !CONFIG_HIGHBITDEPTH
|
||||||
|
@ -836,17 +828,8 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
2, AOM_BITS_8),
|
2, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
|
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
|
||||||
3, AOM_BITS_8)));
|
3, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_1_sse2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
|
#endif // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
|
||||||
INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_1_avx2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
|
||||||
|
|
||||||
#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
||||||
INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
|
INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_sse2,
|
::testing::Values(make_tuple(&aom_fdct16x16_sse2,
|
||||||
|
@ -860,14 +843,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
|
||||||
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 3,
|
make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c, 3,
|
||||||
AOM_BITS_8)));
|
AOM_BITS_8)));
|
||||||
// TODO(luoyi):
|
|
||||||
// For this test case, we should test function: aom_highbd_fdct16x16_1_sse2.
|
|
||||||
// However this function is not available yet. if we mistakely test
|
|
||||||
// aom_fdct16x16_1_sse2, it could only pass AOM_BITS_8/AOM_BITS_10 but not
|
|
||||||
// AOM_BITS_12.
|
|
||||||
INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_1_sse2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
#endif // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
#if HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
||||||
|
@ -886,8 +861,5 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 3,
|
make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 3,
|
||||||
AOM_BITS_8)));
|
AOM_BITS_8)));
|
||||||
#endif // !CONFIG_EXT_TX
|
#endif // !CONFIG_EXT_TX
|
||||||
INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct16x16_1_msa,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
#endif // HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
|
@ -350,11 +350,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
AOM_BITS_8),
|
AOM_BITS_8),
|
||||||
make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
|
make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
|
||||||
1, AOM_BITS_8)));
|
1, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(
|
|
||||||
C, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_8),
|
|
||||||
make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_10),
|
|
||||||
make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_12)));
|
|
||||||
#else
|
#else
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
C, Trans32x32Test,
|
C, Trans32x32Test,
|
||||||
|
@ -362,9 +357,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
AOM_BITS_8),
|
AOM_BITS_8),
|
||||||
make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
|
make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
|
||||||
1, AOM_BITS_8)));
|
1, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct32x32_1_c,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // CONFIG_HIGHBITDEPTH
|
#endif // CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
|
#if HAVE_NEON && !CONFIG_HIGHBITDEPTH
|
||||||
|
@ -383,17 +375,8 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
&aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
|
&aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
|
||||||
make_tuple(&aom_fdct32x32_rd_sse2,
|
make_tuple(&aom_fdct32x32_rd_sse2,
|
||||||
&aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
|
&aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct32x32_1_sse2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
|
#endif // HAVE_SSE2 && !CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
|
||||||
INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct32x32_1_avx2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
|
||||||
|
|
||||||
#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
#if HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
||||||
INSTANTIATE_TEST_CASE_P(
|
INSTANTIATE_TEST_CASE_P(
|
||||||
SSE2, Trans32x32Test,
|
SSE2, Trans32x32Test,
|
||||||
|
@ -401,9 +384,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
0, AOM_BITS_8),
|
0, AOM_BITS_8),
|
||||||
make_tuple(&aom_fdct32x32_rd_sse2,
|
make_tuple(&aom_fdct32x32_rd_sse2,
|
||||||
&aom_idct32x32_1024_add_c, 1, AOM_BITS_8)));
|
&aom_idct32x32_1024_add_c, 1, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct32x32_1_sse2,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
#endif // HAVE_SSE2 && CONFIG_HIGHBITDEPTH
|
||||||
|
|
||||||
#if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
#if HAVE_AVX2 && !CONFIG_HIGHBITDEPTH
|
||||||
|
@ -431,8 +411,5 @@ INSTANTIATE_TEST_CASE_P(
|
||||||
&aom_idct32x32_1024_add_msa, 0, AOM_BITS_8),
|
&aom_idct32x32_1024_add_msa, 0, AOM_BITS_8),
|
||||||
make_tuple(&aom_fdct32x32_rd_msa,
|
make_tuple(&aom_fdct32x32_rd_msa,
|
||||||
&aom_idct32x32_1024_add_msa, 1, AOM_BITS_8)));
|
&aom_idct32x32_1024_add_msa, 1, AOM_BITS_8)));
|
||||||
INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
|
|
||||||
::testing::Values(make_tuple(&aom_fdct32x32_1_msa,
|
|
||||||
AOM_BITS_8)));
|
|
||||||
#endif // HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
#endif // HAVE_MSA && !CONFIG_HIGHBITDEPTH
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
Загрузка…
Ссылка в новой задаче