зеркало из https://github.com/mozilla/gecko-dev.git
1039 строки
38 KiB
C
1039 строки
38 KiB
C
/*
|
|
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
|
*
|
|
* This source code is subject to the terms of the BSD 2 Clause License and
|
|
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
|
* was not distributed with this source code in the LICENSE file, you can
|
|
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
|
* Media Patent License 1.0 was not distributed with this source code in the
|
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
|
*/
|
|
|
|
#include <immintrin.h>
|
|
|
|
#include "config/aom_config.h"
|
|
#include "config/aom_dsp_rtcd.h"
|
|
|
|
#include "aom/aom_integer.h"
|
|
#include "aom_dsp/x86/synonyms_avx2.h"
|
|
#include "aom_ports/mem.h"
|
|
|
|
// SAD
|
|
static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
|
|
// input 8 32-bit summation
|
|
__m128i lo128, hi128;
|
|
__m256i u = _mm256_srli_si256(*v, 8);
|
|
u = _mm256_add_epi32(u, *v);
|
|
|
|
// 4 32-bit summation
|
|
hi128 = _mm256_extracti128_si256(u, 1);
|
|
lo128 = _mm256_castsi256_si128(u);
|
|
lo128 = _mm_add_epi32(hi128, lo128);
|
|
|
|
// 2 32-bit summation
|
|
hi128 = _mm_srli_si128(lo128, 4);
|
|
lo128 = _mm_add_epi32(lo128, hi128);
|
|
|
|
return (unsigned int)_mm_cvtsi128_si32(lo128);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
|
|
|
|
// first 4 rows
|
|
__m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
__m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
__m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
|
|
__m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
|
|
|
|
__m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
__m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
__m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
|
|
__m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
|
|
|
|
__m256i u0 = _mm256_sub_epi16(s0, r0);
|
|
__m256i u1 = _mm256_sub_epi16(s1, r1);
|
|
__m256i u2 = _mm256_sub_epi16(s2, r2);
|
|
__m256i u3 = _mm256_sub_epi16(s3, r3);
|
|
__m256i zero = _mm256_setzero_si256();
|
|
__m256i sum0, sum1;
|
|
|
|
u0 = _mm256_abs_epi16(u0);
|
|
u1 = _mm256_abs_epi16(u1);
|
|
u2 = _mm256_abs_epi16(u2);
|
|
u3 = _mm256_abs_epi16(u3);
|
|
|
|
sum0 = _mm256_add_epi16(u0, u1);
|
|
sum0 = _mm256_add_epi16(sum0, u2);
|
|
sum0 = _mm256_add_epi16(sum0, u3);
|
|
|
|
// second 4 rows
|
|
src_ptr += src_stride << 2;
|
|
ref_ptr += ref_stride << 2;
|
|
s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
|
|
s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
|
|
|
|
r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
|
|
r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
|
|
|
|
u0 = _mm256_sub_epi16(s0, r0);
|
|
u1 = _mm256_sub_epi16(s1, r1);
|
|
u2 = _mm256_sub_epi16(s2, r2);
|
|
u3 = _mm256_sub_epi16(s3, r3);
|
|
|
|
u0 = _mm256_abs_epi16(u0);
|
|
u1 = _mm256_abs_epi16(u1);
|
|
u2 = _mm256_abs_epi16(u2);
|
|
u3 = _mm256_abs_epi16(u3);
|
|
|
|
sum1 = _mm256_add_epi16(u0, u1);
|
|
sum1 = _mm256_add_epi16(sum1, u2);
|
|
sum1 = _mm256_add_epi16(sum1, u3);
|
|
|
|
// find out the SAD
|
|
s0 = _mm256_unpacklo_epi16(sum0, zero);
|
|
s1 = _mm256_unpackhi_epi16(sum0, zero);
|
|
r0 = _mm256_unpacklo_epi16(sum1, zero);
|
|
r1 = _mm256_unpackhi_epi16(sum1, zero);
|
|
s0 = _mm256_add_epi32(s0, s1);
|
|
r0 = _mm256_add_epi32(r0, r1);
|
|
sum0 = _mm256_add_epi32(s0, r0);
|
|
// 8 32-bit summation
|
|
|
|
return (unsigned int)get_sad_from_mm256_epi32(&sum0);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
|
|
__m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
|
|
__m256i sum0;
|
|
__m256i sum = _mm256_setzero_si256();
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
int row = 0;
|
|
|
|
// Loop for every 4 rows
|
|
while (row < 16) {
|
|
s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
|
|
s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
|
|
|
|
r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
|
|
r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
|
|
|
|
u0 = _mm256_sub_epi16(s0, r0);
|
|
u1 = _mm256_sub_epi16(s1, r1);
|
|
u2 = _mm256_sub_epi16(s2, r2);
|
|
u3 = _mm256_sub_epi16(s3, r3);
|
|
|
|
u0 = _mm256_abs_epi16(u0);
|
|
u1 = _mm256_abs_epi16(u1);
|
|
u2 = _mm256_abs_epi16(u2);
|
|
u3 = _mm256_abs_epi16(u3);
|
|
|
|
sum0 = _mm256_add_epi16(u0, u1);
|
|
sum0 = _mm256_add_epi16(sum0, u2);
|
|
sum0 = _mm256_add_epi16(sum0, u3);
|
|
|
|
s0 = _mm256_unpacklo_epi16(sum0, zero);
|
|
s1 = _mm256_unpackhi_epi16(sum0, zero);
|
|
sum = _mm256_add_epi32(sum, s0);
|
|
sum = _mm256_add_epi32(sum, s1);
|
|
// 8 32-bit summation
|
|
|
|
row += 4;
|
|
src_ptr += src_stride << 2;
|
|
ref_ptr += ref_stride << 2;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sum);
|
|
}
|
|
|
|
static void sad32x4(const uint16_t *src_ptr, int src_stride,
|
|
const uint16_t *ref_ptr, int ref_stride,
|
|
const uint16_t *sec_ptr, __m256i *sad_acc) {
|
|
__m256i s0, s1, s2, s3, r0, r1, r2, r3;
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
int row_sections = 0;
|
|
|
|
while (row_sections < 2) {
|
|
s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
|
|
s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
|
|
|
|
r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
|
|
r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
|
|
|
|
if (sec_ptr) {
|
|
r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
|
|
r1 = _mm256_avg_epu16(
|
|
r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
|
|
r2 = _mm256_avg_epu16(
|
|
r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
|
|
r3 = _mm256_avg_epu16(
|
|
r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
|
|
}
|
|
s0 = _mm256_sub_epi16(s0, r0);
|
|
s1 = _mm256_sub_epi16(s1, r1);
|
|
s2 = _mm256_sub_epi16(s2, r2);
|
|
s3 = _mm256_sub_epi16(s3, r3);
|
|
|
|
s0 = _mm256_abs_epi16(s0);
|
|
s1 = _mm256_abs_epi16(s1);
|
|
s2 = _mm256_abs_epi16(s2);
|
|
s3 = _mm256_abs_epi16(s3);
|
|
|
|
s0 = _mm256_add_epi16(s0, s1);
|
|
s0 = _mm256_add_epi16(s0, s2);
|
|
s0 = _mm256_add_epi16(s0, s3);
|
|
|
|
r0 = _mm256_unpacklo_epi16(s0, zero);
|
|
r1 = _mm256_unpackhi_epi16(s0, zero);
|
|
|
|
r0 = _mm256_add_epi32(r0, r1);
|
|
*sad_acc = _mm256_add_epi32(*sad_acc, r0);
|
|
|
|
row_sections += 1;
|
|
src_ptr += src_stride << 1;
|
|
ref_ptr += ref_stride << 1;
|
|
if (sec_ptr) sec_ptr += 32 << 1;
|
|
}
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
const int left_shift = 2;
|
|
int row_section = 0;
|
|
|
|
while (row_section < 4) {
|
|
sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
|
|
srcp += src_stride << left_shift;
|
|
refp += ref_stride << left_shift;
|
|
row_section += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 4;
|
|
ref += ref_stride << 4;
|
|
sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 4;
|
|
ref += ref_stride << 4;
|
|
sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 5;
|
|
ref += ref_stride << 5;
|
|
sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
static void sad64x2(const uint16_t *src_ptr, int src_stride,
|
|
const uint16_t *ref_ptr, int ref_stride,
|
|
const uint16_t *sec_ptr, __m256i *sad_acc) {
|
|
__m256i s[8], r[8];
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
|
|
s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
|
|
s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
|
|
s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
|
|
s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
|
|
s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
|
|
s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
|
|
|
|
r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
|
|
r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
|
|
r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
|
|
r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
|
|
r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
|
|
r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
|
|
|
|
if (sec_ptr) {
|
|
r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
|
|
r[1] = _mm256_avg_epu16(
|
|
r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
|
|
r[2] = _mm256_avg_epu16(
|
|
r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
|
|
r[3] = _mm256_avg_epu16(
|
|
r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
|
|
r[4] = _mm256_avg_epu16(
|
|
r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
|
|
r[5] = _mm256_avg_epu16(
|
|
r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
|
|
r[6] = _mm256_avg_epu16(
|
|
r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
|
|
r[7] = _mm256_avg_epu16(
|
|
r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
|
|
}
|
|
|
|
s[0] = _mm256_sub_epi16(s[0], r[0]);
|
|
s[1] = _mm256_sub_epi16(s[1], r[1]);
|
|
s[2] = _mm256_sub_epi16(s[2], r[2]);
|
|
s[3] = _mm256_sub_epi16(s[3], r[3]);
|
|
s[4] = _mm256_sub_epi16(s[4], r[4]);
|
|
s[5] = _mm256_sub_epi16(s[5], r[5]);
|
|
s[6] = _mm256_sub_epi16(s[6], r[6]);
|
|
s[7] = _mm256_sub_epi16(s[7], r[7]);
|
|
|
|
s[0] = _mm256_abs_epi16(s[0]);
|
|
s[1] = _mm256_abs_epi16(s[1]);
|
|
s[2] = _mm256_abs_epi16(s[2]);
|
|
s[3] = _mm256_abs_epi16(s[3]);
|
|
s[4] = _mm256_abs_epi16(s[4]);
|
|
s[5] = _mm256_abs_epi16(s[5]);
|
|
s[6] = _mm256_abs_epi16(s[6]);
|
|
s[7] = _mm256_abs_epi16(s[7]);
|
|
|
|
s[0] = _mm256_add_epi16(s[0], s[1]);
|
|
s[0] = _mm256_add_epi16(s[0], s[2]);
|
|
s[0] = _mm256_add_epi16(s[0], s[3]);
|
|
|
|
s[4] = _mm256_add_epi16(s[4], s[5]);
|
|
s[4] = _mm256_add_epi16(s[4], s[6]);
|
|
s[4] = _mm256_add_epi16(s[4], s[7]);
|
|
|
|
r[0] = _mm256_unpacklo_epi16(s[0], zero);
|
|
r[1] = _mm256_unpackhi_epi16(s[0], zero);
|
|
r[2] = _mm256_unpacklo_epi16(s[4], zero);
|
|
r[3] = _mm256_unpackhi_epi16(s[4], zero);
|
|
|
|
r[0] = _mm256_add_epi32(r[0], r[1]);
|
|
r[0] = _mm256_add_epi32(r[0], r[2]);
|
|
r[0] = _mm256_add_epi32(r[0], r[3]);
|
|
*sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
const int left_shift = 1;
|
|
int row_section = 0;
|
|
|
|
while (row_section < 16) {
|
|
sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
|
|
srcp += src_stride << left_shift;
|
|
refp += ref_stride << left_shift;
|
|
row_section += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 5;
|
|
ref += ref_stride << 5;
|
|
sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
|
|
const uint16_t *sec_ptr, __m256i *sad_acc) {
|
|
__m256i s[8], r[8];
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
|
|
s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
|
|
s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
|
|
s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
|
|
s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
|
|
s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
|
|
s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
|
|
s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
|
|
|
|
r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
|
|
r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
|
|
r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
|
|
r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
|
|
r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
|
|
r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
|
|
r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
|
|
|
|
if (sec_ptr) {
|
|
r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
|
|
r[1] = _mm256_avg_epu16(
|
|
r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
|
|
r[2] = _mm256_avg_epu16(
|
|
r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
|
|
r[3] = _mm256_avg_epu16(
|
|
r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
|
|
r[4] = _mm256_avg_epu16(
|
|
r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
|
|
r[5] = _mm256_avg_epu16(
|
|
r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
|
|
r[6] = _mm256_avg_epu16(
|
|
r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
|
|
r[7] = _mm256_avg_epu16(
|
|
r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
|
|
}
|
|
|
|
s[0] = _mm256_sub_epi16(s[0], r[0]);
|
|
s[1] = _mm256_sub_epi16(s[1], r[1]);
|
|
s[2] = _mm256_sub_epi16(s[2], r[2]);
|
|
s[3] = _mm256_sub_epi16(s[3], r[3]);
|
|
s[4] = _mm256_sub_epi16(s[4], r[4]);
|
|
s[5] = _mm256_sub_epi16(s[5], r[5]);
|
|
s[6] = _mm256_sub_epi16(s[6], r[6]);
|
|
s[7] = _mm256_sub_epi16(s[7], r[7]);
|
|
|
|
s[0] = _mm256_abs_epi16(s[0]);
|
|
s[1] = _mm256_abs_epi16(s[1]);
|
|
s[2] = _mm256_abs_epi16(s[2]);
|
|
s[3] = _mm256_abs_epi16(s[3]);
|
|
s[4] = _mm256_abs_epi16(s[4]);
|
|
s[5] = _mm256_abs_epi16(s[5]);
|
|
s[6] = _mm256_abs_epi16(s[6]);
|
|
s[7] = _mm256_abs_epi16(s[7]);
|
|
|
|
s[0] = _mm256_add_epi16(s[0], s[1]);
|
|
s[0] = _mm256_add_epi16(s[0], s[2]);
|
|
s[0] = _mm256_add_epi16(s[0], s[3]);
|
|
|
|
s[4] = _mm256_add_epi16(s[4], s[5]);
|
|
s[4] = _mm256_add_epi16(s[4], s[6]);
|
|
s[4] = _mm256_add_epi16(s[4], s[7]);
|
|
|
|
r[0] = _mm256_unpacklo_epi16(s[0], zero);
|
|
r[1] = _mm256_unpackhi_epi16(s[0], zero);
|
|
r[2] = _mm256_unpacklo_epi16(s[4], zero);
|
|
r[3] = _mm256_unpackhi_epi16(s[4], zero);
|
|
|
|
r[0] = _mm256_add_epi32(r[0], r[1]);
|
|
r[0] = _mm256_add_epi32(r[0], r[2]);
|
|
r[0] = _mm256_add_epi32(r[0], r[3]);
|
|
*sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
int row = 0;
|
|
while (row < 64) {
|
|
sad128x1(srcp, refp, NULL, &sad);
|
|
srcp += src_stride;
|
|
refp += ref_stride;
|
|
row += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 6;
|
|
ref += ref_stride << 6;
|
|
sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride) {
|
|
uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
|
|
src += src_stride << 6;
|
|
ref += ref_stride << 6;
|
|
sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
|
|
return sum;
|
|
}
|
|
|
|
// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
|
|
static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
|
|
const uint16_t *ref_ptr, int ref_stride,
|
|
const uint16_t *sec_ptr, __m256i *sad_acc) {
|
|
__m256i s0, s1, s2, s3, r0, r1, r2, r3;
|
|
const __m256i zero = _mm256_setzero_si256();
|
|
|
|
s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
|
|
s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
|
|
s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
|
|
s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
|
|
|
|
r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
|
|
r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
|
|
r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
|
|
r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
|
|
|
|
if (sec_ptr) {
|
|
r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
|
|
r1 = _mm256_avg_epu16(r1,
|
|
_mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
|
|
r2 = _mm256_avg_epu16(r2,
|
|
_mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
|
|
r3 = _mm256_avg_epu16(r3,
|
|
_mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
|
|
}
|
|
|
|
s0 = _mm256_sub_epi16(s0, r0);
|
|
s1 = _mm256_sub_epi16(s1, r1);
|
|
s2 = _mm256_sub_epi16(s2, r2);
|
|
s3 = _mm256_sub_epi16(s3, r3);
|
|
|
|
s0 = _mm256_abs_epi16(s0);
|
|
s1 = _mm256_abs_epi16(s1);
|
|
s2 = _mm256_abs_epi16(s2);
|
|
s3 = _mm256_abs_epi16(s3);
|
|
|
|
s0 = _mm256_add_epi16(s0, s1);
|
|
s0 = _mm256_add_epi16(s0, s2);
|
|
s0 = _mm256_add_epi16(s0, s3);
|
|
|
|
r0 = _mm256_unpacklo_epi16(s0, zero);
|
|
r1 = _mm256_unpackhi_epi16(s0, zero);
|
|
|
|
r0 = _mm256_add_epi32(r0, r1);
|
|
*sad_acc = _mm256_add_epi32(*sad_acc, r0);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
|
|
|
|
sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
|
|
|
|
// Next 4 rows
|
|
srcp += src_stride << 2;
|
|
refp += ref_stride << 2;
|
|
secp += 64;
|
|
sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 3;
|
|
uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 16 << left_shift;
|
|
sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 4;
|
|
uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 16 << left_shift;
|
|
sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
|
|
const int left_shift = 2;
|
|
int row_section = 0;
|
|
|
|
while (row_section < 4) {
|
|
sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
|
|
srcp += src_stride << left_shift;
|
|
refp += ref_stride << left_shift;
|
|
secp += 32 << left_shift;
|
|
row_section += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 4;
|
|
uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 32 << left_shift;
|
|
sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 5;
|
|
uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 32 << left_shift;
|
|
sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
|
|
const int left_shift = 1;
|
|
int row_section = 0;
|
|
|
|
while (row_section < 16) {
|
|
sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
|
|
srcp += src_stride << left_shift;
|
|
refp += ref_stride << left_shift;
|
|
secp += 64 << left_shift;
|
|
row_section += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 5;
|
|
uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 64 << left_shift;
|
|
sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
const int left_shift = 6;
|
|
uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 64 << left_shift;
|
|
sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
__m256i sad = _mm256_setzero_si256();
|
|
uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
|
|
uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
|
|
uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
|
|
int row = 0;
|
|
while (row < 64) {
|
|
sad128x1(srcp, refp, secp, &sad);
|
|
srcp += src_stride;
|
|
refp += ref_stride;
|
|
secp += 16 << 3;
|
|
row += 1;
|
|
}
|
|
return get_sad_from_mm256_epi32(&sad);
|
|
}
|
|
|
|
unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *ref, int ref_stride,
|
|
const uint8_t *second_pred) {
|
|
unsigned int sum;
|
|
const int left_shift = 6;
|
|
|
|
sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
src += src_stride << left_shift;
|
|
ref += ref_stride << left_shift;
|
|
second_pred += 128 << left_shift;
|
|
sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
|
|
second_pred);
|
|
return sum;
|
|
}
|
|
|
|
// SAD 4D
|
|
// Combine 4 __m256i vectors to uint32_t result[4]
|
|
static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
|
|
uint32_t *res) {
|
|
__m256i u0, u1, u2, u3;
|
|
const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
|
|
__m128i sad;
|
|
|
|
// 8 32-bit summation
|
|
u0 = _mm256_srli_si256(v[0], 4);
|
|
u1 = _mm256_srli_si256(v[1], 4);
|
|
u2 = _mm256_srli_si256(v[2], 4);
|
|
u3 = _mm256_srli_si256(v[3], 4);
|
|
|
|
u0 = _mm256_add_epi32(u0, v[0]);
|
|
u1 = _mm256_add_epi32(u1, v[1]);
|
|
u2 = _mm256_add_epi32(u2, v[2]);
|
|
u3 = _mm256_add_epi32(u3, v[3]);
|
|
|
|
u0 = _mm256_and_si256(u0, mask);
|
|
u1 = _mm256_and_si256(u1, mask);
|
|
u2 = _mm256_and_si256(u2, mask);
|
|
u3 = _mm256_and_si256(u3, mask);
|
|
// 4 32-bit summation, evenly positioned
|
|
|
|
u1 = _mm256_slli_si256(u1, 4);
|
|
u3 = _mm256_slli_si256(u3, 4);
|
|
|
|
u0 = _mm256_or_si256(u0, u1);
|
|
u2 = _mm256_or_si256(u2, u3);
|
|
// 8 32-bit summation, interleaved
|
|
|
|
u1 = _mm256_unpacklo_epi64(u0, u2);
|
|
u3 = _mm256_unpackhi_epi64(u0, u2);
|
|
|
|
u0 = _mm256_add_epi32(u1, u3);
|
|
sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
|
|
_mm256_castsi256_si128(u0));
|
|
_mm_storeu_si128((__m128i *)res, sad);
|
|
}
|
|
|
|
static void convert_pointers(const uint8_t *const ref8[],
|
|
const uint16_t *ref[]) {
|
|
ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
|
|
ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
|
|
ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
|
|
ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
|
|
}
|
|
|
|
static void init_sad(__m256i *s) {
|
|
s[0] = _mm256_setzero_si256();
|
|
s[1] = _mm256_setzero_si256();
|
|
s[2] = _mm256_setzero_si256();
|
|
s[3] = _mm256_setzero_si256();
|
|
}
|
|
|
|
void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
__m256i sad_vec[4];
|
|
const uint16_t *refp[4];
|
|
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *srcp;
|
|
const int shift_for_4_rows = 2;
|
|
int i;
|
|
|
|
init_sad(sad_vec);
|
|
convert_pointers(ref_array, refp);
|
|
|
|
for (i = 0; i < 4; ++i) {
|
|
srcp = keep;
|
|
sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
|
|
srcp += src_stride << shift_for_4_rows;
|
|
refp[i] += ref_stride << shift_for_4_rows;
|
|
sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
|
|
}
|
|
get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
|
|
}
|
|
|
|
void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first8rows[4];
|
|
uint32_t second8rows[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_8_rows = 3;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
|
|
src += src_stride << shift_for_8_rows;
|
|
ref[0] += ref_stride << shift_for_8_rows;
|
|
ref[1] += ref_stride << shift_for_8_rows;
|
|
ref[2] += ref_stride << shift_for_8_rows;
|
|
ref[3] += ref_stride << shift_for_8_rows;
|
|
aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
|
|
sad_array[0] = first8rows[0] + second8rows[0];
|
|
sad_array[1] = first8rows[1] + second8rows[1];
|
|
sad_array[2] = first8rows[2] + second8rows[2];
|
|
sad_array[3] = first8rows[3] + second8rows[3];
|
|
}
|
|
|
|
void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 4;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|
|
|
|
void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
__m256i sad_vec[4];
|
|
const uint16_t *refp[4];
|
|
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *srcp;
|
|
const int shift_for_4_rows = 2;
|
|
int i;
|
|
int rows_section;
|
|
|
|
init_sad(sad_vec);
|
|
convert_pointers(ref_array, refp);
|
|
|
|
for (i = 0; i < 4; ++i) {
|
|
srcp = keep;
|
|
rows_section = 0;
|
|
while (rows_section < 4) {
|
|
sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
|
|
srcp += src_stride << shift_for_4_rows;
|
|
refp[i] += ref_stride << shift_for_4_rows;
|
|
rows_section++;
|
|
}
|
|
}
|
|
get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
|
|
}
|
|
|
|
void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 4;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|
|
|
|
void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 5;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|
|
|
|
void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
__m256i sad_vec[4];
|
|
const uint16_t *refp[4];
|
|
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *srcp;
|
|
const int shift_for_rows = 1;
|
|
int i;
|
|
int rows_section;
|
|
|
|
init_sad(sad_vec);
|
|
convert_pointers(ref_array, refp);
|
|
|
|
for (i = 0; i < 4; ++i) {
|
|
srcp = keep;
|
|
rows_section = 0;
|
|
while (rows_section < 16) {
|
|
sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
|
|
srcp += src_stride << shift_for_rows;
|
|
refp[i] += ref_stride << shift_for_rows;
|
|
rows_section++;
|
|
}
|
|
}
|
|
get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
|
|
}
|
|
|
|
void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 5;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|
|
|
|
void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 6;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|
|
|
|
void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
__m256i sad_vec[4];
|
|
const uint16_t *refp[4];
|
|
const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
|
|
const uint16_t *srcp;
|
|
int i;
|
|
int rows_section;
|
|
|
|
init_sad(sad_vec);
|
|
convert_pointers(ref_array, refp);
|
|
|
|
for (i = 0; i < 4; ++i) {
|
|
srcp = keep;
|
|
rows_section = 0;
|
|
while (rows_section < 64) {
|
|
sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
|
|
srcp += src_stride;
|
|
refp[i] += ref_stride;
|
|
rows_section++;
|
|
}
|
|
}
|
|
get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
|
|
}
|
|
|
|
void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
|
|
const uint8_t *const ref_array[],
|
|
int ref_stride, uint32_t *sad_array) {
|
|
uint32_t first_half[4];
|
|
uint32_t second_half[4];
|
|
const uint8_t *ref[4];
|
|
const int shift_for_rows = 6;
|
|
|
|
ref[0] = ref_array[0];
|
|
ref[1] = ref_array[1];
|
|
ref[2] = ref_array[2];
|
|
ref[3] = ref_array[3];
|
|
|
|
aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
|
|
src += src_stride << shift_for_rows;
|
|
ref[0] += ref_stride << shift_for_rows;
|
|
ref[1] += ref_stride << shift_for_rows;
|
|
ref[2] += ref_stride << shift_for_rows;
|
|
ref[3] += ref_stride << shift_for_rows;
|
|
aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
|
|
sad_array[0] = first_half[0] + second_half[0];
|
|
sad_array[1] = first_half[1] + second_half[1];
|
|
sad_array[2] = first_half[2] + second_half[2];
|
|
sad_array[3] = first_half[3] + second_half[3];
|
|
}
|