Merge "Optimize vp9_highbd_block_error_8bit assembly."
This commit is contained in:
Коммит
35cae7f1b3
|
@ -67,12 +67,22 @@ TEST_P(ErrorBlockTest, OperationCheck) {
|
|||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
const int msb = bit_depth_ + 8 - 1;
|
||||
for (int i = 0; i < kNumIterations; ++i) {
|
||||
int err_count = 0;
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
coeff[j] = rnd(2 << 20) - (1 << 20);
|
||||
dqcoeff[j] = rnd(2 << 20) - (1 << 20);
|
||||
// coeff and dqcoeff will always have at least the same sign, and this
|
||||
// can be used for optimization, so generate test input precisely.
|
||||
if (rnd(2)) {
|
||||
// Positive number
|
||||
coeff[j] = rnd(1 << msb);
|
||||
dqcoeff[j] = rnd(1 << msb);
|
||||
} else {
|
||||
// Negative number
|
||||
coeff[j] = -rnd(1 << msb);
|
||||
dqcoeff[j] = -rnd(1 << msb);
|
||||
}
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
bit_depth_);
|
||||
|
@ -85,7 +95,7 @@ TEST_P(ErrorBlockTest, OperationCheck) {
|
|||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "Error: Error Block Test, C output doesn't match optimized output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
|
@ -100,23 +110,36 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
|
|||
int64_t ret;
|
||||
int64_t ref_ssz;
|
||||
int64_t ref_ret;
|
||||
int max_val = ((1 << 20) - 1);
|
||||
const int msb = bit_depth_ + 8 - 1;
|
||||
int max_val = ((1 << msb) - 1);
|
||||
for (int i = 0; i < kNumIterations; ++i) {
|
||||
int err_count = 0;
|
||||
int k = (i / 9) % 5;
|
||||
int k = (i / 9) % 9;
|
||||
|
||||
// Change the maximum coeff value, to test different bit boundaries
|
||||
if ( k == 4 && (i % 9) == 0 ) {
|
||||
if ( k == 8 && (i % 9) == 0 ) {
|
||||
max_val >>= 1;
|
||||
}
|
||||
block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
|
||||
for (int j = 0; j < block_size; j++) {
|
||||
if (k < 4) { // Test at maximum values
|
||||
coeff[j] = k % 2 ? max_val : -max_val;
|
||||
dqcoeff[j] = (k >> 1) % 2 ? max_val : -max_val;
|
||||
if (k < 4) {
|
||||
// Test at positive maximum values
|
||||
coeff[j] = k % 2 ? max_val : 0;
|
||||
dqcoeff[j] = (k >> 1) % 2 ? max_val : 0;
|
||||
} else if (k < 8) {
|
||||
// Test at negative maximum values
|
||||
coeff[j] = k % 2 ? -max_val : 0;
|
||||
dqcoeff[j] = (k >> 1) % 2 ? -max_val : 0;
|
||||
} else {
|
||||
coeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
dqcoeff[j] = rnd(2 << 14) - (1 << 14);
|
||||
if (rnd(2)) {
|
||||
// Positive number
|
||||
coeff[j] = rnd(1 << 14);
|
||||
dqcoeff[j] = rnd(1 << 14);
|
||||
} else {
|
||||
// Negative number
|
||||
coeff[j] = -rnd(1 << 14);
|
||||
dqcoeff[j] = -rnd(1 << 14);
|
||||
}
|
||||
}
|
||||
}
|
||||
ref_ret = ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz,
|
||||
|
@ -130,21 +153,13 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
|
|||
err_count_total += err_count;
|
||||
}
|
||||
EXPECT_EQ(0, err_count_total)
|
||||
<< "Error: Error Block Test, C output doesn't match SSE2 output. "
|
||||
<< "Error: Error Block Test, C output doesn't match optimized output. "
|
||||
<< "First failed at test case " << first_failure;
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if CONFIG_USE_X86INC && HAVE_SSE2
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
int64_t *ssz, int bps) {
|
||||
assert(bps == 8);
|
||||
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
|
||||
#if CONFIG_USE_X86INC
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
|
@ -153,6 +168,15 @@ int64_t wrap_vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
|
|||
return vp9_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
|
||||
#if HAVE_SSE2
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
int64_t *ssz, int bps) {
|
||||
assert(bps == 8);
|
||||
return vp9_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, ErrorBlockTest,
|
||||
::testing::Values(
|
||||
|
@ -165,5 +189,23 @@ INSTANTIATE_TEST_CASE_P(
|
|||
make_tuple(&wrap_vp9_highbd_block_error_8bit_sse2,
|
||||
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_AVX
|
||||
int64_t wrap_vp9_highbd_block_error_8bit_avx(const tran_low_t *coeff,
|
||||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
int64_t *ssz, int bps) {
|
||||
assert(bps == 8);
|
||||
return vp9_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
AVX, ErrorBlockTest,
|
||||
::testing::Values(
|
||||
make_tuple(&wrap_vp9_highbd_block_error_8bit_avx,
|
||||
&wrap_vp9_highbd_block_error_8bit_c, VPX_BITS_8)));
|
||||
#endif // HAVE_AVX
|
||||
|
||||
#endif // CONFIG_USE_X86INC
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
||||
|
|
|
@ -248,7 +248,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
specialize qw/vp9_highbd_block_error/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
|
||||
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc";
|
||||
specialize qw/vp9_highbd_block_error_8bit/, "$sse2_x86inc", "$avx_x86inc";
|
||||
|
||||
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp/;
|
||||
|
|
|
@ -296,30 +296,11 @@ int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
|
|||
const tran_low_t *dqcoeff,
|
||||
intptr_t block_size,
|
||||
int64_t *ssz) {
|
||||
int i;
|
||||
int32_t c, d;
|
||||
int64_t error = 0, sqcoeff = 0;
|
||||
int16_t diff;
|
||||
|
||||
const int32_t hi = 0x00007fff;
|
||||
const int32_t lo = 0xffff8000;
|
||||
|
||||
for (i = 0; i < block_size; i++) {
|
||||
c = coeff[i];
|
||||
d = dqcoeff[i];
|
||||
|
||||
// Saturate to 16 bits
|
||||
c = (c > hi) ? hi : ((c < lo) ? lo : c);
|
||||
d = (d > hi) ? hi : ((d < lo) ? lo : d);
|
||||
|
||||
diff = d - c;
|
||||
error += diff * diff;
|
||||
sqcoeff += c * c;
|
||||
}
|
||||
assert(error >= 0 && sqcoeff >= 0);
|
||||
|
||||
*ssz = sqcoeff;
|
||||
return error;
|
||||
// Note that the C versions of these 2 functions (vp9_block_error and
|
||||
// vp9_highbd_block_error_8bit are the same, but the optimized assembly
|
||||
// routines are not compatible in the non high bitdepth configuration, so
|
||||
// they still cannot share the same name.
|
||||
return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
|
||||
}
|
||||
|
||||
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
|
||||
|
|
|
@ -0,0 +1,261 @@
|
|||
;
|
||||
; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
%define private_prefix vp9
|
||||
|
||||
%include "third_party/x86inc/x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
ALIGN 16
|
||||
|
||||
;
|
||||
; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
|
||||
; intptr_t block_size, int64_t *ssz)
|
||||
;
|
||||
|
||||
INIT_XMM avx
|
||||
cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
|
||||
vzeroupper
|
||||
|
||||
; If only one iteration is required, then handle this as a special case.
|
||||
; It is the most frequent case, so we can have a significant gain here
|
||||
; by not setting up a loop and accumulators.
|
||||
cmp sizeq, 16
|
||||
jne .generic
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Common case of size == 16
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
; Load input vectors
|
||||
mova xm0, [dqcq]
|
||||
packssdw xm0, [dqcq+16]
|
||||
mova xm2, [uqcq]
|
||||
packssdw xm2, [uqcq+16]
|
||||
|
||||
mova xm1, [dqcq+32]
|
||||
packssdw xm1, [dqcq+48]
|
||||
mova xm3, [uqcq+32]
|
||||
packssdw xm3, [uqcq+48]
|
||||
|
||||
; Compute the errors.
|
||||
psubw xm0, xm2
|
||||
psubw xm1, xm3
|
||||
|
||||
; Individual errors are max 15bit+sign, so squares are 30bit, and
|
||||
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
|
||||
pmaddwd xm2, xm2
|
||||
pmaddwd xm3, xm3
|
||||
|
||||
pmaddwd xm0, xm0
|
||||
pmaddwd xm1, xm1
|
||||
|
||||
; Squares are always positive, so we can use unsigned arithmetic after
|
||||
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
|
||||
; fit in 32bits
|
||||
paddd xm2, xm3
|
||||
paddd xm0, xm1
|
||||
|
||||
; Accumulate horizontally in 64 bits, there is no chance of overflow here
|
||||
pxor xm5, xm5
|
||||
|
||||
pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
|
||||
psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
|
||||
|
||||
pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
|
||||
psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
|
||||
|
||||
paddq xm2, xm3
|
||||
paddq xm0, xm1
|
||||
|
||||
psrldq xm3, xm2, 8
|
||||
psrldq xm1, xm0, 8
|
||||
|
||||
paddq xm2, xm3
|
||||
paddq xm0, xm1
|
||||
|
||||
; Store the return value
|
||||
%if ARCH_X86_64
|
||||
movq rax, xm0
|
||||
movq [sszq], xm2
|
||||
%else
|
||||
movd eax, xm0
|
||||
pextrd edx, xm0, 1
|
||||
movq [sszd], xm2
|
||||
%endif
|
||||
RET
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Generic case of size != 16, speculative low precision
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
ALIGN 16
|
||||
.generic:
|
||||
pxor xm4, xm4 ; sse accumulator
|
||||
pxor xm5, xm5 ; overflow detection register for xm4
|
||||
pxor xm6, xm6 ; ssz accumulator
|
||||
pxor xm7, xm7 ; overflow detection register for xm6
|
||||
lea uqcq, [uqcq+sizeq*4]
|
||||
lea dqcq, [dqcq+sizeq*4]
|
||||
neg sizeq
|
||||
|
||||
; Push the negative size as the high precision code might need it
|
||||
push sizeq
|
||||
|
||||
.loop:
|
||||
; Load input vectors
|
||||
mova xm0, [dqcq+sizeq*4]
|
||||
packssdw xm0, [dqcq+sizeq*4+16]
|
||||
mova xm2, [uqcq+sizeq*4]
|
||||
packssdw xm2, [uqcq+sizeq*4+16]
|
||||
|
||||
mova xm1, [dqcq+sizeq*4+32]
|
||||
packssdw xm1, [dqcq+sizeq*4+48]
|
||||
mova xm3, [uqcq+sizeq*4+32]
|
||||
packssdw xm3, [uqcq+sizeq*4+48]
|
||||
|
||||
add sizeq, 16
|
||||
|
||||
; Compute the squared errors.
|
||||
; Individual errors are max 15bit+sign, so squares are 30bit, and
|
||||
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
|
||||
psubw xm0, xm2
|
||||
pmaddwd xm2, xm2
|
||||
pmaddwd xm0, xm0
|
||||
|
||||
psubw xm1, xm3
|
||||
pmaddwd xm3, xm3
|
||||
pmaddwd xm1, xm1
|
||||
|
||||
; Squares are always positive, so we can use unsigned arithmetic after
|
||||
; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
|
||||
; fit in 32bits
|
||||
paddd xm2, xm3
|
||||
paddd xm0, xm1
|
||||
|
||||
; We accumulate using 32 bit arithmetic, but detect potential overflow
|
||||
; by checking if the MSB of the accumulators have ever been a set bit.
|
||||
; If yes, we redo the whole compute at the end on higher precision, but
|
||||
; this happens extremely rarely, so we still achieve a net gain.
|
||||
paddd xm4, xm0
|
||||
paddd xm6, xm2
|
||||
por xm5, xm4 ; OR in the accumulator for overflow detection
|
||||
por xm7, xm6 ; OR in the accumulator for overflow detection
|
||||
|
||||
jnz .loop
|
||||
|
||||
; Add pairs horizontally (still only on 32 bits)
|
||||
phaddd xm4, xm4
|
||||
por xm5, xm4 ; OR in the accumulator for overflow detection
|
||||
phaddd xm6, xm6
|
||||
por xm7, xm6 ; OR in the accumulator for overflow detection
|
||||
|
||||
; Check for possibility of overflow by testing if bit 32 of each dword lane
|
||||
; have ever been set. If they were not, then there was no overflow and the
|
||||
; final sum will fit in 32 bits. If overflow happened, then
|
||||
; we redo the whole computation on higher precision.
|
||||
por xm7, xm5
|
||||
pmovmskb r4, xm7
|
||||
test r4, 0x8888
|
||||
jnz .highprec
|
||||
|
||||
phaddd xm4, xm4
|
||||
phaddd xm6, xm6
|
||||
pmovzxdq xm4, xm4
|
||||
pmovzxdq xm6, xm6
|
||||
|
||||
; Restore stack
|
||||
pop sizeq
|
||||
|
||||
; Store the return value
|
||||
%if ARCH_X86_64
|
||||
movq rax, xm4
|
||||
movq [sszq], xm6
|
||||
%else
|
||||
movd eax, xm4
|
||||
pextrd edx, xm4, 1
|
||||
movq [sszd], xm6
|
||||
%endif
|
||||
RET
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; Generic case of size != 16, high precision case
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
.highprec:
|
||||
pxor xm4, xm4 ; sse accumulator
|
||||
pxor xm5, xm5 ; dedicated zero register
|
||||
pxor xm6, xm6 ; ssz accumulator
|
||||
pop sizeq
|
||||
|
||||
.loophp:
|
||||
mova xm0, [dqcq+sizeq*4]
|
||||
packssdw xm0, [dqcq+sizeq*4+16]
|
||||
mova xm2, [uqcq+sizeq*4]
|
||||
packssdw xm2, [uqcq+sizeq*4+16]
|
||||
|
||||
mova xm1, [dqcq+sizeq*4+32]
|
||||
packssdw xm1, [dqcq+sizeq*4+48]
|
||||
mova xm3, [uqcq+sizeq*4+32]
|
||||
packssdw xm3, [uqcq+sizeq*4+48]
|
||||
|
||||
add sizeq, 16
|
||||
|
||||
; individual errors are max. 15bit+sign, so squares are 30bit, and
|
||||
; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
|
||||
|
||||
psubw xm0, xm2
|
||||
pmaddwd xm2, xm2
|
||||
pmaddwd xm0, xm0
|
||||
|
||||
psubw xm1, xm3
|
||||
pmaddwd xm3, xm3
|
||||
pmaddwd xm1, xm1
|
||||
|
||||
; accumulate in 64bit
|
||||
punpckldq xm7, xm0, xm5
|
||||
punpckhdq xm0, xm5
|
||||
paddq xm4, xm7
|
||||
|
||||
punpckldq xm7, xm2, xm5
|
||||
punpckhdq xm2, xm5
|
||||
paddq xm6, xm7
|
||||
|
||||
punpckldq xm7, xm1, xm5
|
||||
punpckhdq xm1, xm5
|
||||
paddq xm4, xm7
|
||||
|
||||
punpckldq xm7, xm3, xm5
|
||||
punpckhdq xm3, xm5
|
||||
paddq xm6, xm7
|
||||
|
||||
paddq xm4, xm0
|
||||
paddq xm4, xm1
|
||||
paddq xm6, xm2
|
||||
paddq xm6, xm3
|
||||
|
||||
jnz .loophp
|
||||
|
||||
; Accumulate horizontally
|
||||
movhlps xm5, xm4
|
||||
movhlps xm7, xm6
|
||||
paddq xm4, xm5
|
||||
paddq xm6, xm7
|
||||
|
||||
; Store the return value
|
||||
%if ARCH_X86_64
|
||||
movq rax, xm4
|
||||
movq [sszq], xm6
|
||||
%else
|
||||
movd eax, xm4
|
||||
pextrd edx, xm4, 1
|
||||
movq [sszd], xm6
|
||||
%endif
|
||||
RET
|
||||
|
||||
END
|
|
@ -102,6 +102,7 @@ ifeq ($(CONFIG_USE_X86INC),yes)
|
|||
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
|
||||
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
|
||||
else
|
||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
|
||||
endif
|
||||
|
|
Загрузка…
Ссылка в новой задаче