Unify subtract function used in VP8/9

This commit replaces the vp8_ prefixed subtract function with the
common vpx_subtract_block function. It removes redundant SIMD
optimization codes and unit tests.

Change-Id: I42e086c32c93c6125e452dcaa6ed04337fe028d9
This commit is contained in:
Jingning Han 2015-07-06 16:52:24 -07:00
Родитель 9cb3a13426
Коммит 0ede9f52b7
12 изменённых файлов: 24 добавлений и 847 удалений

Просмотреть файл

@ -1,123 +0,0 @@
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "./vpx_config.h"
#include "./vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vp8/encoder/block.h"
#include "vpx_mem/vpx_mem.h"
typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
namespace {
class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
public:
virtual void TearDown() {
libvpx_test::ClearSystemState();
}
};
using libvpx_test::ACMRandom;
TEST_P(SubtractBlockTest, SimpleSubtract) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
BLOCK be;
BLOCKD bd;
// in libvpx, this stride is always 16
const int kDiffPredStride = 16;
const int kSrcStride[] = {32, 16, 8, 4, 0};
const int kBlockWidth = 4;
const int kBlockHeight = 4;
// Allocate... align to 16 for mmx/sse tests
uint8_t *source = reinterpret_cast<uint8_t*>(
vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
be.src_diff = reinterpret_cast<int16_t*>(
vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
bd.predictor = reinterpret_cast<unsigned char*>(
vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
for (int i = 0; kSrcStride[i] > 0; ++i) {
// start at block0
be.src = 0;
be.base_src = &source;
be.src_stride = kSrcStride[i];
// set difference
int16_t *src_diff = be.src_diff;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
src_diff[c] = static_cast<int16_t>(0xa5a5u);
}
src_diff += kDiffPredStride;
}
// set destination
uint8_t *base_src = *be.base_src;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
base_src[c] = rnd.Rand8();
}
base_src += be.src_stride;
}
// set predictor
uint8_t *predictor = bd.predictor;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
predictor[c] = rnd.Rand8();
}
predictor += kDiffPredStride;
}
ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
base_src = *be.base_src;
src_diff = be.src_diff;
predictor = bd.predictor;
for (int r = 0; r < kBlockHeight; ++r) {
for (int c = 0; c < kBlockWidth; ++c) {
EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
<< ", c = " << c;
}
src_diff += kDiffPredStride;
predictor += kDiffPredStride;
base_src += be.src_stride;
}
}
vpx_free(be.src_diff);
vpx_free(source);
vpx_free(bd.predictor);
}
INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
::testing::Values(vp8_subtract_b_c));
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
::testing::Values(vp8_subtract_b_neon));
#endif
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
::testing::Values(vp8_subtract_b_mmx));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
::testing::Values(vp8_subtract_b_sse2));
#endif
} // namespace

Просмотреть файл

@ -104,7 +104,6 @@ endif
LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC) += pp_filter_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc

Просмотреть файл

@ -343,15 +343,6 @@ add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
specialize qw/vp8_mbuverror mmx sse2/;
$vp8_mbuverror_sse2=vp8_mbuverror_xmm;
add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
specialize qw/vp8_subtract_b mmx sse2 neon/;
add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
specialize qw/vp8_subtract_mby mmx sse2 neon/;
add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
#
# Motion search
#

Просмотреть файл

@ -1,154 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/encoder/block.h"
void vp8_subtract_b_neon(
BLOCK *be,
BLOCKD *bd,
int pitch) {
unsigned char *src_ptr, *predictor;
int src_stride;
int16_t *src_diff;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q10u16, q11u16, q12u16, q13u16;
src_ptr = *be->base_src + be->src;
src_stride = be->src_stride;
predictor = bd->predictor;
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d6u8 = vld1_u8(src_ptr);
d1u8 = vld1_u8(predictor);
predictor += pitch;
d3u8 = vld1_u8(predictor);
predictor += pitch;
d5u8 = vld1_u8(predictor);
predictor += pitch;
d7u8 = vld1_u8(predictor);
q10u16 = vsubl_u8(d0u8, d1u8);
q11u16 = vsubl_u8(d2u8, d3u8);
q12u16 = vsubl_u8(d4u8, d5u8);
q13u16 = vsubl_u8(d6u8, d7u8);
src_diff = be->src_diff;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
return;
}
void vp8_subtract_mby_neon(
int16_t *diff,
unsigned char *src,
int src_stride,
unsigned char *pred,
int pred_stride) {
int i;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
for (i = 0; i < 8; i++) { // subtract_mby_loop
q0u8 = vld1q_u8(src);
src += src_stride;
q2u8 = vld1q_u8(src);
src += src_stride;
q1u8 = vld1q_u8(pred);
pred += pred_stride;
q3u8 = vld1q_u8(pred);
pred += pred_stride;
q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
return;
}
void vp8_subtract_mbuv_neon(
int16_t *diff,
unsigned char *usrc,
unsigned char *vsrc,
int src_stride,
unsigned char *upred,
unsigned char *vpred,
int pred_stride) {
int i, j;
unsigned char *src_ptr, *pred_ptr;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
diff += 256;
for (i = 0; i < 2; i++) {
if (i == 0) {
src_ptr = usrc;
pred_ptr = upred;
} else if (i == 1) {
src_ptr = vsrc;
pred_ptr = vpred;
}
for (j = 0; j < 2; j++) {
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d1u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d3u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d5u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d6u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d7u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
q8u16 = vsubl_u8(d0u8, d1u8);
q9u16 = vsubl_u8(d2u8, d3u8);
q10u16 = vsubl_u8(d4u8, d5u8);
q11u16 = vsubl_u8(d6u8, d7u8);
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
}
return;
}

Просмотреть файл

@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "./vpx_dsp_rtcd.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
@ -19,80 +20,29 @@
#include "vpx_mem/vpx_mem.h"
#include "rdopt.h"
// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
// codec specified vp9_subtract_ functions.
void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *src_ptr = (*(be->base_src) + be->src);
short *diff_ptr = be->src_diff;
unsigned char *pred_ptr = bd->predictor;
int src_stride = be->src_stride;
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
unsigned char *src_ptr = (*(be->base_src) + be->src);
short *diff_ptr = be->src_diff;
unsigned char *pred_ptr = bd->predictor;
int src_stride = be->src_stride;
int r, c;
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
diff_ptr[c] = src_ptr[c] - pred_ptr[c];
}
diff_ptr += pitch;
pred_ptr += pitch;
src_ptr += src_stride;
}
vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
pred_ptr, pitch);
}
void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
int src_stride, unsigned char *upred,
unsigned char *vpred, int pred_stride)
{
short *udiff = diff + 256;
short *vdiff = diff + 320;
unsigned char *vpred, int pred_stride) {
short *udiff = diff + 256;
short *vdiff = diff + 320;
int r, c;
for (r = 0; r < 8; r++)
{
for (c = 0; c < 8; c++)
{
udiff[c] = usrc[c] - upred[c];
}
udiff += 8;
upred += pred_stride;
usrc += src_stride;
}
for (r = 0; r < 8; r++)
{
for (c = 0; c < 8; c++)
{
vdiff[c] = vsrc[c] - vpred[c];
}
vdiff += 8;
vpred += pred_stride;
vsrc += src_stride;
}
vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
}
void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride)
{
int r, c;
for (r = 0; r < 16; r++)
{
for (c = 0; c < 16; c++)
{
diff[c] = src[c] - pred[c];
}
diff += 16;
pred += pred_stride;
src += src_stride;
}
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride) {
vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
}
static void vp8_subtract_mb(MACROBLOCK *x)

Просмотреть файл

@ -19,6 +19,13 @@ extern "C" {
#endif
void vp8_encode_inter16x16(MACROBLOCK *x);
void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
int src_stride, unsigned char *upred,
unsigned char *vpred, int pred_stride);
void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
unsigned char *pred, int pred_stride);
void vp8_build_dcblock(MACROBLOCK *b);
void vp8_transform_mb(MACROBLOCK *mb);
void vp8_transform_mbuv(MACROBLOCK *x);

Просмотреть файл

@ -1,223 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_mmx_impl) PRIVATE
sym(vp8_subtract_b_mmx_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2],mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_mmx) PRIVATE
sym(vp8_subtract_mby_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;src
movsxd rdx, dword ptr arg(2);src_stride
mov rax, arg(3) ;pred
push rbx
movsxd rbx, dword ptr arg(4);pred_stride
pxor mm0, mm0
mov rcx, 16
.submby_loop:
movq mm1, [rsi]
movq mm3, [rax]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi], mm1
movq [rdi+8], mm2
movq mm1, [rsi+8]
movq mm3, [rax+8]
movq mm2, mm1
movq mm4, mm3
punpcklbw mm1, mm0
punpcklbw mm3, mm0
punpckhbw mm2, mm0
punpckhbw mm4, mm0
psubw mm1, mm3
psubw mm2, mm4
movq [rdi+16], mm1
movq [rdi+24], mm2
add rdi, 32
lea rax, [rax+rbx]
lea rsi, [rsi+rdx]
dec rcx
jnz .submby_loop
pop rbx
pop rdi
pop rsi
; begin epilog
UNSHADOW_ARGS
pop rbp
ret
;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
; int src_stride, unsigned char *upred,
; unsigned char *vpred, int pred_stride)
global sym(vp8_subtract_mbuv_mmx) PRIVATE
sym(vp8_subtract_mbuv_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;usrc
movsxd rdx, dword ptr arg(3);src_stride;
mov rax, arg(4) ;upred
add rdi, 256*2 ;diff = diff + 256 (shorts)
mov rcx, 8
push rbx
movsxd rbx, dword ptr arg(6);pred_stride
pxor mm7, mm7
.submbu_loop:
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
add rdi, 16
add rsi, rdx
add rax, rbx
dec rcx
jnz .submbu_loop
mov rsi, arg(2) ;vsrc
mov rax, arg(5) ;vpred
mov rcx, 8
.submbv_loop:
movq mm0, [rsi]
movq mm1, [rax]
movq mm3, mm0
movq mm4, mm1
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpckhbw mm3, mm7
punpckhbw mm4, mm7
psubw mm0, mm1
psubw mm3, mm4
movq [rdi], mm0
movq [rdi+8], mm3
add rdi, 16
add rsi, rdx
add rax, rbx
dec rcx
jnz .submbv_loop
pop rbx
; begin epilog
pop rdi
pop rsi
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -1,245 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
; short *diff, unsigned char *Predictor,
; int pitch);
global sym(vp8_subtract_b_sse2_impl) PRIVATE
sym(vp8_subtract_b_sse2_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(2) ;diff
mov rax, arg(3) ;Predictor
mov rsi, arg(0) ;z
movsxd rdx, dword ptr arg(1);src_stride;
movsxd rcx, dword ptr arg(4);pitch
pxor mm7, mm7
movd mm0, [rsi]
movd mm1, [rax]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi], mm0
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
movd mm0, [rsi+rdx*2]
movd mm1, [rax+rcx*2]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*4], mm0
lea rsi, [rsi+rdx*2]
lea rcx, [rcx+rcx*2]
movd mm0, [rsi+rdx]
movd mm1, [rax+rcx]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
psubw mm0, mm1
movq MMWORD PTR [rdi+rcx*2], mm0
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
;unsigned char *pred, int pred_stride)
global sym(vp8_subtract_mby_sse2) PRIVATE
sym(vp8_subtract_mby_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;src
movsxd rdx, dword ptr arg(2);src_stride
mov rax, arg(3) ;pred
movdqa xmm4, [GLOBAL(t80)]
push rbx
mov rcx, 8 ; do two lines at one time
movsxd rbx, dword ptr arg(4);pred_stride
.submby_loop:
movdqa xmm0, [rsi] ; src
movdqa xmm1, [rax] ; pred
movdqa xmm2, xmm0
psubb xmm0, xmm1
pxor xmm1, xmm4 ;convert to signed values
pxor xmm2, xmm4
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm1 ; put sign back to subtraction
movdqa xmm3, [rsi + rdx]
movdqa xmm5, [rax + rbx]
lea rsi, [rsi+rdx*2]
lea rax, [rax+rbx*2]
movdqa [rdi], xmm0
movdqa [rdi +16], xmm2
movdqa xmm1, xmm3
psubb xmm3, xmm5
pxor xmm5, xmm4 ;convert to signed values
pxor xmm1, xmm4
pcmpgtb xmm5, xmm1 ; obtain sign information
movdqa xmm1, xmm3
punpcklbw xmm3, xmm5 ; put sign back to subtraction
punpckhbw xmm1, xmm5 ; put sign back to subtraction
movdqa [rdi +32], xmm3
movdqa [rdi +48], xmm1
add rdi, 64
dec rcx
jnz .submby_loop
pop rbx
pop rdi
pop rsi
; begin epilog
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
; int src_stride, unsigned char *upred,
; unsigned char *vpred, int pred_stride)
global sym(vp8_subtract_mbuv_sse2) PRIVATE
sym(vp8_subtract_mbuv_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
movdqa xmm4, [GLOBAL(t80)]
mov rdi, arg(0) ;diff
mov rsi, arg(1) ;usrc
movsxd rdx, dword ptr arg(3);src_stride;
mov rax, arg(4) ;upred
add rdi, 256*2 ;diff = diff + 256 (shorts)
mov rcx, 4
push rbx
movsxd rbx, dword ptr arg(6);pred_stride
;u
.submbu_loop:
movq xmm0, [rsi] ; src
movq xmm2, [rsi+rdx] ; src -- next line
movq xmm1, [rax] ; pred
movq xmm3, [rax+rbx] ; pred -- next line
lea rsi, [rsi + rdx*2]
lea rax, [rax + rbx*2]
punpcklqdq xmm0, xmm2
punpcklqdq xmm1, xmm3
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, xmm4 ;convert to signed values
pxor xmm2, xmm4
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa [rdi], xmm0 ; store difference
movdqa [rdi +16], xmm2 ; store difference
add rdi, 32
sub rcx, 1
jnz .submbu_loop
mov rsi, arg(2) ;vsrc
mov rax, arg(5) ;vpred
mov rcx, 4
;v
.submbv_loop:
movq xmm0, [rsi] ; src
movq xmm2, [rsi+rdx] ; src -- next line
movq xmm1, [rax] ; pred
movq xmm3, [rax+rbx] ; pred -- next line
lea rsi, [rsi + rdx*2]
lea rax, [rax + rbx*2]
punpcklqdq xmm0, xmm2
punpcklqdq xmm1, xmm3
movdqa xmm2, xmm0
psubb xmm0, xmm1 ; subtraction with sign missed
pxor xmm1, xmm4 ;convert to signed values
pxor xmm2, xmm4
pcmpgtb xmm1, xmm2 ; obtain sign information
movdqa xmm2, xmm0
movdqa xmm3, xmm1
punpcklbw xmm0, xmm1 ; put sign back to subtraction
punpckhbw xmm2, xmm3 ; put sign back to subtraction
movdqa [rdi], xmm0 ; store difference
movdqa [rdi +16], xmm2 ; store difference
add rdi, 32
sub rcx, 1
jnz .submbv_loop
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
t80:
times 16 db 0x80

Просмотреть файл

@ -65,14 +65,3 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb)
return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
}
void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
unsigned char *predictor = &bd->predictor[0];
vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
}

Просмотреть файл

@ -30,14 +30,3 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
}
void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
short *diff, unsigned char *predictor,
int pitch);
void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
{
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
unsigned char *predictor = &bd->predictor[0];
vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
}

Просмотреть файл

@ -82,7 +82,6 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
endif
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
@ -94,7 +93,6 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
endif
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

Просмотреть файл

@ -25,5 +25,4 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c