Merge "Add SSE2 subtract functions"
This commit is contained in:
Коммит
0918747520
|
@ -55,7 +55,9 @@ extern prototype_submbuv(vp8_subtract_mbuv_mmx);
|
|||
extern prototype_berr(vp8_block_error_xmm);
|
||||
extern prototype_mberr(vp8_mbblock_error_xmm);
|
||||
extern prototype_mbuverr(vp8_mbuverror_xmm);
|
||||
|
||||
extern prototype_subb(vp8_subtract_b_sse2);
|
||||
extern prototype_submby(vp8_subtract_mby_sse2);
|
||||
extern prototype_submbuv(vp8_subtract_mbuv_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_encodemb_berr
|
||||
|
@ -67,6 +69,15 @@ extern prototype_mbuverr(vp8_mbuverror_xmm);
|
|||
#undef vp8_encodemb_mbuverr
|
||||
#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
|
||||
|
||||
#undef vp8_encodemb_subb
|
||||
#define vp8_encodemb_subb vp8_subtract_b_sse2
|
||||
|
||||
#undef vp8_encodemb_submby
|
||||
#define vp8_encodemb_submby vp8_subtract_mby_sse2
|
||||
|
||||
#undef vp8_encodemb_submbuv
|
||||
#define vp8_encodemb_submbuv vp8_subtract_mbuv_sse2
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
|
||||
; unsigned short *diff, unsigned char *Predictor,
|
||||
; short *diff, unsigned char *Predictor,
|
||||
; int pitch);
|
||||
global sym(vp8_subtract_b_mmx_impl)
|
||||
sym(vp8_subtract_b_mmx_impl):
|
||||
|
|
|
@ -0,0 +1,348 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
|
||||
;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
|
||||
; short *diff, unsigned char *Predictor,
|
||||
; int pitch);
|
||||
global sym(vp8_subtract_b_sse2_impl)
|
||||
sym(vp8_subtract_b_sse2_impl):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rdi, arg(2) ;diff
|
||||
mov rax, arg(3) ;Predictor
|
||||
mov rsi, arg(0) ;z
|
||||
movsxd rdx, dword ptr arg(1);src_stride;
|
||||
movsxd rcx, dword ptr arg(4);pitch
|
||||
pxor mm7, mm7
|
||||
|
||||
movd mm0, [rsi]
|
||||
movd mm1, [rax]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
psubw mm0, mm1
|
||||
movq [rdi], mm0
|
||||
|
||||
movd mm0, [rsi+rdx]
|
||||
movd mm1, [rax+rcx]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
psubw mm0, mm1
|
||||
movq [rdi+rcx*2],mm0
|
||||
|
||||
movd mm0, [rsi+rdx*2]
|
||||
movd mm1, [rax+rcx*2]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
psubw mm0, mm1
|
||||
movq [rdi+rcx*4], mm0
|
||||
|
||||
lea rsi, [rsi+rdx*2]
|
||||
lea rcx, [rcx+rcx*2]
|
||||
|
||||
movd mm0, [rsi+rdx]
|
||||
movd mm1, [rax+rcx]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
psubw mm0, mm1
|
||||
movq [rdi+rcx*2], mm0
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
global sym(vp8_subtract_mby_sse2)
|
||||
sym(vp8_subtract_mby_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(1) ;src
|
||||
mov rdi, arg(0) ;diff
|
||||
|
||||
mov rax, arg(2) ;pred
|
||||
movsxd rdx, dword ptr arg(3) ;stride
|
||||
|
||||
mov rcx, 8 ; do two lines at one time
|
||||
|
||||
submby_loop:
|
||||
movdqa xmm0, [rsi] ; src
|
||||
movdqa xmm1, [rax] ; pred
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi], xmm0
|
||||
movdqa [rdi +16], xmm2
|
||||
|
||||
movdqa xmm4, [rsi + rdx]
|
||||
movdqa xmm5, [rax + 16]
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
psubb xmm4, xmm5
|
||||
|
||||
pxor xmm5, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm6, [GLOBAL(t80)]
|
||||
pcmpgtb xmm5, xmm6 ; obtain sign information
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
punpcklbw xmm4, xmm5 ; put sign back to subtraction
|
||||
punpckhbw xmm6, xmm7 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi +32], xmm4
|
||||
movdqa [rdi +48], xmm6
|
||||
|
||||
add rdi, 64
|
||||
add rax, 32
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
sub rcx, 1
|
||||
jnz submby_loop
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
global sym(vp8_subtract_mbuv_sse2)
|
||||
sym(vp8_subtract_mbuv_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rax, arg(3) ;pred
|
||||
mov rsi, arg(1) ;z = usrc
|
||||
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
||||
add rax, 256 ;Predictor = pred + 256
|
||||
movsxd rdx, dword ptr arg(4) ;stride;
|
||||
lea rcx, [rdx + rdx*2]
|
||||
|
||||
;u
|
||||
;line 0 1
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx]
|
||||
movdqa xmm1, [rax] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi], xmm0
|
||||
movdqa [rdi +16], xmm2
|
||||
|
||||
;line 2 3
|
||||
movq xmm0, [rsi+rdx*2] ; src
|
||||
movq xmm2, [rsi+rcx]
|
||||
movdqa xmm1, [rax+16] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 32], xmm0
|
||||
movdqa [rdi + 48], xmm2
|
||||
|
||||
;line 4 5
|
||||
lea rsi, [rsi + rdx*4]
|
||||
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx]
|
||||
movdqa xmm1, [rax + 32] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 64], xmm0
|
||||
movdqa [rdi + 80], xmm2
|
||||
|
||||
;line 6 7
|
||||
movq xmm0, [rsi+rdx*2] ; src
|
||||
movq xmm2, [rsi+rcx]
|
||||
movdqa xmm1, [rax+ 48] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 96], xmm0
|
||||
movdqa [rdi + 112], xmm2
|
||||
|
||||
;v
|
||||
mov rsi, arg(2) ;z = vsrc
|
||||
add rdi, 64*2 ;diff = diff + 320 (shorts)
|
||||
add rax, 64 ;Predictor = pred + 320
|
||||
|
||||
;line 0 1
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx]
|
||||
movdqa xmm1, [rax] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi], xmm0
|
||||
movdqa [rdi +16], xmm2
|
||||
|
||||
;line 2 3
|
||||
movq xmm0, [rsi+rdx*2] ; src
|
||||
movq xmm2, [rsi+rcx]
|
||||
movdqa xmm1, [rax+16] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 32], xmm0
|
||||
movdqa [rdi + 48], xmm2
|
||||
|
||||
;line 4 5
|
||||
lea rsi, [rsi + rdx*4]
|
||||
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx]
|
||||
movdqa xmm1, [rax + 32] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 64], xmm0
|
||||
movdqa [rdi + 80], xmm2
|
||||
|
||||
;line 6 7
|
||||
movq xmm0, [rsi+rdx*2] ; src
|
||||
movq xmm2, [rsi+rcx]
|
||||
movdqa xmm1, [rax+ 48] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa [rdi + 96], xmm0
|
||||
movdqa [rdi + 112], xmm2
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
t80:
|
||||
times 16 db 0x80
|
|
@ -165,6 +165,18 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
|
|||
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
|
||||
}
|
||||
|
||||
void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
|
||||
short *diff, unsigned char *predictor,
|
||||
int pitch);
|
||||
void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
{
|
||||
unsigned char *z = *(be->base_src) + be->src;
|
||||
unsigned int src_stride = be->src_stride;
|
||||
short *diff = &be->src_diff[0];
|
||||
unsigned char *predictor = &bd->predictor[0];
|
||||
vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||
|
@ -282,12 +294,12 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
|||
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
|
||||
cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
|
||||
cpi->rtcd.encodemb.mbuverr = vp8_mbuverror_xmm;
|
||||
/* cpi->rtcd.encodemb.sub* not implemented for wmt */
|
||||
cpi->rtcd.encodemb.subb = vp8_subtract_b_sse2;
|
||||
cpi->rtcd.encodemb.submby = vp8_subtract_mby_sse2;
|
||||
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_sse2;
|
||||
|
||||
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
||||
|
||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -104,6 +104,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
|
|||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
|
||||
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
|
||||
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
|
||||
|
|
Загрузка…
Ссылка в новой задаче