Fix decoder mismatch with ssse3 enabled

This patch fixed issue 661: "Decoder produces mismatched outputs
with ssse3 enabled and disabled." In sub-pixel filters, a pixel
value was multiplied by a filter coefficient, and the results
were added up. The order of adding up these multiplications had to
be arranged carefully to prevent incorrect overflowing.

Change-Id: Id08af4200fea9e1b896fc40157b8651c2c7e80f2
This commit is contained in:
Yunqing Wang 2013-11-19 14:29:25 -08:00
Родитель 69541e1dec
Коммит 3d50da5397
1 изменённых файлов: 37 добавлений и 19 удалений

Просмотреть файл

@ -11,17 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
;/************************************************************************************
; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
; input pixel array has output_height rows. This routine assumes that output_height is an
; even number. This function handles 8 pixels in horizontal direction, calculating ONE
; rows each iteration to take advantage of the 128 bits operations.
;
; This is an implementation of some of the SSE optimizations first seen in ffvp8
;
;*************************************************************************************/
%macro VERTx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
@ -81,11 +70,14 @@
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
movdqa xmm1, xmm2
paddsw xmm0, xmm6
paddsw xmm0, xmm2
pmaxsw xmm2, xmm4
pminsw xmm4, xmm1
paddsw xmm0, xmm4
paddsw xmm0, krd
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
@ -538,14 +530,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
movdqa %2, %1
pshufb %1, [GLOBAL(shuf_t0t1)]
pshufb %2, [GLOBAL(shuf_t2t3)]
pmaddubsw %1, xmm6
pmaddubsw %2, xmm7
pmaddubsw %1, k0k1k4k5
pmaddubsw %2, k2k3k6k7
paddsw %1, %2
movdqa %2, %1
movdqa xmm4, %1
movdqa xmm5, %2
psrldq %1, 8
psrldq %2, 8
paddsw %1, %2
paddsw %1, xmm5
movdqa xmm6, xmm5
paddsw xmm4, %2
pmaxsw xmm5, %1
pminsw %1, xmm6
paddsw %1, xmm4
paddsw %1, xmm5
paddsw %1, krd
psraw %1, 7
packuswb %1, %1
%endm
@ -565,6 +565,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
pshufd xmm5, xmm5, 0 ;rounding
movdqa k0k1k4k5, xmm6
movdqa k2k3k6k7, xmm7
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
@ -826,8 +830,15 @@ sym(vp9_filter_block1d4_h8_ssse3):
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 3
%define k0k1k4k5 [rsp + 16 * 0]
%define k2k3k6k7 [rsp + 16 * 1]
%define krd [rsp + 16 * 2]
HORIZx4 0
add rsp, 16 * 3
; begin epilog
pop rdi
pop rsi
@ -932,8 +943,15 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16 * 3
%define k0k1k4k5 [rsp + 16 * 0]
%define k2k3k6k7 [rsp + 16 * 1]
%define krd [rsp + 16 * 2]
HORIZx4 1
add rsp, 16 * 3
; begin epilog
pop rdi
pop rsi