From 3d50da5397d20abc932d81453b26cde758293a40 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 19 Nov 2013 14:29:25 -0800 Subject: [PATCH] Fix decoder mismatch with ssse3 enabled This patch fixed issue 661: "Decoder produces mismatched outputs with ssse3 enabled and disabled." In sub-pixel filters, a pixel value was multiplied by a filter coefficient, and the results were added up. The order of adding up these multiplications had to be arranged carefully to prevent incorrect overflowing. Change-Id: Id08af4200fea9e1b896fc40157b8651c2c7e80f2 --- vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 56 ++++++++++++++++-------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index 7a5cca056..dbc17ec0f 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -11,17 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ - - %macro VERTx4 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr @@ -81,11 +70,14 @@ pmaddubsw xmm4, k4k5 pmaddubsw xmm6, k6k7 + movdqa xmm1, xmm2 paddsw xmm0, xmm6 - paddsw xmm0, xmm2 + pmaxsw xmm2, xmm4 + pminsw xmm4, xmm1 paddsw xmm0, xmm4 - paddsw xmm0, krd + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 @@ -538,14 +530,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): movdqa %2, %1 pshufb %1, [GLOBAL(shuf_t0t1)] pshufb %2, [GLOBAL(shuf_t2t3)] - pmaddubsw %1, xmm6 - pmaddubsw %2, xmm7 + pmaddubsw %1, k0k1k4k5 + pmaddubsw %2, k2k3k6k7 - paddsw %1, %2 - movdqa %2, %1 + movdqa xmm4, %1 + movdqa xmm5, %2 + psrldq %1, 8 psrldq %2, 8 - paddsw %1, %2 - paddsw %1, xmm5 + movdqa xmm6, xmm5 + + paddsw xmm4, %2 + pmaxsw xmm5, %1 + pminsw %1, xmm6 + paddsw %1, xmm4 + paddsw %1, xmm5 + + paddsw %1, krd psraw %1, 7 packuswb %1, %1 %endm @@ -565,6 +565,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3): pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 pshufd xmm5, xmm5, 0 ;rounding + movdqa k0k1k4k5, xmm6 + movdqa k2k3k6k7, xmm7 + movdqa krd, xmm5 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height @@ -826,8 +830,15 @@ sym(vp9_filter_block1d4_h8_ssse3): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + HORIZx4 0 + add rsp, 16 * 3 ; begin epilog pop rdi pop rsi @@ -932,8 +943,15 @@ sym(vp9_filter_block1d4_h8_avg_ssse3): push rdi ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16 * 3 + %define k0k1k4k5 [rsp + 16 * 0] + %define k2k3k6k7 [rsp + 16 * 1] + %define krd [rsp + 16 * 2] + HORIZx4 1 + add rsp, 16 * 3 ; begin epilog pop rdi pop rsi