Rewrite vp8_short_walsh4x4_sse2()
This rewriting reflects changes made in commit "Improve the accuracy of forward walsh-hadamard transform". Since this function is not called much, only a small encoder performance gain (~0.5% ) is seen. Change-Id: Ie9df58a43028a11fd5b115c4bbe3141f7596578b
This commit is contained in:
Родитель
4db2076594
Коммит
fc94ffcea4
|
@ -21,94 +21,122 @@ sym(vp8_short_walsh4x4_sse2):
|
|||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0)
|
||||
mov rdi, arg(1)
|
||||
mov rsi, arg(0) ; input
|
||||
mov rdi, arg(1) ; output
|
||||
movsxd rdx, dword ptr arg(2) ; pitch
|
||||
|
||||
movdqu xmm4, [rsi + 0] ;ip[4] ip[0]
|
||||
movdqu xmm0, [rsi + 16] ;ip[12] ip[8]
|
||||
; first for loop
|
||||
movq xmm0, MMWORD PTR [rsi] ; load input
|
||||
movq xmm1, MMWORD PTR [rsi + rdx]
|
||||
lea rsi, [rsi + rdx*2]
|
||||
movq xmm2, MMWORD PTR [rsi]
|
||||
movq xmm3, MMWORD PTR [rsi + rdx]
|
||||
|
||||
pxor xmm7, xmm7
|
||||
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
; 13 12 11 10 03 02 01 00
|
||||
;
|
||||
; 33 32 31 30 23 22 21 20
|
||||
;
|
||||
movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
|
||||
punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
|
||||
punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
|
||||
movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
|
||||
punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
|
||||
punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
|
||||
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
|
||||
movdqa xmm3, xmm4 ;ip[4] ip[0]
|
||||
punpcklwd xmm0, xmm1
|
||||
punpcklwd xmm2, xmm3
|
||||
|
||||
paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
|
||||
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
|
||||
movdqa xmm1, xmm0
|
||||
punpckldq xmm0, xmm2 ; ip[1] ip[0]
|
||||
punpckhdq xmm1, xmm2 ; ip[3] ip[2]
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
psubw xmm2, xmm1
|
||||
|
||||
psllw xmm0, 2 ; d1 a1
|
||||
psllw xmm2, 2 ; c1 b1
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
punpcklqdq xmm0, xmm2 ; b1 a1
|
||||
punpckhqdq xmm1, xmm2 ; c1 d1
|
||||
|
||||
pxor xmm6, xmm6
|
||||
movq xmm6, xmm0
|
||||
pxor xmm7, xmm7
|
||||
pcmpeqw xmm7, xmm6
|
||||
paddw xmm7, [GLOBAL(c1)]
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1 ; b1+c1 a1+d1
|
||||
psubw xmm2, xmm1 ; b1-c1 a1-d1
|
||||
paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
|
||||
|
||||
; second for loop
|
||||
; input: 13 9 5 1 12 8 4 0 (xmm0)
|
||||
; 14 10 6 2 15 11 7 3 (xmm2)
|
||||
; after shuffle:
|
||||
; 13 5 9 1 12 4 8 0 (xmm0)
|
||||
; 14 6 10 2 15 7 11 3 (xmm1)
|
||||
pshuflw xmm3, xmm0, 0xd8
|
||||
pshufhw xmm0, xmm3, 0xd8
|
||||
pshuflw xmm3, xmm2, 0xd8
|
||||
pshufhw xmm1, xmm3, 0xd8
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
|
||||
pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
|
||||
movdqa xmm3, xmm1
|
||||
pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
|
||||
pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
|
||||
|
||||
pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
|
||||
pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
|
||||
pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
|
||||
pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
|
||||
|
||||
movdqa xmm0, xmm4
|
||||
punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
|
||||
punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
|
||||
movdqa xmm1, xmm6
|
||||
punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
|
||||
punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
paddd xmm0, xmm4 ; b21 b20 a21 a20
|
||||
psubd xmm2, xmm4 ; c21 c20 d21 d20
|
||||
movdqa xmm3, xmm1
|
||||
paddd xmm1, xmm6 ; b23 b22 a23 a22
|
||||
psubd xmm3, xmm6 ; c23 c22 d23 d22
|
||||
|
||||
pxor xmm4, xmm4
|
||||
movdqa xmm5, xmm4
|
||||
punpcklqdq xmm4, xmm3 ;d1 a1
|
||||
punpckhqdq xmm5, xmm3 ;c1 b1
|
||||
pcmpgtd xmm4, xmm0
|
||||
pcmpgtd xmm5, xmm2
|
||||
pand xmm4, [GLOBAL(cd1)]
|
||||
pand xmm5, [GLOBAL(cd1)]
|
||||
|
||||
movdqa xmm1, xmm5 ;c1 b1
|
||||
paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
|
||||
psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
|
||||
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
; 13 12 11 10 03 02 01 00
|
||||
;
|
||||
; 33 32 31 30 23 22 21 20
|
||||
;
|
||||
movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
|
||||
punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
|
||||
punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
|
||||
movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
|
||||
punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
|
||||
punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
|
||||
;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
|
||||
movdqa xmm3, xmm5 ;ip[4] ip[0]
|
||||
pxor xmm6, xmm6
|
||||
movdqa xmm7, xmm6
|
||||
pcmpgtd xmm6, xmm1
|
||||
pcmpgtd xmm7, xmm3
|
||||
pand xmm6, [GLOBAL(cd1)]
|
||||
pand xmm7, [GLOBAL(cd1)]
|
||||
|
||||
paddw xmm5, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
|
||||
psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
|
||||
paddd xmm0, xmm4
|
||||
paddd xmm2, xmm5
|
||||
paddd xmm0, [GLOBAL(cd3)]
|
||||
paddd xmm2, [GLOBAL(cd3)]
|
||||
paddd xmm1, xmm6
|
||||
paddd xmm3, xmm7
|
||||
paddd xmm1, [GLOBAL(cd3)]
|
||||
paddd xmm3, [GLOBAL(cd3)]
|
||||
|
||||
movdqa xmm6, xmm5
|
||||
punpcklqdq xmm5, xmm3 ;d1 a1
|
||||
punpckhqdq xmm6, xmm3 ;c1 b1
|
||||
psrad xmm0, 3
|
||||
psrad xmm1, 3
|
||||
psrad xmm2, 3
|
||||
psrad xmm3, 3
|
||||
movdqa xmm4, xmm0
|
||||
punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
|
||||
punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
|
||||
movdqa xmm5, xmm2
|
||||
punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
|
||||
punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
|
||||
|
||||
movdqa xmm1, xmm6 ;c1 b1
|
||||
paddw xmm6, xmm5 ;dl+cl a1+b1 aka op[4] op[0]
|
||||
psubw xmm5, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
|
||||
packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
|
||||
packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
|
||||
|
||||
movdqa xmm0, xmm6 ;aka b2 a2
|
||||
movdqa xmm1, xmm5 ;aka d2 c2
|
||||
|
||||
pcmpgtw xmm0, xmm7
|
||||
pcmpgtw xmm1, xmm7
|
||||
|
||||
psrlw xmm0, 15
|
||||
psrlw xmm1, 15
|
||||
|
||||
paddw xmm6, xmm0
|
||||
paddw xmm5, xmm1
|
||||
|
||||
psraw xmm6, 1
|
||||
psraw xmm5, 1
|
||||
|
||||
; a2 = a1 + b1;
|
||||
; b2 = c1 + d1;
|
||||
; c2 = a1 - b1;
|
||||
; d2 = d1 - c1;
|
||||
; a2 += (a2>0);
|
||||
; b2 += (b2>0);
|
||||
; c2 += (c2>0);
|
||||
; d2 += (d2>0);
|
||||
; op[0] = (a2)>>1;
|
||||
; op[4] = (b2)>>1;
|
||||
; op[8] = (c2)>>1;
|
||||
; op[12]= (d2)>>1;
|
||||
|
||||
movdqu [rdi + 0], xmm6
|
||||
movdqu [rdi + 16], xmm5
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 16], xmm2
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -116,3 +144,17 @@ sym(vp8_short_walsh4x4_sse2):
|
|||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
c1:
|
||||
dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
|
||||
align 16
|
||||
cn1:
|
||||
dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
|
||||
align 16
|
||||
cd1:
|
||||
dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
|
||||
align 16
|
||||
cd3:
|
||||
dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
|
||||
|
|
|
@ -289,7 +289,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
|||
cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_sse2;
|
||||
cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_sse2;
|
||||
|
||||
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c ;
|
||||
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2 ;
|
||||
|
||||
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
|
||||
cpi->rtcd.encodemb.mberr = vp8_mbblock_error_xmm;
|
||||
|
|
Загрузка…
Ссылка в новой задаче