Add save/restore xmm registers in x86 assembly code

Went through the code and fixed it. Verified on Windows.

Where possible, remove dependencies on xmm[67]

Current code relies on pushing rbp to the stack to get 16 byte
alignment. This broke when rbp wasn't pushed
(vp8/encoder/x86/sad_sse3.asm). Work around this by using unaligned
memory accesses. Revisit this and the offsets in
vp8/encoder/x86/sad_sse3.asm in another change to SAVE_XMM.

Change-Id: I5f940994d3ebfd977c3d68446cef20fd78b07877
This commit is contained in:
Johann 2011-04-07 13:17:22 -04:00
Родитель d889035fe6
Коммит c7cfde42a9
9 изменённых файлов: 120 добавлений и 78 удалений

Просмотреть файл

@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2):
mov rdx, arg(1) ; dequant
mov rax, arg(0) ; qcoeff
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
movd xmm4, [rax]
movd xmm5, [rdx]
@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2):
pmullw xmm4, xmm5
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
; clear coeffs
movd [rax], xmm7
movd [rax+32], xmm7
movd [rax], xmm5
movd [rax+32], xmm5
;pshufb
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2):
lea rcx, [3*rcx]
movq xmm3, [rax+rcx]
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
punpcklbw xmm2, xmm7
punpcklbw xmm3, xmm7
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
mov rax, arg(3) ; dst
movsxd rdx, dword ptr arg(4) ; dst_stride
@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
packuswb xmm0, xmm7
packuswb xmm1, xmm7
packuswb xmm2, xmm7
packuswb xmm3, xmm7
packuswb xmm0, xmm5
packuswb xmm1, xmm5
packuswb xmm2, xmm5
packuswb xmm3, xmm5
; store blocks back out
movq [rax], xmm0
@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2):
mov rdi, arg(3) ; dst
mov rdx, arg(5) ; dc
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2):
psraw xmm4, 3
; Predict buffer needs to be expanded from bytes to words
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
punpcklbw xmm2, xmm7
punpcklbw xmm3, xmm7
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
; Add to predict buffer
paddw xmm0, xmm4
@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2):
paddw xmm3, xmm4
; pack up before storing
packuswb xmm0, xmm7
packuswb xmm1, xmm7
packuswb xmm2, xmm7
packuswb xmm3, xmm7
packuswb xmm0, xmm5
packuswb xmm1, xmm5
packuswb xmm2, xmm5
packuswb xmm3, xmm5
; Load destination stride before writing out,
; doesn't need to persist
@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2):
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -286,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -393,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -413,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -508,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -580,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -598,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -670,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -718,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -808,6 +819,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -33,6 +33,7 @@
%define input rcx
%define output rdx
%define pitch r8
SAVE_XMM
%else
%define input rdi
%define output rsi
@ -53,6 +54,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
RESTORE_XMM
%endif
%endif
ret

Просмотреть файл

@ -22,33 +22,33 @@ sym(vp8_block_error_xmm):
; end prologue
mov rsi, arg(0) ;coeff_ptr
mov rdi, arg(1) ;dcoef_ptr
movdqa xmm3, [rsi]
movdqa xmm4, [rdi]
movdqa xmm5, [rsi+16]
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
movdqa xmm6, [rdi+16]
psubw xmm3, xmm4
movdqa xmm2, [rsi+16]
movdqa xmm3, [rdi+16]
psubw xmm5, xmm6
pmaddwd xmm3, xmm3
pmaddwd xmm5, xmm5
psubw xmm0, xmm1
psubw xmm2, xmm3
paddd xmm3, xmm5
pmaddwd xmm0, xmm0
pmaddwd xmm2, xmm2
pxor xmm7, xmm7
movdqa xmm0, xmm3
paddd xmm0, xmm2
punpckldq xmm0, xmm7
punpckhdq xmm3, xmm7
pxor xmm5, xmm5
movdqa xmm1, xmm0
paddd xmm0, xmm3
movdqa xmm3, xmm0
punpckldq xmm0, xmm5
punpckhdq xmm1, xmm5
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm0, 8
paddd xmm0, xmm3
paddd xmm0, xmm1
movq rax, xmm0
@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
SAVE_XMM ; 6
push rsi
push rdi
; end prolog
mov rsi, arg(0) ;coeff_ptr
pxor xmm7, xmm7
pxor xmm6, xmm6
mov rdi, arg(1) ;dcoef_ptr
pxor xmm2, xmm2
pxor xmm4, xmm4
movd xmm1, dword ptr arg(2) ;dc
por xmm1, xmm2
movd xmm5, dword ptr arg(2) ;dc
por xmm5, xmm4
pcmpeqw xmm1, xmm7
pcmpeqw xmm5, xmm6
mov rcx, 16
mberror_loop:
movdqa xmm3, [rsi]
movdqa xmm4, [rdi]
movdqa xmm0, [rsi]
movdqa xmm1, [rdi]
movdqa xmm5, [rsi+16]
movdqa xmm6, [rdi+16]
movdqa xmm2, [rsi+16]
movdqa xmm3, [rdi+16]
psubw xmm5, xmm6
pmaddwd xmm5, xmm5
psubw xmm2, xmm3
pmaddwd xmm2, xmm2
psubw xmm3, xmm4
pand xmm3, xmm1
psubw xmm0, xmm1
pand xmm0, xmm5
pmaddwd xmm3, xmm3
pmaddwd xmm0, xmm0
add rsi, 32
add rdi, 32
sub rcx, 1
paddd xmm2, xmm5
paddd xmm4, xmm2
paddd xmm2, xmm3
paddd xmm4, xmm0
jnz mberror_loop
movdqa xmm0, xmm2
punpckldq xmm0, xmm7
movdqa xmm0, xmm4
punpckldq xmm0, xmm6
punpckhdq xmm2, xmm7
paddd xmm0, xmm2
punpckhdq xmm4, xmm6
paddd xmm0, xmm4
movdqa xmm1, xmm0
psrldq xmm0, 8
@ -265,6 +266,7 @@ mberror_loop:
pop rdi
pop rsi
; begin epilog
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl):
mov rdi, arg(1) ;d_ptr
mov rcx, 16
pxor xmm7, xmm7
pxor xmm3, xmm3
mbuverror_loop:
@ -352,7 +354,7 @@ mbuverror_loop:
psubw xmm1, xmm2
pmaddwd xmm1, xmm1
paddd xmm7, xmm1
paddd xmm3, xmm1
add rsi, 16
add rdi, 16
@ -361,7 +363,7 @@ mbuverror_loop:
jnz mbuverror_loop
pxor xmm0, xmm0
movdqa xmm1, xmm7
movdqa xmm1, xmm3
movdqa xmm2, xmm1
punpckldq xmm1, xmm0

Просмотреть файл

@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM ; 6
push rsi
push rdi
; end prolog
@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt):
lea rcx, [rsi+rax*8]
lea rcx, [rcx+rax*8]
pxor xmm7, xmm7
pxor xmm6, xmm6
x16x16sad_wmt_loop:
@ -52,32 +53,33 @@ x16x16sad_wmt_loop:
punpcklbw xmm1, xmm3
psadbw xmm0, xmm1
movq xmm6, QWORD PTR [rsi+rax+8]
movq xmm2, QWORD PTR [rsi+rax+8]
movq xmm3, QWORD PTR [rdi+rdx+8]
lea rsi, [rsi+rax*2]
lea rdi, [rdi+rdx*2]
punpcklbw xmm4, xmm6
punpcklbw xmm4, xmm2
punpcklbw xmm5, xmm3
psadbw xmm4, xmm5
paddw xmm7, xmm0
paddw xmm7, xmm4
paddw xmm6, xmm0
paddw xmm6, xmm4
cmp rsi, rcx
jne x16x16sad_wmt_loop
movq xmm0, xmm7
psrldq xmm7, 8
movq xmm0, xmm6
psrldq xmm6, 8
paddw xmm0, xmm7
paddw xmm0, xmm6
movq rax, xmm0
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -39,8 +39,9 @@
%define ref_stride r9
%define end_ptr r10
%define ret_var r11
%define result_ptr [rsp+8+4*8]
%define max_err [rsp+8+4*8]
%define result_ptr [rsp+40+4*8]
%define max_err [rsp+40+4*8]
SAVE_XMM
%else
%define src_ptr rdi
%define src_stride rsi
@ -72,6 +73,7 @@
pop rbp
%else
%ifidn __OUTPUT_FORMAT__,x64
RESTORE_XMM
%endif
%endif
ret
@ -113,7 +115,8 @@
%define r2_ptr r11
%define r3_ptr r8
%define ref_stride r9
%define result_ptr [rsp+16+4*8]
%define result_ptr [rsp+48+4*8]
SAVE_XMM
push rsi
LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
@ -151,6 +154,7 @@
%else
%ifidn __OUTPUT_FORMAT__,x64
pop rsi
RESTORE_XMM
%endif
%endif
ret

Просмотреть файл

@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM
push rsi
push rdi
push rcx
@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off:
pop rcx
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
SAVE_XMM
push rsi
push rdi
push rcx
@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off:
pop rcx
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
push rbx
push rsi
push rdi
@ -206,6 +207,7 @@ var16loop:
pop rdi
pop rsi
pop rbx
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 4
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -321,6 +324,7 @@ var16peloop:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2):
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
SAVE_XMM
GET_GOT rbx
push rsi
push rdi
@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1:
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret

Просмотреть файл

@ -260,12 +260,12 @@
%ifidn __OUTPUT_FORMAT__,x64
%macro SAVE_XMM 0
sub rsp, 32
movdqa XMMWORD PTR [rsp], xmm6
movdqa XMMWORD PTR [rsp+16], xmm7
movdqu XMMWORD PTR [rsp], xmm6
movdqu XMMWORD PTR [rsp+16], xmm7
%endmacro
%macro RESTORE_XMM 0
movdqa xmm6, XMMWORD PTR [rsp]
movdqa xmm7, XMMWORD PTR [rsp+16]
movdqu xmm6, XMMWORD PTR [rsp]
movdqu xmm7, XMMWORD PTR [rsp+16]
add rsp, 32
%endmacro
%else