Add prefetch before variance calculation
This improved encoding performance by 0.5% (good, speed 1) to 1.5% (good, speed 5). Change-Id: I843d72a0d68a90b5f694adf770943e4a4618f50e
This commit is contained in:
Родитель
945dad277d
Коммит
d96ba65a23
|
@ -85,10 +85,9 @@ sym(vp8_get16x16var_sse2):
|
|||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
GET_GOT rbx
|
||||
push rbx
|
||||
push rsi
|
||||
push rdi
|
||||
sub rsp, 16
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;[src_ptr]
|
||||
|
@ -97,6 +96,29 @@ sym(vp8_get16x16var_sse2):
|
|||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
||||
|
||||
; Prefetch data
|
||||
lea rcx, [rax+rax*2]
|
||||
prefetcht0 [rsi]
|
||||
prefetcht0 [rsi+rax]
|
||||
prefetcht0 [rsi+rax*2]
|
||||
prefetcht0 [rsi+rcx]
|
||||
lea rbx, [rsi+rax*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rax]
|
||||
prefetcht0 [rbx+rax*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
lea rcx, [rdx+rdx*2]
|
||||
prefetcht0 [rdi]
|
||||
prefetcht0 [rdi+rdx]
|
||||
prefetcht0 [rdi+rdx*2]
|
||||
prefetcht0 [rdi+rcx]
|
||||
lea rbx, [rdi+rdx*4]
|
||||
prefetcht0 [rbx]
|
||||
prefetcht0 [rbx+rdx]
|
||||
prefetcht0 [rbx+rdx*2]
|
||||
prefetcht0 [rbx+rcx]
|
||||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
||||
|
||||
|
@ -107,6 +129,9 @@ var16loop:
|
|||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
prefetcht0 [rsi+rax*8]
|
||||
prefetcht0 [rdi+rdx*8]
|
||||
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
|
@ -178,10 +203,9 @@ var16loop:
|
|||
|
||||
|
||||
; begin epilog
|
||||
add rsp, 16
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
pop rbx
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
|
Загрузка…
Ссылка в новой задаче