Merge remote branch 'origin/master' into experimental
Change-Id: If53ec5c1219b31e5ef9ae552d9cc79432ebda267
This commit is contained in:
Коммит
d2a2d5a6d5
|
@ -40,7 +40,7 @@ sym(vp8_loop_filter_horizontal_edge_mmx):
|
|||
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ;count
|
||||
next8_h:
|
||||
.next8_h:
|
||||
mov rdx, arg(3) ;limit
|
||||
movq mm7, [rdx]
|
||||
mov rdi, rsi ; rdi points to row +1 for indirect addressing
|
||||
|
@ -211,7 +211,7 @@ next8_h:
|
|||
add rsi,8
|
||||
neg rax
|
||||
dec rcx
|
||||
jnz next8_h
|
||||
jnz .next8_h
|
||||
|
||||
add rsp, 32
|
||||
pop rsp
|
||||
|
@ -255,7 +255,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
|
|||
lea rsi, [rsi + rax*4 - 4]
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ;count
|
||||
next8_v:
|
||||
.next8_v:
|
||||
mov rdi, rsi ; rdi points to row +1 for indirect addressing
|
||||
add rdi, rax
|
||||
|
||||
|
@ -581,7 +581,7 @@ next8_v:
|
|||
|
||||
lea rsi, [rsi+rax*8]
|
||||
dec rcx
|
||||
jnz next8_v
|
||||
jnz .next8_v
|
||||
|
||||
add rsp, 64
|
||||
pop rsp
|
||||
|
@ -622,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_mmx):
|
|||
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ;count
|
||||
next8_mbh:
|
||||
.next8_mbh:
|
||||
mov rdx, arg(3) ;limit
|
||||
movq mm7, [rdx]
|
||||
mov rdi, rsi ; rdi points to row +1 for indirect addressing
|
||||
|
@ -898,7 +898,7 @@ next8_mbh:
|
|||
neg rax
|
||||
add rsi,8
|
||||
dec rcx
|
||||
jnz next8_mbh
|
||||
jnz .next8_mbh
|
||||
|
||||
add rsp, 32
|
||||
pop rsp
|
||||
|
@ -942,7 +942,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
|
|||
lea rsi, [rsi + rax*4 - 4]
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ;count
|
||||
next8_mbv:
|
||||
.next8_mbv:
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
|
||||
;transpose
|
||||
|
@ -1365,7 +1365,7 @@ next8_mbv:
|
|||
lea rsi, [rsi+rax*8]
|
||||
dec rcx
|
||||
|
||||
jnz next8_mbv
|
||||
jnz .next8_mbv
|
||||
|
||||
add rsp, 96
|
||||
pop rsp
|
||||
|
@ -1398,7 +1398,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx):
|
|||
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
|
||||
|
||||
mov rcx, 2 ; count
|
||||
nexts8_h:
|
||||
.nexts8_h:
|
||||
mov rdx, arg(2) ;blimit ; get blimit
|
||||
movq mm3, [rdx] ;
|
||||
|
||||
|
@ -1483,7 +1483,7 @@ nexts8_h:
|
|||
add rsi,8
|
||||
neg rax
|
||||
dec rcx
|
||||
jnz nexts8_h
|
||||
jnz .nexts8_h
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -1520,7 +1520,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx):
|
|||
|
||||
lea rsi, [rsi + rax*4- 2]; ;
|
||||
mov rcx, 2 ; count
|
||||
nexts8_v:
|
||||
.nexts8_v:
|
||||
|
||||
lea rdi, [rsi + rax];
|
||||
movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
|
||||
|
@ -1695,7 +1695,7 @@ nexts8_v:
|
|||
lea rsi, [rsi+rax*8] ; next 8
|
||||
|
||||
dec rcx
|
||||
jnz nexts8_v
|
||||
jnz .nexts8_v
|
||||
|
||||
add rsp, 32
|
||||
pop rsp
|
||||
|
|
|
@ -58,10 +58,10 @@ sym(vp8_post_proc_down_and_across_mmx):
|
|||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
pxor mm0, mm0 ; mm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
.nextrow:
|
||||
|
||||
xor rdx, rdx ; clear out rdx for use as loop counter
|
||||
nextcol:
|
||||
.nextcol:
|
||||
|
||||
pxor mm7, mm7 ; mm7 = 00000000
|
||||
movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps
|
||||
|
@ -146,7 +146,7 @@ nextcol:
|
|||
add rdx, 4
|
||||
|
||||
cmp edx, dword ptr arg(5) ;cols
|
||||
jl nextcol
|
||||
jl .nextcol
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
sub rdi, rdx
|
||||
|
@ -156,7 +156,7 @@ nextcol:
|
|||
xor rdx, rdx
|
||||
mov rax, [rdi-4];
|
||||
|
||||
acrossnextcol:
|
||||
.acrossnextcol:
|
||||
pxor mm7, mm7 ; mm7 = 00000000
|
||||
movq mm6, [rbx + 32 ] ;
|
||||
movq mm4, [rdi+rdx] ; mm4 = p0..p7
|
||||
|
@ -237,7 +237,7 @@ acrossnextcol:
|
|||
|
||||
add rdx, 4
|
||||
cmp edx, dword ptr arg(5) ;cols
|
||||
jl acrossnextcol;
|
||||
jl .acrossnextcol;
|
||||
|
||||
mov DWORD PTR [rdi+rdx-4], eax
|
||||
pop rax
|
||||
|
@ -249,7 +249,7 @@ acrossnextcol:
|
|||
movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
jnz .nextrow ; next row
|
||||
pop rbx
|
||||
|
||||
; begin epilog
|
||||
|
@ -293,7 +293,7 @@ sym(vp8_mbpost_proc_down_mmx):
|
|||
add dword ptr arg(2), 8
|
||||
|
||||
;for(c=0; c<cols; c+=4)
|
||||
loop_col:
|
||||
.loop_col:
|
||||
mov rsi, arg(0) ;s
|
||||
pxor mm0, mm0 ;
|
||||
|
||||
|
@ -312,7 +312,7 @@ loop_col:
|
|||
|
||||
mov rcx, 15 ;
|
||||
|
||||
loop_initvar:
|
||||
.loop_initvar:
|
||||
movd mm1, DWORD PTR [rdi];
|
||||
punpcklbw mm1, mm0 ;
|
||||
|
||||
|
@ -329,10 +329,10 @@ loop_initvar:
|
|||
lea rdi, [rdi+rax] ;
|
||||
|
||||
dec rcx
|
||||
jne loop_initvar
|
||||
jne .loop_initvar
|
||||
;save the var and sum
|
||||
xor rdx, rdx
|
||||
loop_row:
|
||||
.loop_row:
|
||||
movd mm1, DWORD PTR [rsi] ; [s-pitch*8]
|
||||
movd mm2, DWORD PTR [rdi] ; [s+pitch*7]
|
||||
|
||||
|
@ -438,13 +438,13 @@ loop_row:
|
|||
add rdx, 1
|
||||
|
||||
cmp edx, dword arg(2) ;rows
|
||||
jl loop_row
|
||||
jl .loop_row
|
||||
|
||||
|
||||
add dword arg(0), 4 ; s += 4
|
||||
sub dword arg(3), 4 ; cols -= 4
|
||||
cmp dword arg(3), 0
|
||||
jg loop_col
|
||||
jg .loop_col
|
||||
|
||||
add rsp, 136
|
||||
pop rsp
|
||||
|
@ -475,7 +475,7 @@ sym(vp8_plane_add_noise_mmx):
|
|||
push rdi
|
||||
; end prolog
|
||||
|
||||
addnoise_loop:
|
||||
.addnoise_loop:
|
||||
call sym(rand) WRT_PLT
|
||||
mov rcx, arg(1) ;noise
|
||||
and rax, 0xff
|
||||
|
@ -492,7 +492,7 @@ addnoise_loop:
|
|||
mov rsi, arg(0) ;Pos
|
||||
xor rax,rax
|
||||
|
||||
addnoise_nextset:
|
||||
.addnoise_nextset:
|
||||
movq mm1,[rsi+rax] ; get the source
|
||||
|
||||
psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
|
||||
|
@ -506,12 +506,12 @@ addnoise_nextset:
|
|||
add rax,8 ; move to the next line
|
||||
|
||||
cmp rax, rcx
|
||||
jl addnoise_nextset
|
||||
jl .addnoise_nextset
|
||||
|
||||
movsxd rax, dword arg(7) ; Pitch
|
||||
add arg(0), rax ; Start += Pitch
|
||||
sub dword arg(6), 1 ; Height -= 1
|
||||
jg addnoise_loop
|
||||
jg .addnoise_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
|
|
@ -57,10 +57,10 @@ sym(vp8_post_proc_down_and_across_xmm):
|
|||
movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
pxor xmm0, xmm0 ; mm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
.nextrow:
|
||||
|
||||
xor rdx, rdx ; clear out rdx for use as loop counter
|
||||
nextcol:
|
||||
.nextcol:
|
||||
movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7
|
||||
punpcklbw xmm3, xmm0 ; mm3 = p0..p3
|
||||
movdqa xmm1, xmm3 ; mm1 = p0..p3
|
||||
|
@ -133,7 +133,7 @@ nextcol:
|
|||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
|
||||
jl nextcol
|
||||
jl .nextcol
|
||||
|
||||
; done with the all cols, start the across filtering in place
|
||||
sub rsi, rdx
|
||||
|
@ -142,7 +142,7 @@ nextcol:
|
|||
xor rdx, rdx
|
||||
movq mm0, QWORD PTR [rdi-8];
|
||||
|
||||
acrossnextcol:
|
||||
.acrossnextcol:
|
||||
movq xmm7, QWORD PTR [rdi +rdx -2]
|
||||
movd xmm4, DWORD PTR [rdi +rdx +6]
|
||||
|
||||
|
@ -219,7 +219,7 @@ acrossnextcol:
|
|||
|
||||
add rdx, 8
|
||||
cmp edx, dword arg(5) ;cols
|
||||
jl acrossnextcol;
|
||||
jl .acrossnextcol;
|
||||
|
||||
; last 8 pixels
|
||||
movq QWORD PTR [rdi+rdx-8], mm0
|
||||
|
@ -231,7 +231,7 @@ acrossnextcol:
|
|||
mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
jnz .nextrow ; next row
|
||||
|
||||
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
|
||||
add rsp,16
|
||||
|
@ -282,7 +282,7 @@ sym(vp8_mbpost_proc_down_xmm):
|
|||
add dword arg(2), 8
|
||||
|
||||
;for(c=0; c<cols; c+=8)
|
||||
loop_col:
|
||||
.loop_col:
|
||||
mov rsi, arg(0) ; s
|
||||
pxor xmm0, xmm0 ;
|
||||
|
||||
|
@ -301,7 +301,7 @@ loop_col:
|
|||
|
||||
mov rcx, 15 ;
|
||||
|
||||
loop_initvar:
|
||||
.loop_initvar:
|
||||
movq xmm1, QWORD PTR [rdi];
|
||||
punpcklbw xmm1, xmm0 ;
|
||||
|
||||
|
@ -318,10 +318,10 @@ loop_initvar:
|
|||
lea rdi, [rdi+rax] ;
|
||||
|
||||
dec rcx
|
||||
jne loop_initvar
|
||||
jne .loop_initvar
|
||||
;save the var and sum
|
||||
xor rdx, rdx
|
||||
loop_row:
|
||||
.loop_row:
|
||||
movq xmm1, QWORD PTR [rsi] ; [s-pitch*8]
|
||||
movq xmm2, QWORD PTR [rdi] ; [s+pitch*7]
|
||||
|
||||
|
@ -428,12 +428,12 @@ loop_row:
|
|||
add rdx, 1
|
||||
|
||||
cmp edx, dword arg(2) ;rows
|
||||
jl loop_row
|
||||
jl .loop_row
|
||||
|
||||
add dword arg(0), 8 ; s += 8
|
||||
sub dword arg(3), 8 ; cols -= 8
|
||||
cmp dword arg(3), 0
|
||||
jg loop_col
|
||||
jg .loop_col
|
||||
|
||||
add rsp, 128+16
|
||||
pop rsp
|
||||
|
@ -475,13 +475,13 @@ sym(vp8_mbpost_proc_across_ip_xmm):
|
|||
|
||||
|
||||
;for(r=0;r<rows;r++)
|
||||
ip_row_loop:
|
||||
.ip_row_loop:
|
||||
|
||||
xor rdx, rdx ;sumsq=0;
|
||||
xor rcx, rcx ;sum=0;
|
||||
mov rsi, arg(0); s
|
||||
mov rdi, -8
|
||||
ip_var_loop:
|
||||
.ip_var_loop:
|
||||
;for(i=-8;i<=6;i++)
|
||||
;{
|
||||
; sumsq += s[i]*s[i];
|
||||
|
@ -493,7 +493,7 @@ ip_var_loop:
|
|||
add edx, eax
|
||||
add rdi, 1
|
||||
cmp rdi, 6
|
||||
jle ip_var_loop
|
||||
jle .ip_var_loop
|
||||
|
||||
|
||||
;mov rax, sumsq
|
||||
|
@ -513,7 +513,7 @@ ip_var_loop:
|
|||
pxor mm1, mm1
|
||||
|
||||
pxor xmm0, xmm0
|
||||
nextcol4:
|
||||
.nextcol4:
|
||||
|
||||
movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5
|
||||
movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10
|
||||
|
@ -600,7 +600,7 @@ nextcol4:
|
|||
add rcx, 4
|
||||
|
||||
cmp rcx, rdx
|
||||
jl nextcol4
|
||||
jl .nextcol4
|
||||
|
||||
;s+=pitch;
|
||||
movsxd rax, dword arg(1)
|
||||
|
@ -608,7 +608,7 @@ nextcol4:
|
|||
|
||||
sub dword arg(2), 1 ;rows-=1
|
||||
cmp dword arg(2), 0
|
||||
jg ip_row_loop
|
||||
jg .ip_row_loop
|
||||
|
||||
add rsp, 16
|
||||
pop rsp
|
||||
|
@ -640,7 +640,7 @@ sym(vp8_plane_add_noise_wmt):
|
|||
push rdi
|
||||
; end prolog
|
||||
|
||||
addnoise_loop:
|
||||
.addnoise_loop:
|
||||
call sym(rand) WRT_PLT
|
||||
mov rcx, arg(1) ;noise
|
||||
and rax, 0xff
|
||||
|
@ -657,7 +657,7 @@ addnoise_loop:
|
|||
mov rsi, arg(0) ;Pos
|
||||
xor rax,rax
|
||||
|
||||
addnoise_nextset:
|
||||
.addnoise_nextset:
|
||||
movdqu xmm1,[rsi+rax] ; get the source
|
||||
|
||||
psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise
|
||||
|
@ -671,12 +671,12 @@ addnoise_nextset:
|
|||
add rax,16 ; move to the next line
|
||||
|
||||
cmp rax, rcx
|
||||
jl addnoise_nextset
|
||||
jl .addnoise_nextset
|
||||
|
||||
movsxd rax, dword arg(7) ; Pitch
|
||||
add arg(0), rax ; Start += Pitch
|
||||
sub dword arg(6), 1 ; Height -= 1
|
||||
jg addnoise_loop
|
||||
jg .addnoise_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
|
|
@ -503,7 +503,7 @@ sym(vp8_intra_pred_uv_tm_%1):
|
|||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
|
||||
vp8_intra_pred_uv_tm_%1_loop:
|
||||
.vp8_intra_pred_uv_tm_%1_loop:
|
||||
movd xmm3, [rsi]
|
||||
movd xmm5, [rsi+rax]
|
||||
%ifidn %1, sse2
|
||||
|
@ -525,7 +525,7 @@ vp8_intra_pred_uv_tm_%1_loop:
|
|||
lea rsi, [rsi+rax*2]
|
||||
lea rdi, [rdi+rcx*2]
|
||||
dec edx
|
||||
jnz vp8_intra_pred_uv_tm_%1_loop
|
||||
jnz .vp8_intra_pred_uv_tm_%1_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -615,7 +615,7 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
%endif
|
||||
dec rsi
|
||||
%ifidn %1, mmx2
|
||||
vp8_intra_pred_uv_ho_%1_loop:
|
||||
.vp8_intra_pred_uv_ho_%1_loop:
|
||||
movd mm0, [rsi]
|
||||
movd mm1, [rsi+rax]
|
||||
punpcklbw mm0, mm0
|
||||
|
@ -627,7 +627,7 @@ vp8_intra_pred_uv_ho_%1_loop:
|
|||
lea rsi, [rsi+rax*2]
|
||||
lea rdi, [rdi+rcx*2]
|
||||
dec edx
|
||||
jnz vp8_intra_pred_uv_ho_%1_loop
|
||||
jnz .vp8_intra_pred_uv_ho_%1_loop
|
||||
%else
|
||||
movd xmm0, [rsi]
|
||||
movd xmm3, [rsi+rax]
|
||||
|
|
|
@ -50,7 +50,7 @@ sym(vp8_filter_block1d_h6_mmx):
|
|||
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
|
||||
pxor mm0, mm0 ; mm0 = 00000000
|
||||
|
||||
nextrow:
|
||||
.nextrow:
|
||||
movq mm3, [rsi-2] ; mm3 = p-2..p5
|
||||
movq mm4, mm3 ; mm4 = p-2..p5
|
||||
psrlq mm3, 8 ; mm3 = p-1..p5
|
||||
|
@ -102,7 +102,7 @@ nextrow:
|
|||
%endif
|
||||
|
||||
dec rcx ; decrement count
|
||||
jnz nextrow ; next row
|
||||
jnz .nextrow ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -152,7 +152,7 @@ sym(vp8_filter_block1dc_v6_mmx):
|
|||
pxor mm0, mm0 ; mm0 = 00000000
|
||||
|
||||
|
||||
nextrow_cv:
|
||||
.nextrow_cv:
|
||||
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
|
||||
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
||||
|
||||
|
@ -190,7 +190,7 @@ nextrow_cv:
|
|||
; avoidable!!!.
|
||||
lea rdi, [rdi+rax] ;
|
||||
dec rcx ; decrement count
|
||||
jnz nextrow_cv ; next row
|
||||
jnz .nextrow_cv ; next row
|
||||
|
||||
pop rbx
|
||||
|
||||
|
@ -282,7 +282,7 @@ sym(vp8_bilinear_predict8x8_mmx):
|
|||
packuswb mm7, mm4 ;
|
||||
|
||||
add rsi, rdx ; next line
|
||||
next_row_8x8:
|
||||
.next_row_8x8:
|
||||
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movq mm4, mm3 ; make a copy of current line
|
||||
|
||||
|
@ -349,7 +349,7 @@ next_row_8x8:
|
|||
add rdi, r8 ;dst_pitch
|
||||
%endif
|
||||
cmp rdi, rcx ;
|
||||
jne next_row_8x8
|
||||
jne .next_row_8x8
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -437,7 +437,7 @@ sym(vp8_bilinear_predict8x4_mmx):
|
|||
packuswb mm7, mm4 ;
|
||||
|
||||
add rsi, rdx ; next line
|
||||
next_row_8x4:
|
||||
.next_row_8x4:
|
||||
movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movq mm4, mm3 ; make a copy of current line
|
||||
|
||||
|
@ -504,7 +504,7 @@ next_row_8x4:
|
|||
add rdi, r8
|
||||
%endif
|
||||
cmp rdi, rcx ;
|
||||
jne next_row_8x4
|
||||
jne .next_row_8x4
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -579,7 +579,7 @@ sym(vp8_bilinear_predict4x4_mmx):
|
|||
packuswb mm7, mm0 ;
|
||||
|
||||
add rsi, rdx ; next line
|
||||
next_row_4x4:
|
||||
.next_row_4x4:
|
||||
movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
||||
|
||||
|
@ -622,7 +622,7 @@ next_row_4x4:
|
|||
%endif
|
||||
|
||||
cmp rdi, rcx ;
|
||||
jne next_row_4x4
|
||||
jne .next_row_4x4
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
|
|
@ -55,7 +55,7 @@ sym(vp8_filter_block1d8_h6_sse2):
|
|||
%endif
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
|
||||
filter_block1d8_h6_rowloop:
|
||||
.filter_block1d8_h6_rowloop:
|
||||
movq xmm3, MMWORD PTR [rsi - 2]
|
||||
movq xmm1, MMWORD PTR [rsi + 6]
|
||||
|
||||
|
@ -124,7 +124,7 @@ filter_block1d8_h6_rowloop:
|
|||
%endif
|
||||
dec rcx
|
||||
|
||||
jnz filter_block1d8_h6_rowloop ; next row
|
||||
jnz .filter_block1d8_h6_rowloop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -176,7 +176,7 @@ sym(vp8_filter_block1d16_h6_sse2):
|
|||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
|
||||
filter_block1d16_h6_sse2_rowloop:
|
||||
.filter_block1d16_h6_sse2_rowloop:
|
||||
movq xmm3, MMWORD PTR [rsi - 2]
|
||||
movq xmm1, MMWORD PTR [rsi + 6]
|
||||
|
||||
|
@ -301,7 +301,7 @@ filter_block1d16_h6_sse2_rowloop:
|
|||
%endif
|
||||
|
||||
dec rcx
|
||||
jnz filter_block1d16_h6_sse2_rowloop ; next row
|
||||
jnz .filter_block1d16_h6_sse2_rowloop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -356,7 +356,7 @@ sym(vp8_filter_block1d8_v6_sse2):
|
|||
movsxd r8, dword ptr arg(2) ; dst_ptich
|
||||
%endif
|
||||
|
||||
vp8_filter_block1d8_v6_sse2_loop:
|
||||
.vp8_filter_block1d8_v6_sse2_loop:
|
||||
movdqa xmm1, XMMWORD PTR [rsi]
|
||||
pmullw xmm1, [rax]
|
||||
|
||||
|
@ -396,7 +396,7 @@ vp8_filter_block1d8_v6_sse2_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx ; decrement count
|
||||
jnz vp8_filter_block1d8_v6_sse2_loop ; next row
|
||||
jnz .vp8_filter_block1d8_v6_sse2_loop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -448,7 +448,7 @@ sym(vp8_filter_block1d16_v6_sse2):
|
|||
movsxd r8, dword ptr arg(2) ; dst_ptich
|
||||
%endif
|
||||
|
||||
vp8_filter_block1d16_v6_sse2_loop:
|
||||
.vp8_filter_block1d16_v6_sse2_loop:
|
||||
; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
|
||||
movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
|
||||
movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
|
||||
|
@ -511,7 +511,7 @@ vp8_filter_block1d16_v6_sse2_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx ; decrement count
|
||||
jnz vp8_filter_block1d16_v6_sse2_loop ; next row
|
||||
jnz .vp8_filter_block1d16_v6_sse2_loop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -556,7 +556,7 @@ sym(vp8_filter_block1d8_h6_only_sse2):
|
|||
%endif
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
|
||||
filter_block1d8_h6_only_rowloop:
|
||||
.filter_block1d8_h6_only_rowloop:
|
||||
movq xmm3, MMWORD PTR [rsi - 2]
|
||||
movq xmm1, MMWORD PTR [rsi + 6]
|
||||
|
||||
|
@ -624,7 +624,7 @@ filter_block1d8_h6_only_rowloop:
|
|||
%endif
|
||||
dec rcx
|
||||
|
||||
jnz filter_block1d8_h6_only_rowloop ; next row
|
||||
jnz .filter_block1d8_h6_only_rowloop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -670,7 +670,7 @@ sym(vp8_filter_block1d16_h6_only_sse2):
|
|||
|
||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
||||
|
||||
filter_block1d16_h6_only_sse2_rowloop:
|
||||
.filter_block1d16_h6_only_sse2_rowloop:
|
||||
movq xmm3, MMWORD PTR [rsi - 2]
|
||||
movq xmm1, MMWORD PTR [rsi + 6]
|
||||
|
||||
|
@ -789,7 +789,7 @@ filter_block1d16_h6_only_sse2_rowloop:
|
|||
%endif
|
||||
|
||||
dec rcx
|
||||
jnz filter_block1d16_h6_only_sse2_rowloop ; next row
|
||||
jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -837,7 +837,7 @@ sym(vp8_filter_block1d8_v6_only_sse2):
|
|||
movsxd r8, dword ptr arg(3) ; dst_ptich
|
||||
%endif
|
||||
|
||||
vp8_filter_block1d8_v6_only_sse2_loop:
|
||||
.vp8_filter_block1d8_v6_only_sse2_loop:
|
||||
movq xmm1, MMWORD PTR [rsi]
|
||||
movq xmm2, MMWORD PTR [rsi + rdx]
|
||||
movq xmm3, MMWORD PTR [rsi + rdx * 2]
|
||||
|
@ -883,7 +883,7 @@ vp8_filter_block1d8_v6_only_sse2_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx ; decrement count
|
||||
jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row
|
||||
jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -924,7 +924,7 @@ sym(vp8_unpack_block1d16_h6_sse2):
|
|||
movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
|
||||
%endif
|
||||
|
||||
unpack_block1d16_h6_sse2_rowloop:
|
||||
.unpack_block1d16_h6_sse2_rowloop:
|
||||
movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
|
||||
movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
|
||||
|
||||
|
@ -941,7 +941,7 @@ unpack_block1d16_h6_sse2_rowloop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz unpack_block1d16_h6_sse2_rowloop ; next row
|
||||
jnz .unpack_block1d16_h6_sse2_rowloop ; next row
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -980,7 +980,7 @@ sym(vp8_bilinear_predict16x16_sse2):
|
|||
movsxd rax, dword ptr arg(2) ;xoffset
|
||||
|
||||
cmp rax, 0 ;skip first_pass filter if xoffset=0
|
||||
je b16x16_sp_only
|
||||
je .b16x16_sp_only
|
||||
|
||||
shl rax, 5
|
||||
add rax, rcx ;HFilter
|
||||
|
@ -995,7 +995,7 @@ sym(vp8_bilinear_predict16x16_sse2):
|
|||
movsxd rax, dword ptr arg(3) ;yoffset
|
||||
|
||||
cmp rax, 0 ;skip second_pass filter if yoffset=0
|
||||
je b16x16_fp_only
|
||||
je .b16x16_fp_only
|
||||
|
||||
shl rax, 5
|
||||
add rax, rcx ;VFilter
|
||||
|
@ -1041,7 +1041,7 @@ sym(vp8_bilinear_predict16x16_sse2):
|
|||
packuswb xmm7, xmm4
|
||||
|
||||
add rsi, rdx ; next line
|
||||
next_row:
|
||||
.next_row:
|
||||
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
|
||||
|
@ -1104,11 +1104,11 @@ next_row:
|
|||
%endif
|
||||
|
||||
cmp rdi, rcx
|
||||
jne next_row
|
||||
jne .next_row
|
||||
|
||||
jmp done
|
||||
jmp .done
|
||||
|
||||
b16x16_sp_only:
|
||||
.b16x16_sp_only:
|
||||
movsxd rax, dword ptr arg(3) ;yoffset
|
||||
shl rax, 5
|
||||
add rax, rcx ;VFilter
|
||||
|
@ -1130,7 +1130,7 @@ b16x16_sp_only:
|
|||
movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
|
||||
add rsi, rax ; next line
|
||||
next_row_spo:
|
||||
.next_row_spo:
|
||||
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
|
||||
movdqa xmm5, xmm7
|
||||
|
@ -1164,17 +1164,17 @@ next_row_spo:
|
|||
add rsi, rax ; next line
|
||||
add rdi, rdx ;dst_pitch
|
||||
cmp rdi, rcx
|
||||
jne next_row_spo
|
||||
jne .next_row_spo
|
||||
|
||||
jmp done
|
||||
jmp .done
|
||||
|
||||
b16x16_fp_only:
|
||||
.b16x16_fp_only:
|
||||
lea rcx, [rdi+rdx*8]
|
||||
lea rcx, [rcx+rdx*8]
|
||||
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0
|
||||
|
||||
next_row_fpo:
|
||||
.next_row_fpo:
|
||||
movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
|
||||
|
@ -1208,9 +1208,9 @@ next_row_fpo:
|
|||
add rsi, rax ; next line
|
||||
add rdi, rdx ; dst_pitch
|
||||
cmp rdi, rcx
|
||||
jne next_row_fpo
|
||||
jne .next_row_fpo
|
||||
|
||||
done:
|
||||
.done:
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
@ -1318,7 +1318,7 @@ sym(vp8_bilinear_predict8x8_sse2):
|
|||
|
||||
movdqa xmm7, xmm3
|
||||
add rsp, 16 ; next line
|
||||
next_row8x8:
|
||||
.next_row8x8:
|
||||
movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
|
||||
movdqa xmm4, xmm3 ; make a copy of current line
|
||||
psrldq xmm4, 1
|
||||
|
@ -1352,7 +1352,7 @@ next_row8x8:
|
|||
add rdi, rdx
|
||||
|
||||
cmp rdi, rcx
|
||||
jne next_row8x8
|
||||
jne .next_row8x8
|
||||
|
||||
;add rsp, 144
|
||||
pop rsp
|
||||
|
|
|
@ -70,7 +70,7 @@ sym(vp8_filter_block1d8_h6_ssse3):
|
|||
|
||||
sub rdi, rdx
|
||||
;xmm3 free
|
||||
filter_block1d8_h6_rowloop_ssse3:
|
||||
.filter_block1d8_h6_rowloop_ssse3:
|
||||
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
||||
|
||||
movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
||||
|
@ -102,7 +102,7 @@ filter_block1d8_h6_rowloop_ssse3:
|
|||
packuswb xmm0, xmm0
|
||||
|
||||
movq MMWORD Ptr [rdi], xmm0
|
||||
jnz filter_block1d8_h6_rowloop_ssse3
|
||||
jnz .filter_block1d8_h6_rowloop_ssse3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -129,7 +129,7 @@ vp8_filter_block1d8_h4_ssse3:
|
|||
|
||||
sub rdi, rdx
|
||||
|
||||
filter_block1d8_h4_rowloop_ssse3:
|
||||
.filter_block1d8_h4_rowloop_ssse3:
|
||||
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
||||
|
||||
movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
||||
|
@ -158,7 +158,7 @@ filter_block1d8_h4_rowloop_ssse3:
|
|||
|
||||
movq MMWORD Ptr [rdi], xmm0
|
||||
|
||||
jnz filter_block1d8_h4_rowloop_ssse3
|
||||
jnz .filter_block1d8_h4_rowloop_ssse3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -207,7 +207,7 @@ sym(vp8_filter_block1d16_h6_ssse3):
|
|||
movsxd rcx, dword ptr arg(4) ;output_height
|
||||
movsxd rdx, dword ptr arg(3) ;output_pitch
|
||||
|
||||
filter_block1d16_h6_rowloop_ssse3:
|
||||
.filter_block1d16_h6_rowloop_ssse3:
|
||||
movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
|
||||
|
||||
movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
|
||||
|
@ -264,7 +264,7 @@ filter_block1d16_h6_rowloop_ssse3:
|
|||
|
||||
lea rdi, [rdi + rdx]
|
||||
dec rcx
|
||||
jnz filter_block1d16_h6_rowloop_ssse3
|
||||
jnz .filter_block1d16_h6_rowloop_ssse3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -304,7 +304,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
|
|||
movdqa xmm7, [GLOBAL(rd)]
|
||||
|
||||
cmp esi, DWORD PTR [rax]
|
||||
je vp8_filter_block1d4_h4_ssse3
|
||||
je .vp8_filter_block1d4_h4_ssse3
|
||||
|
||||
movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
|
||||
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
||||
|
@ -318,7 +318,7 @@ sym(vp8_filter_block1d4_h6_ssse3):
|
|||
movsxd rdx, dword ptr arg(3) ;output_pitch
|
||||
|
||||
;xmm3 free
|
||||
filter_block1d4_h6_rowloop_ssse3:
|
||||
.filter_block1d4_h6_rowloop_ssse3:
|
||||
movdqu xmm0, XMMWORD PTR [rsi - 2]
|
||||
|
||||
movdqa xmm1, xmm0
|
||||
|
@ -346,7 +346,7 @@ filter_block1d4_h6_rowloop_ssse3:
|
|||
|
||||
add rdi, rdx
|
||||
dec rcx
|
||||
jnz filter_block1d4_h6_rowloop_ssse3
|
||||
jnz .filter_block1d4_h6_rowloop_ssse3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -356,7 +356,7 @@ filter_block1d4_h6_rowloop_ssse3:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
vp8_filter_block1d4_h4_ssse3:
|
||||
.vp8_filter_block1d4_h4_ssse3:
|
||||
movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
|
||||
movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
|
||||
movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
|
||||
|
@ -369,7 +369,7 @@ vp8_filter_block1d4_h4_ssse3:
|
|||
|
||||
movsxd rdx, dword ptr arg(3) ;output_pitch
|
||||
|
||||
filter_block1d4_h4_rowloop_ssse3:
|
||||
.filter_block1d4_h4_rowloop_ssse3:
|
||||
movdqu xmm1, XMMWORD PTR [rsi - 2]
|
||||
|
||||
movdqa xmm2, xmm1
|
||||
|
@ -391,7 +391,7 @@ filter_block1d4_h4_rowloop_ssse3:
|
|||
|
||||
add rdi, rdx
|
||||
dec rcx
|
||||
jnz filter_block1d4_h4_rowloop_ssse3
|
||||
jnz .filter_block1d4_h4_rowloop_ssse3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -432,7 +432,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
|
|||
add rax, rdx
|
||||
|
||||
cmp esi, DWORD PTR [rax]
|
||||
je vp8_filter_block1d16_v4_ssse3
|
||||
je .vp8_filter_block1d16_v4_ssse3
|
||||
|
||||
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
|
||||
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
||||
|
@ -450,7 +450,7 @@ sym(vp8_filter_block1d16_v6_ssse3):
|
|||
add rax, rdx
|
||||
|
||||
|
||||
vp8_filter_block1d16_v6_ssse3_loop:
|
||||
.vp8_filter_block1d16_v6_ssse3_loop:
|
||||
movq xmm1, MMWORD PTR [rsi] ;A
|
||||
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
||||
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
||||
|
@ -508,7 +508,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d16_v6_ssse3_loop
|
||||
jnz .vp8_filter_block1d16_v6_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -519,7 +519,7 @@ vp8_filter_block1d16_v6_ssse3_loop:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
vp8_filter_block1d16_v4_ssse3:
|
||||
.vp8_filter_block1d16_v4_ssse3:
|
||||
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
||||
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
||||
|
||||
|
@ -534,7 +534,7 @@ vp8_filter_block1d16_v4_ssse3:
|
|||
movsxd rcx, DWORD PTR arg(4) ;output_height
|
||||
add rax, rdx
|
||||
|
||||
vp8_filter_block1d16_v4_ssse3_loop:
|
||||
.vp8_filter_block1d16_v4_ssse3_loop:
|
||||
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
||||
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
||||
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
||||
|
@ -581,7 +581,7 @@ vp8_filter_block1d16_v4_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d16_v4_ssse3_loop
|
||||
jnz .vp8_filter_block1d16_v4_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -627,7 +627,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
|
|||
movsxd rcx, DWORD PTR arg(4) ;[output_height]
|
||||
|
||||
cmp esi, DWORD PTR [rax]
|
||||
je vp8_filter_block1d8_v4_ssse3
|
||||
je .vp8_filter_block1d8_v4_ssse3
|
||||
|
||||
movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
|
||||
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
||||
|
@ -638,7 +638,7 @@ sym(vp8_filter_block1d8_v6_ssse3):
|
|||
mov rax, rsi
|
||||
add rax, rdx
|
||||
|
||||
vp8_filter_block1d8_v6_ssse3_loop:
|
||||
.vp8_filter_block1d8_v6_ssse3_loop:
|
||||
movq xmm1, MMWORD PTR [rsi] ;A
|
||||
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
||||
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
||||
|
@ -673,7 +673,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d8_v6_ssse3_loop
|
||||
jnz .vp8_filter_block1d8_v6_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -684,7 +684,7 @@ vp8_filter_block1d8_v6_ssse3_loop:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
vp8_filter_block1d8_v4_ssse3:
|
||||
.vp8_filter_block1d8_v4_ssse3:
|
||||
movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
|
||||
movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
|
||||
movdqa xmm5, [GLOBAL(rd)]
|
||||
|
@ -694,7 +694,7 @@ vp8_filter_block1d8_v4_ssse3:
|
|||
mov rax, rsi
|
||||
add rax, rdx
|
||||
|
||||
vp8_filter_block1d8_v4_ssse3_loop:
|
||||
.vp8_filter_block1d8_v4_ssse3_loop:
|
||||
movq xmm2, MMWORD PTR [rsi + rdx] ;B
|
||||
movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
|
||||
movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
|
||||
|
@ -722,7 +722,7 @@ vp8_filter_block1d8_v4_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d8_v4_ssse3_loop
|
||||
jnz .vp8_filter_block1d8_v4_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -766,7 +766,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
|
|||
movsxd rcx, DWORD PTR arg(4) ;[output_height]
|
||||
|
||||
cmp esi, DWORD PTR [rax]
|
||||
je vp8_filter_block1d4_v4_ssse3
|
||||
je .vp8_filter_block1d4_v4_ssse3
|
||||
|
||||
movq mm5, MMWORD PTR [rax] ;k0_k5
|
||||
movq mm6, MMWORD PTR [rax+256] ;k2_k4
|
||||
|
@ -777,7 +777,7 @@ sym(vp8_filter_block1d4_v6_ssse3):
|
|||
mov rax, rsi
|
||||
add rax, rdx
|
||||
|
||||
vp8_filter_block1d4_v6_ssse3_loop:
|
||||
.vp8_filter_block1d4_v6_ssse3_loop:
|
||||
movd mm1, DWORD PTR [rsi] ;A
|
||||
movd mm2, DWORD PTR [rsi + rdx] ;B
|
||||
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
|
||||
|
@ -813,7 +813,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d4_v6_ssse3_loop
|
||||
jnz .vp8_filter_block1d4_v6_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -823,7 +823,7 @@ vp8_filter_block1d4_v6_ssse3_loop:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
vp8_filter_block1d4_v4_ssse3:
|
||||
.vp8_filter_block1d4_v4_ssse3:
|
||||
movq mm6, MMWORD PTR [rax+256] ;k2_k4
|
||||
movq mm7, MMWORD PTR [rax+128] ;k1_k3
|
||||
movq mm5, MMWORD PTR [GLOBAL(rd)]
|
||||
|
@ -833,7 +833,7 @@ vp8_filter_block1d4_v4_ssse3:
|
|||
mov rax, rsi
|
||||
add rax, rdx
|
||||
|
||||
vp8_filter_block1d4_v4_ssse3_loop:
|
||||
.vp8_filter_block1d4_v4_ssse3_loop:
|
||||
movd mm2, DWORD PTR [rsi + rdx] ;B
|
||||
movd mm3, DWORD PTR [rsi + rdx * 2] ;C
|
||||
movd mm4, DWORD PTR [rax + rdx * 2] ;D
|
||||
|
@ -861,7 +861,7 @@ vp8_filter_block1d4_v4_ssse3_loop:
|
|||
add rdi, r8
|
||||
%endif
|
||||
dec rcx
|
||||
jnz vp8_filter_block1d4_v4_ssse3_loop
|
||||
jnz .vp8_filter_block1d4_v4_ssse3_loop
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -895,7 +895,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
|
|||
movsxd rax, dword ptr arg(2) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je b16x16_sp_only
|
||||
je .b16x16_sp_only
|
||||
|
||||
shl rax, 4
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
@ -909,7 +909,7 @@ sym(vp8_bilinear_predict16x16_ssse3):
|
|||
movsxd rax, dword ptr arg(3) ; yoffset
|
||||
|
||||
cmp rax, 0 ; skip second_pass filter if yoffset=0
|
||||
je b16x16_fp_only
|
||||
je .b16x16_fp_only
|
||||
|
||||
shl rax, 4
|
||||
lea rax, [rax + rcx] ; VFilter
|
||||
|
@ -996,9 +996,9 @@ sym(vp8_bilinear_predict16x16_ssse3):
|
|||
cmp rdi, rcx
|
||||
jne .next_row
|
||||
|
||||
jmp done
|
||||
jmp .done
|
||||
|
||||
b16x16_sp_only:
|
||||
.b16x16_sp_only:
|
||||
movsxd rax, dword ptr arg(3) ; yoffset
|
||||
shl rax, 4
|
||||
lea rax, [rax + rcx] ; VFilter
|
||||
|
@ -1018,7 +1018,7 @@ b16x16_sp_only:
|
|||
movq xmm2, [rsi + 8] ; load row 0
|
||||
|
||||
lea rsi, [rsi + rax] ; next line
|
||||
.next_row:
|
||||
.next_row_sp:
|
||||
movq xmm3, [rsi] ; load row + 1
|
||||
movq xmm5, [rsi + 8] ; load row + 1
|
||||
|
||||
|
@ -1062,16 +1062,16 @@ b16x16_sp_only:
|
|||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
cmp rdi, rcx
|
||||
jne .next_row
|
||||
jne .next_row_sp
|
||||
|
||||
jmp done
|
||||
jmp .done
|
||||
|
||||
b16x16_fp_only:
|
||||
.b16x16_fp_only:
|
||||
lea rcx, [rdi+rdx*8]
|
||||
lea rcx, [rcx+rdx*8]
|
||||
movsxd rax, dword ptr arg(1) ; src_pixels_per_line
|
||||
|
||||
.next_row:
|
||||
.next_row_fp:
|
||||
movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
|
||||
movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
|
||||
|
||||
|
@ -1122,9 +1122,9 @@ b16x16_fp_only:
|
|||
|
||||
cmp rdi, rcx
|
||||
|
||||
jne .next_row
|
||||
jne .next_row_fp
|
||||
|
||||
done:
|
||||
.done:
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
@ -1191,7 +1191,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
|
|||
|
||||
movsxd rax, dword ptr arg(2) ; xoffset
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je b8x8_sp_only
|
||||
je .b8x8_sp_only
|
||||
|
||||
shl rax, 4
|
||||
add rax, rcx ; HFilter
|
||||
|
@ -1203,7 +1203,7 @@ sym(vp8_bilinear_predict8x8_ssse3):
|
|||
|
||||
movsxd rax, dword ptr arg(3) ; yoffset
|
||||
cmp rax, 0 ; skip second_pass filter if yoffset=0
|
||||
je b8x8_fp_only
|
||||
je .b8x8_fp_only
|
||||
|
||||
shl rax, 4
|
||||
lea rax, [rax + rcx] ; VFilter
|
||||
|
@ -1260,9 +1260,9 @@ sym(vp8_bilinear_predict8x8_ssse3):
|
|||
cmp rdi, rcx
|
||||
jne .next_row
|
||||
|
||||
jmp done8x8
|
||||
jmp .done8x8
|
||||
|
||||
b8x8_sp_only:
|
||||
.b8x8_sp_only:
|
||||
movsxd rax, dword ptr arg(3) ; yoffset
|
||||
shl rax, 4
|
||||
lea rax, [rax + rcx] ; VFilter
|
||||
|
@ -1364,12 +1364,12 @@ b8x8_sp_only:
|
|||
movq [rdi+rdx], xmm1
|
||||
lea rsp, [rsp + 144]
|
||||
|
||||
jmp done8x8
|
||||
jmp .done8x8
|
||||
|
||||
b8x8_fp_only:
|
||||
.b8x8_fp_only:
|
||||
lea rcx, [rdi+rdx*8]
|
||||
|
||||
.next_row:
|
||||
.next_row_fp:
|
||||
movdqa xmm1, XMMWORD PTR [rsp]
|
||||
movdqa xmm3, XMMWORD PTR [rsp+16]
|
||||
|
||||
|
@ -1430,11 +1430,11 @@ b8x8_fp_only:
|
|||
lea rdi, [rdi + 2*rdx]
|
||||
cmp rdi, rcx
|
||||
|
||||
jne .next_row
|
||||
jne .next_row_fp
|
||||
|
||||
lea rsp, [rsp + 16]
|
||||
|
||||
done8x8:
|
||||
.done8x8:
|
||||
;add rsp, 144
|
||||
pop rsp
|
||||
; begin epilog
|
||||
|
|
|
@ -94,16 +94,15 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
|
|||
#if !(CONFIG_REALTIME_ONLY)
|
||||
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_c;
|
||||
#endif
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_c;
|
||||
cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_c;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Pure C:
|
||||
vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
|
||||
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c;
|
||||
cpi->rtcd.variance.ssimpf = ssim_parms_c;
|
||||
#endif
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
vp8_arch_x86_encoder_init(cpi);
|
||||
#endif
|
||||
|
|
|
@ -9,18 +9,9 @@
|
|||
*/
|
||||
|
||||
|
||||
#include "vpx_scale/yv12config.h"
|
||||
#include "math.h"
|
||||
#include "onyx_int.h"
|
||||
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
#define IF_RTCD(x) (x)
|
||||
#else
|
||||
#define IF_RTCD(x) NULL
|
||||
#endif
|
||||
|
||||
|
||||
void ssim_parms_c
|
||||
void vp8_ssim_parms_16x16_c
|
||||
(
|
||||
unsigned char *s,
|
||||
int sp,
|
||||
|
@ -46,7 +37,7 @@ void ssim_parms_c
|
|||
}
|
||||
}
|
||||
}
|
||||
void ssim_parms_8x8_c
|
||||
void vp8_ssim_parms_8x8_c
|
||||
(
|
||||
unsigned char *s,
|
||||
int sp,
|
||||
|
@ -107,14 +98,14 @@ static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp,
|
|||
const vp8_variance_rtcd_vtable_t *rtcd)
|
||||
{
|
||||
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
|
||||
rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
|
||||
}
|
||||
static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
|
||||
const vp8_variance_rtcd_vtable_t *rtcd)
|
||||
{
|
||||
unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
|
||||
rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
|
||||
}
|
||||
|
||||
|
@ -134,7 +125,7 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp,
|
|||
c1 = cc1*16;
|
||||
c2 = cc2*16;
|
||||
|
||||
rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
|
||||
ssim_n1 = (2*sum_s*sum_r+ c1);
|
||||
|
||||
ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
|
||||
|
|
|
@ -320,16 +320,16 @@ extern prototype_variance(vp8_variance_mse16x16);
|
|||
#endif
|
||||
extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs);
|
||||
|
||||
#ifndef vp8_ssimpf
|
||||
#define vp8_ssimpf ssim_parms_c
|
||||
#endif
|
||||
extern prototype_ssimpf(vp8_ssimpf)
|
||||
|
||||
#ifndef vp8_ssimpf_8x8
|
||||
#define vp8_ssimpf_8x8 ssim_parms_8x8_c
|
||||
#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
|
||||
#endif
|
||||
extern prototype_ssimpf(vp8_ssimpf_8x8)
|
||||
|
||||
#ifndef vp8_ssimpf_16x16
|
||||
#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
|
||||
#endif
|
||||
extern prototype_ssimpf(vp8_ssimpf_16x16)
|
||||
|
||||
typedef prototype_sad(*vp8_sad_fn_t);
|
||||
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
|
||||
typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
|
||||
|
@ -394,7 +394,7 @@ typedef struct
|
|||
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
vp8_ssimpf_fn_t ssimpf_8x8;
|
||||
vp8_ssimpf_fn_t ssimpf;
|
||||
vp8_ssimpf_fn_t ssimpf_16x16;
|
||||
#endif
|
||||
|
||||
} vp8_variance_rtcd_vtable_t;
|
||||
|
@ -417,8 +417,10 @@ typedef struct
|
|||
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
|
||||
#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
|
||||
#else
|
||||
#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
|
||||
#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -148,7 +148,7 @@ sym(vp8_mbblock_error_mmx_impl):
|
|||
pcmpeqw mm1, mm7
|
||||
mov rcx, 16
|
||||
|
||||
mberror_loop_mmx:
|
||||
.mberror_loop_mmx:
|
||||
movq mm3, [rsi]
|
||||
movq mm4, [rdi]
|
||||
|
||||
|
@ -186,7 +186,7 @@ mberror_loop_mmx:
|
|||
add rdi, 32
|
||||
sub rcx, 1
|
||||
|
||||
jnz mberror_loop_mmx
|
||||
jnz .mberror_loop_mmx
|
||||
|
||||
movq mm0, mm2
|
||||
psrlq mm2, 32
|
||||
|
@ -226,7 +226,7 @@ sym(vp8_mbblock_error_xmm_impl):
|
|||
pcmpeqw xmm5, xmm6
|
||||
mov rcx, 16
|
||||
|
||||
mberror_loop:
|
||||
.mberror_loop:
|
||||
movdqa xmm0, [rsi]
|
||||
movdqa xmm1, [rdi]
|
||||
|
||||
|
@ -249,7 +249,7 @@ mberror_loop:
|
|||
paddd xmm4, xmm2
|
||||
|
||||
paddd xmm4, xmm0
|
||||
jnz mberror_loop
|
||||
jnz .mberror_loop
|
||||
|
||||
movdqa xmm0, xmm4
|
||||
punpckldq xmm0, xmm6
|
||||
|
@ -289,7 +289,7 @@ sym(vp8_mbuverror_mmx_impl):
|
|||
mov rcx, 16
|
||||
pxor mm7, mm7
|
||||
|
||||
mbuverror_loop_mmx:
|
||||
.mbuverror_loop_mmx:
|
||||
|
||||
movq mm1, [rsi]
|
||||
movq mm2, [rdi]
|
||||
|
@ -313,7 +313,7 @@ mbuverror_loop_mmx:
|
|||
add rdi, 16
|
||||
|
||||
dec rcx
|
||||
jnz mbuverror_loop_mmx
|
||||
jnz .mbuverror_loop_mmx
|
||||
|
||||
movq mm0, mm7
|
||||
psrlq mm7, 32
|
||||
|
@ -346,7 +346,7 @@ sym(vp8_mbuverror_xmm_impl):
|
|||
mov rcx, 16
|
||||
pxor xmm3, xmm3
|
||||
|
||||
mbuverror_loop:
|
||||
.mbuverror_loop:
|
||||
|
||||
movdqa xmm1, [rsi]
|
||||
movdqa xmm2, [rdi]
|
||||
|
@ -360,7 +360,7 @@ mbuverror_loop:
|
|||
add rdi, 16
|
||||
|
||||
dec rcx
|
||||
jnz mbuverror_loop
|
||||
jnz .mbuverror_loop
|
||||
|
||||
pxor xmm0, xmm0
|
||||
movdqa xmm1, xmm3
|
||||
|
|
|
@ -137,17 +137,17 @@ sym(vp8_regular_quantize_b_sse2):
|
|||
; if (x >= zbin)
|
||||
sub cx, WORD PTR[rdx] ; x - zbin
|
||||
lea rdx, [rdx + 2] ; zbin_boost_ptr++
|
||||
jl rq_zigzag_loop_%1 ; x < zbin
|
||||
jl .rq_zigzag_loop_%1 ; x < zbin
|
||||
|
||||
movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
|
||||
|
||||
; downshift by quant_shift[rc]
|
||||
movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc]
|
||||
sar edi, cl ; also sets Z bit
|
||||
je rq_zigzag_loop_%1 ; !y
|
||||
je .rq_zigzag_loop_%1 ; !y
|
||||
mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
|
||||
mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
|
||||
rq_zigzag_loop_%1:
|
||||
.rq_zigzag_loop_%1:
|
||||
%endmacro
|
||||
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
|
||||
ZIGZAG_LOOP 0
|
||||
|
|
|
@ -140,21 +140,21 @@ sym(vp8_regular_quantize_b_sse4):
|
|||
; if (x >= zbin)
|
||||
sub cx, WORD PTR[rdx] ; x - zbin
|
||||
lea rdx, [rdx + 2] ; zbin_boost_ptr++
|
||||
jl rq_zigzag_loop_%1 ; x < zbin
|
||||
jl .rq_zigzag_loop_%1 ; x < zbin
|
||||
|
||||
pextrw edi, %3, %2 ; y
|
||||
|
||||
; downshift by quant_shift[rc]
|
||||
pextrb ecx, xmm5, %1 ; quant_shift[rc]
|
||||
sar edi, cl ; also sets Z bit
|
||||
je rq_zigzag_loop_%1 ; !y
|
||||
je .rq_zigzag_loop_%1 ; !y
|
||||
%if ABI_IS_32BIT
|
||||
mov WORD PTR[rsp + qcoeff + %1 *2], di
|
||||
%else
|
||||
pinsrw %5, edi, %2 ; qcoeff[rc]
|
||||
%endif
|
||||
mov rdx, rax ; reset to b->zrun_zbin_boost
|
||||
rq_zigzag_loop_%1:
|
||||
.rq_zigzag_loop_%1:
|
||||
%endmacro
|
||||
; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
|
||||
ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
|
||||
|
|
|
@ -43,7 +43,7 @@ sym(vp8_sad16x16_mmx):
|
|||
|
||||
pxor mm6, mm6
|
||||
|
||||
x16x16sad_mmx_loop:
|
||||
.x16x16sad_mmx_loop:
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm2, QWORD PTR [rsi+8]
|
||||
|
@ -83,7 +83,7 @@ x16x16sad_mmx_loop:
|
|||
paddw mm7, mm1
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x16x16sad_mmx_loop
|
||||
jne .x16x16sad_mmx_loop
|
||||
|
||||
|
||||
movq mm0, mm7
|
||||
|
@ -135,7 +135,7 @@ sym(vp8_sad8x16_mmx):
|
|||
|
||||
pxor mm6, mm6
|
||||
|
||||
x8x16sad_mmx_loop:
|
||||
.x8x16sad_mmx_loop:
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm1, QWORD PTR [rdi]
|
||||
|
@ -158,7 +158,7 @@ x8x16sad_mmx_loop:
|
|||
paddw mm7, mm2
|
||||
cmp rsi, rcx
|
||||
|
||||
jne x8x16sad_mmx_loop
|
||||
jne .x8x16sad_mmx_loop
|
||||
|
||||
movq mm0, mm7
|
||||
punpcklwd mm0, mm6
|
||||
|
@ -205,7 +205,7 @@ sym(vp8_sad8x8_mmx):
|
|||
|
||||
pxor mm6, mm6
|
||||
|
||||
x8x8sad_mmx_loop:
|
||||
.x8x8sad_mmx_loop:
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm1, QWORD PTR [rdi]
|
||||
|
@ -228,7 +228,7 @@ x8x8sad_mmx_loop:
|
|||
paddw mm7, mm0
|
||||
cmp rsi, rcx
|
||||
|
||||
jne x8x8sad_mmx_loop
|
||||
jne .x8x8sad_mmx_loop
|
||||
|
||||
movq mm0, mm7
|
||||
punpcklwd mm0, mm6
|
||||
|
@ -364,7 +364,7 @@ sym(vp8_sad16x8_mmx):
|
|||
|
||||
pxor mm6, mm6
|
||||
|
||||
x16x8sad_mmx_loop:
|
||||
.x16x8sad_mmx_loop:
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rdi]
|
||||
|
@ -404,7 +404,7 @@ x16x8sad_mmx_loop:
|
|||
paddw mm7, mm0
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x16x8sad_mmx_loop
|
||||
jne .x16x8sad_mmx_loop
|
||||
|
||||
movq mm0, mm7
|
||||
punpcklwd mm0, mm6
|
||||
|
|
|
@ -37,7 +37,7 @@ sym(vp8_sad16x16_wmt):
|
|||
lea rcx, [rcx+rax*8]
|
||||
pxor xmm6, xmm6
|
||||
|
||||
x16x16sad_wmt_loop:
|
||||
.x16x16sad_wmt_loop:
|
||||
|
||||
movq xmm0, QWORD PTR [rsi]
|
||||
movq xmm2, QWORD PTR [rsi+8]
|
||||
|
@ -68,7 +68,7 @@ x16x16sad_wmt_loop:
|
|||
paddw xmm6, xmm4
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x16x16sad_wmt_loop
|
||||
jne .x16x16sad_wmt_loop
|
||||
|
||||
movq xmm0, xmm6
|
||||
psrldq xmm6, 8
|
||||
|
@ -111,11 +111,11 @@ sym(vp8_sad8x16_wmt):
|
|||
lea rcx, [rcx+rbx*8]
|
||||
pxor mm7, mm7
|
||||
|
||||
x8x16sad_wmt_loop:
|
||||
.x8x16sad_wmt_loop:
|
||||
|
||||
movq rax, mm7
|
||||
cmp eax, arg(4)
|
||||
jg x8x16sad_wmt_early_exit
|
||||
jg .x8x16sad_wmt_early_exit
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm1, QWORD PTR [rdi]
|
||||
|
@ -133,11 +133,11 @@ x8x16sad_wmt_loop:
|
|||
paddw mm7, mm2
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x8x16sad_wmt_loop
|
||||
jne .x8x16sad_wmt_loop
|
||||
|
||||
movq rax, mm7
|
||||
|
||||
x8x16sad_wmt_early_exit:
|
||||
.x8x16sad_wmt_early_exit:
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -172,11 +172,11 @@ sym(vp8_sad8x8_wmt):
|
|||
lea rcx, [rsi+rbx*8]
|
||||
pxor mm7, mm7
|
||||
|
||||
x8x8sad_wmt_loop:
|
||||
.x8x8sad_wmt_loop:
|
||||
|
||||
movq rax, mm7
|
||||
cmp eax, arg(4)
|
||||
jg x8x8sad_wmt_early_exit
|
||||
jg .x8x8sad_wmt_early_exit
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm1, QWORD PTR [rdi]
|
||||
|
@ -188,10 +188,10 @@ x8x8sad_wmt_loop:
|
|||
paddw mm7, mm0
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x8x8sad_wmt_loop
|
||||
jne .x8x8sad_wmt_loop
|
||||
|
||||
movq rax, mm7
|
||||
x8x8sad_wmt_early_exit:
|
||||
.x8x8sad_wmt_early_exit:
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -281,11 +281,11 @@ sym(vp8_sad16x8_wmt):
|
|||
lea rcx, [rsi+rbx*8]
|
||||
pxor mm7, mm7
|
||||
|
||||
x16x8sad_wmt_loop:
|
||||
.x16x8sad_wmt_loop:
|
||||
|
||||
movq rax, mm7
|
||||
cmp eax, arg(4)
|
||||
jg x16x8sad_wmt_early_exit
|
||||
jg .x16x8sad_wmt_early_exit
|
||||
|
||||
movq mm0, QWORD PTR [rsi]
|
||||
movq mm2, QWORD PTR [rsi+8]
|
||||
|
@ -315,11 +315,11 @@ x16x8sad_wmt_loop:
|
|||
paddw mm7, mm4
|
||||
|
||||
cmp rsi, rcx
|
||||
jne x16x8sad_wmt_loop
|
||||
jne .x16x8sad_wmt_loop
|
||||
|
||||
movq rax, mm7
|
||||
|
||||
x16x8sad_wmt_early_exit:
|
||||
.x16x8sad_wmt_early_exit:
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
|
@ -352,7 +352,7 @@ sym(vp8_copy32xn_sse2):
|
|||
movsxd rdx, dword ptr arg(3) ;dst_stride
|
||||
movsxd rcx, dword ptr arg(4) ;height
|
||||
|
||||
block_copy_sse2_loopx4:
|
||||
.block_copy_sse2_loopx4:
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
||||
movdqu xmm2, XMMWORD PTR [rsi + rax]
|
||||
|
@ -383,12 +383,12 @@ block_copy_sse2_loopx4:
|
|||
|
||||
sub rcx, 4
|
||||
cmp rcx, 4
|
||||
jge block_copy_sse2_loopx4
|
||||
jge .block_copy_sse2_loopx4
|
||||
|
||||
cmp rcx, 0
|
||||
je copy_is_done
|
||||
je .copy_is_done
|
||||
|
||||
block_copy_sse2_loop:
|
||||
.block_copy_sse2_loop:
|
||||
movdqu xmm0, XMMWORD PTR [rsi]
|
||||
movdqu xmm1, XMMWORD PTR [rsi + 16]
|
||||
lea rsi, [rsi+rax]
|
||||
|
@ -398,9 +398,9 @@ block_copy_sse2_loop:
|
|||
lea rdi, [rdi+rdx]
|
||||
|
||||
sub rcx, 1
|
||||
jne block_copy_sse2_loop
|
||||
jne .block_copy_sse2_loop
|
||||
|
||||
copy_is_done:
|
||||
.copy_is_done:
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
|
|
@ -647,7 +647,7 @@ sym(vp8_copy32xn_sse3):
|
|||
|
||||
STACK_FRAME_CREATE_X3
|
||||
|
||||
block_copy_sse3_loopx4:
|
||||
.block_copy_sse3_loopx4:
|
||||
lea end_ptr, [src_ptr+src_stride*2]
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [src_ptr]
|
||||
|
@ -676,13 +676,13 @@ block_copy_sse3_loopx4:
|
|||
|
||||
sub height, 4
|
||||
cmp height, 4
|
||||
jge block_copy_sse3_loopx4
|
||||
jge .block_copy_sse3_loopx4
|
||||
|
||||
;Check to see if there is more rows need to be copied.
|
||||
cmp height, 0
|
||||
je copy_is_done
|
||||
je .copy_is_done
|
||||
|
||||
block_copy_sse3_loop:
|
||||
.block_copy_sse3_loop:
|
||||
movdqu xmm0, XMMWORD PTR [src_ptr]
|
||||
movdqu xmm1, XMMWORD PTR [src_ptr + 16]
|
||||
lea src_ptr, [src_ptr+src_stride]
|
||||
|
@ -692,9 +692,9 @@ block_copy_sse3_loop:
|
|||
lea ref_ptr, [ref_ptr+ref_stride]
|
||||
|
||||
sub height, 1
|
||||
jne block_copy_sse3_loop
|
||||
jne .block_copy_sse3_loop
|
||||
|
||||
copy_is_done:
|
||||
.copy_is_done:
|
||||
STACK_FRAME_DESTROY_X3
|
||||
|
||||
;void vp8_sad16x16x4d_sse3(
|
||||
|
|
|
@ -169,30 +169,30 @@ sym(vp8_sad16x16x3_ssse3):
|
|||
mov rdx, 0xf
|
||||
and rdx, rdi
|
||||
|
||||
jmp vp8_sad16x16x3_ssse3_skiptable
|
||||
vp8_sad16x16x3_ssse3_jumptable:
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_0 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_1 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_2 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_3 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_4 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_5 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_6 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_7 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_8 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_9 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
|
||||
dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
|
||||
vp8_sad16x16x3_ssse3_skiptable:
|
||||
jmp .vp8_sad16x16x3_ssse3_skiptable
|
||||
.vp8_sad16x16x3_ssse3_jumptable:
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
|
||||
.vp8_sad16x16x3_ssse3_skiptable:
|
||||
|
||||
call vp8_sad16x16x3_ssse3_do_jump
|
||||
vp8_sad16x16x3_ssse3_do_jump:
|
||||
call .vp8_sad16x16x3_ssse3_do_jump
|
||||
.vp8_sad16x16x3_ssse3_do_jump:
|
||||
pop rcx ; get the address of do_jump
|
||||
mov rax, vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
|
||||
mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
|
||||
add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
|
||||
|
||||
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
|
||||
|
@ -203,23 +203,23 @@ vp8_sad16x16x3_ssse3_do_jump:
|
|||
|
||||
jmp rcx
|
||||
|
||||
PROCESS_16X16X3_OFFSET 0, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 1, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 2, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 3, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 4, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 5, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 6, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 7, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 8, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 9, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
|
||||
PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
|
||||
|
||||
vp8_sad16x16x3_ssse3_aligned_by_15:
|
||||
.vp8_sad16x16x3_ssse3_aligned_by_15:
|
||||
PROCESS_16X2X3 1
|
||||
PROCESS_16X2X3 0
|
||||
PROCESS_16X2X3 0
|
||||
|
@ -229,7 +229,7 @@ vp8_sad16x16x3_ssse3_aligned_by_15:
|
|||
PROCESS_16X2X3 0
|
||||
PROCESS_16X2X3 0
|
||||
|
||||
vp8_sad16x16x3_ssse3_store_off:
|
||||
.vp8_sad16x16x3_ssse3_store_off:
|
||||
mov rdi, arg(4) ;Results
|
||||
|
||||
movq xmm0, xmm5
|
||||
|
@ -282,30 +282,30 @@ sym(vp8_sad16x8x3_ssse3):
|
|||
mov rdx, 0xf
|
||||
and rdx, rdi
|
||||
|
||||
jmp vp8_sad16x8x3_ssse3_skiptable
|
||||
vp8_sad16x8x3_ssse3_jumptable:
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_0 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_1 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_2 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_3 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_4 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_5 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_6 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_7 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_8 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_9 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
|
||||
dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
|
||||
vp8_sad16x8x3_ssse3_skiptable:
|
||||
jmp .vp8_sad16x8x3_ssse3_skiptable
|
||||
.vp8_sad16x8x3_ssse3_jumptable:
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
|
||||
.vp8_sad16x8x3_ssse3_skiptable:
|
||||
|
||||
call vp8_sad16x8x3_ssse3_do_jump
|
||||
vp8_sad16x8x3_ssse3_do_jump:
|
||||
call .vp8_sad16x8x3_ssse3_do_jump
|
||||
.vp8_sad16x8x3_ssse3_do_jump:
|
||||
pop rcx ; get the address of do_jump
|
||||
mov rax, vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
|
||||
mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
|
||||
add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
|
||||
|
||||
movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable
|
||||
|
@ -316,30 +316,30 @@ vp8_sad16x8x3_ssse3_do_jump:
|
|||
|
||||
jmp rcx
|
||||
|
||||
PROCESS_16X8X3_OFFSET 0, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 1, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 2, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 3, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 4, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 5, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 6, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 7, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 8, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 9, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
|
||||
PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
|
||||
|
||||
vp8_sad16x8x3_ssse3_aligned_by_15:
|
||||
.vp8_sad16x8x3_ssse3_aligned_by_15:
|
||||
|
||||
PROCESS_16X2X3 1
|
||||
PROCESS_16X2X3 0
|
||||
PROCESS_16X2X3 0
|
||||
PROCESS_16X2X3 0
|
||||
|
||||
vp8_sad16x8x3_ssse3_store_off:
|
||||
.vp8_sad16x8x3_ssse3_store_off:
|
||||
mov rdi, arg(4) ;Results
|
||||
|
||||
movq xmm0, xmm5
|
||||
|
|
|
@ -44,7 +44,7 @@
|
|||
paddd %1, xmm1
|
||||
SUM_ACROSS_Q %1
|
||||
%endmacro
|
||||
;void ssim_parms_sse3(
|
||||
;void ssim_parms_sse2(
|
||||
; unsigned char *s,
|
||||
; int sp,
|
||||
; unsigned char *r,
|
||||
|
@ -61,8 +61,8 @@
|
|||
; or pavgb At this point this is just meant to be first pass for calculating
|
||||
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
|
||||
; in mode selection code.
|
||||
global sym(vp8_ssim_parms_16x16_sse3)
|
||||
sym(vp8_ssim_parms_16x16_sse3):
|
||||
global sym(vp8_ssim_parms_16x16_sse2)
|
||||
sym(vp8_ssim_parms_16x16_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
|
@ -84,7 +84,7 @@ sym(vp8_ssim_parms_16x16_sse3):
|
|||
pxor xmm11,xmm11 ;sum_sxr
|
||||
|
||||
mov rdx, 16 ;row counter
|
||||
NextRow:
|
||||
.NextRow:
|
||||
|
||||
;grab source and reference pixels
|
||||
movdqu xmm5, [rsi]
|
||||
|
@ -107,7 +107,7 @@ NextRow:
|
|||
add rdi, rax ; next r row
|
||||
|
||||
dec rdx ; counter
|
||||
jnz NextRow
|
||||
jnz .NextRow
|
||||
|
||||
SUM_ACROSS_W xmm15
|
||||
SUM_ACROSS_W xmm14
|
||||
|
@ -134,7 +134,7 @@ NextRow:
|
|||
pop rbp
|
||||
ret
|
||||
|
||||
;void ssim_parms_sse3(
|
||||
;void ssim_parms_sse2(
|
||||
; unsigned char *s,
|
||||
; int sp,
|
||||
; unsigned char *r,
|
||||
|
@ -151,8 +151,8 @@ NextRow:
|
|||
; or pavgb At this point this is just meant to be first pass for calculating
|
||||
; all the parms needed for 16x16 ssim so we can play with dssim as distortion
|
||||
; in mode selection code.
|
||||
global sym(vp8_ssim_parms_8x8_sse3)
|
||||
sym(vp8_ssim_parms_8x8_sse3):
|
||||
global sym(vp8_ssim_parms_8x8_sse2)
|
||||
sym(vp8_ssim_parms_8x8_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 9
|
||||
|
@ -174,7 +174,7 @@ sym(vp8_ssim_parms_8x8_sse3):
|
|||
pxor xmm11,xmm11 ;sum_sxr
|
||||
|
||||
mov rdx, 8 ;row counter
|
||||
NextRow2:
|
||||
.NextRow:
|
||||
|
||||
;grab source and reference pixels
|
||||
movq xmm3, [rsi]
|
||||
|
@ -188,7 +188,7 @@ NextRow2:
|
|||
add rdi, rax ; next r row
|
||||
|
||||
dec rdx ; counter
|
||||
jnz NextRow2
|
||||
jnz .NextRow
|
||||
|
||||
SUM_ACROSS_W xmm15
|
||||
SUM_ACROSS_W xmm14
|
||||
|
|
|
@ -93,7 +93,7 @@ sym(vp8_subtract_mby_mmx):
|
|||
mov rcx, 16
|
||||
pxor mm0, mm0
|
||||
|
||||
submby_loop:
|
||||
.submby_loop:
|
||||
|
||||
movq mm1, [rsi]
|
||||
movq mm3, [rax]
|
||||
|
@ -139,7 +139,7 @@ submby_loop:
|
|||
lea rsi, [rsi+rdx]
|
||||
|
||||
sub rcx, 1
|
||||
jnz submby_loop
|
||||
jnz .submby_loop
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
|
|
@ -91,7 +91,7 @@ sym(vp8_subtract_mby_sse2):
|
|||
|
||||
mov rcx, 8 ; do two lines at one time
|
||||
|
||||
submby_loop:
|
||||
.submby_loop:
|
||||
movdqa xmm0, XMMWORD PTR [rsi] ; src
|
||||
movdqa xmm1, XMMWORD PTR [rax] ; pred
|
||||
|
||||
|
@ -133,7 +133,7 @@ submby_loop:
|
|||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
sub rcx, 1
|
||||
jnz submby_loop
|
||||
jnz .submby_loop
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
|
|
@ -71,26 +71,26 @@ sym(vp8_temporal_filter_apply_sse2):
|
|||
|
||||
lea rcx, [rdx + 16*16*1]
|
||||
cmp dword ptr [rsp + block_size], 8
|
||||
jne temporal_filter_apply_load_16
|
||||
jne .temporal_filter_apply_load_16
|
||||
lea rcx, [rdx + 8*8*1]
|
||||
|
||||
temporal_filter_apply_load_8:
|
||||
.temporal_filter_apply_load_8:
|
||||
movq xmm0, [rsi] ; first row
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
punpcklbw xmm0, xmm7 ; src[ 0- 7]
|
||||
movq xmm1, [rsi] ; second row
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
punpcklbw xmm1, xmm7 ; src[ 8-15]
|
||||
jmp temporal_filter_apply_load_finished
|
||||
jmp .temporal_filter_apply_load_finished
|
||||
|
||||
temporal_filter_apply_load_16:
|
||||
.temporal_filter_apply_load_16:
|
||||
movdqa xmm0, [rsi] ; src (frame1)
|
||||
lea rsi, [rsi + rbp] ; += stride
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm7 ; src[ 0- 7]
|
||||
punpckhbw xmm1, xmm7 ; src[ 8-15]
|
||||
|
||||
temporal_filter_apply_load_finished:
|
||||
.temporal_filter_apply_load_finished:
|
||||
movdqa xmm2, [rdx] ; predictor (frame2)
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm7 ; pred[ 0- 7]
|
||||
|
@ -176,13 +176,13 @@ temporal_filter_apply_load_finished:
|
|||
lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
|
||||
|
||||
cmp rdx, rcx
|
||||
je temporal_filter_apply_epilog
|
||||
je .temporal_filter_apply_epilog
|
||||
pxor xmm7, xmm7 ; zero for extraction
|
||||
cmp dword ptr [rsp + block_size], 16
|
||||
je temporal_filter_apply_load_16
|
||||
jmp temporal_filter_apply_load_8
|
||||
je .temporal_filter_apply_load_16
|
||||
jmp .temporal_filter_apply_load_8
|
||||
|
||||
temporal_filter_apply_epilog:
|
||||
.temporal_filter_apply_epilog:
|
||||
; begin epilog
|
||||
mov rbp, [rsp + rbp_backup]
|
||||
add rsp, stack_size
|
||||
|
|
|
@ -27,7 +27,7 @@ sym(vp8_get_mb_ss_mmx):
|
|||
mov rcx, 16
|
||||
pxor mm4, mm4
|
||||
|
||||
NEXTROW:
|
||||
.NEXTROW:
|
||||
movq mm0, [rax]
|
||||
movq mm1, [rax+8]
|
||||
movq mm2, [rax+16]
|
||||
|
@ -44,7 +44,7 @@ NEXTROW:
|
|||
|
||||
add rax, 32
|
||||
dec rcx
|
||||
ja NEXTROW
|
||||
ja .NEXTROW
|
||||
movq QWORD PTR [rsp], mm4
|
||||
|
||||
;return sum[0]+sum[1];
|
||||
|
@ -568,7 +568,7 @@ sym(vp8_filter_block2d_bil4x4_var_mmx):
|
|||
add rsi, r8
|
||||
%endif
|
||||
|
||||
filter_block2d_bil4x4_var_mmx_loop:
|
||||
.filter_block2d_bil4x4_var_mmx_loop:
|
||||
|
||||
movd mm1, [rsi] ;
|
||||
movd mm3, [rsi+1] ;
|
||||
|
@ -614,7 +614,7 @@ filter_block2d_bil4x4_var_mmx_loop:
|
|||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil4x4_var_mmx_loop ;
|
||||
jnz .filter_block2d_bil4x4_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
|
@ -726,7 +726,7 @@ sym(vp8_filter_block2d_bil_var_mmx):
|
|||
add rsi, r8
|
||||
%endif
|
||||
|
||||
filter_block2d_bil_var_mmx_loop:
|
||||
.filter_block2d_bil_var_mmx_loop:
|
||||
|
||||
movq mm1, [rsi] ;
|
||||
movq mm3, [rsi+1] ;
|
||||
|
@ -807,7 +807,7 @@ filter_block2d_bil_var_mmx_loop:
|
|||
add rdi, r9
|
||||
%endif
|
||||
sub rcx, 1 ;
|
||||
jnz filter_block2d_bil_var_mmx_loop ;
|
||||
jnz .filter_block2d_bil_var_mmx_loop ;
|
||||
|
||||
|
||||
pxor mm3, mm3 ;
|
||||
|
|
|
@ -33,7 +33,7 @@ sym(vp8_get_mb_ss_sse2):
|
|||
mov rcx, 8
|
||||
pxor xmm4, xmm4
|
||||
|
||||
NEXTROW:
|
||||
.NEXTROW:
|
||||
movdqa xmm0, [rax]
|
||||
movdqa xmm1, [rax+16]
|
||||
movdqa xmm2, [rax+32]
|
||||
|
@ -50,7 +50,7 @@ NEXTROW:
|
|||
|
||||
add rax, 0x40
|
||||
dec rcx
|
||||
ja NEXTROW
|
||||
ja .NEXTROW
|
||||
|
||||
movdqa xmm3,xmm4
|
||||
psrldq xmm4,8
|
||||
|
@ -126,7 +126,7 @@ sym(vp8_get16x16var_sse2):
|
|||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
||||
mov rcx, 16
|
||||
|
||||
var16loop:
|
||||
.var16loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rdi]
|
||||
|
||||
|
@ -160,7 +160,7 @@ var16loop:
|
|||
add rdi, rdx
|
||||
|
||||
sub rcx, 1
|
||||
jnz var16loop
|
||||
jnz .var16loop
|
||||
|
||||
|
||||
movdqa xmm1, xmm6
|
||||
|
|
|
@ -47,7 +47,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
|
|||
movsxd rax, dword ptr arg(5) ; xoffset
|
||||
|
||||
cmp rax, 0 ; skip first_pass filter if xoffset=0
|
||||
je filter_block2d_bil_var_ssse3_sp_only
|
||||
je .filter_block2d_bil_var_ssse3_sp_only
|
||||
|
||||
shl rax, 4 ; point to filter coeff with xoffset
|
||||
lea rax, [rax + rcx] ; HFilter
|
||||
|
@ -55,7 +55,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
|
|||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; skip second_pass filter if yoffset=0
|
||||
je filter_block2d_bil_var_ssse3_fp_only
|
||||
je .filter_block2d_bil_var_ssse3_fp_only
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
@ -88,7 +88,7 @@ sym(vp8_filter_block2d_bil_var_ssse3):
|
|||
lea rsi, [rsi + r8]
|
||||
%endif
|
||||
|
||||
filter_block2d_bil_var_ssse3_loop:
|
||||
.filter_block2d_bil_var_ssse3_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
@ -142,15 +142,15 @@ filter_block2d_bil_var_ssse3_loop:
|
|||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz filter_block2d_bil_var_ssse3_loop
|
||||
jnz .filter_block2d_bil_var_ssse3_loop
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_ssse3_sp_only:
|
||||
.filter_block2d_bil_var_ssse3_sp_only:
|
||||
movsxd rdx, dword ptr arg(6) ; yoffset
|
||||
|
||||
cmp rdx, 0 ; Both xoffset =0 and yoffset=0
|
||||
je filter_block2d_bil_var_ssse3_full_pixel
|
||||
je .filter_block2d_bil_var_ssse3_full_pixel
|
||||
|
||||
shl rdx, 4
|
||||
lea rdx, [rdx + rcx] ; VFilter
|
||||
|
@ -169,7 +169,7 @@ filter_block2d_bil_var_ssse3_sp_only:
|
|||
|
||||
lea rsi, [rsi + rax]
|
||||
|
||||
filter_block2d_bil_sp_only_loop:
|
||||
.filter_block2d_bil_sp_only_loop:
|
||||
movdqu xmm3, XMMWORD PTR [rsi]
|
||||
movdqa xmm2, xmm1
|
||||
movdqa xmm0, xmm3
|
||||
|
@ -209,11 +209,11 @@ filter_block2d_bil_sp_only_loop:
|
|||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz filter_block2d_bil_sp_only_loop
|
||||
jnz .filter_block2d_bil_sp_only_loop
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_ssse3_full_pixel:
|
||||
.filter_block2d_bil_var_ssse3_full_pixel:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
@ -221,7 +221,7 @@ filter_block2d_bil_var_ssse3_full_pixel:
|
|||
movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
|
||||
pxor xmm0, xmm0
|
||||
|
||||
filter_block2d_bil_full_pixel_loop:
|
||||
.filter_block2d_bil_full_pixel_loop:
|
||||
movq xmm1, QWORD PTR [rsi]
|
||||
punpcklbw xmm1, xmm0
|
||||
movq xmm2, QWORD PTR [rsi+8]
|
||||
|
@ -244,11 +244,11 @@ filter_block2d_bil_full_pixel_loop:
|
|||
lea rsi, [rsi + rax] ;ref_pixels_per_line
|
||||
lea rdi, [rdi + rdx] ;src_pixels_per_line
|
||||
sub rcx, 1
|
||||
jnz filter_block2d_bil_full_pixel_loop
|
||||
jnz .filter_block2d_bil_full_pixel_loop
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_var_ssse3_fp_only:
|
||||
.filter_block2d_bil_var_ssse3_fp_only:
|
||||
mov rsi, arg(0) ;ref_ptr
|
||||
mov rdi, arg(2) ;src_ptr
|
||||
movsxd rcx, dword ptr arg(4) ;Height
|
||||
|
@ -260,7 +260,7 @@ filter_block2d_bil_var_ssse3_fp_only:
|
|||
movsxd r9, dword ptr arg(3) ;src_pixels_per_line
|
||||
%endif
|
||||
|
||||
filter_block2d_bil_fp_only_loop:
|
||||
.filter_block2d_bil_fp_only_loop:
|
||||
movdqu xmm1, XMMWORD PTR [rsi]
|
||||
movdqu xmm2, XMMWORD PTR [rsi+1]
|
||||
movdqa xmm3, xmm1
|
||||
|
@ -298,11 +298,11 @@ filter_block2d_bil_fp_only_loop:
|
|||
%endif
|
||||
|
||||
sub rcx, 1
|
||||
jnz filter_block2d_bil_fp_only_loop
|
||||
jnz .filter_block2d_bil_fp_only_loop
|
||||
|
||||
jmp filter_block2d_bil_variance
|
||||
jmp .filter_block2d_bil_variance
|
||||
|
||||
filter_block2d_bil_variance:
|
||||
.filter_block2d_bil_variance:
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm5, xmm5
|
||||
|
|
|
@ -140,6 +140,8 @@ extern prototype_getmbss(vp8_get_mb_ss_sse2);
|
|||
extern prototype_variance(vp8_mse16x16_wmt);
|
||||
extern prototype_variance2(vp8_get8x8var_sse2);
|
||||
extern prototype_variance2(vp8_get16x16var_sse2);
|
||||
extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
|
||||
extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_variance_sad4x4
|
||||
|
@ -208,6 +210,14 @@ extern prototype_variance2(vp8_get16x16var_sse2);
|
|||
#undef vp8_variance_mse16x16
|
||||
#define vp8_variance_mse16x16 vp8_mse16x16_wmt
|
||||
|
||||
#if ARCH_X86_64
|
||||
#undef vp8_ssimpf_8x8
|
||||
#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
|
||||
|
||||
#undef vp8_ssimpf_16x16
|
||||
#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -111,29 +111,6 @@ void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
|
|||
|
||||
#endif
|
||||
|
||||
#if HAVE_SSSE3
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
#if ARCH_X86_64
|
||||
typedef void ssimpf
|
||||
(
|
||||
unsigned char *s,
|
||||
int sp,
|
||||
unsigned char *r,
|
||||
int rp,
|
||||
unsigned long *sum_s,
|
||||
unsigned long *sum_r,
|
||||
unsigned long *sum_sq_s,
|
||||
unsigned long *sum_sq_r,
|
||||
unsigned long *sum_sxr
|
||||
);
|
||||
|
||||
extern ssimpf vp8_ssim_parms_16x16_sse3;
|
||||
extern ssimpf vp8_ssim_parms_8x8_sse3;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||
{
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
|
@ -245,6 +222,13 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
|||
|
||||
#if !(CONFIG_REALTIME_ONLY)
|
||||
cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
|
||||
#endif
|
||||
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
#if ARCH_X86_64
|
||||
cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse2;
|
||||
cpi->rtcd.variance.ssimpf_16x16 = vp8_ssim_parms_16x16_sse2;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
@ -280,14 +264,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
|||
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
|
||||
|
||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
|
||||
|
||||
#if CONFIG_INTERNAL_STATS
|
||||
#if ARCH_X86_64
|
||||
cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3;
|
||||
cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче