Bug 1865695 - Update dav1d to 3c41fa88ce0fee1fcd1cdfdf53ad8db9bcf3ad29 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D194125
This commit is contained in:
Updatebot 2023-12-12 03:25:19 +00:00
Родитель 8aa051554c
Коммит dade619a29
5 изменённых файлов: 769 добавлений и 42 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: fd4ecc2fd870fa267e1995600dddf212c6e49300 (2023-10-19T17:00:20.000+02:00).
release: 3c41fa88ce0fee1fcd1cdfdf53ad8db9bcf3ad29 (2023-11-13T13:05:58.000+01:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: fd4ecc2fd870fa267e1995600dddf212c6e49300
revision: 3c41fa88ce0fee1fcd1cdfdf53ad8db9bcf3ad29
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "fd4ecc2fd870fa267e1995600dddf212c6e49300"
#define DAV1D_VERSION "3c41fa88ce0fee1fcd1cdfdf53ad8db9bcf3ad29"

1
third_party/dav1d/src/x86/ipred.h поставляемый
Просмотреть файл

@ -138,6 +138,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl);
init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
#endif
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);

56
third_party/dav1d/src/x86/ipred_avx2.asm поставляемый
Просмотреть файл

@ -2275,14 +2275,14 @@ ALIGN function_align
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
punpckhqdq xm3, xm3 ; 34 44 44 44
pmaddubsw xm3, xm4
movd xm4, r6m ; max_width
pminsw xm4, xm15
vpbroadcastb xm4, xm4
vpbroadcastd xm4, r6m ; max_width
packssdw xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
psubb xm4, [base+pb_1to32]
packsswb xm4, xm4
psrlq xm1, 8
psubb xm4, [base+pb_1to32]
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movd [rsp+65], xm0
@ -2324,14 +2324,14 @@ ALIGN function_align
vpbroadcastd m3, [base+z_filter_k-4+r3*4+12*2]
pshufb m2, m4
pmaddubsw m2, m3
movd xm4, r7m ; max_height
pminsw xm4, xm15
vpbroadcastb xm4, xm4
psubb xm4, [base+pb_16to1]
vpbroadcastd xm4, r7m ; max_height
packssdw xm4, xm4
paddw m1, m0
paddw m1, m2
pmulhrsw m1, m13
packsswb xm4, xm4
vextracti128 xm0, m1, 1
psubb xm4, [base+pb_16to1]
packuswb xm0, xm1
vpblendvb xm0, [rsp+48], xm4
mova [rsp+48], xm0
@ -2465,14 +2465,14 @@ ALIGN function_align
pmaddubsw xm2, xm4
vpbroadcastd xm4, [base+z_filter_k-4+r3*4+12*2]
pmaddubsw xm3, xm4
movd xm4, r6m ; max_width
pminuw xm4, xm15
vpbroadcastb xm4, xm4
vpbroadcastd xm4, r6m ; max_width
packssdw xm4, xm4
paddw xm0, xm2
paddw xm0, xm3
pmulhrsw xm0, xm13
psubb xm4, [base+pb_1to32]
packsswb xm4, xm4
psrldq xm1, 1
psubb xm4, [base+pb_1to32]
packuswb xm0, xm0
vpblendvb xm0, xm1, xm4
movq [rsp+65], xm0
@ -2530,14 +2530,14 @@ ALIGN function_align
vinserti128 m2, [rsp+43], 1
pshufb m0, m2, m0
pmaddubsw m0, m7
movd xm7, r7m ; max_height
vpbroadcastd m7, r7m ; max_height
pshufb m1, m2, m1
pmaddubsw m1, m8
pshufb m2, m4
pmaddubsw m2, m9
pminsw xm7, xm15
packssdw m7, m7
paddw m1, m0
vpbroadcastb m7, xm7
packsswb m7, m7
paddw m1, m2
pmulhrsw m1, m13
psubb m7, [base+pb_32to1]
@ -2679,14 +2679,14 @@ ALIGN function_align
shufps m2, m1, q2121 ; 12 23 34 45 56 67 78 89 89 9a ab bc cd de ef ff
pmaddubsw m2, m4
pmaddubsw m1, m5
movd xm4, r6m ; max_width
pminsw xm4, xm15
vpbroadcastb xm4, xm4
vpbroadcastd xm4, r6m ; max_width
packssdw xm4, xm4
paddw m0, m2
paddw m0, m1
pmulhrsw m0, m13
psubb xm4, [base+pb_1to32]
packsswb xm4, xm4
vextracti128 xm2, m0, 1
psubb xm4, [base+pb_1to32]
packuswb xm0, xm2
vpblendvb xm0, xm6, xm4
movu [rsp+65], xm0
@ -2703,9 +2703,9 @@ ALIGN function_align
vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2]
.w16_filter_left:
movd xm6, r7m ; max_height
pminsw xm6, xm15
vpbroadcastb m6, xm6
vpbroadcastd m6, r7m ; max_height
packssdw m6, m6
packsswb m6, m6
cmp hd, 32
jl .w16_filter_left_h16
vpbroadcastd xm0, [base+pb_5]
@ -2916,9 +2916,9 @@ ALIGN function_align
vinserti128 m6, [base+z_filter_s+22], 1 ; 56 67 78 89 9a ab bc cd ab bc cd de ef ff ff ff
movu xm3, [tlq+ 6]
vinserti128 m3, [tlq+17], 1
movd xm0, r6m ; max_width
pminsw xm0, xm15
vpbroadcastb m10, xm0
vpbroadcastd m10, r6m ; max_width
packssdw m10, m10
packsswb m10, m10
.w32_filter_above:
pshufb m0, m1, m5
shufps m4, m5, m6, q1021 ; 12 23 34 45 56 67 78 89 67 78 89 9a ab bc cd de
@ -2974,20 +2974,20 @@ ALIGN function_align
paddw m0, m3
movu xm2, [tlq+36]
vinserti128 m2, [tlq+49], 1
vpbroadcastd m10, r6m ; max_width
pshufb m4, m2, m4
pmaddubsw m4, m7
pshufb m3, m2, m6
pmaddubsw m3, m8
pshufb m2, m5
pmaddubsw m2, m9
movd xm5, r6m ; max_width
pminsw xm5, xm15
vpbroadcastb m10, xm5
packssdw m10, m10
paddw m3, m4
paddw m2, m3
vpbroadcastd m3, [base+pb_32]
pmulhrsw m0, m13
pmulhrsw m2, m13
packsswb m10, m10
mova xm5, [base+z_filter_s]
vinserti128 m5, [base+z_filter_s+6], 1
psubb m3, m10, m3

748
third_party/dav1d/src/x86/ipred_avx512.asm поставляемый
Просмотреть файл

@ -139,11 +139,19 @@ z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0
db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1
db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2
db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3
z_ypos_mul1: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512
z_ypos_off3: db 1, 2, 1, 2, 1, 1, 1, 1, 3, 4, 3, 4, 1, 1, 1, 1
db 5, 6, 5, 6, 3, 3, 3, 3, 7, 8, 7, 8, 3, 3, 3, 3
db 9, 10, 9, 10, 5, 5, 5, 5, 11, 12, 11, 12, 5, 5, 5, 5
db 13, 14, 13, 14, 7, 7, 7, 7, 15, 16, 15, 16, 7, 7, 7, 7
z_ypos_mul1a: dw 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24
dw 33, 34, 35, 36, 37, 38, 39, 40, 49, 50, 51, 52, 53, 54, 55, 56
z_ypos_mul1b: dw 9, 10, 11, 12, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32
dw 41, 42, 43, 44, 45, 46, 47, 48, 57, 58, 59, 60, 61, 62, 63, 64
z_ypos_mul2a: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512
dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512
dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512
dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512
z_ypos_mul2: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
z_ypos_mul2b: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512
dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512
dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512
dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512
@ -165,6 +173,7 @@ pb_15: times 4 db 15
pb_16: times 4 db 16
pb_31: times 4 db 31
pb_63: times 4 db 63
pb_90: times 4 db 90
pb_128: times 4 db 128
pw_128: times 2 dw 128
pw_255: times 2 dw 255
@ -198,6 +207,7 @@ JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
@ -1824,6 +1834,722 @@ cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
.w64_end:
RET
cglobal ipred_z2_8bpc, 3, 9, 18, dst, stride, tl, w, h, angle, dx, _, dy
tzcnt wd, wm
movifnidn angled, anglem
lea dxq, [dr_intra_derivative-90]
movzx dyd, angleb
xor angled, 0x400
mov r7, dxq
sub dxq, dyq
movifnidn hd, hm
and dyd, ~1
and dxq, ~1
movzx dyd, word [r7+dyq] ; angle - 90
lea r7, [z_filter_t0]
movzx dxd, word [dxq+270] ; 180 - angle
movsxd wq, [base+ipred_z2_8bpc_avx512icl_table+wq*4]
mova m8, [base+pb_63to0]
neg dyd
vpermb m8, m8, [tlq-64] ; left
lea wq, [base+ipred_z2_8bpc_avx512icl_table+wq]
mova m14, [base+z_frac_table]
inc tlq
vpbroadcastd m15, [base+pw_512]
neg dxd
jmp wq
.w4:
movd xm7, [tlq]
vpbroadcastq m10, [base+z_xpos_off2a]
test angled, 0x400
jnz .w4_main ; !enable_intra_edge_filter
lea r3d, [hq+2]
add angled, 1022
shl r3d, 6
test r3d, angled
jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
vpbroadcastd xm2, [base+pb_4]
sub angled, 1075 ; angle - 53
call .upsample_above
lea r3d, [hq+3]
vpbroadcastq m10, [pb_0to63+1]
punpcklbw xm7, xm0, xm7
call .filter_strength
jmp .w4_filter_left
.w4_upsample_left:
call .upsample_left
movsldup m16, [base+z_ypos_off3]
vpbroadcastd m9, [base+pb_16]
punpcklbw xm8, xm0, xm8
jmp .w4_main2
.w4_no_upsample_above:
lea r3d, [hq+3]
sub angled, 1112 ; angle - 90
call .filter_strength
test r3d, r3d
jz .w4_no_filter_above
vpbroadcastd xm5, [base+pb_3]
call .filter_top_w16
.w4_no_filter_above:
lea r3d, [hq+2]
add angled, 973 ; angle + 883
shl r3d, 6
test r3d, angled
jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
vpbroadcastd ym0, [base+pb_90]
psubb ym0, ym17
vpcmpgtb k2{k2}, ym0, ym16
kmovd r3d, k2
.w4_filter_left:
test r3d, r3d
jz .w4_main
popcnt r3d, r3d
call .filter_left_h16
.w4_main:
movsldup m16, [base+z_ypos_off1]
vpbroadcastd m9, [base+pb_8]
.w4_main2:
vpbroadcastq m3, [base+z_ypos_mul1a]
vpbroadcastw m0, dyd
movsldup m1, [base+z_xpos_mul]
vpbroadcastw m5, dxd
vinserti32x4 m7, [tlq-16], 3
vinserti32x4 m8, [tlq-16], 3
pmullw m3, m0
vbroadcasti32x4 m2, [base+z_xpos_bc]
pmullw m1, m5 ; xpos0..3
psllw m5, 5 ; dx*8
psraw m4, m3, 6
psrlw m3, 1
packsswb m4, m4
vpermw m3, m3, m14 ; 64-frac, frac
punpcklbw m4, m4
lea r2, [strideq*3]
paddb m4, m16 ; base, base+1
.w4_loop:
pshufb m16, m1, m2
psrlw m0, m1, 3
paddb m16, m10
vpermw m0, m0, m14
vpmovw2m k1, m16 ; base_x < 0
vpermb m16, m16, m7
pmaddubsw m16, m0
vpermb m0, m4, m8
pmaddubsw m16{k1}, m0, m3
pmulhrsw m16, m15
vpmovwb ym16, m16
movd [dstq+strideq*0], xm16
pextrd [dstq+strideq*1], xm16, 1
pextrd [dstq+strideq*2], xm16, 2
pextrd [dstq+r2 ], xm16, 3
sub hd, 8
jl .w4_end
paddsw m1, m5
vextracti128 xm16, ym16, 1
lea dstq, [dstq+strideq*4]
paddb m4, m9
movd [dstq+strideq*0], xm16
pextrd [dstq+strideq*1], xm16, 1
pextrd [dstq+strideq*2], xm16, 2
pextrd [dstq+r2 ], xm16, 3
lea dstq, [dstq+strideq*4]
jg .w4_loop
.w4_end:
RET
.upsample_above: ; w4/w8
mova xm0, [tlq-1]
xor angled, 0x7f ; 180 - angle
add dxd, dxd
jmp .upsample
.upsample_left: ; h4/h8
palignr xm0, xm8, [tlq-16], 15
vpbroadcastb xm2, hd
add dyd, dyd
.upsample:
pshufb xm1, xm0, [base+z_filter4_s1]
pminub xm2, [base+z_filter_s4]
vpbroadcastd xm3, [base+pb_m4_36]
pshufb xm0, xm2
pmaddubsw xm1, xm3
pmaddubsw xm0, xm3
paddw xm0, xm1
pmulhrsw xm0, xm15
packuswb xm0, xm0
ret
.filter_strength:
vpbroadcastb ym16, r3d
mov r3d, angled
vpbroadcastd m2, [tlq-4]
vpbroadcastb ym17, angled
shr r3d, 8
vpcmpeqb k2, ym16, [base+z_filter_wh]
mova xm16, [base+z_filter_t0+r3*8]
vpcmpgtb k1{k2}, ym17, ym16
mova m9, [pb_0to63]
kmovd r3d, k1
ret
.w8:
movq xm7, [tlq]
vbroadcasti32x4 m10, [base+z_xpos_off2a]
test angled, 0x400
jnz .w8_main
lea r3d, [angleq+126]
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
vpbroadcastd xm2, [base+pb_8]
sub angled, 53 ; angle - 53
call .upsample_above
lea r3d, [hq+7]
vbroadcasti32x4 m10, [pb_0to63+1]
punpcklbw xm7, xm0, xm7
call .filter_strength
jmp .w8_filter_left
.w8_upsample_left:
call .upsample_left
movshdup m16, [base+z_ypos_off3]
vpbroadcastd m9, [base+pb_8]
punpcklbw xm8, xm0, xm8
jmp .w8_main2
.w8_no_upsample_above:
lea r3d, [hq+7]
sub angled, 90 ; angle - 90
call .filter_strength
test r3d, r3d
jz .w8_no_filter_above
vpbroadcastd xm5, [base+pb_7]
call .filter_top_w16
.w8_no_filter_above:
lea r3d, [angleq-51]
mov r3b, hb
cmp r3d, 8
jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
vpbroadcastd ym0, [base+pb_90]
psubb ym0, ym17
vpcmpgtb k2{k2}, ym0, ym16
kmovd r3d, k2
.w8_filter_left:
test r3d, r3d
jz .w8_main
cmp hd, 32
je .w8_filter_left_h32
popcnt r3d, r3d
call .filter_left_h16
jmp .w8_main
.w8_filter_left_h32:
call .filter_left_h64
.w8_main:
movshdup m16, [base+z_ypos_off2]
vpbroadcastd m9, [base+pb_4]
.w8_main2:
vbroadcasti32x4 m3, [base+z_ypos_mul1a]
vpbroadcastw m0, dyd
movshdup m1, [base+z_xpos_mul]
vpbroadcastw m5, dxd
vinserti32x4 m7, [tlq-16], 3
vinserti32x4 m8, [tlq-16], 3
pmullw m3, m0
vpbroadcastd m2, [base+pb_1]
pmullw m1, m5 ; xpos0..3
psllw m5, 4 ; dx*4
psraw m4, m3, 6
psrlw m3, 1
packsswb m4, m4
vpermw m3, m3, m14 ; 64-frac, frac
lea r3d, [dxq+(8<<6)]
paddsb m4, m16
shl dxd, 2
paddsb m0, m4, m2
lea r2, [strideq*3]
punpcklbw m4, m0 ; base, base+1
.w8_loop:
pshufb m16, m1, m2
psrlw m0, m1, 3
paddb m16, m10
vpermw m0, m0, m14
vpmovw2m k1, m16 ; base_x < 0
vpermb m16, m16, m7
pmaddubsw m16, m0
vpermb m0, m4, m8
pmaddubsw m16{k1}, m0, m3
pmulhrsw m16, m15
vpmovwb ym16, m16
vextracti128 xm17, ym16, 1
movq [dstq+strideq*0], xm16
movhps [dstq+strideq*1], xm16
movq [dstq+strideq*2], xm17
movhps [dstq+r2 ], xm17
sub hd, 4
jz .w8_end
paddw m1, m5
lea dstq, [dstq+strideq*4]
paddb m4, m9
add r3d, dxd
jge .w8_loop
.w8_leftonly_loop:
vpermb m16, m4, m8
pmaddubsw m16, m3
paddb m4, m9
pmulhrsw m16, m15
vpmovwb ym16, m16
vextracti128 xm17, ym16, 1
movq [dstq+strideq*0], xm16
movhps [dstq+strideq*1], xm16
movq [dstq+strideq*2], xm17
movhps [dstq+r2 ], xm17
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_leftonly_loop
.w8_end:
RET
.filter_top_w16:
mova xm0, [base+z_filter_s1]
popcnt r3d, r3d
pminub xm4, xm5, [base+z_filter_s4]
vpermi2b xm0, xm7, xm2
pminub xm5, [base+z_filter_s5]
pshufb xm1, xm7, [base+z_filter_s2]
vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
pshufb xm3, xm7, [base+z_filter_s3]
vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
pshufb xm4, xm7, xm4
vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
pshufb xm5, xm7, xm5
pmaddubsw xm0, xm11
pmaddubsw xm1, xm11
pmaddubsw xm6, xm3, xm12
vpbroadcastd xm12, r7m ; max_width
pmaddubsw xm3, xm13
pmaddubsw xm4, xm11
pmaddubsw xm5, xm11
packssdw xm12, xm12
paddw xm0, xm6
paddw xm1, xm3
paddw xm0, xm4
paddw xm1, xm5
packsswb xm12, xm12
pmulhrsw xm0, xm15
pmulhrsw xm1, xm15
vpcmpgtb k1, xm12, xm9 ; x < max_width
packuswb xm7{k1}, xm0, xm1
ret
.filter_left_h16:
lea r5d, [hq-1]
mova xm0, [base+z_filter_s1]
vpbroadcastb xm5, r5d
vpermi2b xm0, xm8, xm2
pminub xm4, xm5, [base+z_filter_s4]
pshufb xm1, xm8, [base+z_filter_s2]
pminub xm5, [base+z_filter_s5]
pshufb xm3, xm8, [base+z_filter_s3]
vpbroadcastd xm11, [base+z_filter_k+(r3-1)*4+12*0]
pshufb xm4, xm8, xm4
vpbroadcastd xm12, [base+z_filter_k+(r3-1)*4+12*1]
pshufb xm5, xm8, xm5
vpbroadcastd xm13, [base+z_filter_k+(r3-1)*4+12*2]
pmaddubsw xm0, xm11
pmaddubsw xm1, xm11
pmaddubsw xm6, xm3, xm12
vpbroadcastd xm12, r8m ; max_height
pmaddubsw xm3, xm13
pmaddubsw xm4, xm11
pmaddubsw xm5, xm11
packssdw xm12, xm12
paddw xm0, xm6
paddw xm1, xm3
paddw xm0, xm4
paddw xm1, xm5
packsswb xm12, xm12
pmulhrsw xm0, xm15
pmulhrsw xm1, xm15
vpcmpgtb k1, xm12, xm9 ; y < max_height
packuswb xm8{k1}, xm0, xm1
ret
.w16:
movu xm7, [tlq] ; top
test angled, 0x400
jnz .w16_main
lea r3d, [hq+15]
sub angled, 90
call .filter_strength
test r3d, r3d
jz .w16_no_filter_above
vpbroadcastd xm5, [base+pb_15]
call .filter_top_w16
.w16_no_filter_above:
cmp hd, 16
jg .w16_filter_left_h64
vpbroadcastd ym0, [base+pb_90]
psubb ym0, ym17
vpcmpgtb k2{k2}, ym0, ym16
kmovd r3d, k2
test r3d, r3d
jz .w16_main
popcnt r3d, r3d
call .filter_left_h16
jmp .w16_main
.w16_filter_left_h64:
call .filter_left_h64
.w16_main:
vbroadcasti32x4 m6, [base+z_ypos_mul1a] ; 1.. 8
vbroadcasti32x4 m5, [base+z_ypos_mul1b] ; 9..15
vpbroadcastw m0, dyd
vinserti32x4 m7, [tlq-16], 3
vpbroadcastd m2, [base+pb_1]
vpbroadcastw m12, dxd
movshdup m1, [base+z_xpos_mul]
pmullw m6, m0
vbroadcasti32x4 m3, [base+z_xpos_off2a]
pmullw m5, m0
vbroadcasti32x4 m4, [base+z_xpos_off2b]
pmullw m1, m12 ; xpos0 xpos1 xpos2 xpos3
vpbroadcastd m9, [base+pb_4]
psllw m12, 4 ; dx*4
movshdup m16, [base+z_ypos_off2]
psrlw m10, m6, 1
psrlw m11, m5, 1
vpermw m10, m10, m14 ; 64-frac, frac
psraw m6, 6
vpermw m11, m11, m14
psraw m5, 6
mov r5d, -(16<<6) ; 15 to avoid top, +1 to avoid topleft
packsswb m6, m5
mov r3d, 1<<6
paddsb m6, m16
sub r5d, dxd ; left-only threshold
paddsb m0, m6, m2
shl dxd, 2
punpcklbw m5, m6, m0 ; base, base+1
lea r2, [strideq*3]
punpckhbw m6, m0
.w16_loop:
pshufb m17, m1, m2
psrlw m0, m1, 3
paddb m16, m3, m17
vpermw m0, m0, m14
paddb m17, m4
vpmovw2m k1, m16
vpermb m16, m16, m7
vpmovw2m k2, m17
vpermb m17, m17, m7
pmaddubsw m16, m0
pmaddubsw m17, m0
add r3d, dxd
jge .w16_toponly
mova m0, m8
vpermt2b m0, m5, m7
pmaddubsw m16{k1}, m0, m10
mova m0, m8
vpermt2b m0, m6, m7
pmaddubsw m17{k2}, m0, m11
.w16_toponly:
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
mova [dstq+strideq*0], xm16
vextracti128 [dstq+strideq*1], ym16, 1
vextracti32x4 [dstq+strideq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
sub hd, 4
jz .w16_end
paddw m1, m12
lea dstq, [dstq+strideq*4]
paddb m5, m9
paddb m6, m9
cmp r3d, r5d
jge .w16_loop
.w16_leftonly_loop:
vpermb m16, m5, m8
vpermb m17, m6, m8
pmaddubsw m16, m10
pmaddubsw m17, m11
paddb m5, m9
paddb m6, m9
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
mova [dstq+strideq*0], xm16
vextracti128 [dstq+strideq*1], ym16, 1
vextracti32x4 [dstq+strideq*2], m16, 2
vextracti32x4 [dstq+r2 ], m16, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16_leftonly_loop
.w16_end:
RET
.w32:
movu ym7, [tlq]
test angled, 0x400
jnz .w32_main
vpbroadcastd m2, [tlq-4]
mova ym0, [base+z_filter_s1]
vbroadcasti32x4 ym1, [base+z_filter_s2]
vbroadcasti32x4 ym3, [base+z_filter_s3]
vbroadcasti32x4 ym4, [base+z_filter_s4]
vpermi2b ym0, ym7, ym2 ; al bl
vpbroadcastd ym5, [base+pb_31]
pminub ym5, [base+z_filter_s5]
pshufb ym1, ym7, ym1 ; ah bh
vpbroadcastd ym11, [base+z_filter_k+4*2+12*0]
pshufb ym3, ym7, ym3 ; cl ch
vpbroadcastd ym12, [base+z_filter_k+4*2+12*1]
pshufb ym4, ym7, ym4 ; el dl
vpbroadcastd ym13, [base+z_filter_k+4*2+12*2]
vpermb ym5, ym5, ym7 ; eh dh
pmaddubsw ym0, ym11
pmaddubsw ym1, ym11
pmaddubsw ym6, ym3, ym12
vpbroadcastd ym12, r6m
pmaddubsw ym3, ym13
pmaddubsw ym4, ym11
pmaddubsw ym5, ym11
mova m9, [pb_0to63]
packssdw ym12, ym12
paddw ym0, ym6
paddw ym1, ym3
paddw ym0, ym4
paddw ym1, ym5
packsswb ym12, ym12
pmulhrsw ym0, ym15
pmulhrsw ym1, ym15
vpcmpgtb k1, ym12, ym9 ; x < max_width
packuswb ym7{k1}, ym0, ym1
cmp hd, 16
jg .w32_filter_h64
mov r3d, 3
call .filter_left_h16
jmp .w32_main
.w32_filter_h64:
call .filter_left_h64
.w32_main:
vbroadcasti32x8 m6, [base+z_ypos_mul1a] ; 1.. 8
vbroadcasti32x8 m5, [base+z_ypos_mul1b] ; 9..15
vpbroadcastw m0, dyd
vinserti32x4 m7, [tlq-16], 3
rorx r2q, dxq, 62 ; dx << 2
vpbroadcastd m2, [base+pb_1]
vpbroadcastw m1, r2d
pmullw m6, m0
vbroadcasti32x8 m3, [base+z_xpos_off2a]
pmullw m5, m0
vbroadcasti32x8 m4, [base+z_xpos_off2b]
mova ym0, ym1
paddw m12, m1, m1
vpbroadcastd m9, [base+pb_2]
paddw m1, m0 ; xpos1 xpos0
mova ym0, ym2
psrlw m10, m6, 1
psrlw m11, m5, 1
vpermw m10, m10, m14 ; 64-frac, frac
psraw m6, 6
vpermw m11, m11, m14
psraw m5, 6
mov r5d, -(32<<6) ; 31 to avoid top, +1 to avoid topleft
packsswb m6, m5
mov r3d, 1<<6
paddsb m6, m0
sub r5d, dxd ; left-only threshold
paddsb m0, m6, m2
add dxd, dxd
punpcklbw m5, m6, m0 ; base, base+1
punpckhbw m6, m0
.w32_loop:
pshufb m17, m1, m2
psrlw m0, m1, 3
paddb m16, m3, m17
vpermw m0, m0, m14
paddb m17, m4
vpmovw2m k1, m16
vpermb m16, m16, m7
vpmovw2m k2, m17
vpermb m17, m17, m7
pmaddubsw m16, m0
pmaddubsw m17, m0
add r3d, dxd
jge .w32_toponly
mova m0, m8
vpermt2b m0, m5, m7
pmaddubsw m16{k1}, m0, m10
mova m0, m8
vpermt2b m0, m6, m7
pmaddubsw m17{k2}, m0, m11
.w32_toponly:
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
vextracti32x8 [dstq+strideq*0], m16, 1
mova [dstq+strideq*1], ym16
sub hd, 2
jz .w32_end
paddw m1, m12
lea dstq, [dstq+strideq*2]
paddb m5, m9
paddb m6, m9
cmp r3d, r5d
jge .w32_loop
.w32_leftonly_loop:
vpermb m16, m5, m8
vpermb m17, m6, m8
pmaddubsw m16, m10
pmaddubsw m17, m11
paddb m5, m9
paddb m6, m9
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
vextracti32x8 [dstq+strideq*0], m16, 1
mova [dstq+strideq*1], ym16
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_leftonly_loop
.w32_end:
RET
.filter_left_h64:
mova m0, [base+z_filter_s1]
lea r3d, [hq-1]
vbroadcasti32x4 m4, [base+z_filter_s4]
vpbroadcastb m5, r3d
vbroadcasti32x4 m1, [base+z_filter_s2]
vbroadcasti32x4 m3, [base+z_filter_s3]
vpermi2b m0, m8, m2 ; al bl
pminub m5, [base+z_filter_s5]
pshufb m1, m8, m1 ; ah bh
vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
pshufb m3, m8, m3 ; cl ch
vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
pshufb m4, m8, m4 ; el dl
vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
vpermb m5, m5, m8 ; eh dh
pmaddubsw m0, m11
pmaddubsw m1, m11
pmaddubsw m6, m3, m12
vpbroadcastd m12, r8m ; max_height
pmaddubsw m3, m13
pmaddubsw m4, m11
pmaddubsw m5, m11
packssdw m12, m12
paddw m0, m6
paddw m1, m3
paddw m0, m4
paddw m1, m5
packsswb m12, m12
pmulhrsw m0, m15
pmulhrsw m1, m15
vpcmpgtb k1, m12, m9 ; y < max_height
packuswb m8{k1}, m0, m1
ret
.w64:
movu m7, [tlq]
test angled, 0x400
jnz .w64_main
vpbroadcastd m2, [tlq-4]
mova m0, [base+z_filter_s1]
vbroadcasti32x4 m1, [base+z_filter_s2]
vbroadcasti32x4 m3, [base+z_filter_s3]
vbroadcasti32x4 m4, [base+z_filter_s4]
vpermi2b m0, m7, m2 ; al bl
vpbroadcastd m5, [base+pb_63]
pminub m5, [base+z_filter_s5]
pshufb m1, m7, m1 ; ah bh
vpbroadcastd m11, [base+z_filter_k+4*2+12*0]
pshufb m3, m7, m3 ; cl ch
vpbroadcastd m12, [base+z_filter_k+4*2+12*1]
pshufb m4, m7, m4 ; el dl
vpbroadcastd m13, [base+z_filter_k+4*2+12*2]
vpermb m5, m5, m7 ; eh dh
pmaddubsw m0, m11
pmaddubsw m1, m11
pmaddubsw m6, m3, m12
vpbroadcastd m12, r6m
pmaddubsw m3, m13
pmaddubsw m4, m11
pmaddubsw m5, m11
mova m9, [pb_0to63]
packssdw m12, m12
paddw m0, m6
paddw m1, m3
paddw m0, m4
paddw m1, m5
packsswb m12, m12
pmulhrsw m0, m15
pmulhrsw m1, m15
vpcmpgtb k1, m12, m9 ; x < max_width
packuswb m7{k1}, m0, m1
call .filter_left_h64 ; always filter the full 64 pixels for simplicity
.w64_main:
vpbroadcastw m5, dyd
vpbroadcastd m9, [tlq-4]
rorx r2q, dxq, 62 ; dx << 2
pmullw m6, m5, [base+z_ypos_mul1a] ; can overflow, but it doesn't matter as such
pmullw m5, [base+z_ypos_mul1b] ; pixels aren't selected from the left edge
vpbroadcastw m1, r2d ; xpos
mova m3, [base+z_xpos_off2a]
mova m4, [base+z_xpos_off2b]
mova m12, m1
vpbroadcastd m2, [base+pb_1]
psrlw m10, m6, 1
psrlw m11, m5, 1
vpermw m10, m10, m14 ; 64-frac, frac
psraw m6, 6
vpermw m11, m11, m14
psraw m5, 6
mov r5d, -(64<<6) ; 63 to avoid top, +1 to avoid topleft
packsswb m6, m5
mov r3d, 1<<6
paddsb m0, m6, m2
sub r5d, dxd ; left-only threshold
punpcklbw m5, m6, m0 ; base, base+1
punpckhbw m6, m0
.w64_loop:
pshufb m17, m1, m2
psrlw m0, m1, 3
paddb m16, m3, m17
vpermw m0, m0, m14
paddb m17, m4
vpmovw2m k1, m16 ; base_x < 0
vpermi2b m16, m7, m9
vpmovw2m k2, m17
vpermi2b m17, m7, m9
pmaddubsw m16, m0
pmaddubsw m17, m0
add r3d, dxd
jge .w64_toponly
mova m0, m8
vpermt2b m0, m5, m9
pmaddubsw m16{k1}, m0, m10
mova m0, m8
vpermt2b m0, m6, m9
pmaddubsw m17{k2}, m0, m11
.w64_toponly:
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
mova [dstq], m16
dec hd
jz .w64_end
paddw m1, m12
add dstq, strideq
paddb m5, m2
paddb m6, m2
cmp r3d, r5d
jge .w64_loop
.w64_leftonly_loop:
vpermb m16, m5, m8
vpermb m17, m6, m8
pmaddubsw m16, m10
pmaddubsw m17, m11
paddb m5, m2
paddb m6, m2
pmulhrsw m16, m15
pmulhrsw m17, m15
packuswb m16, m17
mova [dstq], m16
add dstq, strideq
dec hd
jg .w64_leftonly_loop
.w64_end:
RET
cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
lea r7, [z_filter_t0]
tzcnt wd, wm
@ -1879,7 +2605,7 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
vpbroadcastd m6, [base+pb_8]
.w4_main2:
vpbroadcastw m0, dyd
vpbroadcastq m2, [base+z_ypos_mul1] ; 1..4
vpbroadcastq m2, [base+z_ypos_mul2a] ; 1..4
pmulhuw m2, m0 ; ypos >> 1
lea r2, [strideq*3]
vpermw m3, m2, m14 ; 64-frac, frac
@ -1960,7 +2686,7 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
vpbroadcastd m6, [base+pb_4]
.w8_main2:
vpbroadcastw m0, dyd
vbroadcasti32x4 m2, [base+z_ypos_mul1] ; 1..8
vbroadcasti32x4 m2, [base+z_ypos_mul2a] ; 1..8
pmulhuw m2, m0 ; ypos >> 1
lea r2, [strideq*3]
vpermw m3, m2, m14 ; 64-frac, frac
@ -2037,10 +2763,10 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
valignq m11, m8, m7, 1
call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
.w16_main:
vpbroadcastd m6, [base+pb_4]
vbroadcasti32x4 m3, [base+z_ypos_mul2a] ; 1.. 8
vbroadcasti32x4 m2, [base+z_ypos_mul2b] ; 9..15
vpbroadcastw m0, dyd
vbroadcasti32x4 m3, [base+z_ypos_mul1] ; 1.. 8
vbroadcasti32x4 m2, [base+z_ypos_mul2] ; 9..15
vpbroadcastd m6, [base+pb_4]
pmulhuw m3, m0 ; ypos >> 1
pmulhuw m2, m0
movshdup m0, [base+z_ypos_off2]
@ -2098,9 +2824,9 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
valignq m11, m8, m7, 1
call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
.w32_main:
vbroadcasti32x8 m3, [base+z_ypos_mul1] ; 1.. 8
vbroadcasti32x8 m3, [base+z_ypos_mul2a] ; 1.. 8
vbroadcasti32x8 m2, [base+z_ypos_mul2b] ; 9..15
vpbroadcastw m0, dyd
vbroadcasti32x8 m2, [base+z_ypos_mul2] ; 9..15
vpbroadcastd m1, [base+pb_1]
pmulhuw m3, m0 ; ypos >> 1
pmulhuw m2, m0
@ -2148,8 +2874,8 @@ cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter
.w64_main:
vpbroadcastw m2, dyd
pmulhuw m3, m2, [base+z_ypos_mul1]
pmulhuw m2, [base+z_ypos_mul2]
pmulhuw m3, m2, [base+z_ypos_mul2a]
pmulhuw m2, [base+z_ypos_mul2b]
vpbroadcastd m6, [base+pb_1]
vpermw m4, m3, m14 ; 64-frac, frac
psrlw m3, 5