diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index 15d6e60777ea..4fa15ad35ee3 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: e58afe4dd9057591882a01c31382c203e8a61c92 (2023-07-25T16:10:07.000+02:00). + release: fd4ecc2fd870fa267e1995600dddf212c6e49300 (2023-10-19T17:00:20.000+02:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: e58afe4dd9057591882a01c31382c203e8a61c92 + revision: fd4ecc2fd870fa267e1995600dddf212c6e49300 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index f6c4cdb76394..6d9754b53668 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "e58afe4dd9057591882a01c31382c203e8a61c92" +#define DAV1D_VERSION "fd4ecc2fd870fa267e1995600dddf212c6e49300" diff --git a/third_party/dav1d/NEWS b/third_party/dav1d/NEWS index b541a2747df5..54f8557328a5 100644 --- a/third_party/dav1d/NEWS +++ b/third_party/dav1d/NEWS @@ -1,3 +1,16 @@ +Changes for 1.3.0 'Tundra Peregrine Falcon (Calidus)': +------------------------------------------------------ + +1.3.0 is a medium release of dav1d, focus on new APIs and memory usage reduction. + +- Reduce memory usage in numerous places +- ABI break in Dav1dSequenceHeader, Dav1dFrameHeader, Dav1dContentLightLevel structures +- new API function to check the API version: dav1d_version_api() +- Rewrite of the SGR functions for ARM64 to be faster +- NEON implemetation of save_tmvs for ARM32 and ARM64 +- x86 palette DSP for pal_idx_finish function + + Changes for 1.2.1 'Arctic Peregrine Falcon': ------------------------------------------- diff --git a/third_party/dav1d/meson.build b/third_party/dav1d/meson.build index 1104aef9b98c..2b88f3c45d01 100644 --- a/third_party/dav1d/meson.build +++ b/third_party/dav1d/meson.build @@ -23,7 +23,7 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '1.2.1', + version: '1.3.0', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S index 4f5cf5120e87..709238e2f854 100644 --- a/third_party/dav1d/src/arm/64/ipred.S +++ b/third_party/dav1d/src/arm/64/ipred.S @@ -3529,6 +3529,7 @@ function ipred_z3_fill_padding_neon, export=0 br x9 2: + AARCH64_VALID_JUMP_TARGET st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 st1 {v31.h}[0], [x13], x1 @@ -3547,6 +3548,7 @@ function ipred_z3_fill_padding_neon, export=0 b 1b 4: + AARCH64_VALID_JUMP_TARGET st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 @@ -3565,6 +3567,7 @@ function ipred_z3_fill_padding_neon, export=0 b 1b 8: + AARCH64_VALID_JUMP_TARGET st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 @@ -3585,6 +3588,7 @@ function ipred_z3_fill_padding_neon, export=0 16: 32: 64: + AARCH64_VALID_JUMP_TARGET st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S index ae5ce9f8da10..3f8cff986932 100644 --- a/third_party/dav1d/src/arm/64/ipred16.S +++ b/third_party/dav1d/src/arm/64/ipred16.S @@ -3664,6 +3664,7 @@ function ipred_z3_fill_padding_neon, export=0 br x9 2: + AARCH64_VALID_JUMP_TARGET st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 st1 {v31.s}[0], [x13], x1 @@ -3682,6 +3683,7 @@ function ipred_z3_fill_padding_neon, export=0 b 1b 4: + AARCH64_VALID_JUMP_TARGET st1 {v31.4h}, [x0], x1 subs w4, w4, #4 st1 {v31.4h}, [x13], x1 @@ -3703,6 +3705,7 @@ function ipred_z3_fill_padding_neon, export=0 16: 32: 64: + AARCH64_VALID_JUMP_TARGET st1 {v31.8h}, [x0], x1 subs w4, w4, #4 st1 {v31.8h}, [x13], x1 diff --git a/third_party/dav1d/src/arm/64/refmvs.S b/third_party/dav1d/src/arm/64/refmvs.S index b567e194404a..e905682f47cd 100644 --- a/third_party/dav1d/src/arm/64/refmvs.S +++ b/third_party/dav1d/src/arm/64/refmvs.S @@ -202,27 +202,27 @@ function save_tmvs_neon, export=1 ret 10: - AARCH64_VALID_JUMP_TARGET + AARCH64_VALID_CALL_TARGET add x16, x3, #4 st1 {v0.s}[0], [x3] st1 {v0.b}[4], [x16] add x3, x3, #5 ret 20: - AARCH64_VALID_JUMP_TARGET + AARCH64_VALID_CALL_TARGET add x16, x3, #8 st1 {v0.d}[0], [x3] st1 {v0.h}[4], [x16] add x3, x3, #2*5 ret 40: - AARCH64_VALID_JUMP_TARGET + AARCH64_VALID_CALL_TARGET st1 {v0.16b}, [x3] str s1, [x3, #16] add x3, x3, #4*5 ret 80: - AARCH64_VALID_JUMP_TARGET + AARCH64_VALID_CALL_TARGET // This writes 6 full entries plus 2 extra bytes st1 {v0.16b, v1.16b}, [x3] // Write the last few, overlapping with the first write. @@ -230,7 +230,7 @@ function save_tmvs_neon, export=1 add x3, x3, #8*5 ret 160: - AARCH64_VALID_JUMP_TARGET + AARCH64_VALID_CALL_TARGET add x16, x3, #6*5 add x17, x3, #12*5 // This writes 6 full entries plus 2 extra bytes diff --git a/third_party/dav1d/src/mem.c b/third_party/dav1d/src/mem.c index 1888af361813..7e6eb4c066d1 100644 --- a/third_party/dav1d/src/mem.c +++ b/third_party/dav1d/src/mem.c @@ -303,7 +303,7 @@ COLD int dav1d_mem_pool_init(const enum AllocationType type, *ppool = pool; return 0; } - free(pool); + dav1d_free(pool); } *ppool = NULL; return DAV1D_ERR(ENOMEM); diff --git a/third_party/dav1d/src/x86/filmgrain16_avx512.asm b/third_party/dav1d/src/x86/filmgrain16_avx512.asm index 00dd6af59999..5cbebcef500f 100644 --- a/third_party/dav1d/src/x86/filmgrain16_avx512.asm +++ b/third_party/dav1d/src/x86/filmgrain16_avx512.asm @@ -29,11 +29,7 @@ %if ARCH_X86_64 -SECTION_RODATA 64 -pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 - db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 +SECTION_RODATA 16 scale_mask: db -1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1 scale_shift: dw 7, 7, 6, 6, 5, 5, 4, 4 pw_27_17_17_27: dw 108, 68, 68, 108, 27, 17, 17, 27 @@ -53,6 +49,8 @@ uv_offset_mul: dd 256 dd 1024 pb_8_9_0_1: db 8, 9, 0, 1 +cextern pb_0to63 + SECTION .text INIT_ZMM avx512icl @@ -382,7 +380,7 @@ cglobal fguv_32x32xn_i%1_16bpc, 6, 15, 22, dst, src, stride, fg_data, w, scaling packssdw m4, m5, m5 vpbroadcastd m21, [base+scale_shift+r9*8+4] %if %2 - mova m12, [base+pb_0to63] ; pw_even + mova m12, [pb_0to63] ; pw_even mov r13d, 0x0101 vpbroadcastq m10, [base+pw_23_22+r9*8] kmovw k3, r13d diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h index 29e1d9605264..e290c8710ca3 100644 --- a/third_party/dav1d/src/x86/ipred.h +++ b/third_party/dav1d/src/x86/ipred.h @@ -137,6 +137,8 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, avx512icl); init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl); init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl); + init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl); + init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl); #endif init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl); init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl); diff --git a/third_party/dav1d/src/x86/ipred_avx512.asm b/third_party/dav1d/src/x86/ipred_avx512.asm index 4aeb14e74220..7666e62be617 100644 --- a/third_party/dav1d/src/x86/ipred_avx512.asm +++ b/third_party/dav1d/src/x86/ipred_avx512.asm @@ -97,16 +97,88 @@ ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4 db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0 pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +pb_63to0: db 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 + db 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32 + db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +z_frac_table: db 64, 0, 62, 2, 60, 4, 58, 6, 56, 8, 54, 10, 52, 12, 50, 14 + db 48, 16, 46, 18, 44, 20, 42, 22, 40, 24, 38, 26, 36, 28, 34, 30 + db 32, 32, 30, 34, 28, 36, 26, 38, 24, 40, 22, 42, 20, 44, 18, 46 + db 16, 48, 14, 50, 12, 52, 10, 54, 8, 56, 6, 58, 4, 60, 2, 62 +z_filter_s1: db -1, -1, -1, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6 + db 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22 + db 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38 + db 46, 47, 47, 48, 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54 +z_filter_s5: db 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15, 17, 16 + db 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31, 33, 32 + db 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47, 49, 48 + db 58, 57, 59, 58, 60, 59, 61, 60, 62, 61, 63, 62, 64, 63, 65, 64 +z_filter_s3: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 +z_filter_s2: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +z_filter_s4: db 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8 +z_xpos_bc: db 17, 17, 17, 17, 33, 33, 33, 33, 9, 9, 9, 9, 9, 9, 9, 9 +z_filter4_s1: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 + db 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 +z_xpos_off1a: db 64, 65, 65, 66, 66, 67, 67, 68, 68, 69, 69, 70, 70, 71, 71, 72 +z_xpos_off1b: db 72, 73, 73, 74, 74, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80 +z_xpos_off2a: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + db 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24 + db 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40 + db 48, 49, 49, 50, 50, 51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56 +z_xpos_off2b: db 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16 + db 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32 + db 40, 41, 41, 42, 42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48 + db 56, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64 +z_xpos_mul: dw 4, 4, 4, 4, 8, 8, 4, 4, 12, 12, 8, 8, 16, 16, 8, 8 + dw 20, 20, 12, 12, 24, 24, 12, 12, 28, 28, 16, 16, 32, 32, 16, 16 +z_ypos_off1: db 64, 65, 64, 65, 64, 65, 64, 65, 65, 66, 65, 66, 66, 67, 66, 67 + db 66, 67, 66, 67, 68, 69, 68, 69, 67, 68, 67, 68, 70, 71, 70, 71 + db 68, 69, 68, 69, 72, 73, 72, 73, 69, 70, 69, 70, 74, 75, 74, 75 + db 70, 71, 70, 71, 76, 77, 76, 77, 71, 72, 71, 72, 78, 79, 78, 79 +z_ypos_off2: db 64, 65, 64, 65, 0, 0, 0, 0, 64, 65, 64, 65, 0, 0, 0, 0 + db 65, 66, 65, 66, 1, 1, 1, 1, 65, 66, 65, 66, 1, 1, 1, 1 + db 66, 67, 66, 67, 2, 2, 2, 2, 66, 67, 66, 67, 2, 2, 2, 2 + db 67, 68, 67, 68, 3, 3, 3, 3, 67, 68, 67, 68, 3, 3, 3, 3 +z_ypos_mul1: dw 1*512, 2*512, 3*512, 4*512, 5*512, 6*512, 7*512, 8*512 + dw 17*512, 18*512, 19*512, 20*512, 21*512, 22*512, 23*512, 24*512 + dw 33*512, 34*512, 35*512, 36*512, 37*512, 38*512, 39*512, 40*512 + dw 49*512, 50*512, 51*512, 52*512, 53*512, 54*512, 55*512, 56*512 +z_ypos_mul2: dw 9*512, 10*512, 11*512, 12*512, 13*512, 14*512, 15*512, 16*512 + dw 25*512, 26*512, 27*512, 28*512, 29*512, 30*512, 31*512, 32*512 + dw 41*512, 42*512, 43*512, 44*512, 45*512, 46*512, 47*512, 48*512 + dw 57*512, 58*512, 59*512, 60*512, 61*512, 62*512, 63*512, 64*512 +z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0 +z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0 +z3_upsample: db 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 + db 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8 +z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39 + db 39, 39, 47, 47, 47, 79, 79, 79 +z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16 + db 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16, 0 + db 0, 32, 0, 32, 0, 24, 0, 24, 0, 16, 0, 16 +pb_8_56_0_0: db 8, 56, 0, 0 +pb_m4_36: times 2 db -4, 36 pb_127_m127: times 2 db 127, -127 +pb_8: times 4 db 8 +pb_15: times 4 db 15 +pb_16: times 4 db 16 +pb_31: times 4 db 31 +pb_63: times 4 db 63 pb_128: times 4 db 128 pw_128: times 2 dw 128 pw_255: times 2 dw 255 +pw_512: times 2 dw 512 -%define pb_1 (ipred_h_shuf+24) -%define pb_2 (ipred_h_shuf+20) -%define pb_3 (ipred_h_shuf+16) -%define pd_8 (filter_taps+128) +%define pb_1 (ipred_h_shuf+24) +%define pb_2 (ipred_h_shuf+20) +%define pb_3 (ipred_h_shuf+16) +%define pb_4 (smooth_shuf +48) +%define pb_7 (ipred_h_shuf+ 0) +%define pb_9 (z_xpos_bc + 8) +%define pb_17 (z_xpos_bc + 0) +%define pb_33 (z_xpos_bc + 4) +%define pd_8 (filter_taps+128) %macro JMP_TABLE 3-* %xdefine %1_%2_table (%%table - 2*4) @@ -125,10 +197,15 @@ JMP_TABLE ipred_paeth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_v_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z1_8bpc, avx512icl, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z3_8bpc, avx512icl, w4, w8, w16, w32, w64 JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4 JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64 +cextern dr_intra_derivative +cextern pb_0to63 + SECTION .text INIT_ZMM avx512icl @@ -1200,6 +1277,896 @@ cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3 jg .w64 RET +%if WIN64 + DECLARE_REG_TMP 4 +%else + DECLARE_REG_TMP 8 +%endif + +cglobal ipred_z1_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx +%define base r7-z_filter_t0 + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative] + movsxd wq, [base+ipred_z1_8bpc_avx512icl_table+wq*4] + inc tlq + mov dxd, angled + and dxd, 0x7e + add angled, 165 ; ~90 + movzx dxd, word [t0+dxq] + lea wq, [base+ipred_z1_8bpc_avx512icl_table+wq] + movifnidn hd, hm + xor angled, 0x4ff ; d = 90 - angle + mova m14, [base+z_frac_table] + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + mova m9, [pb_0to63] + pminud m8, m9, [base+pb_7] {1to16} + vpbroadcastq m7, [tlq] + pshufb m7, m8 + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + pshufb xmm0, xm7, [base+z_filter_s4] + mova xmm1, [tlq-1] + pshufb xmm1, [base+z_xpos_off2a] + vpbroadcastd xmm2, [base+pb_m4_36] + vpbroadcastq m4, [pb_0to63] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm1, xmm2 + add dxd, dxd + kxnorw k1, k1, k1 + paddw xmm0, xmm1 + pmulhrsw xm0, xmm0, xm15 + packuswb xm0, xm0 + punpcklbw ym7{k1}, ym0 + jmp .w4_main2 +.w4_no_upsample: + test angled, 0x400 + jnz .w4_main ; !enable_intra_edge_filter + lea r3d, [hq+3] + vpbroadcastb xm0, r3d + vpbroadcastb xm1, angled + shr angled, 8 ; is_sm << 1 + vpcmpeqb k1, xm0, [base+z_filter_wh] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + vbroadcasti32x4 ym0, [tlq-1] + pshufb ym0, [base+z_filter4_s1] + popcnt r5d, r5d ; filter_strength + pshufb ym1, ym7, [z_filter_s4] + pshufb ym7, [base+z_filter_s3] + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym7, ym12 + paddw ym0, ym1 + paddw ym7, ym0 + pmulhrsw ym7, ym15 + cmp hd, 4 + je .w4_filter_end + vpbroadcastd m8, [base+pb_9] + pminub m8, m9 +.w4_filter_end: + paddb m8, m8 + vpermb m7, m8, m7 +.w4_main: + vpbroadcastq m4, [base+z_xpos_off1a] +.w4_main2: + movsldup m2, [base+z_xpos_mul] + vpbroadcastw m5, dxd + vbroadcasti32x4 m3, [base+z_xpos_bc] + lea r2, [strideq*3] + pmullw m2, m5 ; xpos + psllw m5, 5 ; dx*8 +.w4_loop: + psrlw m1, m2, 3 + pshufb m0, m2, m3 + vpermw m1, m1, m14 ; 64-frac, frac + paddsb m0, m4 ; base, base+1 + vpermb m0, m0, m7 ; top[base], top[base+1] + paddsw m2, m5 ; xpos += dx + pmaddubsw m0, m1 ; v + pmulhrsw m0, m15 + packuswb m0, m0 + vextracti32x4 xm1, ym0, 1 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + movd [dstq+strideq*2], xm1 + pextrd [dstq+r2 ], xm1, 1 + sub hd, 8 + jl .w4_end + vextracti32x4 xm1, m0, 2 ; top[max_base_x] + lea dstq, [dstq+strideq*4] + vextracti32x4 xm0, m0, 3 + movd [dstq+strideq*0], xm1 + pextrd [dstq+strideq*1], xm1, 1 + movd [dstq+strideq*2], xm0 + pextrd [dstq+r2 ], xm0, 1 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.w8_filter: + mova ym0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 ym1, [base+z_filter_s2] + vbroadcasti32x4 ym3, [base+z_filter_s3] + vbroadcasti32x4 ym4, [base+z_filter_s4] + vpermi2b ym0, ym7, ym2 ; al bl + mova ym5, [base+z_filter_s5] + pshufb ym1, ym7, ym1 ; ah bh + vpbroadcastd ym11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb ym3, ym7, ym3 ; cl ch + vpbroadcastd ym12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb ym4, ym7, ym4 ; el dl + vpbroadcastd ym13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb ym5, ym5, ym7 ; eh dh + pmaddubsw ym0, ym11 + pmaddubsw ym1, ym11 + pmaddubsw ym2, ym3, ym12 + pmaddubsw ym3, ym13 + pmaddubsw ym4, ym11 + pmaddubsw ym5, ym11 + paddw ym0, ym2 + paddw ym1, ym3 + paddw ym0, ym4 + paddw ym1, ym5 + pmulhrsw ym0, ym15 + pmulhrsw ym1, ym15 + packuswb ym0, ym1 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq-1] + mova xm1, [base+z_filter_s4] + vpbroadcastb xm2, r3d + mova xm7, [tlq-1] + vinserti32x4 ym7, [tlq+7], 1 + vbroadcasti32x4 ym0, [base+z_xpos_off1a] + vpbroadcastd ym3, [base+pb_m4_36] + pminub xm2, xm1 + pshufb ym0, ym7, ym0 + vinserti32x4 ym1, xm2, 1 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym3 + pmaddubsw ym1, ym3 + vbroadcasti32x4 m8, [pb_0to63] + add dxd, dxd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + jmp .w8_main2 +.w8_no_upsample: + lea r3d, [hq+7] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 7 + vbroadcasti32x4 m7, [tlq] + or r3d, 8 ; imin(h+7, 15) + vpbroadcastb m8, r3d + pminub m8, m9 + pshufb m7, m8 + test angled, 0x400 + jnz .w8_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w8_main + vpbroadcastd ym2, [tlq-4] + call .w8_filter + cmp hd, 8 + jle .w8_filter_end + vpbroadcastd m8, [base+pb_17] + add r3d, 2 + pminub m8, m9 +.w8_filter_end: + vpermb m7, m8, m0 +.w8_main: + vbroadcasti32x4 m8, [base+z_xpos_off1a] +.w8_main2: + movsldup m4, [base+z_xpos_mul] + vpbroadcastw m9, dxd + shl r3d, 6 + vpbroadcastd m5, [base+z_xpos_bc+8*0] + pmullw m4, m9 ; xpos + vpbroadcastd m6, [base+z_xpos_bc+8*1] + sub r3d, dxd + shl dxd, 3 + psllw m9, 5 ; dx*8 + lea r2, [strideq*3] +.w8_loop: + psrlw m3, m4, 3 + pshufb m0, m4, m5 + pshufb m1, m4, m6 + vpermw m3, m3, m14 + paddsb m0, m8 + paddsb m1, m8 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m4, m9 + punpcklqdq m2, m3, m3 + pmaddubsw m0, m2 + punpckhqdq m3, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + sub hd, 8 + jl .w8_end + vextracti32x8 ym0, m0, 1 + lea dstq, [dstq+strideq*4] + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + jz .w8_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w8_loop + vextracti32x4 xm7, m7, 3 +.w8_end_loop: + movq [dstq+strideq*0], xm7 + movq [dstq+strideq*1], xm7 + movq [dstq+strideq*2], xm7 + movq [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_end_loop +.w8_end: + RET +.w16_filter: + mova m0, [base+z_filter_s1] + popcnt r5d, r5d + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+(r5-1)*4+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+(r5-1)*4+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+(r5-1)*4+12*2] + vpermb m5, m5, m7 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + ret +.w16: + lea r3d, [hq+15] + mova m9, [pb_0to63] + vpbroadcastb ym0, r3d + and r3d, 15 + movu ym7, [tlq] + or r3d, 16 ; imin(h+15, 31) + vpbroadcastb m8, r3d + pminub m8, m9 + vpermb m7, m8, m7 + test angled, 0x400 + jnz .w16_main + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym0, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + kmovd r5d, k1 + test r5d, r5d + jz .w16_main + vpbroadcastd m2, [tlq-4] + call .w16_filter + cmp hd, 16 + jle .w16_filter_end + vpbroadcastd m8, [base+pb_33] + add r3d, 2 + pminub m8, m9 +.w16_filter_end: + vpermb m7, m8, m0 +.w16_main: + movshdup m3, [base+z_xpos_mul] + vpbroadcastw m8, dxd + shl r3d, 6 + vpbroadcastd m4, [base+z_xpos_bc] + pmullw m3, m8 ; xpos + vbroadcasti32x4 m5, [base+z_xpos_off1a] + sub r3d, dxd + shl dxd, 2 + vbroadcasti32x4 m6, [base+z_xpos_off1b] + psllw m8, 4 ; dx*4 + lea r2, [strideq*3] +.w16_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermb m0, m0, m7 + vpermb m1, m1, m7 + paddsw m3, m8 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + sub hd, 4 + jz .w16_end + lea dstq, [dstq+strideq*4] + sub r3d, dxd + jg .w16_loop + vextracti32x4 xm7, m7, 3 +.w16_end_loop: + mova [dstq+strideq*0], xm7 + mova [dstq+strideq*1], xm7 + mova [dstq+strideq*2], xm7 + mova [dstq+r2 ], xm7 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_end_loop +.w16_end: + RET +.w32_filter: + mova m0, [base+z_filter_s1] + vbroadcasti32x4 m1, [base+z_filter_s2] + vbroadcasti32x4 m3, [base+z_filter_s3] + vbroadcasti32x4 m4, [base+z_filter_s4] + vpermi2b m0, m7, m2 ; al bl + mova m5, [base+z_filter_s5] + pshufb m1, m7, m1 ; ah bh + vpbroadcastd m11, [base+z_filter_k+4*2+12*0] + pshufb m3, m7, m3 ; cl ch + vpbroadcastd m12, [base+z_filter_k+4*2+12*1] + pshufb m4, m7, m4 ; el dl + vpbroadcastd m13, [base+z_filter_k+4*2+12*2] + vpermi2b m5, m7, m8 ; eh dh + pmaddubsw m0, m11 + pmaddubsw m1, m11 + pmaddubsw m2, m3, m12 + pmaddubsw m3, m13 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m0, m2 + paddw m1, m3 + paddw m0, m4 + paddw m1, m5 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m7, m0, m1 + ret +.w32: + lea r3d, [hq+31] + vpbroadcastb m9, r3d + and r3d, 31 + pminub m10, m9, [pb_0to63] + or r3d, 32 ; imin(h+31, 63) + vpermb m7, m10, [tlq] + vpbroadcastb m8, [tlq+r3] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-4] + call .w32_filter + cmp hd, 64 + je .w32_h64_filter_end + vpermb m8, m9, m7 + vpermb m7, m10, m7 + jmp .w32_main +.w32_h64_filter_end: ; edge case for 32x64 + movd xmm0, [tlq+r3-1] + movd xmm1, [base+pb_8_56_0_0] + add r3d, 2 + pmaddubsw xmm0, xmm1 + vptestmw k1, xmm1, xmm1 ; 0x01 + pmulhrsw xm0, xmm0, xm15 + vmovdqu8 m8{k1}, m0 +.w32_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + vbroadcasti32x8 m5, [base+z_xpos_off2a] + shl r3d, 6 + vbroadcasti32x8 m6, [base+z_xpos_off2b] + sub r3d, dxd + paddw m9, m3, m3 + add dxd, dxd + vinserti32x8 m3, ym9, 1 +.w32_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + sub hd, 2 + jz .w32_end + lea dstq, [dstq+strideq*2] + sub r3d, dxd + jg .w32_loop + punpckhqdq ym8, ym8 +.w32_end_loop: + mova [dstq+strideq*0], ym8 + mova [dstq+strideq*1], ym8 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_end_loop +.w32_end: + RET +.w64_filter: + vbroadcasti32x4 m3, [base+z_filter_s2] + mova m1, [base+z_filter_s1] + pshufb m0, m3 ; al bl + vpermi2b m1, m7, m2 + vbroadcasti32x4 m4, [base+z_filter_s4] + pshufb m6, m8, m4 ; el dl + pshufb m9, m7, m4 + pminub m10, m13, [base+z_filter_s5] + pshufb m2, m8, m3 ; ah bh + pshufb m3, m7, m3 + vbroadcasti32x4 m5, [base+z_filter_s3] + vpermb m10, m10, m8 ; eh dh + pshufb m11, m4 + vpbroadcastd m4, [base+z_filter_k+4*2+12*0] + pshufb m8, m5 ; cl ch + pshufb m7, m5 + vpbroadcastd m5, [base+z_filter_k+4*2+12*1] + REPX {pmaddubsw x, m4}, m0, m1, m6, m9, m2, m3, m10, m11 + pmaddubsw m4, m8, m5 + pmaddubsw m5, m7, m5 + paddw m0, m6 + vpbroadcastd m6, [base+z_filter_k+4*2+12*2] + paddw m1, m9 + pmaddubsw m7, m6 + pmaddubsw m8, m6 + paddw m2, m10 + paddw m3, m11 + paddw m0, m4 + paddw m1, m5 + paddw m2, m8 + paddw m3, m7 + REPX {pmulhrsw x, m15}, m0, m2, m1, m3 + packuswb m0, m2 + packuswb m7, m1, m3 + vpermb m8, m12, m0 + ret +.w64: + lea r3d, [hq-1] + movu m7, [tlq+64*0] + vpbroadcastb m13, r3d + pminub m12, m13, [pb_0to63] + or r3d, 64 + vpermb m8, m12, [tlq+64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + movu m0, [tlq+56] + vpbroadcastd m2, [tlq-4] + movu m11, [tlq+8] + call .w64_filter +.w64_main: + rorx r2d, dxd, 30 + vpbroadcastd m4, [base+z_xpos_bc] + vpbroadcastw m3, r2d + mova m5, [base+z_xpos_off2a] + shl r3d, 6 + mova m6, [base+z_xpos_off2b] + sub r3d, dxd + mova m9, m3 +.w64_loop: + pshufb m1, m3, m4 + psrlw m2, m3, 3 + paddsb m0, m1, m5 + vpermw m2, m2, m14 + paddsb m1, m6 + vpermi2b m0, m7, m8 + vpermi2b m1, m7, m8 + paddsw m3, m9 + pmaddubsw m0, m2 + pmaddubsw m1, m2 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 + mova [dstq], m0 + dec hd + jz .w64_end + add dstq, strideq + sub r3d, dxd + jg .w64_loop + vpermb m8, m13, m8 +.w64_end_loop: + mova [dstq], m8 + add dstq, strideq + dec hd + jg .w64_end_loop +.w64_end: + RET + +cglobal ipred_z3_8bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy + lea r7, [z_filter_t0] + tzcnt wd, wm + movifnidn angled, anglem + lea t0, [dr_intra_derivative+45*2-1] + movsxd wq, [base+ipred_z3_8bpc_avx512icl_table+wq*4] + sub angled, 180 + mov dyd, angled + neg dyd + xor angled, 0x400 + or dyq, ~0x7e + mova m0, [base+pb_63to0] + movzx dyd, word [t0+dyq] + lea wq, [base+ipred_z3_8bpc_avx512icl_table+wq] + movifnidn hd, hm + mova m14, [base+z_frac_table] + shl dyd, 6 + vpbroadcastd m15, [base+pw_512] + jmp wq +.w4: + cmp angleb, 40 + jae .w4_no_upsample + lea r3d, [angleq-1024] + sar r3d, 7 + add r3d, hd + jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm) + lea r3d, [hq+4] + call .upsample + movshdup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_16] + jmp .w4_main2 +.w4_no_upsample: + lea r3d, [hq+3] + vpbroadcastb m9, r3d + vpxord m1, m9, [base+pb_63] {1to16} ; 63 - (h + 4) + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w4_main + vpbroadcastb xm1, angled + shr angled, 8 + vpcmpeqb k1, xm9, [base+z_filter_wh] + vpbroadcastd m2, [tlq-3] + vpcmpgtb k1{k1}, xm1, [base+z_filter_t0+angleq*8] + kmovw r5d, k1 + test r5d, r5d + jz .w4_main + pminub m9, [pb_0to63] + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w8_filter + vpermb m7, m9, m0 +.w4_main: + movsldup m1, [base+z_ypos_off1] + vpbroadcastd m6, [base+pb_8] +.w4_main2: + vpbroadcastw m0, dyd + vpbroadcastq m2, [base+z_ypos_mul1] ; 1..4 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w4_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + sub hd, 8 + jl .w4_end + vextracti32x4 xm0, ym0, 1 + lea dstq, [dstq+strideq*4] + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + lea dstq, [dstq+strideq*4] + jg .w4_loop +.w4_end: + RET +.upsample: + xor r3d, 31 ; 31 - (h + imin(w, h)) + vbroadcasti32x4 ym0, [base+z_xpos_off2a] + vpbroadcastb ym7, r3d + pmaxub ym7, [base+z3_upsample] + vbroadcasti32x4 ym1, [base+z_filter_s4] + vpermb ym7, ym7, [tlq-31] + vpbroadcastd ym2, [base+pb_m4_36] + pshufb ym0, ym7, ym0 + psrldq ym7, 1 + pshufb ym1, ym7, ym1 + pmaddubsw ym0, ym2 + pmaddubsw ym1, ym2 + add dyd, dyd + paddw ym0, ym1 + pmulhrsw ym0, ym15 + packuswb ym0, ym0 + punpcklbw ym7, ym0 + ret +.w8: + lea r3d, [angleq+216] + mov r3b, hb + cmp r3d, 8 + ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8 + lea r3d, [hq*2] + call .upsample + pshufd m1, [base+z_ypos_off1], q0000 + vpbroadcastd m6, [base+pb_8] + jmp .w8_main2 +.w8_no_upsample: + mov r3d, 8 + cmp hd, 4 + cmove r3d, hd + lea r3d, [r3+hq-1] + xor r3d, 63 ; 63 - (h + imin(w, h)) + vpbroadcastb m1, wd + pmaxub m1, m0 + vpermb m7, m1, [tlq-64*1] + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w8_main + lea r3d, [hq+7] + call .filter_strength + test r5d, r5d + jz .w8_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + vpermb m7, m10, m0 +.w8_main: + movsldup m1, [base+z_ypos_off2] + vpbroadcastd m6, [base+pb_4] +.w8_main2: + vpbroadcastw m0, dyd + vbroadcasti32x4 m2, [base+z_ypos_mul1] ; 1..8 + pmulhuw m2, m0 ; ypos >> 1 + lea r2, [strideq*3] + vpermw m3, m2, m14 ; 64-frac, frac + psrlw m2, 5 + packsswb m2, m2 + punpcklbw m2, m2 + paddsb m2, m1 ; base, base+1 +.w8_loop: + vpermb m0, m2, m7 + pmaddubsw m0, m3 + paddsb m2, m6 + pmulhrsw m0, m15 + vpmovwb ym0, m0 + vextracti32x4 xm1, ym0, 1 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r2 ], xm1 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w8_loop + RET +.filter_strength: + vpbroadcastd m2, [tlq-3] +.filter_strength2: + vpbroadcastb m9, r3d + vpbroadcastb ym1, angled + shr angled, 8 + vpcmpeqb k1, ym9, [base+z_filter_wh] + mova xm0, [base+z_filter_t0+angleq*8] + vpcmpgtb k1{k1}, ym1, ym0 + pminub m10, m9, [pb_0to63] + kmovd r5d, k1 + ret +.w16_load: + cmp r3d, hd + cmovae r3d, hd + add r3d, hd + mova m7, [tlq-64*1] + neg r3d ; -(h + imin(w, h)) + and r3d, 63 + vpbroadcastb m1, r3d + pmaxub m2, m0, m1 + cmp hd, 64 + je .w16_load_h64 + vpermb m8, m1, m7 + vpermb m7, m2, m7 + ret +.w16_load_h64: + vpermb m7, m0, m7 + vpermb m8, m2, [tlq-64*2] + ret +.w16: + mov r3d, 16 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w16_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w16_filter64 + lea r3d, [hq+15] + call .filter_strength2 + test r5d, r5d + jz .w16_main + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w16_filter + pminub m10, m9, [pb_0to63] + vpermb m8, m9, m0 + vpermb m7, m10, m0 + jmp .w16_main +.w16_filter64: + vpbroadcastd m13, [base+pb_15] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w16_main: + vpbroadcastd m6, [base+pb_4] + vpbroadcastw m0, dyd + vbroadcasti32x4 m3, [base+z_ypos_mul1] ; 1.. 8 + vbroadcasti32x4 m2, [base+z_ypos_mul2] ; 9..15 + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + movshdup m0, [base+z_ypos_off2] + lea r2, [strideq*3] + vpbroadcastd m1, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w16_loop: +%macro Z3_PERM2 0 + mova m0, m7 + vpermt2b m0, m2, m8 + mova m1, m7 + vpermt2b m1, m3, m8 + pmaddubsw m0, m4 + pmaddubsw m1, m5 + paddsb m2, m6 + paddsb m3, m6 + pmulhrsw m0, m15 + pmulhrsw m1, m15 + packuswb m0, m1 +%endmacro + Z3_PERM2 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub hd, 4 + jg .w16_loop + RET +.w32: + mov r3d, 32 + call .w16_load + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w32_main + vpbroadcastd m2, [tlq-3] + cmp hd, 64 + je .w32_filter64 + lea r3d, [hq+31] + vpbroadcastb m9, r3d + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w32_filter + vpermb m8, m9, m7 + jmp .w32_main +.w32_filter64: + vpbroadcastd m13, [base+pb_31] + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w32_main: + vbroadcasti32x8 m3, [base+z_ypos_mul1] ; 1.. 8 + vpbroadcastw m0, dyd + vbroadcasti32x8 m2, [base+z_ypos_mul2] ; 9..15 + vpbroadcastd m1, [base+pb_1] + pmulhuw m3, m0 ; ypos >> 1 + pmulhuw m2, m0 + vpbroadcastd m6, [base+pb_2] + mova ym0, ym1 + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m3, m0 + paddsb m1, m3 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w32_loop: + Z3_PERM2 + vextracti32x8 [dstq+strideq*0], m0, 1 + mova [dstq+strideq*1], ym0 + lea dstq, [dstq+strideq*2] + sub hd, 2 + jg .w32_loop + RET +.w64: + mova m7, [tlq-64*1] + cmp hd, 64 + je .w64_h64 + lea r3d, [hq*2-1] + xor r3d, 63 ; -(h + imin(w, h)) & 63 + vpbroadcastb m1, r3d + pmaxub m0, m1 + vpermb m8, m1, m7 + jmp .w64_filter +.w64_h64: + vpermb m8, m0, [tlq-64*2] +.w64_filter: + vpermb m7, m0, m7 + test angled, 0x400 ; !enable_intra_edge_filter + jnz .w64_main + lea r3d, [hq-1] + vpbroadcastd m2, [tlq-3] + vpbroadcastb m13, r3d + valignq m0, m8, m7, 7 + pminub m12, m13, [pb_0to63] + valignq m11, m8, m7, 1 + call mangle(private_prefix %+ _ipred_z1_8bpc_avx512icl).w64_filter +.w64_main: + vpbroadcastw m2, dyd + pmulhuw m3, m2, [base+z_ypos_mul1] + pmulhuw m2, [base+z_ypos_mul2] + vpbroadcastd m6, [base+pb_1] + vpermw m4, m3, m14 ; 64-frac, frac + psrlw m3, 5 + vpermw m5, m2, m14 + psrlw m2, 5 + packsswb m3, m2 + paddsb m1, m3, m6 + punpcklbw m2, m3, m1 ; base, base+1 + punpckhbw m3, m1 +.w64_loop: + Z3_PERM2 + mova [dstq], m0 + add dstq, strideq + dec hd + jg .w64_loop + RET + ; The ipred_filter code processes 4x2 blocks in the following order ; which increases parallelism compared to doing things row by row. ; Some redundant blocks are calculated for w > 4. diff --git a/third_party/dav1d/src/x86/loopfilter_avx512.asm b/third_party/dav1d/src/x86/loopfilter_avx512.asm index 0218b624d3c0..202a612aac42 100644 --- a/third_party/dav1d/src/x86/loopfilter_avx512.asm +++ b/third_party/dav1d/src/x86/loopfilter_avx512.asm @@ -41,6 +41,10 @@ hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +shift1: dq 0x0204081020408000 +shift3: dq 0x0810204080000000 +shift4: dq 0x1020408000000000 + pb_1: times 4 db 1 pb_2: times 4 db 2 pb_3: times 4 db 3 @@ -49,9 +53,6 @@ pb_16: times 4 db 16 pb_63: times 4 db 63 pb_64: times 4 db 64 pb_128: times 4 db 0x80 -pb_240: times 4 db 0xf0 -pb_248: times 4 db 0xf8 -pb_254: times 4 db 0xfe pb_2_1: times 2 db 2, 1 pb_3_1: times 2 db 3, 1 pb_7_1: times 2 db 7, 1 @@ -482,8 +483,7 @@ SECTION .text vpbroadcastb m1, [lutq+136] pminub m2, m1 pmaxub m2, m15 ; I - pand m1, m0, [pb_240]{bcstd} - psrlq m1, 4 ; H + gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H paddd m0, [pb_2]{bcstd} paddb m0, m0 paddb m0, m2 ; E @@ -534,8 +534,7 @@ SECTION .text ABSSUB m10, m3, m6, m11 ; abs(p1-q1) ABSSUB m11, m4, m5, m2 ; abs(p0-q0) paddusb m11, m11 - pand m10, [pb_254]{bcstd} - psrlq m10, 1 + gf2p8affineqb m10, m10, [shift1]{bcstq}, 0 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E @@ -608,12 +607,8 @@ SECTION .text paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm paddsb m8, m10, m15 paddsb m10, m0 - pand m8, [pb_248]{bcstd} - pand m10, [pb_248]{bcstd} - psrlq m8, 3 - psrlq m10, 3 - pxor m8, m12 - pxor m10, m12 + gf2p8affineqb m8, m8, [shift3]{bcstq}, 16 + gf2p8affineqb m10, m10, [shift3]{bcstq}, 16 psubb m8, m12 ; f2 psubb m10, m12 ; f1 paddsb m4, m8 diff --git a/third_party/dav1d/src/x86/looprestoration16_avx2.asm b/third_party/dav1d/src/x86/looprestoration16_avx2.asm index ef25c2847441..4cf8b905c260 100644 --- a/third_party/dav1d/src/x86/looprestoration16_avx2.asm +++ b/third_party/dav1d/src/x86/looprestoration16_avx2.asm @@ -32,15 +32,15 @@ SECTION_RODATA 32 sgr_lshuf3: db 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_lshuf5: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 +wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_shufA: db 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11 wiener_shufB: db 6, 7, 4, 5, 8, 9, 6, 7, 10, 11, 8, 9, 12, 13, 10, 11 wiener_shufC: db 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15 wiener_shufD: db 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, -1, -1, 8, 9, -1, -1 wiener_shufE: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 -wiener_lshuf5: db 4, 5, 4, 5, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -wiener_lshuf7: db 8, 9, 8, 9, 8, 9, 8, 9, 8, 9, 10, 11, 12, 13, 14, 15 -pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 wiener_hshift: dw 4, 4, 1, 1 wiener_vshift: dw 1024, 1024, 4096, 4096 @@ -62,6 +62,7 @@ pd_0xf00801c7: dd 0xf00801c7 %define pw_256 sgr_lshuf5 +cextern pb_0to63 cextern sgr_x_by_x_avx2 SECTION .text @@ -182,7 +183,7 @@ cglobal wiener_filter7_16bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ .extend_right: movd xm1, r10d vpbroadcastd m0, [pb_6_7] - movu m2, [pb_0to31] + mova m2, [pb_0to63] vpbroadcastb m1, xm1 psubb m0, m1 pminub m0, m2 @@ -406,9 +407,8 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ vpbroadcastd m0, [base+wiener_hshift+t3*4] vpbroadcastd m9, [base+wiener_round+t3*4] vpbroadcastd m10, [base+wiener_vshift+t3*4] - movu xm15, [wiener_lshuf5] + mova m15, [wiener_lshuf5] pmullw m11, m0 - vinserti128 m15, [pb_0to31], 1 pmullw m12, m0 test edgeb, 4 ; LR_HAVE_TOP jz .no_top @@ -486,7 +486,7 @@ cglobal wiener_filter5_16bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ vpbroadcastb m2, xm2 psubb m0, m2 psubb m1, m2 - movu m2, [pb_0to31] + mova m2, [pb_0to63] pminub m0, m2 pminub m1, m2 pshufb m3, m0 diff --git a/third_party/dav1d/src/x86/looprestoration_avx2.asm b/third_party/dav1d/src/x86/looprestoration_avx2.asm index a73cb218826a..778799742566 100644 --- a/third_party/dav1d/src/x86/looprestoration_avx2.asm +++ b/third_party/dav1d/src/x86/looprestoration_avx2.asm @@ -31,11 +31,11 @@ SECTION_RODATA 32 wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -pb_0to31: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 - db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 +sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_r_ext: times 16 db 1 times 16 db 9 @@ -64,7 +64,6 @@ pb_m5: times 4 db -5 pb_3: times 4 db 3 pw_5_6: dw 5, 6 -sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 db 9, -1, 10, -1, 11, -1, 12, -1 @@ -77,6 +76,8 @@ pd_m4096: dd -4096 pd_0xf00801c7: dd 0xf00801c7 pd_0xf00800a4: dd 0xf00800a4 +cextern pb_0to63 + SECTION .text DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers @@ -192,7 +193,7 @@ cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ vpbroadcastd m0, [pb_3] vpbroadcastd m1, [pb_m5] vpbroadcastb m2, xm2 - movu m3, [pb_0to31] + mova m3, [pb_0to63] psubb m0, m2 psubb m1, m2 pminub m0, m3 @@ -826,7 +827,7 @@ cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ mova m0, [sgr_r_ext] vpbroadcastb m2, xm2 psubb m0, m2 - pminub m0, [pb_0to31] + pminub m0, [pb_0to63] pshufb m5, m0 ret .h: ; horizontal boxsum diff --git a/third_party/dav1d/src/x86/pal.asm b/third_party/dav1d/src/x86/pal.asm index 27187d11fae3..92075b9ba8ad 100644 --- a/third_party/dav1d/src/x86/pal.asm +++ b/third_party/dav1d/src/x86/pal.asm @@ -28,7 +28,7 @@ SECTION_RODATA 64 -pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +const pb_0to63, db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 %if ARCH_X86_64 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47