diff --git a/media/libdav1d/moz.yaml b/media/libdav1d/moz.yaml index ca5279ba8e31..d97fec14d981 100644 --- a/media/libdav1d/moz.yaml +++ b/media/libdav1d/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb (2023-03-13T15:19:35.000+00:00). + release: 5aa3b38f9871859e14e55f18ab5e38318fe86305 (2023-04-08T11:47:31.000+00:00). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb + revision: 5aa3b38f9871859e14e55f18ab5e38318fe86305 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libdav1d/vcs_version.h b/media/libdav1d/vcs_version.h index e08cb9dc2ebd..c74f30d0be1f 100644 --- a/media/libdav1d/vcs_version.h +++ b/media/libdav1d/vcs_version.h @@ -1,2 +1,2 @@ /* auto-generated, do not edit */ -#define DAV1D_VERSION "7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb" +#define DAV1D_VERSION "5aa3b38f9871859e14e55f18ab5e38318fe86305" diff --git a/third_party/dav1d/src/arm/64/ipred.S b/third_party/dav1d/src/arm/64/ipred.S index 248ca44a499c..dab67577e63c 100644 --- a/third_party/dav1d/src/arm/64/ipred.S +++ b/third_party/dav1d/src/arm/64/ipred.S @@ -1481,11 +1481,10 @@ function ipred_z1_filter_edge_8bpc_neon, export=1 sub x5, x5, w3, uxtw add x6, x2, w6, sxtw - ld1 {v2.16b, v3.16b}, [x5] // padding_mask + ld1 {v2.16b}, [x5] // padding_mask - ld1r {v4.16b}, [x6] - bit v0.16b, v4.16b, v2.16b // Pad v0-v1 - bit v1.16b, v4.16b, v3.16b + ld1r {v1.16b}, [x6] + bit v0.16b, v1.16b, v2.16b // Pad v0-v1 // Filter one block ext v2.16b, v0.16b, v1.16b, #1 @@ -1598,6 +1597,17 @@ L(fivetap): ret endfunc +// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_8bpc_neon, export=1 + dup v0.16b, w1 +1: + subs w2, w2, #16 + st1 {v0.16b}, [x0], #16 + b.gt 1b + ret +endfunc + // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const top, // const int width, const int height, @@ -1671,10 +1681,10 @@ function ipred_z1_fill1_8bpc_neon, export=1 dup v7.8b, w11 ext v1.16b, v0.16b, v0.16b, #1 // top[base+1] ext v3.16b, v2.16b, v2.16b, #1 - umull v16.8h, v1.8b, v4.8b // top[base+1]*frac - umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac) - umull v17.8h, v3.8b, v5.8b - umlal v17.8h, v2.8b, v7.8b + umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac + umull v17.8h, v2.8b, v7.8b + umlal v17.8h, v3.8b, v5.8b rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 st1 {v16.8b}, [x0], x1 @@ -1724,14 +1734,14 @@ function ipred_z1_fill1_8bpc_neon, export=1 ext v16.16b, v0.16b, v1.16b, #1 // top[base+1] ext v17.16b, v2.16b, v3.16b, #1 subs w3, w3, #16 - umull v18.8h, v16.8b, v4.8b // top[base+1]*frac - umlal v18.8h, v0.8b, v6.8b // + top[base]*(64-frac) - umull2 v19.8h, v16.16b, v4.16b - umlal2 v19.8h, v0.16b, v6.16b - umull v20.8h, v17.8b, v5.8b - umlal v20.8h, v2.8b, v7.8b - umull2 v21.8h, v17.16b, v5.16b - umlal2 v21.8h, v2.16b, v7.16b + umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac) + umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac + umull2 v19.8h, v0.16b, v6.16b + umlal2 v19.8h, v16.16b, v4.16b + umull v20.8h, v2.8b, v7.8b + umlal v20.8h, v17.8b, v5.8b + umull2 v21.8h, v2.16b, v7.16b + umlal2 v21.8h, v17.16b, v5.16b rshrn v16.8b, v18.8h, #6 rshrn2 v16.16b, v19.8h, #6 rshrn v17.8b, v20.8h, #6 @@ -1899,7 +1909,7 @@ function ipred_z3_fill1_8bpc_neon, export=1 ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments mov w7, w5 - b.gt L(ipred_z3_fill1_large_w16) + b.gt L(ipred_z3_fill1_large_h16) br x8 40: @@ -1909,6 +1919,7 @@ function ipred_z3_fill1_8bpc_neon, export=1 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e + // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32 ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos @@ -1958,7 +1969,8 @@ function ipred_z3_fill1_8bpc_neon, export=1 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e - ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] + // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48 + ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos movi v22.16b, #64 @@ -1974,12 +1986,12 @@ function ipred_z3_fill1_8bpc_neon, export=1 uqadd v28.8b, v26.8b, v21.8b // base + 2 sub v25.8b, v22.8b, v24.8b // 64 - frac - tbx v4.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.8b // left[base] + tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base] 1: mov v5.8b, v31.8b mov v6.8b, v31.8b - tbx v5.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.8b // left[base+1] - tbx v6.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.8b // left[base+2] + tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1] + tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2] umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac) umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac @@ -2008,6 +2020,7 @@ function ipred_z3_fill1_8bpc_neon, export=1 mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e + // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] add v28.8h, v28.8h, v30.8h // ypos @@ -2075,6 +2088,7 @@ function ipred_z3_fill1_8bpc_neon, export=1 sub x1, x1, w3, uxtw add v30.8h, v28.8h, v30.8h // ypos + // This is only executed if we've checked that max_base_y <= 64. ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[] movi v22.16b, #64 @@ -2146,7 +2160,7 @@ function ipred_z3_fill1_8bpc_neon, export=1 9: ret -L(ipred_z3_fill1_large_w16): +L(ipred_z3_fill1_large_h16): // Fallback case for max_base_y > 64; similar to the z1 // implementation. This does the filtering vertically, filling out // a 2x pixel column at a time. @@ -2358,7 +2372,7 @@ L(ipred_z3_fill_padding_wide): st1 {v31.16b}, [x0], #16 b.gt 2b subs w4, w4, #1 - add x0, x0, x1 + add x0, x0, x1 b.le 9f mov w3, w12 b 1b @@ -2367,16 +2381,11 @@ L(ipred_z3_fill_padding_wide): endfunc function ipred_z3_fill2_8bpc_neon, export=1 - adr x8, L(ipred_z3_fill1_tbl) + cmp w3, #8 add x10, x2, w6, uxtw // left[max_base_y] movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments - mov w7, w5 - - cmp w3, #8 - add x10, x2, w6, uxtw // left[max_base_y] - ld1r {v31.16b}, [x10] // padding b.eq 80f 40: // w == 4 @@ -2385,6 +2394,8 @@ function ipred_z3_fill2_8bpc_neon, export=1 mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.4h, v29.4h, v30.4h // ypos @@ -2434,6 +2445,8 @@ function ipred_z3_fill2_8bpc_neon, export=1 mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy movi v23.16b, #0x3e + // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16, + // so max_base_y <= 32. ld1 {v0.16b, v1.16b}, [x2] // left[] add v30.8h, v29.8h, v30.8h // ypos diff --git a/third_party/dav1d/src/arm/64/ipred16.S b/third_party/dav1d/src/arm/64/ipred16.S index 6a28661671e3..c48c48583ce3 100644 --- a/third_party/dav1d/src/arm/64/ipred16.S +++ b/third_party/dav1d/src/arm/64/ipred16.S @@ -1405,6 +1405,1033 @@ L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 40b endfunc +const padding_mask_buf + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +padding_mask: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +endconst + +// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz, +// const pixel *const in, const int end, +// const int bitdepth_max); +function ipred_z1_upsample_edge_16bpc_neon, export=1 + dup v30.8h, w4 // bitdepth_max + movrel x4, padding_mask + ld1 {v0.8h, v1.8h}, [x2] // in[] + add x5, x2, w3, uxtw #1 // in[end] + sub x4, x4, w3, uxtw #1 + + ld1r {v2.8h}, [x5] // padding + ld1 {v3.8h, v4.8h}, [x4] // padding_mask + + movi v31.8h, #9 + + bit v0.16b, v2.16b, v3.16b // padded in[] + bit v1.16b, v2.16b, v4.16b + + ext v4.16b, v0.16b, v1.16b, #2 + ext v5.16b, v1.16b, v2.16b, #2 + ext v6.16b, v0.16b, v1.16b, #4 + ext v7.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + + add v18.8h, v4.8h, v6.8h // in[i+1] + in[i+2] + add v19.8h, v5.8h, v7.8h + add v20.8h, v0.8h, v16.8h + add v21.8h, v1.8h, v17.8h + umull v22.4s, v18.4h, v31.4h // 9*(in[i+1] + in[i+2]) + umull2 v23.4s, v18.8h, v31.8h + umull v24.4s, v19.4h, v31.4h + umull2 v25.4s, v19.8h, v31.8h + usubw v22.4s, v22.4s, v20.4h + usubw2 v23.4s, v23.4s, v20.8h + usubw v24.4s, v24.4s, v21.4h + usubw2 v25.4s, v25.4s, v21.8h + + sqrshrun v16.4h, v22.4s, #4 + sqrshrun2 v16.8h, v23.4s, #4 + sqrshrun v17.4h, v24.4s, #4 + sqrshrun2 v17.8h, v25.4s, #4 + + smin v16.8h, v16.8h, v30.8h + smin v17.8h, v17.8h, v30.8h + + zip1 v0.8h, v4.8h, v16.8h + zip2 v1.8h, v4.8h, v16.8h + zip1 v2.8h, v5.8h, v17.8h + zip2 v3.8h, v5.8h, v17.8h + + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + + ret +endfunc + +const edge_filter + .short 0, 4, 8, 0 + .short 0, 5, 6, 0 +// Leaving out the coeffs for strength=3 +// .byte 2, 4, 4, 0 +endconst + +// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz, +// const pixel *const in, const int end, +// const int strength); +function ipred_z1_filter_edge_16bpc_neon, export=1 + cmp w4, #3 + b.eq L(fivetap) // if (strength == 3) goto fivetap + + movrel x5, edge_filter, -6 + add x5, x5, w4, uxtw #3 // edge_filter + 2*((strength - 1)*4 + 1) + + ld1 {v31.s}[0], [x5] // kernel[1-2] + + ld1 {v0.8h}, [x2], #16 + + dup v30.8h, v31.h[0] + dup v31.8h, v31.h[1] +1: + // in[end], is the last valid pixel. We produce 16 pixels out by + // using 18 pixels in - the last pixel used is [17] of the ones + // read/buffered. + cmp w3, #17 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + sub w3, w3, #16 + st1 {v16.8h, v17.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask + sub w6, w3, #24 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h}, [x5] // padding_mask + + ld1r {v2.8h}, [x6] + bit v0.16b, v2.16b, v3.16b // Pad v0-v1 + bit v1.16b, v2.16b, v4.16b + + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + mul v16.8h, v0.8h, v30.8h + mla v16.8h, v3.8h, v31.8h + mla v16.8h, v5.8h, v30.8h + mul v17.8h, v1.8h, v30.8h + mla v17.8h, v4.8h, v31.8h + mla v17.8h, v6.8h, v30.8h + subs w1, w1, #16 + urshr v16.8h, v16.8h, #4 + urshr v17.8h, v17.8h, #4 + st1 {v16.8h, v17.8h}, [x0], #32 + b.le 9f +5: + // After one block, any remaining output would only be filtering + // padding - thus just store the padding. + subs w1, w1, #16 + st1 {v2.16b}, [x0], #16 + b.gt 5b +9: + ret + +L(fivetap): + sub x2, x2, #2 // topleft -= 1 pixel + movi v29.8h, #2 + ld1 {v0.8h}, [x2], #16 + movi v30.8h, #4 + movi v31.8h, #4 + ins v0.h[0], v0.h[1] +1: + // in[end+1], is the last valid pixel. We produce 16 pixels out by + // using 20 pixels in - the last pixel used is [19] of the ones + // read/buffered. + cmp w3, #18 + ld1 {v1.8h, v2.8h}, [x2], #32 + b.lt 2f // if (end + 1 < 19) + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.gt 1b + ret +2: + // Right padding + + // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead) + movrel x5, padding_mask, -2 + sub w6, w3, #23 + sub x5, x5, w3, uxtw #1 + add x6, x2, w6, sxtw #1 + + ld1 {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask + + ld1r {v28.8h}, [x6] + bit v0.16b, v28.16b, v3.16b // Pad v0-v2 + bit v1.16b, v28.16b, v4.16b + bit v2.16b, v28.16b, v5.16b +4: + // Filter one block + ext v3.16b, v0.16b, v1.16b, #2 + ext v4.16b, v1.16b, v2.16b, #2 + ext v5.16b, v0.16b, v1.16b, #4 + ext v6.16b, v1.16b, v2.16b, #4 + ext v16.16b, v0.16b, v1.16b, #6 + ext v17.16b, v1.16b, v2.16b, #6 + ext v18.16b, v0.16b, v1.16b, #8 + ext v19.16b, v1.16b, v2.16b, #8 + mul v20.8h, v0.8h, v29.8h + mla v20.8h, v3.8h, v30.8h + mla v20.8h, v5.8h, v31.8h + mla v20.8h, v16.8h, v30.8h + mla v20.8h, v18.8h, v29.8h + mul v21.8h, v1.8h, v29.8h + mla v21.8h, v4.8h, v30.8h + mla v21.8h, v6.8h, v31.8h + mla v21.8h, v17.8h, v30.8h + mla v21.8h, v19.8h, v29.8h + subs w1, w1, #16 + mov v0.16b, v2.16b + mov v1.16b, v28.16b + mov v2.16b, v28.16b + urshr v20.8h, v20.8h, #4 + urshr v21.8h, v21.8h, #4 + sub w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + b.le 9f + // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to + // filter properly once more - aka (w3 >= 0). + cmp w3, #0 + b.ge 4b +5: + // When w3 <= 0, all remaining pixels in v0-v1 are equal to the + // last valid pixel - thus just output that without filtering. + subs w1, w1, #8 + st1 {v28.8h}, [x0], #16 + b.gt 5b +9: + ret +endfunc + +// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px, +// const int n); +function ipred_pixel_set_16bpc_neon, export=1 + dup v0.8h, w1 +1: + subs w2, w2, #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const top, +// const int width, const int height, +// const int dx, const int max_base_x); +function ipred_z1_fill1_16bpc_neon, export=1 + clz w9, w3 + adr x8, L(ipred_z1_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // top[max_base_x] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + br x8 +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // top[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + + mov w12, w3 + + add x13, x0, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 169f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // top[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // xpos += dx +2: + ext v18.16b, v0.16b, v1.16b, #2 // top[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w3, w3, #16 + umull v22.4s, v0.4h, v16.4h // top[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + top[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + st1 {v22.8h, v23.8h}, [x0], #32 + st1 {v24.8h, v25.8h}, [x13], #32 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // top[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w4, w4, #2 + b.le 9f + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 1b +9: + ret + +169: + st1 {v31.8h}, [x0], #16 + subs w3, w3, #8 + st1 {v31.8h}, [x13], #16 + b.gt 169b + subs w4, w4, #2 + b.le 9b + add x0, x0, x1 + add x13, x13, x1 + mov w3, w12 + b 169b + +L(ipred_z1_fill1_tbl): + .hword L(ipred_z1_fill1_tbl) - 640b + .hword L(ipred_z1_fill1_tbl) - 320b + .hword L(ipred_z1_fill1_tbl) - 160b + .hword L(ipred_z1_fill1_tbl) - 80b + .hword L(ipred_z1_fill1_tbl) - 40b +endfunc + +function ipred_z1_fill2_16bpc_neon, export=1 + cmp w3, #8 + add x10, x2, w6, uxtw // top[max_base_x] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + b.eq 8f + +4: // w == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 49f + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + st1 {v16.4h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.4h}, [x0], x1 + b.gt 4b + ret + +49: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.4h}, [x0], x1 + b.gt 49b + ret + +8: // w == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge 89f + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + st1 {v16.8h}, [x0], x1 + add w7, w7, w5 // xpos += dx + subs w4, w4, #2 + st1 {v17.8h}, [x0], x1 + b.gt 8b + ret + +89: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #2 + st1 {v31.8h}, [x0], x1 + b.gt 89b + ret +endfunc + +// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src, +// const int n); +function ipred_reverse_16bpc_neon, export=1 + sub x1, x1, #16 + add x3, x0, #8 + mov x4, #16 +1: + ld1 {v0.8h}, [x1] + subs w2, w2, #8 + rev64 v0.8h, v0.8h + sub x1, x1, #16 + st1 {v0.d}[1], [x0], x4 + st1 {v0.d}[0], [x3], x4 + b.gt 1b + ret +endfunc + +// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const left, +// const int width, const int height, +// const int dy, const int max_base_y); +function ipred_z3_fill1_16bpc_neon, export=1 + clz w9, w4 + adr x8, L(ipred_z3_fill1_tbl) + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + add x10, x2, w6, uxtw #1 // left[max_base_y] + sub x8, x8, w9, uxtw + ld1r {v31.8h}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + br x8 + +40: + AARCH64_VALID_JUMP_TARGET +4: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // left[base] + ldr q2, [x2, w10, uxtw] + dup v4.8h, w9 // frac + dup v5.8h, w11 + ext v1.16b, v0.16b, v0.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v2.16b, #2 + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +80: + AARCH64_VALID_JUMP_TARGET +8: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h}, [x8] // left[base] + ld1 {v2.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + ldr h1, [x8, #16] + ldr h3, [x10, #16] + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + ext v1.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v3.16b, v2.16b, v3.16b, #2 + umull v16.4s, v0.4h, v6.4h // left[base]*(64-frac) + umlal v16.4s, v1.4h, v4.4h // + left[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v1.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v3.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v3.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret + +160: +320: +640: + AARCH64_VALID_JUMP_TARGET + mov w12, w4 +1: + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // ypos += dy + cmp w8, w6 // base >= max_base_y + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v6.8h, w9 // frac + dup v7.8h, w11 + ld1 {v0.8h, v1.8h, v2.8h}, [x8], #48 // left[base] + ld1 {v3.8h, v4.8h, v5.8h}, [x10], #48 + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v16.8h, w9 // 64 - frac + dup v17.8h, w11 + add w7, w7, w5 // ypos += dy +2: + ext v18.16b, v0.16b, v1.16b, #2 // left[base+1] + ext v19.16b, v1.16b, v2.16b, #2 + ext v20.16b, v3.16b, v4.16b, #2 + ext v21.16b, v4.16b, v5.16b, #2 + subs w4, w4, #16 + umull v22.4s, v0.4h, v16.4h // left[base]*(64-frac) + umlal v22.4s, v18.4h, v6.4h // + left[base+1]*frac + umull2 v23.4s, v0.8h, v16.8h + umlal2 v23.4s, v18.8h, v6.8h + umull v24.4s, v1.4h, v16.4h + umlal v24.4s, v19.4h, v6.4h + umull2 v25.4s, v1.8h, v16.8h + umlal2 v25.4s, v19.8h, v6.8h + umull v26.4s, v3.4h, v17.4h + umlal v26.4s, v20.4h, v7.4h + umull2 v27.4s, v3.8h, v17.8h + umlal2 v27.4s, v20.8h, v7.8h + umull v28.4s, v4.4h, v17.4h + umlal v28.4s, v21.4h, v7.4h + umull2 v29.4s, v4.8h, v17.8h + umlal2 v29.4s, v21.8h, v7.8h + rshrn v22.4h, v22.4s, #6 + rshrn2 v22.8h, v23.4s, #6 + rshrn v23.4h, v24.4s, #6 + rshrn2 v23.8h, v25.4s, #6 + rshrn v24.4h, v26.4s, #6 + rshrn2 v24.8h, v27.4s, #6 + rshrn v25.4h, v28.4s, #6 + rshrn2 v25.8h, v29.4s, #6 + zip1 v18.8h, v22.8h, v24.8h + zip2 v19.8h, v22.8h, v24.8h + zip1 v20.8h, v23.8h, v25.8h + zip2 v21.8h, v23.8h, v25.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x13], x1 + st1 {v20.s}[2], [x0], x1 + st1 {v20.s}[3], [x13], x1 + st1 {v21.s}[0], [x0], x1 + st1 {v21.s}[1], [x13], x1 + st1 {v21.s}[2], [x0], x1 + st1 {v21.s}[3], [x13], x1 + b.le 3f + mov v0.16b, v2.16b + ld1 {v1.8h, v2.8h}, [x8], #32 // left[base] + mov v3.16b, v5.16b + ld1 {v4.8h, v5.8h}, [x10], #32 + b 2b + +3: + subs w3, w3, #2 + b.le 9f + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b +9: + ret + +L(ipred_z3_fill1_tbl): + .hword L(ipred_z3_fill1_tbl) - 640b + .hword L(ipred_z3_fill1_tbl) - 320b + .hword L(ipred_z3_fill1_tbl) - 160b + .hword L(ipred_z3_fill1_tbl) - 80b + .hword L(ipred_z3_fill1_tbl) - 40b +endfunc + +function ipred_z3_fill_padding_neon, export=0 + cmp w3, #8 + adr x8, L(ipred_z3_fill_padding_tbl) + b.gt L(ipred_z3_fill_padding_wide) + // w3 = remaining width, w4 = constant height + mov w12, w4 + +1: + // Fill a WxH rectangle with padding. W can be any number; + // this fills the exact width by filling in the largest + // power of two in the remaining width, and repeating. + clz w9, w3 + sub w9, w9, #25 + ldrh w9, [x8, w9, uxtw #1] + sub x9, x8, w9, uxtw + br x9 + +2: + st1 {v31.s}[0], [x0], x1 + subs w4, w4, #4 + st1 {v31.s}[0], [x13], x1 + st1 {v31.s}[0], [x0], x1 + st1 {v31.s}[0], [x13], x1 + b.gt 2b + subs w3, w3, #2 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #4 + add x13, x13, #4 + mov w4, w12 + b 1b + +4: + st1 {v31.4h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.4h}, [x13], x1 + st1 {v31.4h}, [x0], x1 + st1 {v31.4h}, [x13], x1 + b.gt 4b + subs w3, w3, #4 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #8 + add x13, x13, #8 + mov w4, w12 + b 1b + +8: +16: +32: +64: + st1 {v31.8h}, [x0], x1 + subs w4, w4, #4 + st1 {v31.8h}, [x13], x1 + st1 {v31.8h}, [x0], x1 + st1 {v31.8h}, [x13], x1 + b.gt 4b + subs w3, w3, #8 + lsr x1, x1, #1 + msub x0, x1, x12, x0 // ptr -= h * stride + msub x13, x1, x12, x13 + b.le 9f + lsl x1, x1, #1 + add x0, x0, #16 + add x13, x13, #16 + mov w4, w12 + b 1b + +9: + ret + +L(ipred_z3_fill_padding_tbl): + .hword L(ipred_z3_fill_padding_tbl) - 64b + .hword L(ipred_z3_fill_padding_tbl) - 32b + .hword L(ipred_z3_fill_padding_tbl) - 16b + .hword L(ipred_z3_fill_padding_tbl) - 8b + .hword L(ipred_z3_fill_padding_tbl) - 4b + .hword L(ipred_z3_fill_padding_tbl) - 2b + +L(ipred_z3_fill_padding_wide): + // Fill a WxH rectangle with padding, with W > 8. + lsr x1, x1, #1 + mov w12, w3 + sub x1, x1, w3, uxtw #1 +1: + ands w5, w3, #7 + b.eq 2f + // If the width isn't aligned to 8, first do one 8 pixel write + // and align the start pointer. + sub w3, w3, w5 + st1 {v31.8h}, [x0] + add x0, x0, w5, uxtw #1 +2: + // Fill the rest of the line with aligned 8 pixel writes. + subs w3, w3, #8 + st1 {v31.8h}, [x0], #16 + b.gt 2b + subs w4, w4, #1 + add x0, x0, x1 + b.le 9f + mov w3, w12 + b 1b +9: + ret +endfunc + +function ipred_z3_fill2_16bpc_neon, export=1 + cmp w4, #8 + add x10, x2, w6, uxtw // left[max_base_y] + ld1r {v31.16b}, [x10] // padding + mov w7, w5 + mov w15, #64 + add x13, x0, x1 + lsl x1, x1, #1 + b.eq 8f + +4: // h == 4 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + lsl w8, w8, #1 + lsl w10, w10, #1 + ldr q0, [x2, w8, uxtw] // top[base] + ldr q2, [x2, w10, uxtw] + dup v4.4h, w9 // frac + dup v5.4h, w11 + uzp2 v1.8h, v0.8h, v0.8h // top[base+1] + uzp1 v0.8h, v0.8h, v0.8h // top[base] + uzp2 v3.8h, v2.8h, v2.8h + uzp1 v2.8h, v2.8h, v2.8h + sub v6.4h, v1.4h, v0.4h // top[base+1]-top[base] + sub v7.4h, v3.4h, v2.4h + ushll v16.4s, v0.4h, #6 // top[base]*64 + ushll v17.4s, v2.4h, #6 + smlal v16.4s, v6.4h, v4.4h // + top[base+1]*frac + smlal v17.4s, v7.4h, v5.4h + rshrn v16.4h, v16.4s, #6 + rshrn v17.4h, v17.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + add w7, w7, w5 // xpos += dx + st1 {v18.s}[2], [x0] + st1 {v18.s}[3], [x13] + b.le 9f + sub x0, x0, x1 // ptr -= 4 * (2*stride) + sub x13, x13, x1 + add x0, x0, #4 + add x13, x13, #4 + b 4b +9: + ret + +8: // h == 8 + lsr w8, w7, #6 // base + and w9, w7, #0x3e // frac + add w7, w7, w5 // xpos += dx + cmp w8, w6 // base >= max_base_x + lsr w10, w7, #6 // base + and w11, w7, #0x3e // frac + b.ge ipred_z3_fill_padding_neon + add x8, x2, w8, uxtw #1 + add x10, x2, w10, uxtw #1 + dup v4.8h, w9 // frac + dup v5.8h, w11 + ld1 {v0.8h, v1.8h}, [x8] // top[base] + ld1 {v2.8h, v3.8h}, [x10] + sub w9, w15, w9 // 64 - frac + sub w11, w15, w11 + dup v6.8h, w9 // 64 - frac + dup v7.8h, w11 + uzp2 v20.8h, v0.8h, v1.8h // top[base+1] + uzp1 v0.8h, v0.8h, v1.8h // top[base] + uzp2 v21.8h, v2.8h, v3.8h + uzp1 v2.8h, v2.8h, v3.8h + umull v16.4s, v0.4h, v6.4h // top[base]*(64-frac) + umlal v16.4s, v20.4h, v4.4h // + top[base+1]*frac + umull2 v17.4s, v0.8h, v6.8h + umlal2 v17.4s, v20.8h, v4.8h + umull v18.4s, v2.4h, v7.4h + umlal v18.4s, v21.4h, v5.4h + umull2 v19.4s, v2.8h, v7.8h + umlal2 v19.4s, v21.8h, v5.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + subs w3, w3, #2 + zip1 v18.8h, v16.8h, v17.8h + zip2 v19.8h, v16.8h, v17.8h + add w7, w7, w5 // xpos += dx + st1 {v18.s}[0], [x0], x1 + st1 {v18.s}[1], [x13], x1 + st1 {v18.s}[2], [x0], x1 + st1 {v18.s}[3], [x13], x1 + st1 {v19.s}[0], [x0], x1 + st1 {v19.s}[1], [x13], x1 + st1 {v19.s}[2], [x0], x1 + st1 {v19.s}[3], [x13], x1 + b.le 9f + sub x0, x0, x1, lsl #2 // ptr -= 4 * (2*stride) + sub x13, x13, x1, lsl #2 + add x0, x0, #4 + add x13, x13, #4 + b 8b +9: + ret +endfunc + + // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, // const pixel *const topleft, // const int width, const int height, const int filt_idx, diff --git a/third_party/dav1d/src/arm/ipred.h b/third_party/dav1d/src/arm/ipred.h index cedc849dce7f..e849d4998b1c 100644 --- a/third_party/dav1d/src/arm/ipred.h +++ b/third_party/dav1d/src/arm/ipred.h @@ -50,13 +50,15 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); -#if ARCH_AARCH64 && BITDEPTH == 8 +#if ARCH_AARCH64 void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz, const pixel *const in, - const int end); + const int end HIGHBD_DECL_SUFFIX); void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz, const pixel *const in, const int end, const int strength); +void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px, + const int n); void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride, const pixel *const top, const int width, const int height, const int dx, @@ -76,14 +78,15 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, const int enable_intra_edge_filter = angle >> 10; angle &= 511; int dx = dav1d_dr_intra_derivative[angle >> 1]; - pixel top_out[64 + 64 + (64+15)*2]; + pixel top_out[64 + 64 + (64+15)*2 + 16]; int max_base_x; const int upsample_above = enable_intra_edge_filter ? get_upsample(width + height, 90 - angle, is_sm) : 0; if (upsample_above) { BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height, topleft_in, - width + imin(width, height)); + width + imin(width, height) + HIGHBD_TAIL_SUFFIX); max_base_x = 2 * (width + height) - 2; dx <<= 1; } else { @@ -102,7 +105,8 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, } const int base_inc = 1 + upsample_above; int pad_pixels = width + 15; // max(dx >> 6) == 15 - pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc); + BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1], + top_out[max_base_x], pad_pixels * base_inc); if (upsample_above) BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height, dx, max_base_x); @@ -140,12 +144,13 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, const int upsample_left = enable_intra_edge_filter ? get_upsample(width + height, angle - 180, is_sm) : 0; if (upsample_left) { - flipped[0] = topleft_in[0]; + flipped[0] = topleft_in[0]; BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], height + imax(width, height)); BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height, flipped, - height + imin(width, height)); + height + imin(width, height) + HIGHBD_TAIL_SUFFIX); max_base_y = 2 * (width + height) - 2; dy <<= 1; } else { @@ -172,7 +177,8 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, // the other implementation can read height + max(dy >> 6) past the end. int pad_pixels = imax(64 - max_base_y - 1, height + 15); - pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc); + BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1], + left_out[max_base_y], pad_pixels * base_inc); if (upsample_left) BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height, dy, max_base_y); @@ -197,7 +203,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); -#if ARCH_AARCH64 && BITDEPTH == 8 +#if ARCH_AARCH64 c->intra_pred[Z1_PRED] = ipred_z1_neon; c->intra_pred[Z3_PRED] = ipred_z3_neon; #endif diff --git a/third_party/dav1d/src/obu.c b/third_party/dav1d/src/obu.c index b6c2b6990bc9..e08129aba553 100644 --- a/third_party/dav1d/src/obu.c +++ b/third_party/dav1d/src/obu.c @@ -1560,7 +1560,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa if (c->n_fc == 1) { dav1d_thread_picture_ref(&c->out, &c->refs[c->frame_hdr->existing_frame_idx].p); - dav1d_data_props_copy(&c->out.p.m, &in->m); + dav1d_picture_copy_props(&c->out.p, + c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, + &in->m); + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p); } else { pthread_mutex_lock(&c->task_thread.lock); @@ -1606,7 +1613,15 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa dav1d_thread_picture_ref(out_delayed, &c->refs[c->frame_hdr->existing_frame_idx].p); out_delayed->visible = 1; - dav1d_data_props_copy(&out_delayed->p.m, &in->m); + dav1d_picture_copy_props(&out_delayed->p, + c->content_light, c->content_light_ref, + c->mastering_display, c->mastering_display_ref, + c->itut_t35, c->itut_t35_ref, + &in->m); + // Must be removed from the context after being attached to the frame + dav1d_ref_dec(&c->itut_t35_ref); + c->itut_t35 = NULL; + pthread_mutex_unlock(&c->task_thread.lock); } if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) { diff --git a/third_party/dav1d/src/picture.c b/third_party/dav1d/src/picture.c index 30e224cc2379..3e55d3f27373 100644 --- a/third_party/dav1d/src/picture.c +++ b/third_party/dav1d/src/picture.c @@ -142,9 +142,6 @@ static int picture_alloc_with_edges(Dav1dContext *const c, p->p.h = h; p->seq_hdr = seq_hdr; p->frame_hdr = frame_hdr; - p->content_light = content_light; - p->mastering_display = mastering_display; - p->itut_t35 = itut_t35; p->p.layout = seq_hdr->layout; p->p.bpc = bpc; dav1d_data_props_set_defaults(&p->m); @@ -194,21 +191,38 @@ static int picture_alloc_with_edges(Dav1dContext *const c, p->frame_hdr_ref = frame_hdr_ref; if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref); - dav1d_data_props_copy(&p->m, props); + dav1d_picture_copy_props(p, content_light, content_light_ref, + mastering_display, mastering_display_ref, + itut_t35, itut_t35_ref, props); if (extra && extra_ptr) *extra_ptr = &pic_ctx->extra_ptr; + return 0; +} + +void dav1d_picture_copy_props(Dav1dPicture *const p, + Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref, + Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref, + Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref, + const Dav1dDataProps *const props) +{ + dav1d_data_props_copy(&p->m, props); + + dav1d_ref_dec(&p->content_light_ref); p->content_light_ref = content_light_ref; + p->content_light = content_light; if (content_light_ref) dav1d_ref_inc(content_light_ref); + dav1d_ref_dec(&p->mastering_display_ref); p->mastering_display_ref = mastering_display_ref; + p->mastering_display = mastering_display; if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref); + dav1d_ref_dec(&p->itut_t35_ref); p->itut_t35_ref = itut_t35_ref; + p->itut_t35 = itut_t35; if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref); - - return 0; } int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f, diff --git a/third_party/dav1d/src/picture.h b/third_party/dav1d/src/picture.h index 154c85a0c6a1..0c3a0ec5629b 100644 --- a/third_party/dav1d/src/picture.h +++ b/third_party/dav1d/src/picture.h @@ -101,6 +101,12 @@ int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie); void dav1d_default_picture_release(Dav1dPicture *p, void *cookie); void dav1d_picture_unref_internal(Dav1dPicture *p); +void dav1d_picture_copy_props(Dav1dPicture *p, + Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref, + Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref, + Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref, + const Dav1dDataProps *props); + /** * Get event flags from picture flags. */ diff --git a/third_party/dav1d/src/x86/ipred.h b/third_party/dav1d/src/x86/ipred.h index 3eb5e8b8ab66..415a4d8d621c 100644 --- a/third_party/dav1d/src/x86/ipred.h +++ b/third_party/dav1d/src/x86/ipred.h @@ -85,6 +85,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3); #if BITDEPTH == 8 init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3); + init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3); init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3); #endif init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3); diff --git a/third_party/dav1d/src/x86/ipred_sse.asm b/third_party/dav1d/src/x86/ipred_sse.asm index 6ca4900bb7b4..67e90b79ae8f 100644 --- a/third_party/dav1d/src/x86/ipred_sse.asm +++ b/third_party/dav1d/src/x86/ipred_sse.asm @@ -81,6 +81,10 @@ z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7 db 7, 8, 8, 9, 9, 10, 10, 11 z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64 +z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11 +z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8 +z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64 +pw_m1to4: dw -1, -2, -3, -4 z_filter_k: times 4 db 0, 16 times 4 db 0, 20 times 4 db 8, 16 @@ -129,6 +133,7 @@ JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64 +JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64 JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64 JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \ @@ -1979,6 +1984,653 @@ ALIGN function_align mova [tlq+16*1], m1 ret +%if ARCH_X86_64 +cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy + %define base r7-$$ + %define maxwm r6m + %define maxhm r7m + lea r7, [$$] + mov hd, hm + mova m8, [base+pw_62] + mova m9, [base+pw_64] + lea r9d, [wq-4] + mova m10, [base+pw_512] + shl r9d, 6 + mova m11, [base+z1_shuf_w4] + or r9d, hd + mova m12, [base+z2_h_shuf] +%else +cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx + %define base r1-$$ + %define m8 [base+pw_62] + %define m9 [base+pw_64] + %define m10 [base+pw_512] + %define m11 [rsp+16*16] + %define m12 [rsp+16*17] + %define r8 [rsp+16*6+4*1] + %define r9b byte [rsp+16*18+4*0] + %define r9d dword [rsp+16*18+4*0] + %define r10d dword [rsp+16*18+4*1] + %define r11d dword [rsp+16*18+4*2] + %define maxwm [rsp+16*18+4*3] + %define maxhm [rsp+16*19+4*0] + %define stridemp [rsp+16*19+4*1] + %define strideq r3 + %define dyd r4 + %define dyq r4 + mov stridemp, r1 + mov r1d, r6m + mov r4d, r7m + mov maxwm, r1d + mov maxhm, r4d + LEA r1, $$ + lea hd, [wq-4] + mova m0, [base+z1_shuf_w4] + shl hd, 6 + mova m1, [base+z2_h_shuf] + or hd, hm + mova m11, m0 + mov r9d, hd + mova m12, m1 +%endif + tzcnt wd, wd + movifnidn angled, anglem + movsxd wq, [base+ipred_z2_ssse3_table+wq*4] +%if ARCH_X86_64 + movzx dxd, angleb +%else + movzx dxd, byte anglem +%endif + xor angled, 0x400 + mova m0, [tlq-16*4] + mov dyd, dxd + mova m1, [tlq-16*3] + neg dxq + mova m2, [tlq-16*2] + and dyd, ~1 + mova m3, [tlq-16*1] + and dxq, ~1 + movd m4, [tlq] + movu m5, [tlq+16*0+1] + movu m6, [tlq+16*1+1] + movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90 + movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle + mova [rsp+16*2], m0 + pxor m7, m7 + mova [rsp+16*3], m1 + pshufb m4, m7 + mova [rsp+16*4], m2 + lea wq, [base+ipred_z2_ssse3_table+wq] + mova [rsp+16*5], m3 + neg dxd + mova [rsp+16*6], m4 + or dyd, 4<<16 + mova [rsp+16*7], m4 + mova [rsp+16*8], m5 + mova [rsp+16*9], m6 + movq m0, [base+z_base_inc+2] + movsldup m1, [base+z2_dy_offset] + movq m2, [base+pw_256] ; 4<<6 + movq [rsp+16*14+8*0], m0 + movq [rsp+16*15+8*0], m1 + movq [rsp+16*15+8*1], m2 +%if ARCH_X86_64 + lea r10d, [dxq+(128<<6)] ; xpos +%else + mov [rsp+16*7+4*1], dyd + lea r4d, [dxq+(128<<6)] + mov r10d, r4d + movzx hd, r9b +%endif + mov r11d, (128-4)<<6 + jmp wq +.w4: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+4] + lea r3d, [hq+2] + add angled, 1022 + pshufb m5, m7 + shl r3d, 6 + movd [rsp+16*8+4], m5 + test r3d, angled + jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8) + call .upsample_above + sub angled, 1075 ; angle - 53 + lea r3d, [hq+3] + xor angled, 0x7f ; 180 - angle + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, [base+z_filter_wh4] + pand m6, m0 + pcmpgtb m6, [base+z_filter_t_w48+angleq*8] + jmp .w8_filter_left +.upsample_above: ; w4/w8 + movq m3, [rsp+gprsize+16*8-2] + movq m1, [rsp+gprsize+16*8-1] + movq m0, [rsp+gprsize+16*8+0] + movq m4, [rsp+gprsize+16*8+1] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m3 + punpcklbw m2, m0, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 +%if ARCH_X86_64 + mova m11, [base+pb_0to15] + lea r10d, [r10+dxq+(1<<6)] + mov r11d, (128-7)<<6 +%else + mova m3, [base+pb_0to15] + mov r3d, [rsp+gprsize+16*18+4*1] + mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6 + lea r3d, [r3+dxq+(1<<6)] + mov [rsp+gprsize+16*18+4*1], r3d + mova [rsp+gprsize+16*16], m3 +%endif + add dxd, dxd + paddw m1, m2 + pmulhrsw m1, m10 + movq m2, [rsp+gprsize+16*14] + paddw m2, m2 + movq [rsp+gprsize+16*14], m2 + packuswb m1, m1 + punpcklbw m1, m0 + mova [rsp+gprsize+16*8], m1 + ret +.w4_no_upsample_above: + lea r3d, [hq+3] + mov [rsp], angled + sub angled, 1112 ; angle - 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh4] + mova m4, [base+z_filter_t_w48+angleq*8] + call .w8_filter_top + mov angled, [rsp] + lea r3d, [hq+2] + sub angled, 139 + shl r3d, 6 + test r3d, angled + jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8) +.upsample_left: ; w4/w8 + neg hq + movd m0, [tlq+hq] + pshufb m0, m7 + movd [rsp+16*6+hq-4], m0 + movq m3, [rsp+16*5+7] + movq m0, [rsp+16*5+8] + movq m2, [rsp+16*5+9] + movq m4, [rsp+16*5+10] + movddup m5, [base+pb_36_m4] + punpcklbw m1, m0, m3 + punpcklbw m2, m4 + pmaddubsw m1, m5 + pmaddubsw m2, m5 + movshdup m3, [base+z2_dy_offset] +%if ARCH_X86_64 + mova m12, [base+z2_upsample] + add dyd, dyd +%else + mova m4, [base+z2_upsample] + shl dword [rsp+16*7+4*1], 1 + mova m12, m4 +%endif + paddw m1, m2 + pmulhrsw m1, m10 + movq [rsp+16*15], m3 + packuswb m1, m1 + punpcklbw m0, m1 + mova [rsp+16*5], m0 +.w4_main: + movd m6, dxd +%if ARCH_X86_64 + movd m3, dyd +%else + movd m3, [rsp+16*7+4*1] +%endif + movddup m0, [rsp+16*14+8*0] + pshufb m6, [base+pw_256] + paddw m7, m6, m6 + movq m5, [base+pw_m1to4] + pshuflw m4, m3, q0000 + punpcklqdq m6, m7 + pmullw m4, m5 + pshuflw m3, m3, q1111 + paddw m6, m0 + pshuflw m0, m4, q3333 + psubw m4, [rsp+16*15] + movq [rsp+16*6+8*1], m3 + movq [rsp+8*1], m0 ; dy*4 +%if ARCH_X86_64 + mov r8, dstq +%endif +.w4_loop0: +%if ARCH_X86_32 + mov r8, dstq +%endif + mova [rsp+16*12], m6 + mov r2d, r10d + movq [rsp+8*0], m4 + pand m0, m4, m8 + psraw m4, 6 + psubw m1, m9, m0 + psllw m0, 8 + por m0, m1 ; 64-frac_y, frac_y + movq [rsp+8*3], m0 + pabsw m4, m4 + movq [rsp+8*2], m4 + movzx hd, r9b +.w4_loop: + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x0 + movq m0, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x1 + movhps m0, [rsp+r3] + lea r3d, [r2+dxq] + shr r2d, 6 ; base_x2 + movq m1, [rsp+r2] + lea r2d, [r3+dxq] + shr r3d, 6 ; base_x3 + movhps m1, [rsp+r3] + pand m2, m8, m6 + paddsw m5, m6, m7 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m0, m11 + por m2, m3 + pmaddubsw m0, m2 + pand m2, m8, m5 + psubw m3, m9, m2 + psllw m2, 8 + pshufb m1, m11 + por m2, m3 + pmaddubsw m1, m2 + cmp r3d, 127 ; topleft + jge .w4_toponly + movzx r3d, byte [rsp+8*2+0] ; base_y0 + movq m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+2] ; base_y1 + movhps m3, [rsp+r3] + movzx r3d, byte [rsp+8*2+4] ; base_y2 + movq m4, [rsp+r3] + movzx r3d, byte [rsp+8*2+6] ; base_y3 + movhps m4, [rsp+r3] + pshufb m3, m12 + pshufb m4, m12 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + movddup m4, [rsp+8*3] + pmaddubsw m2, m4 + pmaddubsw m3, m4 + psraw m6, 15 ; base_x < topleft + pand m2, m6 + pandn m6, m0 + por m0, m2, m6 + psraw m6, m5, 15 + pand m3, m6 + pandn m6, m1 + por m1, m3, m6 +.w4_toponly: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + movifnidn strideq, stridemp + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + sub hd, 4 + jz .w4_end + movq m4, [rsp+8*2] + movq m3, [rsp+16*6+8*1] + paddw m6, m5, m7 ; xpos += dx + psubw m4, m3 + movq [rsp+8*2], m4 + lea dstq, [dstq+strideq*2] + cmp r2d, r11d + jge .w4_loop + movddup m5, [rsp+8*3] +.w4_leftonly_loop: + movzx r3d, byte [rsp+8*2+0] ; base_y0 + movq m1, [rsp+r3] + movzx r3d, byte [rsp+8*2+2] ; base_y1 + movhps m1, [rsp+r3] + movzx r3d, byte [rsp+8*2+4] ; base_y2 + movq m2, [rsp+r3] + movzx r3d, byte [rsp+8*2+6] ; base_y3 + movhps m2, [rsp+r3] + psubw m4, m3 + pshufb m1, m12 + pshufb m2, m12 + movq [rsp+8*2], m4 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + pmaddubsw m0, m5 + pmaddubsw m1, m5 + movifnidn strideq, stridemp + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movd [dstq+strideq*0], m0 + pshuflw m1, m0, q1032 + movd [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + punpckhqdq m0, m0 + movd [dstq+strideq*0], m0 + psrlq m0, 32 + movd [dstq+strideq*1], m0 + lea dstq, [dstq+strideq*2] + sub hd, 4 + jg .w4_leftonly_loop +.w4_end: + sub r9d, 1<<8 + jl .w4_ret + movq m4, [rsp+8*1] +%if ARCH_X86_64 + add r8, 4 + mov dstq, r8 +%else + mov dstq, r8 + add dstq, 4 +%endif + paddw m4, [rsp+8*0] ; base_y += 4*dy + movzx r3d, word [rsp+16*15+8*1] + add r10d, r3d + movddup m6, [rsp+16*15+8*1] + paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above) + jmp .w4_loop0 +.w4_ret: + RET +.w8: + test angled, 0x400 + jnz .w4_main + movd m5, [tlq+8] + lea r3d, [angleq+126] + pshufb m5, m7 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + movd [rsp+16*8+8], m5 + cmp r3d, 8 + ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm + call .upsample_above + sub angled, 53 + lea r3d, [hq+7] + xor angled, 0x7f ; 180 - angle + movu m1, [base+z_filter_wh8] + movd m0, r3d + movd m6, angled + shr angled, 8 ; is_sm << 1 + psrldq m2, [base+z_filter_t_w48+angleq*8], 4 + pshufb m0, m7 + pshufb m6, m7 + pcmpeqb m0, m1 + pand m6, m0 + pcmpgtb m6, m2 +%if ARCH_X86_64 + movq [rsp+16*15+8*1], m10 ; 8<<6 +%else + movq m0, m10 + movq [rsp+16*15+8*1], m0 +%endif + jmp .w8_filter_left +.w8_no_upsample_above: + lea r3d, [hq+7] + mov [rsp], angled + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movu m3, [base+z_filter_wh8] + psrldq m4, [base+z_filter_t_w48+angleq*8], 4 + call .w8_filter_top + mov r3d, [rsp] + sub r3d, 141 +%if ARCH_X86_64 + mov r3b, hb +%else + xor r3b, r3b + or r3d, hd +%endif + cmp r3d, 8 + jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm +.w8_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x55555555 + mov r3, tlq + shr r5d, 30 + sub r5, 3 ; filter_strength-3 + jmp .filter_left +.w8_filter_top: + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + pcmpeqb m0, m3 + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m4 + pcmpgtb m6, m4 + pmovmskb r5d, m1 + test r5d, r5d + jz .w8_filter_top_end ; filter_strength == 0 + imul r5d, 0x55555555 + movq m0, [rsp+gprsize+16*8-2] + shr r5d, 30 + movq m1, [rsp+gprsize+16*8-1] + sub r5, 3 ; filter_strength-3 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + punpcklbw m0, m1 + pmaddubsw m0, m7 + movq m1, [rsp+gprsize+16*8+0] + movq m2, [rsp+gprsize+16*8+1] + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + punpcklbw m1, m2 + pmaddubsw m1, m7 + movq m2, [rsp+gprsize+16*8+2] + movddup m7, [base+z_filter_k+8*2+r5*8+24*2] + punpcklbw m2, m2 + pmaddubsw m2, m7 + paddw m0, m1 + paddw m0, m2 +%if ARCH_X86_64 + mov r3d, r7m ; maxw, offset due to call +%else + mov r3d, [rsp+gprsize+16*18+4*3] +%endif + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + movq [rsp+gprsize+16*8], m0 + cmp r3d, 8 + jge .w8_filter_top_end + movq m0, [tlq+r3+1] + movq [rsp+gprsize+r3+16*8], m0 +.w8_filter_top_end: + ret +.w16: + test angled, 0x400 + jnz .w4_main + lea r3d, [hq+15] + sub angled, 90 + movd m0, r3d + mov r3d, 90 + movd m1, angled + sub r3d, angled ; 180 - angle + shr angled, 8 ; is_sm << 1 + movd m6, r3d + REPX {pshufb x, m7}, m0, m1, m6 + movq m3, [base+z_filter_t_w16+angleq*4] + pcmpeqb m0, [base+z_filter_wh16] + pand m1, m0 + pand m6, m0 + pcmpgtb m1, m3 + pcmpgtb m6, m3 + pmovmskb r5d, m1 + mov r3, tlq + test r5d, r5d + jz .w16_filter_left ; filter_strength == 0 + imul r5d, 0x24924924 + pshufb m5, [base+z_filter_t_w16] ; tlq[16] + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + movd [rsp+16*9], m5 + movddup m7, [base+z_filter_k+8*2+r5*8+24*0] + movu m1, [rsp+16*8-2] + movu m2, [rsp+16*8-1] + punpcklbw m0, m1, m2 + pmaddubsw m0, m7 + punpckhbw m1, m2 + pmaddubsw m1, m7 + movddup m7, [base+z_filter_k+8*2+r5*8+24*1] + mova m3, [rsp+16*8+0] + movu m4, [rsp+16*8+1] + punpcklbw m2, m3, m4 + pmaddubsw m2, m7 + punpckhbw m3, m4 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 + test r5d, r5d + jnz .w16_filter_end ; 3-tap + movddup m7, [base+z_filter_k+8*8] + movu m3, [rsp+16*8+2] + punpcklbw m2, m3, m3 + pmaddubsw m2, m7 + punpckhbw m3, m3 + pmaddubsw m3, m7 + paddw m0, m2 + paddw m1, m3 +.w16_filter_end: + mov r2d, maxwm + pmulhrsw m0, m10 + pmulhrsw m1, m10 + packuswb m0, m1 + mova [rsp+16*8], m0 + cmp r2d, 16 + jge .w16_filter_left + movu m0, [r3+r2+1] + movu [rsp+r2+16*8], m0 +.w16_filter_left: + pmovmskb r5d, m6 + test r5d, r5d + jz .w4_main + imul r5d, 0x24924924 + shr r5d, 30 + adc r5, -4 ; filter_strength-3 + jmp .filter_left +.w32: + test angled, 0x400 + jnz .w4_main + pshufb m6, [base+z_filter_t_w16] ; tlq[32] + mov r3, tlq + lea tlq, [rsp+16*9] + movd [tlq+16*1], m6 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mov r2d, maxwm + mova [rsp+16*8], m0 + mova [rsp+16*9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16*8], m0 + movu [rsp+r2+16*9], m1 + jmp .filter_left +.w64: + movu m0, [tlq+16*2+1] + movu m1, [tlq+16*3+1] + mova [rsp+16*10], m0 + mova [rsp+16*11], m1 + test angled, 0x400 + jnz .w4_main + pshufb m1, [base+z_filter_t_w16] ; tlq[64] + mov r3, tlq + lea tlq, [rsp+16*11] + movd [tlq+16*1], m1 + xor r5d, r5d ; filter_strength = 3 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova m2, [tlq+16*2] + mova m3, [tlq+16*3] + mov r2d, maxwm + mova [rsp+16* 8], m0 + mova [rsp+16* 9], m1 + mova [rsp+16*10], m2 + mova [rsp+16*11], m3 + cmp r2d, 64 + jge .filter_left + movu m0, [r3+r2+16*0+1] + movu m1, [r3+r2+16*1+1] + movu [rsp+r2+16* 8], m0 + movu [rsp+r2+16* 9], m1 + cmp r2d, 32 + jge .filter_left + movu m0, [r3+r2+16*2+1] + movu m1, [r3+r2+16*3+1] + movu [rsp+r2+16*10], m0 + movu [rsp+r2+16*11], m1 +.filter_left: + neg hq + movd m0, [r3+hq] + pxor m1, m1 + pshufb m0, m1 + movd [rsp+16*6+hq-4], m0 + lea tlq, [rsp+16*5] + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + cmp hd, -32 + jge .filter_left_end + sub tlq, 16*2 + call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge + mova m0, [tlq+16*0] + mova m1, [tlq+16*1] + mova [rsp+16*2], m0 + mova [rsp+16*3], m1 +.filter_left_end: + mov r2d, maxhm + mova m0, [rsp+16*5] + mova m1, [rsp+16*6] + mova m2, [rsp+16*7] + neg r2 + mova [rsp+16*4], m0 + mova [rsp+16*5], m1 + mova [rsp+16*6], m2 + cmp r2d, hd + jle .w4_main + movu m0, [r3+r2-16*2] + movu m1, [r3+r2-16*1] + movu [rsp+r2+16*4], m0 + movu [rsp+r2+16*5], m1 + cmp r2d, -32 + jle .w4_main + movu m0, [r3+r2-16*4] + movu m1, [r3+r2-16*3] + movu [rsp+r2+16*2], m0 + movu [rsp+r2+16*3], m1 + jmp .w4_main + %if ARCH_X86_64 cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w %define base r7-$$ diff --git a/third_party/dav1d/src/x86/itx.h b/third_party/dav1d/src/x86/itx.h index 28ee69aed89b..478eb6c6b65d 100644 --- a/third_party/dav1d/src/x86/itx.h +++ b/third_party/dav1d/src/x86/itx.h @@ -317,6 +317,9 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons assign_itx16_bpc_fn(R, 16, 8, 12, avx2); assign_itx12_bpc_fn( , 16, 16, 12, avx2); assign_itx2_bpc_fn (R, 32, 8, 12, avx2); + assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2); + assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2); } #endif @@ -353,6 +356,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl); assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl); assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl); + assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl); } #endif #endif diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm index 811f711540f0..2315ec1e479a 100644 --- a/third_party/dav1d/src/x86/itx16_avx2.asm +++ b/third_party/dav1d/src/x86/itx16_avx2.asm @@ -3137,10 +3137,14 @@ INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity -%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] pmulhrsw m%2, m%3, m%1 %if %0 == 4 ; if downshifting by 1 +%ifnum %4 pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif %else paddsw m%1, m%1 %endif @@ -6837,10 +6841,11 @@ ALIGN function_align ret cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] vpbroadcastd m11, [pw_8192] - vpbroadcastd m7, [pixel_10bpc_max] lea r6, [strideq*5] pxor m6, m6 paddw m10, m11, m11 ; pw_16384 @@ -6910,11 +6915,15 @@ ALIGN function_align punpckhqdq m1, m3, m2 jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 +cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*4] @@ -7136,10 +7145,11 @@ ALIGN function_align jmp m(idct_16x8_internal_10bpc).write_16x4 cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: vpbroadcastd m8, [pw_2896x8] vpbroadcastd m9, [pw_1697x16] - vpbroadcastd m10, [pw_2048] - vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m10, [pw_4096] lea r6, [strideq*5] pxor m6, m6 mov r5, dstq @@ -7187,16 +7197,20 @@ ALIGN function_align packssdw m3, [cq+64*7] REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 REPX {paddsw x, x }, m0, m1, m2, m3 - REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 REPX {pmulhrsw x, m10}, m0, m1, m2, m3 REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 +cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 + cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*7] @@ -7364,9 +7378,10 @@ ALIGN function_align jmp m(idct_16x16_internal_8bpc).main cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob - %undef cmp - vpbroadcastd m5, [pw_8192] +%undef cmp vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_8192] pxor m6, m6 lea r6, [strideq*3] lea r5, [strideq*5] @@ -7432,6 +7447,10 @@ ALIGN function_align REPX {pmulhrsw x, m5}, m0, m1, m2, m3 jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero +cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 + %macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) %if %1 & 1 mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n @@ -7472,7 +7491,7 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -7814,7 +7833,7 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] lea r6, [rsp+32*6] @@ -8043,7 +8062,7 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob RET .normal: PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -8262,7 +8281,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -8411,7 +8430,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob test eobd, eobd jz .dconly PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob - %undef cmp +%undef cmp vpbroadcastd m11, [pd_2048] vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] diff --git a/third_party/dav1d/src/x86/itx16_avx512.asm b/third_party/dav1d/src/x86/itx16_avx512.asm index e3a3f74f4bfb..d973655462b8 100644 --- a/third_party/dav1d/src/x86/itx16_avx512.asm +++ b/third_party/dav1d/src/x86/itx16_avx512.asm @@ -174,6 +174,8 @@ cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2 cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast SECTION .text @@ -3815,4 +3817,317 @@ cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eo punpckhdq m8, m0 ; 6 7 ret +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + + PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [o(pd_2896)] + vpbroadcastd m13, [o(pd_2048)] + vpbroadcastd m14, [o(clip_18b_min)] + vpbroadcastd m15, [o(clip_18b_max)] + cmp eobd, 36 + jl .fast + call .pass1 + cmp eobd, 151 + jge .full + lea r5, [o_base_8bpc] + + punpckhwd m22, m0, m0 + punpckhwd m23, m1, m1 + punpckhwd m24, m2, m2 + punpckhwd m25, m3, m3 + punpckhwd m26, m4, m4 + punpckhwd m27, m5, m5 + punpckhwd m28, m6, m6 + punpckhwd m29, m7, m7 + punpcklwd m21, m1, m1 + punpcklwd m14, m3, m3 + punpcklwd m18, m5, m5 + punpcklwd m15, m7, m7 + pxor m9, m9 + punpcklwd m9, m9, m0 + punpcklwd m8, m2, m2 + punpcklwd m7, m4, m4 + punpcklwd m1, m6, m6 + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + mov r3d, 64*3 +.zero_loop: + REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3 + sub r3d, 64 + jge .zero_loop + + jmp .pass2_end +.full: + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 64 + call .pass1 + sub cq, 64 + mova m22, [cq+128*0] ; 0 1 + mova m23, [cq+128*1] ; 2 3 + mova m24, [cq+128*2] ; 4 5 + mova m25, [cq+128*3] ; 6 7 + mova m26, [cq+128*4] ; 8 9 + mova m27, [cq+128*5] ; 10 11 + mova m28, [cq+128*6] ; 12 13 + mova m29, [cq+128*7] ; 14 15 + mova [cq+64* 8], m0 + mova [cq+64* 9], m1 + mova [cq+64*10], m2 + mova [cq+64*11], m3 + mova [cq+64*12], m4 + mova [cq+64*13], m5 + mova [cq+64*14], m6 + mova [cq+64*15], m7 + lea r5, [o_base_8bpc] + + punpcklwd m20, m1, m1 + punpcklwd m16, m3, m3 + punpcklwd m19, m5, m5 + punpcklwd m17, m7, m7 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m5, m2, m2 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m4, m4 ; 24 + punpcklwd m4, m6, m6 ; 28 + pxor m9, m9 + punpcklwd m6, m9, m0 ; __ 16 + mova m0, m4 + punpcklwd m9, m9, m22 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast + punpcklwd m21, m23, m23 ; 2 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + mova m21, [cq+64*15] + mova m14, [cq+64* 8] + mova m17, [cq+64*11] + mova m18, [cq+64*12] + mova m19, [cq+64*13] + mova m16, [cq+64*10] + mova m15, [cq+64* 9] + mova m20, [cq+64*14] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + + pxor m12, m12 + mov r3d, 32*7 +.full_zero_loop: + REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3 + sub r3d, 32 + jge .full_zero_loop + + jmp .pass2_end +.fast: + mova ym0, [cq+128*0] + mova ym2, [cq+128*4] + movshdup m8, [o(permB)] + mova ym1, [cq+128*2] + mova ym3, [cq+128*6] + mova ym4, [cq+128*1] + mova ym5, [cq+128*3] + mova ym6, [cq+128*5] + mova ym7, [cq+128*7] + vpermt2q m0, m8, m2 ; 0 4 + vpermt2q m1, m8, m3 ; 2 6 + vpermt2q m4, m8, m5 ; 1 3 + vpermt2q m7, m8, m6 ; 7 5 + call m(idct_8x8_internal_10bpc).main_fast + call m(idct_16x8_internal_10bpc).main_fast + vpbroadcastd m11, [o(pd_2)] + call m(idct_8x16_internal_10bpc).main_end2 + mova m8, [o(idct8x32p)] + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m6, [dup16_perm] + vpermb m0, m8, m0 + vpermb m2, m8, m2 + vprold m8, 16 + vpermb m1, m8, m1 + vpermb m3, m8, m3 + punpckldq m4, m0, m2 + punpckhdq m0, m2 + punpckldq m2, m1, m3 + punpckhdq m1, m3 + punpckldq m21, m4, m2 + punpckhdq m14, m4, m2 + punpckldq m18, m0, m1 + punpckhdq m15, m0, m1 + vpord m7, m6, [o(pb_32)] {1to16} + vpermb m22, m7, m21 ; 1 + pmovzxwd m9, ym21 ; 0 + vpermb m8, m6, m18 ; 4 + vpermb m24, m7, m18 ; 5 + vpermb m21, m6, m14 ; 2 + vpermb m23, m7, m14 ; 3 + vpermb m14, m6, m15 ; 6 + vpermb m25, m7, m15 ; 7 + lea r5, [o_base_8bpc] + pslld m9, 16 + + pxor m7, m7 + REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29 + + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [rsp+mmsize*0], m14 + mova [rsp+mmsize*1], m15 + mova [rsp+mmsize*2], m16 + mova [rsp+mmsize*3], m17 + mova [rsp+mmsize*4], m18 + mova [rsp+mmsize*5], m19 + mova [rsp+mmsize*6], m20 + mova [rsp+mmsize*7], m21 + + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + + pxor m12, m12 + REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7 +.pass2_end: + movshdup m30, [permC] + vpbroadcastd m11, [pw_2048] + vpbroadcastd m13, [pixel_10bpc_max] + lea r6, [strideq*3] + psrlq m31, m30, 8 + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + mova m1, [rsp+mmsize*0] + mova m2, [rsp+mmsize*1] + mova m3, [rsp+mmsize*2] + mova m4, [rsp+mmsize*3] + mova m5, [rsp+mmsize*4] + mova m6, [rsp+mmsize*5] + mova m7, [rsp+mmsize*6] + mova m8, [rsp+mmsize*7] + + paddsw m0, m1, m21 + psubsw m21, m1, m21 + paddsw m1, m2, m20 + psubsw m20, m2, m20 + paddsw m2, m3, m19 + psubsw m19, m3, m19 + paddsw m3, m4, m18 + psubsw m18, m4, m18 + paddsw m4, m5, m17 + psubsw m17, m5, m17 + paddsw m5, m6, m16 + psubsw m16, m6, m16 + paddsw m6, m7, m15 + psubsw m15, m7, m15 + paddsw m7, m8, m14 + psubsw m14, m8, m14 + + vpermq m8, m30, m0 + vpermq m9, m31, m1 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m2 + vpermq m9, m31, m3 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m4 + vpermq m9, m31, m5 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m6 + vpermq m9, m31, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m14 + vpermq m9, m31, m15 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m16 + vpermq m9, m31, m17 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m18 + vpermq m9, m31, m19 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m20 + vpermq m9, m31, m21 + call m(idct_16x8_internal_10bpc).write_16x4 + + vpermq m8, m30, m22 + vpermq m9, m31, m23 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m24 + vpermq m9, m31, m25 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m26 + vpermq m9, m31, m27 + call m(idct_16x8_internal_10bpc).write_16x4 + vpermq m8, m30, m28 + vpermq m9, m31, m29 + call m(idct_16x8_internal_10bpc).write_16x4 + RET +.pass1: + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x16_internal_10bpc).main + mova m16, [cq+128* 1] + mova m17, [cq+128* 3] + mova m18, [cq+128* 5] + mova m19, [cq+128* 7] + mova m20, [cq+128* 9] + mova m21, [cq+128*11] + mova m22, [cq+128*13] + mova m23, [cq+128*15] + call m(idct_16x16_internal_10bpc).main + call m(idct_16x16_internal_10bpc).main_end + jmp m(idct_16x16_internal_10bpc).main_end3 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2 + %endif ; ARCH_X86_64 diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm index 813d474011d7..31c60fdd457a 100644 --- a/third_party/dav1d/src/x86/itx_avx512.asm +++ b/third_party/dav1d/src/x86/itx_avx512.asm @@ -5143,7 +5143,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob sar r6d, 8+2 jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 ALIGN function_align -.main_oddhalf_fast: ; bottom three-quarters are zero +cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m21, [o(pw_m1474_3822x8)] vpbroadcastd m14, [o(pw_897_3996x8)] @@ -5170,7 +5170,7 @@ ALIGN function_align mova m20, m15 jmp .main_oddhalf2 ALIGN function_align -.main_oddhalf: +cglobal_label .main_oddhalf vpbroadcastd m8, [o(pw_101_4095x8)] vpbroadcastd m9, [o(pw_m2824_2967x8)] vpbroadcastd m11, [o(pw_1660_3745x8)] diff --git a/third_party/dav1d/src/x86/refmvs.asm b/third_party/dav1d/src/x86/refmvs.asm index 9b0e0bf711d5..06f555db117b 100644 --- a/third_party/dav1d/src/x86/refmvs.asm +++ b/third_party/dav1d/src/x86/refmvs.asm @@ -57,6 +57,7 @@ save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 +cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 pb_128: times 16 db 128 @@ -74,6 +75,12 @@ save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 SAVE_TMVS_TABLE 5, 2, avx2 SAVE_TMVS_TABLE 7, 1, avx2 +save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl + SAVE_TMVS_TABLE 4, 8, avx512icl + SAVE_TMVS_TABLE 4, 4, avx512icl + SAVE_TMVS_TABLE 5, 2, avx512icl + SAVE_TMVS_TABLE 7, 1, avx512icl + JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 %endif @@ -170,8 +177,6 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ %define rpq r3 %define r10 r1 %define r10d r1 -%define r10w r1w -%define r10b r1b %define r11 r4 %define r11d r4 %endif @@ -486,6 +491,125 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 jg .loop RET +INIT_ZMM avx512icl +; refmvs_temporal_block *rp, ptrdiff_t stride, +; refmvs_block **rr, uint8_t *ref_sign, +; int col_end8, int row_end8, int col_start8, int row_start8 +cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ + xend, yend, xstart, ystart +%define base r14-.write1 + lea r14, [.write1] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm + psllq m4, [ref_signq]{bcstq}, 8 + vpbroadcastq m3, [base+save_ref_shuf+8] + vbroadcasti32x4 m5, [base+cond_shuf512] + vbroadcasti32x4 m6, [base+save_cond0] + vpbroadcastd m7, [base+pb_128] + mova m8, [base+save_pack0] + movu xm9, [base+save_pack0+4] + lea r9d, [xendq*5] + lea xstartd, [xstartq*5] + sub yendd, ystartd + add ystartd, ystartd + lea strideq, [strideq*5] + sub xstartq, r9 + add xendd, r9d + add rpq, r9 + mov r10d, 0x1f + kmovb k2, r10d + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand +.loop_y: + and ystartd, 30 + mov xq, xstartq + mov bq, [rrq+ystartq*8] + add ystartd, 2 + lea bq, [bq+xendq*4] +.loop_x: + imul candq, xq, 0x9999 + sar candq, 16 ; x / 5 * 3 + movzx r10d, byte [bq+candq*8+22] ; cand_b->bs + movu xm0, [bq+candq*8+12] ; cand_b + movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] + movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] + add r10, r14 + add candq, r11 + jge .calc + movzx r11d, byte [bq+candq*8+22] + vinserti32x4 ym0, [bq+candq*8+12], 1 + movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] + movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] + add r11, r14 + add candq, r12 + jge .calc + movzx r12d, byte [bq+candq*8+22] + vinserti32x4 m0, [bq+candq*8+12], 2 + movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] + movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] + add r12, r14 + add candq, r13 + jge .calc + vinserti32x4 m0, [bq+candq*8+12], 3 + movzx r13d, byte [bq+candq*8+22] + movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] + add r13, r14 +.calc: + pshufb m1, m0, m3 + pabsw m2, m0 + pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] + psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 + psubd m2, m1 + pshufb m2, m5 ; c0 c1 c1 c0 + pand m2, m6 + punpckhqdq m1, m2, m2 + vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 + pshufb m2, m0, m1 + mova xm0, xm2 + call r10 + jge .next_line + vextracti32x4 xm0, m2, 1 + call r11 + jge .next_line + vextracti32x4 xm0, m2, 2 + call r12 + jge .next_line + vextracti32x4 xm0, m2, 3 + call r13 + jl .loop_x +.next_line: + add rpq, strideq + dec hd + jg .loop_y + RET +.write1: + vmovdqu8 [rpq+xq]{k2}, xm0 + add xq, 5*1 + ret +.write2: + pshufb xm0, xm8 + vmovdqu16 [rpq+xq]{k2}, xm0 + add xq, 5*2 + ret +.write4: + vpermb ym0, ym8, ym0 + vmovdqu32 [rpq+xq]{k2}, ym0 + add xq, 5*4 + ret +.write8: + vpermb m0, m8, m0 + vmovdqu64 [rpq+xq]{k2}, m0 + add xq, 5*8 + ret +.write16: + vpermb m1, m8, m0 + movu [rpq+xq+ 0], m1 + pshufb xm0, xm9 + movu [rpq+xq+64], xm0 + add xq, 5*16 + ret + INIT_ZMM avx512icl cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 vbroadcasti32x4 m0, [aq] diff --git a/third_party/dav1d/src/x86/refmvs.h b/third_party/dav1d/src/x86/refmvs.h index b81e8b9bddae..9dafa78b1338 100644 --- a/third_party/dav1d/src/x86/refmvs.h +++ b/third_party/dav1d/src/x86/refmvs.h @@ -30,6 +30,7 @@ decl_save_tmvs_fn(dav1d_save_tmvs_ssse3); decl_save_tmvs_fn(dav1d_save_tmvs_avx2); +decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl); decl_splat_mv_fn(dav1d_splat_mv_sse2); decl_splat_mv_fn(dav1d_splat_mv_avx2); @@ -54,6 +55,7 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) { if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; + c->save_tmvs = dav1d_save_tmvs_avx512icl; c->splat_mv = dav1d_splat_mv_avx512icl; #endif } diff --git a/third_party/dav1d/tests/checkasm/ipred.c b/third_party/dav1d/tests/checkasm/ipred.c index b12843bd6a06..3676b809b368 100644 --- a/third_party/dav1d/tests/checkasm/ipred.c +++ b/third_party/dav1d/tests/checkasm/ipred.c @@ -133,6 +133,7 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { a & 0x1ff, a & 0x600, maxw, maxh); else if (mode == FILTER_PRED) fprintf(stderr, "filter_idx = %d\n", a & 0x1ff); + break; } bench_new(a_dst, stride, topleft, w, h, a, 128, 128 diff --git a/third_party/dav1d/tests/checkasm/refmvs.c b/third_party/dav1d/tests/checkasm/refmvs.c index 88a057fcf572..f21c81f85a48 100644 --- a/third_party/dav1d/tests/checkasm/refmvs.c +++ b/third_party/dav1d/tests/checkasm/refmvs.c @@ -29,6 +29,16 @@ #include +static inline int gen_mv(const int total_bits, int spel_bits) { + int bits = rnd() & ((1 << spel_bits) - 1); + do { + bits |= (rnd() & 1) << spel_bits; + } while (rnd() & 1 && ++spel_bits < total_bits); + // the do/while makes it relatively more likely to be close to zero (fpel) + // than far away + return rnd() & 1 ? -bits : bits; +} + static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) { refmvs_block *rr[31]; refmvs_block r[31 * 256]; @@ -58,10 +68,10 @@ static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) { while (j + ((dav1d_block_dimensions[bs][0] + 1) >> 1) > col_end8) bs++; rr[i * 2][j * 2 + 1] = (refmvs_block) { - .mv.mv[0].x = -(rnd() & 1) * (rnd() & 8191), - .mv.mv[0].y = -(rnd() & 1) * (rnd() & 8191), - .mv.mv[1].x = -(rnd() & 1) * (rnd() & 8191), - .mv.mv[1].y = -(rnd() & 1) * (rnd() & 8191), + .mv.mv[0].x = gen_mv(14, 10), + .mv.mv[0].y = gen_mv(14, 10), + .mv.mv[1].x = gen_mv(14, 10), + .mv.mv[1].y = gen_mv(14, 10), .ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 }, .bs = bs };