Bug 1827248 - Update dav1d to 5aa3b38f9871859e14e55f18ab5e38318fe86305 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D175056
This commit is contained in:
Updatebot 2023-04-11 16:16:09 +00:00
Родитель 55175bdf25
Коммит 6dde2ed59f
18 изменённых файлов: 2280 добавлений и 71 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb (2023-03-13T15:19:35.000+00:00).
release: 5aa3b38f9871859e14e55f18ab5e38318fe86305 (2023-04-08T11:47:31.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb
revision: 5aa3b38f9871859e14e55f18ab5e38318fe86305
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb"
#define DAV1D_VERSION "5aa3b38f9871859e14e55f18ab5e38318fe86305"

71
third_party/dav1d/src/arm/64/ipred.S поставляемый
Просмотреть файл

@ -1481,11 +1481,10 @@ function ipred_z1_filter_edge_8bpc_neon, export=1
sub x5, x5, w3, uxtw
add x6, x2, w6, sxtw
ld1 {v2.16b, v3.16b}, [x5] // padding_mask
ld1 {v2.16b}, [x5] // padding_mask
ld1r {v4.16b}, [x6]
bit v0.16b, v4.16b, v2.16b // Pad v0-v1
bit v1.16b, v4.16b, v3.16b
ld1r {v1.16b}, [x6]
bit v0.16b, v1.16b, v2.16b // Pad v0-v1
// Filter one block
ext v2.16b, v0.16b, v1.16b, #1
@ -1598,6 +1597,17 @@ L(fivetap):
ret
endfunc
// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
// const int n);
function ipred_pixel_set_8bpc_neon, export=1
dup v0.16b, w1
1:
subs w2, w2, #16
st1 {v0.16b}, [x0], #16
b.gt 1b
ret
endfunc
// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const top,
// const int width, const int height,
@ -1671,10 +1681,10 @@ function ipred_z1_fill1_8bpc_neon, export=1
dup v7.8b, w11
ext v1.16b, v0.16b, v0.16b, #1 // top[base+1]
ext v3.16b, v2.16b, v2.16b, #1
umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
umull v17.8h, v3.8b, v5.8b
umlal v17.8h, v2.8b, v7.8b
umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
umull v17.8h, v2.8b, v7.8b
umlal v17.8h, v3.8b, v5.8b
rshrn v16.8b, v16.8h, #6
rshrn v17.8b, v17.8h, #6
st1 {v16.8b}, [x0], x1
@ -1724,14 +1734,14 @@ function ipred_z1_fill1_8bpc_neon, export=1
ext v16.16b, v0.16b, v1.16b, #1 // top[base+1]
ext v17.16b, v2.16b, v3.16b, #1
subs w3, w3, #16
umull v18.8h, v16.8b, v4.8b // top[base+1]*frac
umlal v18.8h, v0.8b, v6.8b // + top[base]*(64-frac)
umull2 v19.8h, v16.16b, v4.16b
umlal2 v19.8h, v0.16b, v6.16b
umull v20.8h, v17.8b, v5.8b
umlal v20.8h, v2.8b, v7.8b
umull2 v21.8h, v17.16b, v5.16b
umlal2 v21.8h, v2.16b, v7.16b
umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
umull2 v19.8h, v0.16b, v6.16b
umlal2 v19.8h, v16.16b, v4.16b
umull v20.8h, v2.8b, v7.8b
umlal v20.8h, v17.8b, v5.8b
umull2 v21.8h, v2.16b, v7.16b
umlal2 v21.8h, v17.16b, v5.16b
rshrn v16.8b, v18.8h, #6
rshrn2 v16.16b, v19.8h, #6
rshrn v17.8b, v20.8h, #6
@ -1899,7 +1909,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
ld1r {v31.16b}, [x10] // padding
ld1 {v30.8h}, [x11] // increments
mov w7, w5
b.gt L(ipred_z3_fill1_large_w16)
b.gt L(ipred_z3_fill1_large_h16)
br x8
40:
@ -1909,6 +1919,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
movi v23.16b, #0x3e
// Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
ld1 {v0.16b, v1.16b}, [x2] // left[]
add v30.4h, v29.4h, v30.4h // ypos
@ -1958,7 +1969,8 @@ function ipred_z3_fill1_8bpc_neon, export=1
mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
movi v23.16b, #0x3e
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
// Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[]
add v30.8h, v29.8h, v30.8h // ypos
movi v22.16b, #64
@ -1974,12 +1986,12 @@ function ipred_z3_fill1_8bpc_neon, export=1
uqadd v28.8b, v26.8b, v21.8b // base + 2
sub v25.8b, v22.8b, v24.8b // 64 - frac
tbx v4.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.8b // left[base]
tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
1:
mov v5.8b, v31.8b
mov v6.8b, v31.8b
tbx v5.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.8b // left[base+1]
tbx v6.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.8b // left[base+2]
tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
@ -2008,6 +2020,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
movi v23.16b, #0x3e
// This is only executed if we've checked that max_base_y <= 64.
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
add v28.8h, v28.8h, v30.8h // ypos
@ -2075,6 +2088,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
sub x1, x1, w3, uxtw
add v30.8h, v28.8h, v30.8h // ypos
// This is only executed if we've checked that max_base_y <= 64.
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
movi v22.16b, #64
@ -2146,7 +2160,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
9:
ret
L(ipred_z3_fill1_large_w16):
L(ipred_z3_fill1_large_h16):
// Fallback case for max_base_y > 64; similar to the z1
// implementation. This does the filtering vertically, filling out
// a 2x pixel column at a time.
@ -2358,7 +2372,7 @@ L(ipred_z3_fill_padding_wide):
st1 {v31.16b}, [x0], #16
b.gt 2b
subs w4, w4, #1
add x0, x0, x1
add x0, x0, x1
b.le 9f
mov w3, w12
b 1b
@ -2367,16 +2381,11 @@ L(ipred_z3_fill_padding_wide):
endfunc
function ipred_z3_fill2_8bpc_neon, export=1
adr x8, L(ipred_z3_fill1_tbl)
cmp w3, #8
add x10, x2, w6, uxtw // left[max_base_y]
movrel x11, increments
ld1r {v31.16b}, [x10] // padding
ld1 {v30.8h}, [x11] // increments
mov w7, w5
cmp w3, #8
add x10, x2, w6, uxtw // left[max_base_y]
ld1r {v31.16b}, [x10] // padding
b.eq 80f
40: // w == 4
@ -2385,6 +2394,8 @@ function ipred_z3_fill2_8bpc_neon, export=1
mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
movi v23.16b, #0x3e
// Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
// so max_base_y <= 32.
ld1 {v0.16b, v1.16b}, [x2] // left[]
add v30.4h, v29.4h, v30.4h // ypos
@ -2434,6 +2445,8 @@ function ipred_z3_fill2_8bpc_neon, export=1
mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
movi v23.16b, #0x3e
// Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
// so max_base_y <= 32.
ld1 {v0.16b, v1.16b}, [x2] // left[]
add v30.8h, v29.8h, v30.8h // ypos

1027
third_party/dav1d/src/arm/64/ipred16.S поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

24
third_party/dav1d/src/arm/ipred.h поставляемый
Просмотреть файл

@ -50,13 +50,15 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
#if ARCH_AARCH64 && BITDEPTH == 8
#if ARCH_AARCH64
void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
const pixel *const in,
const int end);
const int end HIGHBD_DECL_SUFFIX);
void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
const pixel *const in,
const int end, const int strength);
void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
const int n);
void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
const pixel *const top, const int width,
const int height, const int dx,
@ -76,14 +78,15 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
int dx = dav1d_dr_intra_derivative[angle >> 1];
pixel top_out[64 + 64 + (64+15)*2];
pixel top_out[64 + 64 + (64+15)*2 + 16];
int max_base_x;
const int upsample_above = enable_intra_edge_filter ?
get_upsample(width + height, 90 - angle, is_sm) : 0;
if (upsample_above) {
BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
topleft_in,
width + imin(width, height));
width + imin(width, height)
HIGHBD_TAIL_SUFFIX);
max_base_x = 2 * (width + height) - 2;
dx <<= 1;
} else {
@ -102,7 +105,8 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
}
const int base_inc = 1 + upsample_above;
int pad_pixels = width + 15; // max(dx >> 6) == 15
pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc);
BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
top_out[max_base_x], pad_pixels * base_inc);
if (upsample_above)
BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
dx, max_base_x);
@ -140,12 +144,13 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
const int upsample_left = enable_intra_edge_filter ?
get_upsample(width + height, angle - 180, is_sm) : 0;
if (upsample_left) {
flipped[0] = topleft_in[0];
flipped[0] = topleft_in[0];
BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
height + imax(width, height));
BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
flipped,
height + imin(width, height));
height + imin(width, height)
HIGHBD_TAIL_SUFFIX);
max_base_y = 2 * (width + height) - 2;
dy <<= 1;
} else {
@ -172,7 +177,8 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
// the other implementation can read height + max(dy >> 6) past the end.
int pad_pixels = imax(64 - max_base_y - 1, height + 15);
pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc);
BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
left_out[max_base_y], pad_pixels * base_inc);
if (upsample_left)
BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
dy, max_base_y);
@ -197,7 +203,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons
c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
#if ARCH_AARCH64 && BITDEPTH == 8
#if ARCH_AARCH64
c->intra_pred[Z1_PRED] = ipred_z1_neon;
c->intra_pred[Z3_PRED] = ipred_z3_neon;
#endif

19
third_party/dav1d/src/obu.c поставляемый
Просмотреть файл

@ -1560,7 +1560,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (c->n_fc == 1) {
dav1d_thread_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p);
dav1d_data_props_copy(&c->out.p.m, &in->m);
dav1d_picture_copy_props(&c->out.p,
c->content_light, c->content_light_ref,
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref,
&in->m);
// Must be removed from the context after being attached to the frame
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
} else {
pthread_mutex_lock(&c->task_thread.lock);
@ -1606,7 +1613,15 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
dav1d_thread_picture_ref(out_delayed,
&c->refs[c->frame_hdr->existing_frame_idx].p);
out_delayed->visible = 1;
dav1d_data_props_copy(&out_delayed->p.m, &in->m);
dav1d_picture_copy_props(&out_delayed->p,
c->content_light, c->content_light_ref,
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref,
&in->m);
// Must be removed from the context after being attached to the frame
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
pthread_mutex_unlock(&c->task_thread.lock);
}
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {

26
third_party/dav1d/src/picture.c поставляемый
Просмотреть файл

@ -142,9 +142,6 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
p->p.h = h;
p->seq_hdr = seq_hdr;
p->frame_hdr = frame_hdr;
p->content_light = content_light;
p->mastering_display = mastering_display;
p->itut_t35 = itut_t35;
p->p.layout = seq_hdr->layout;
p->p.bpc = bpc;
dav1d_data_props_set_defaults(&p->m);
@ -194,21 +191,38 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
p->frame_hdr_ref = frame_hdr_ref;
if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
dav1d_data_props_copy(&p->m, props);
dav1d_picture_copy_props(p, content_light, content_light_ref,
mastering_display, mastering_display_ref,
itut_t35, itut_t35_ref, props);
if (extra && extra_ptr)
*extra_ptr = &pic_ctx->extra_ptr;
return 0;
}
void dav1d_picture_copy_props(Dav1dPicture *const p,
Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
const Dav1dDataProps *const props)
{
dav1d_data_props_copy(&p->m, props);
dav1d_ref_dec(&p->content_light_ref);
p->content_light_ref = content_light_ref;
p->content_light = content_light;
if (content_light_ref) dav1d_ref_inc(content_light_ref);
dav1d_ref_dec(&p->mastering_display_ref);
p->mastering_display_ref = mastering_display_ref;
p->mastering_display = mastering_display;
if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
dav1d_ref_dec(&p->itut_t35_ref);
p->itut_t35_ref = itut_t35_ref;
p->itut_t35 = itut_t35;
if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
return 0;
}
int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f,

6
third_party/dav1d/src/picture.h поставляемый
Просмотреть файл

@ -101,6 +101,12 @@ int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
void dav1d_picture_unref_internal(Dav1dPicture *p);
void dav1d_picture_copy_props(Dav1dPicture *p,
Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
const Dav1dDataProps *props);
/**
* Get event flags from picture flags.
*/

1
third_party/dav1d/src/x86/ipred.h поставляемый
Просмотреть файл

@ -85,6 +85,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
#if BITDEPTH == 8
init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3);
init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3);
#endif
init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);

652
third_party/dav1d/src/x86/ipred_sse.asm поставляемый
Просмотреть файл

@ -81,6 +81,10 @@ z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0,
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11
z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
pw_m1to4: dw -1, -2, -3, -4
z_filter_k: times 4 db 0, 16
times 4 db 0, 20
times 4 db 8, 16
@ -129,6 +133,7 @@ JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
@ -1979,6 +1984,653 @@ ALIGN function_align
mova [tlq+16*1], m1
ret
%if ARCH_X86_64
cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
%define base r7-$$
%define maxwm r6m
%define maxhm r7m
lea r7, [$$]
mov hd, hm
mova m8, [base+pw_62]
mova m9, [base+pw_64]
lea r9d, [wq-4]
mova m10, [base+pw_512]
shl r9d, 6
mova m11, [base+z1_shuf_w4]
or r9d, hd
mova m12, [base+z2_h_shuf]
%else
cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
%define base r1-$$
%define m8 [base+pw_62]
%define m9 [base+pw_64]
%define m10 [base+pw_512]
%define m11 [rsp+16*16]
%define m12 [rsp+16*17]
%define r8 [rsp+16*6+4*1]
%define r9b byte [rsp+16*18+4*0]
%define r9d dword [rsp+16*18+4*0]
%define r10d dword [rsp+16*18+4*1]
%define r11d dword [rsp+16*18+4*2]
%define maxwm [rsp+16*18+4*3]
%define maxhm [rsp+16*19+4*0]
%define stridemp [rsp+16*19+4*1]
%define strideq r3
%define dyd r4
%define dyq r4
mov stridemp, r1
mov r1d, r6m
mov r4d, r7m
mov maxwm, r1d
mov maxhm, r4d
LEA r1, $$
lea hd, [wq-4]
mova m0, [base+z1_shuf_w4]
shl hd, 6
mova m1, [base+z2_h_shuf]
or hd, hm
mova m11, m0
mov r9d, hd
mova m12, m1
%endif
tzcnt wd, wd
movifnidn angled, anglem
movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
%if ARCH_X86_64
movzx dxd, angleb
%else
movzx dxd, byte anglem
%endif
xor angled, 0x400
mova m0, [tlq-16*4]
mov dyd, dxd
mova m1, [tlq-16*3]
neg dxq
mova m2, [tlq-16*2]
and dyd, ~1
mova m3, [tlq-16*1]
and dxq, ~1
movd m4, [tlq]
movu m5, [tlq+16*0+1]
movu m6, [tlq+16*1+1]
movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
mova [rsp+16*2], m0
pxor m7, m7
mova [rsp+16*3], m1
pshufb m4, m7
mova [rsp+16*4], m2
lea wq, [base+ipred_z2_ssse3_table+wq]
mova [rsp+16*5], m3
neg dxd
mova [rsp+16*6], m4
or dyd, 4<<16
mova [rsp+16*7], m4
mova [rsp+16*8], m5
mova [rsp+16*9], m6
movq m0, [base+z_base_inc+2]
movsldup m1, [base+z2_dy_offset]
movq m2, [base+pw_256] ; 4<<6
movq [rsp+16*14+8*0], m0
movq [rsp+16*15+8*0], m1
movq [rsp+16*15+8*1], m2
%if ARCH_X86_64
lea r10d, [dxq+(128<<6)] ; xpos
%else
mov [rsp+16*7+4*1], dyd
lea r4d, [dxq+(128<<6)]
mov r10d, r4d
movzx hd, r9b
%endif
mov r11d, (128-4)<<6
jmp wq
.w4:
test angled, 0x400
jnz .w4_main
movd m5, [tlq+4]
lea r3d, [hq+2]
add angled, 1022
pshufb m5, m7
shl r3d, 6
movd [rsp+16*8+4], m5
test r3d, angled
jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
call .upsample_above
sub angled, 1075 ; angle - 53
lea r3d, [hq+3]
xor angled, 0x7f ; 180 - angle
movd m0, r3d
movd m6, angled
shr angled, 8 ; is_sm << 1
pshufb m0, m7
pshufb m6, m7
pcmpeqb m0, [base+z_filter_wh4]
pand m6, m0
pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
jmp .w8_filter_left
.upsample_above: ; w4/w8
movq m3, [rsp+gprsize+16*8-2]
movq m1, [rsp+gprsize+16*8-1]
movq m0, [rsp+gprsize+16*8+0]
movq m4, [rsp+gprsize+16*8+1]
movddup m5, [base+pb_36_m4]
punpcklbw m1, m3
punpcklbw m2, m0, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
%if ARCH_X86_64
mova m11, [base+pb_0to15]
lea r10d, [r10+dxq+(1<<6)]
mov r11d, (128-7)<<6
%else
mova m3, [base+pb_0to15]
mov r3d, [rsp+gprsize+16*18+4*1]
mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
lea r3d, [r3+dxq+(1<<6)]
mov [rsp+gprsize+16*18+4*1], r3d
mova [rsp+gprsize+16*16], m3
%endif
add dxd, dxd
paddw m1, m2
pmulhrsw m1, m10
movq m2, [rsp+gprsize+16*14]
paddw m2, m2
movq [rsp+gprsize+16*14], m2
packuswb m1, m1
punpcklbw m1, m0
mova [rsp+gprsize+16*8], m1
ret
.w4_no_upsample_above:
lea r3d, [hq+3]
mov [rsp], angled
sub angled, 1112 ; angle - 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movu m3, [base+z_filter_wh4]
mova m4, [base+z_filter_t_w48+angleq*8]
call .w8_filter_top
mov angled, [rsp]
lea r3d, [hq+2]
sub angled, 139
shl r3d, 6
test r3d, angled
jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
.upsample_left: ; w4/w8
neg hq
movd m0, [tlq+hq]
pshufb m0, m7
movd [rsp+16*6+hq-4], m0
movq m3, [rsp+16*5+7]
movq m0, [rsp+16*5+8]
movq m2, [rsp+16*5+9]
movq m4, [rsp+16*5+10]
movddup m5, [base+pb_36_m4]
punpcklbw m1, m0, m3
punpcklbw m2, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
movshdup m3, [base+z2_dy_offset]
%if ARCH_X86_64
mova m12, [base+z2_upsample]
add dyd, dyd
%else
mova m4, [base+z2_upsample]
shl dword [rsp+16*7+4*1], 1
mova m12, m4
%endif
paddw m1, m2
pmulhrsw m1, m10
movq [rsp+16*15], m3
packuswb m1, m1
punpcklbw m0, m1
mova [rsp+16*5], m0
.w4_main:
movd m6, dxd
%if ARCH_X86_64
movd m3, dyd
%else
movd m3, [rsp+16*7+4*1]
%endif
movddup m0, [rsp+16*14+8*0]
pshufb m6, [base+pw_256]
paddw m7, m6, m6
movq m5, [base+pw_m1to4]
pshuflw m4, m3, q0000
punpcklqdq m6, m7
pmullw m4, m5
pshuflw m3, m3, q1111
paddw m6, m0
pshuflw m0, m4, q3333
psubw m4, [rsp+16*15]
movq [rsp+16*6+8*1], m3
movq [rsp+8*1], m0 ; dy*4
%if ARCH_X86_64
mov r8, dstq
%endif
.w4_loop0:
%if ARCH_X86_32
mov r8, dstq
%endif
mova [rsp+16*12], m6
mov r2d, r10d
movq [rsp+8*0], m4
pand m0, m4, m8
psraw m4, 6
psubw m1, m9, m0
psllw m0, 8
por m0, m1 ; 64-frac_y, frac_y
movq [rsp+8*3], m0
pabsw m4, m4
movq [rsp+8*2], m4
movzx hd, r9b
.w4_loop:
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x0
movq m0, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x1
movhps m0, [rsp+r3]
lea r3d, [r2+dxq]
shr r2d, 6 ; base_x2
movq m1, [rsp+r2]
lea r2d, [r3+dxq]
shr r3d, 6 ; base_x3
movhps m1, [rsp+r3]
pand m2, m8, m6
paddsw m5, m6, m7
psubw m3, m9, m2
psllw m2, 8
pshufb m0, m11
por m2, m3
pmaddubsw m0, m2
pand m2, m8, m5
psubw m3, m9, m2
psllw m2, 8
pshufb m1, m11
por m2, m3
pmaddubsw m1, m2
cmp r3d, 127 ; topleft
jge .w4_toponly
movzx r3d, byte [rsp+8*2+0] ; base_y0
movq m3, [rsp+r3]
movzx r3d, byte [rsp+8*2+2] ; base_y1
movhps m3, [rsp+r3]
movzx r3d, byte [rsp+8*2+4] ; base_y2
movq m4, [rsp+r3]
movzx r3d, byte [rsp+8*2+6] ; base_y3
movhps m4, [rsp+r3]
pshufb m3, m12
pshufb m4, m12
punpckldq m2, m3, m4
punpckhdq m3, m4
movddup m4, [rsp+8*3]
pmaddubsw m2, m4
pmaddubsw m3, m4
psraw m6, 15 ; base_x < topleft
pand m2, m6
pandn m6, m0
por m0, m2, m6
psraw m6, m5, 15
pand m3, m6
pandn m6, m1
por m1, m3, m6
.w4_toponly:
pmulhrsw m0, m10
pmulhrsw m1, m10
movifnidn strideq, stridemp
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0
movd [dstq+strideq*0], m0
psrlq m0, 32
movd [dstq+strideq*1], m0
sub hd, 4
jz .w4_end
movq m4, [rsp+8*2]
movq m3, [rsp+16*6+8*1]
paddw m6, m5, m7 ; xpos += dx
psubw m4, m3
movq [rsp+8*2], m4
lea dstq, [dstq+strideq*2]
cmp r2d, r11d
jge .w4_loop
movddup m5, [rsp+8*3]
.w4_leftonly_loop:
movzx r3d, byte [rsp+8*2+0] ; base_y0
movq m1, [rsp+r3]
movzx r3d, byte [rsp+8*2+2] ; base_y1
movhps m1, [rsp+r3]
movzx r3d, byte [rsp+8*2+4] ; base_y2
movq m2, [rsp+r3]
movzx r3d, byte [rsp+8*2+6] ; base_y3
movhps m2, [rsp+r3]
psubw m4, m3
pshufb m1, m12
pshufb m2, m12
movq [rsp+8*2], m4
punpckldq m0, m1, m2
punpckhdq m1, m2
pmaddubsw m0, m5
pmaddubsw m1, m5
movifnidn strideq, stridemp
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
punpckhqdq m0, m0
movd [dstq+strideq*0], m0
psrlq m0, 32
movd [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w4_leftonly_loop
.w4_end:
sub r9d, 1<<8
jl .w4_ret
movq m4, [rsp+8*1]
%if ARCH_X86_64
add r8, 4
mov dstq, r8
%else
mov dstq, r8
add dstq, 4
%endif
paddw m4, [rsp+8*0] ; base_y += 4*dy
movzx r3d, word [rsp+16*15+8*1]
add r10d, r3d
movddup m6, [rsp+16*15+8*1]
paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
jmp .w4_loop0
.w4_ret:
RET
.w8:
test angled, 0x400
jnz .w4_main
movd m5, [tlq+8]
lea r3d, [angleq+126]
pshufb m5, m7
%if ARCH_X86_64
mov r3b, hb
%else
xor r3b, r3b
or r3d, hd
%endif
movd [rsp+16*8+8], m5
cmp r3d, 8
ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
call .upsample_above
sub angled, 53
lea r3d, [hq+7]
xor angled, 0x7f ; 180 - angle
movu m1, [base+z_filter_wh8]
movd m0, r3d
movd m6, angled
shr angled, 8 ; is_sm << 1
psrldq m2, [base+z_filter_t_w48+angleq*8], 4
pshufb m0, m7
pshufb m6, m7
pcmpeqb m0, m1
pand m6, m0
pcmpgtb m6, m2
%if ARCH_X86_64
movq [rsp+16*15+8*1], m10 ; 8<<6
%else
movq m0, m10
movq [rsp+16*15+8*1], m0
%endif
jmp .w8_filter_left
.w8_no_upsample_above:
lea r3d, [hq+7]
mov [rsp], angled
sub angled, 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movu m3, [base+z_filter_wh8]
psrldq m4, [base+z_filter_t_w48+angleq*8], 4
call .w8_filter_top
mov r3d, [rsp]
sub r3d, 141
%if ARCH_X86_64
mov r3b, hb
%else
xor r3b, r3b
or r3d, hd
%endif
cmp r3d, 8
jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
.w8_filter_left:
pmovmskb r5d, m6
test r5d, r5d
jz .w4_main
imul r5d, 0x55555555
mov r3, tlq
shr r5d, 30
sub r5, 3 ; filter_strength-3
jmp .filter_left
.w8_filter_top:
movd m6, r3d
REPX {pshufb x, m7}, m0, m1, m6
pcmpeqb m0, m3
pand m1, m0
pand m6, m0
pcmpgtb m1, m4
pcmpgtb m6, m4
pmovmskb r5d, m1
test r5d, r5d
jz .w8_filter_top_end ; filter_strength == 0
imul r5d, 0x55555555
movq m0, [rsp+gprsize+16*8-2]
shr r5d, 30
movq m1, [rsp+gprsize+16*8-1]
sub r5, 3 ; filter_strength-3
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
punpcklbw m0, m1
pmaddubsw m0, m7
movq m1, [rsp+gprsize+16*8+0]
movq m2, [rsp+gprsize+16*8+1]
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
punpcklbw m1, m2
pmaddubsw m1, m7
movq m2, [rsp+gprsize+16*8+2]
movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
punpcklbw m2, m2
pmaddubsw m2, m7
paddw m0, m1
paddw m0, m2
%if ARCH_X86_64
mov r3d, r7m ; maxw, offset due to call
%else
mov r3d, [rsp+gprsize+16*18+4*3]
%endif
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
movq [rsp+gprsize+16*8], m0
cmp r3d, 8
jge .w8_filter_top_end
movq m0, [tlq+r3+1]
movq [rsp+gprsize+r3+16*8], m0
.w8_filter_top_end:
ret
.w16:
test angled, 0x400
jnz .w4_main
lea r3d, [hq+15]
sub angled, 90
movd m0, r3d
mov r3d, 90
movd m1, angled
sub r3d, angled ; 180 - angle
shr angled, 8 ; is_sm << 1
movd m6, r3d
REPX {pshufb x, m7}, m0, m1, m6
movq m3, [base+z_filter_t_w16+angleq*4]
pcmpeqb m0, [base+z_filter_wh16]
pand m1, m0
pand m6, m0
pcmpgtb m1, m3
pcmpgtb m6, m3
pmovmskb r5d, m1
mov r3, tlq
test r5d, r5d
jz .w16_filter_left ; filter_strength == 0
imul r5d, 0x24924924
pshufb m5, [base+z_filter_t_w16] ; tlq[16]
shr r5d, 30
adc r5, -4 ; filter_strength-3
movd [rsp+16*9], m5
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
movu m1, [rsp+16*8-2]
movu m2, [rsp+16*8-1]
punpcklbw m0, m1, m2
pmaddubsw m0, m7
punpckhbw m1, m2
pmaddubsw m1, m7
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
mova m3, [rsp+16*8+0]
movu m4, [rsp+16*8+1]
punpcklbw m2, m3, m4
pmaddubsw m2, m7
punpckhbw m3, m4
pmaddubsw m3, m7
paddw m0, m2
paddw m1, m3
test r5d, r5d
jnz .w16_filter_end ; 3-tap
movddup m7, [base+z_filter_k+8*8]
movu m3, [rsp+16*8+2]
punpcklbw m2, m3, m3
pmaddubsw m2, m7
punpckhbw m3, m3
pmaddubsw m3, m7
paddw m0, m2
paddw m1, m3
.w16_filter_end:
mov r2d, maxwm
pmulhrsw m0, m10
pmulhrsw m1, m10
packuswb m0, m1
mova [rsp+16*8], m0
cmp r2d, 16
jge .w16_filter_left
movu m0, [r3+r2+1]
movu [rsp+r2+16*8], m0
.w16_filter_left:
pmovmskb r5d, m6
test r5d, r5d
jz .w4_main
imul r5d, 0x24924924
shr r5d, 30
adc r5, -4 ; filter_strength-3
jmp .filter_left
.w32:
test angled, 0x400
jnz .w4_main
pshufb m6, [base+z_filter_t_w16] ; tlq[32]
mov r3, tlq
lea tlq, [rsp+16*9]
movd [tlq+16*1], m6
xor r5d, r5d ; filter_strength = 3
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mov r2d, maxwm
mova [rsp+16*8], m0
mova [rsp+16*9], m1
cmp r2d, 32
jge .filter_left
movu m0, [r3+r2+16*0+1]
movu m1, [r3+r2+16*1+1]
movu [rsp+r2+16*8], m0
movu [rsp+r2+16*9], m1
jmp .filter_left
.w64:
movu m0, [tlq+16*2+1]
movu m1, [tlq+16*3+1]
mova [rsp+16*10], m0
mova [rsp+16*11], m1
test angled, 0x400
jnz .w4_main
pshufb m1, [base+z_filter_t_w16] ; tlq[64]
mov r3, tlq
lea tlq, [rsp+16*11]
movd [tlq+16*1], m1
xor r5d, r5d ; filter_strength = 3
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mova m2, [tlq+16*2]
mova m3, [tlq+16*3]
mov r2d, maxwm
mova [rsp+16* 8], m0
mova [rsp+16* 9], m1
mova [rsp+16*10], m2
mova [rsp+16*11], m3
cmp r2d, 64
jge .filter_left
movu m0, [r3+r2+16*0+1]
movu m1, [r3+r2+16*1+1]
movu [rsp+r2+16* 8], m0
movu [rsp+r2+16* 9], m1
cmp r2d, 32
jge .filter_left
movu m0, [r3+r2+16*2+1]
movu m1, [r3+r2+16*3+1]
movu [rsp+r2+16*10], m0
movu [rsp+r2+16*11], m1
.filter_left:
neg hq
movd m0, [r3+hq]
pxor m1, m1
pshufb m0, m1
movd [rsp+16*6+hq-4], m0
lea tlq, [rsp+16*5]
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
cmp hd, -32
jge .filter_left_end
sub tlq, 16*2
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
mova m0, [tlq+16*0]
mova m1, [tlq+16*1]
mova [rsp+16*2], m0
mova [rsp+16*3], m1
.filter_left_end:
mov r2d, maxhm
mova m0, [rsp+16*5]
mova m1, [rsp+16*6]
mova m2, [rsp+16*7]
neg r2
mova [rsp+16*4], m0
mova [rsp+16*5], m1
mova [rsp+16*6], m2
cmp r2d, hd
jle .w4_main
movu m0, [r3+r2-16*2]
movu m1, [r3+r2-16*1]
movu [rsp+r2+16*4], m0
movu [rsp+r2+16*5], m1
cmp r2d, -32
jle .w4_main
movu m0, [r3+r2-16*4]
movu m1, [r3+r2-16*3]
movu [rsp+r2+16*2], m0
movu [rsp+r2+16*3], m1
jmp .w4_main
%if ARCH_X86_64
cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
%define base r7-$$

4
third_party/dav1d/src/x86/itx.h поставляемый
Просмотреть файл

@ -317,6 +317,9 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
assign_itx12_bpc_fn( , 16, 16, 12, avx2);
assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
}
#endif
@ -353,6 +356,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
}
#endif
#endif

47
third_party/dav1d/src/x86/itx16_avx2.asm поставляемый
Просмотреть файл

@ -3137,10 +3137,14 @@ INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
pmulhrsw m%2, m%3, m%1
%if %0 == 4 ; if downshifting by 1
%ifnum %4
pmulhrsw m%2, m%4
%else ; without rounding
psraw m%2, 1
%endif
%else
paddsw m%1, m%1
%endif
@ -6837,10 +6841,11 @@ ALIGN function_align
ret
cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
vpbroadcastd m7, [pixel_10bpc_max]
.pass1:
vpbroadcastd m8, [pw_2896x8]
vpbroadcastd m9, [pw_1697x16]
vpbroadcastd m11, [pw_8192]
vpbroadcastd m7, [pixel_10bpc_max]
lea r6, [strideq*5]
pxor m6, m6
paddw m10, m11, m11 ; pw_16384
@ -6910,11 +6915,15 @@ ALIGN function_align
punpckhqdq m1, m3, m2
jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
vpbroadcastd m7, [pixel_12bpc_max]
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
lea r6, [rsp+32*4]
@ -7136,10 +7145,11 @@ ALIGN function_align
jmp m(idct_16x8_internal_10bpc).write_16x4
cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
vpbroadcastd m7, [pixel_10bpc_max]
.pass1:
vpbroadcastd m8, [pw_2896x8]
vpbroadcastd m9, [pw_1697x16]
vpbroadcastd m10, [pw_2048]
vpbroadcastd m7, [pixel_10bpc_max]
vpbroadcastd m10, [pw_4096]
lea r6, [strideq*5]
pxor m6, m6
mov r5, dstq
@ -7187,16 +7197,20 @@ ALIGN function_align
packssdw m3, [cq+64*7]
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
REPX {paddsw x, x }, m0, m1, m2, m3
REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3
REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
REPX {pmulhrsw x, m10}, m0, m1, m2, m3
REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
vpbroadcastd m7, [pixel_12bpc_max]
jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
lea r6, [rsp+32*7]
@ -7364,9 +7378,10 @@ ALIGN function_align
jmp m(idct_16x16_internal_8bpc).main
cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
%undef cmp
vpbroadcastd m5, [pw_8192]
%undef cmp
vpbroadcastd m7, [pixel_10bpc_max]
.pass1:
vpbroadcastd m5, [pw_8192]
pxor m6, m6
lea r6, [strideq*3]
lea r5, [strideq*5]
@ -7432,6 +7447,10 @@ ALIGN function_align
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
vpbroadcastd m7, [pixel_12bpc_max]
jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
%if %1 & 1
mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
@ -7472,7 +7491,7 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@ -7814,7 +7833,7 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
lea r6, [rsp+32*6]
@ -8043,7 +8062,7 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
RET
.normal:
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@ -8262,7 +8281,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]
@ -8411,7 +8430,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
%undef cmp
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_18b_min]
vpbroadcastd m13, [clip_18b_max]

315
third_party/dav1d/src/x86/itx16_avx512.asm поставляемый
Просмотреть файл

@ -174,6 +174,8 @@ cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
SECTION .text
@ -3815,4 +3817,317 @@ cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eo
punpckhdq m8, m0 ; 6 7
ret
cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
lea r5, [o_base]
test eobd, eobd
jz .dconly
PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [o(pd_2896)]
vpbroadcastd m13, [o(pd_2048)]
vpbroadcastd m14, [o(clip_18b_min)]
vpbroadcastd m15, [o(clip_18b_max)]
cmp eobd, 36
jl .fast
call .pass1
cmp eobd, 151
jge .full
lea r5, [o_base_8bpc]
punpckhwd m22, m0, m0
punpckhwd m23, m1, m1
punpckhwd m24, m2, m2
punpckhwd m25, m3, m3
punpckhwd m26, m4, m4
punpckhwd m27, m5, m5
punpckhwd m28, m6, m6
punpckhwd m29, m7, m7
punpcklwd m21, m1, m1
punpcklwd m14, m3, m3
punpcklwd m18, m5, m5
punpcklwd m15, m7, m7
pxor m9, m9
punpcklwd m9, m9, m0
punpcklwd m8, m2, m2
punpcklwd m7, m4, m4
punpcklwd m1, m6, m6
call m(idct_16x16_internal_8bpc).main_fast2
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
mova [rsp+mmsize*0], m14
mova [rsp+mmsize*1], m15
mova [rsp+mmsize*2], m16
mova [rsp+mmsize*3], m17
mova [rsp+mmsize*4], m18
mova [rsp+mmsize*5], m19
mova [rsp+mmsize*6], m20
mova [rsp+mmsize*7], m21
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
pxor m12, m12
mov r3d, 64*3
.zero_loop:
REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
sub r3d, 64
jge .zero_loop
jmp .pass2_end
.full:
mova [cq+128*0], m0
mova [cq+128*1], m1
mova [cq+128*2], m2
mova [cq+128*3], m3
mova [cq+128*4], m4
mova [cq+128*5], m5
mova [cq+128*6], m6
mova [cq+128*7], m7
add cq, 64
call .pass1
sub cq, 64
mova m22, [cq+128*0] ; 0 1
mova m23, [cq+128*1] ; 2 3
mova m24, [cq+128*2] ; 4 5
mova m25, [cq+128*3] ; 6 7
mova m26, [cq+128*4] ; 8 9
mova m27, [cq+128*5] ; 10 11
mova m28, [cq+128*6] ; 12 13
mova m29, [cq+128*7] ; 14 15
mova [cq+64* 8], m0
mova [cq+64* 9], m1
mova [cq+64*10], m2
mova [cq+64*11], m3
mova [cq+64*12], m4
mova [cq+64*13], m5
mova [cq+64*14], m6
mova [cq+64*15], m7
lea r5, [o_base_8bpc]
punpcklwd m20, m1, m1
punpcklwd m16, m3, m3
punpcklwd m19, m5, m5
punpcklwd m17, m7, m7
punpcklwd m8, m24, m24 ; 4
punpcklwd m5, m2, m2 ; 20
punpcklwd m1, m28, m28 ; 12
punpcklwd m7, m26, m26 ; 8
punpcklwd m3, m4, m4 ; 24
punpcklwd m4, m6, m6 ; 28
pxor m9, m9
punpcklwd m6, m9, m0 ; __ 16
mova m0, m4
punpcklwd m9, m9, m22 ; __ 0
call m(idct_16x16_internal_8bpc).main_fast
punpcklwd m21, m23, m23 ; 2
punpcklwd m15, m29, m29 ; 14
punpcklwd m18, m27, m27 ; 10
punpcklwd m14, m25, m25 ; 6
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova [rsp+mmsize*0], m14
mova [rsp+mmsize*1], m15
mova [rsp+mmsize*2], m16
mova [rsp+mmsize*3], m17
mova [rsp+mmsize*4], m18
mova [rsp+mmsize*5], m19
mova [rsp+mmsize*6], m20
mova [rsp+mmsize*7], m21
mova m21, [cq+64*15]
mova m14, [cq+64* 8]
mova m17, [cq+64*11]
mova m18, [cq+64*12]
mova m19, [cq+64*13]
mova m16, [cq+64*10]
mova m15, [cq+64* 9]
mova m20, [cq+64*14]
REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
m24, m19, m16, m27, m28, m15, m20, m23
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
pxor m12, m12
mov r3d, 32*7
.full_zero_loop:
REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
sub r3d, 32
jge .full_zero_loop
jmp .pass2_end
.fast:
mova ym0, [cq+128*0]
mova ym2, [cq+128*4]
movshdup m8, [o(permB)]
mova ym1, [cq+128*2]
mova ym3, [cq+128*6]
mova ym4, [cq+128*1]
mova ym5, [cq+128*3]
mova ym6, [cq+128*5]
mova ym7, [cq+128*7]
vpermt2q m0, m8, m2 ; 0 4
vpermt2q m1, m8, m3 ; 2 6
vpermt2q m4, m8, m5 ; 1 3
vpermt2q m7, m8, m6 ; 7 5
call m(idct_8x8_internal_10bpc).main_fast
call m(idct_16x8_internal_10bpc).main_fast
vpbroadcastd m11, [o(pd_2)]
call m(idct_8x16_internal_10bpc).main_end2
mova m8, [o(idct8x32p)]
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
mova m6, [dup16_perm]
vpermb m0, m8, m0
vpermb m2, m8, m2
vprold m8, 16
vpermb m1, m8, m1
vpermb m3, m8, m3
punpckldq m4, m0, m2
punpckhdq m0, m2
punpckldq m2, m1, m3
punpckhdq m1, m3
punpckldq m21, m4, m2
punpckhdq m14, m4, m2
punpckldq m18, m0, m1
punpckhdq m15, m0, m1
vpord m7, m6, [o(pb_32)] {1to16}
vpermb m22, m7, m21 ; 1
pmovzxwd m9, ym21 ; 0
vpermb m8, m6, m18 ; 4
vpermb m24, m7, m18 ; 5
vpermb m21, m6, m14 ; 2
vpermb m23, m7, m14 ; 3
vpermb m14, m6, m15 ; 6
vpermb m25, m7, m15 ; 7
lea r5, [o_base_8bpc]
pslld m9, 16
pxor m7, m7
REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
call m(idct_16x16_internal_8bpc).main_fast2
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
mova [rsp+mmsize*0], m14
mova [rsp+mmsize*1], m15
mova [rsp+mmsize*2], m16
mova [rsp+mmsize*3], m17
mova [rsp+mmsize*4], m18
mova [rsp+mmsize*5], m19
mova [rsp+mmsize*6], m20
mova [rsp+mmsize*7], m21
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
pxor m12, m12
REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
.pass2_end:
movshdup m30, [permC]
vpbroadcastd m11, [pw_2048]
vpbroadcastd m13, [pixel_10bpc_max]
lea r6, [strideq*3]
psrlq m31, m30, 8
vpermq m8, m30, m0
vpermq m9, m31, m1
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m2
vpermq m9, m31, m3
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m4
vpermq m9, m31, m5
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m6
vpermq m9, m31, m7
call m(idct_16x8_internal_10bpc).write_16x4
mova m1, [rsp+mmsize*0]
mova m2, [rsp+mmsize*1]
mova m3, [rsp+mmsize*2]
mova m4, [rsp+mmsize*3]
mova m5, [rsp+mmsize*4]
mova m6, [rsp+mmsize*5]
mova m7, [rsp+mmsize*6]
mova m8, [rsp+mmsize*7]
paddsw m0, m1, m21
psubsw m21, m1, m21
paddsw m1, m2, m20
psubsw m20, m2, m20
paddsw m2, m3, m19
psubsw m19, m3, m19
paddsw m3, m4, m18
psubsw m18, m4, m18
paddsw m4, m5, m17
psubsw m17, m5, m17
paddsw m5, m6, m16
psubsw m16, m6, m16
paddsw m6, m7, m15
psubsw m15, m7, m15
paddsw m7, m8, m14
psubsw m14, m8, m14
vpermq m8, m30, m0
vpermq m9, m31, m1
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m2
vpermq m9, m31, m3
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m4
vpermq m9, m31, m5
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m6
vpermq m9, m31, m7
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m14
vpermq m9, m31, m15
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m16
vpermq m9, m31, m17
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m18
vpermq m9, m31, m19
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m20
vpermq m9, m31, m21
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m22
vpermq m9, m31, m23
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m24
vpermq m9, m31, m25
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m26
vpermq m9, m31, m27
call m(idct_16x8_internal_10bpc).write_16x4
vpermq m8, m30, m28
vpermq m9, m31, m29
call m(idct_16x8_internal_10bpc).write_16x4
RET
.pass1:
mova m0, [cq+128* 0]
mova m1, [cq+128* 2]
mova m2, [cq+128* 4]
mova m3, [cq+128* 6]
mova m4, [cq+128* 8]
mova m5, [cq+128*10]
mova m6, [cq+128*12]
mova m7, [cq+128*14]
call m(idct_8x16_internal_10bpc).main
mova m16, [cq+128* 1]
mova m17, [cq+128* 3]
mova m18, [cq+128* 5]
mova m19, [cq+128* 7]
mova m20, [cq+128* 9]
mova m21, [cq+128*11]
mova m22, [cq+128*13]
mova m23, [cq+128*15]
call m(idct_16x16_internal_10bpc).main
call m(idct_16x16_internal_10bpc).main_end
jmp m(idct_16x16_internal_10bpc).main_end3
.dconly:
imul r6d, [cq], 181
mov [cq], eobd
or r3d, 64
add r6d, 640
sar r6d, 10
jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
%endif ; ARCH_X86_64

4
third_party/dav1d/src/x86/itx_avx512.asm поставляемый
Просмотреть файл

@ -5143,7 +5143,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
sar r6d, 8+2
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
ALIGN function_align
.main_oddhalf_fast: ; bottom three-quarters are zero
cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
vpbroadcastd m8, [o(pw_101_4095x8)]
vpbroadcastd m21, [o(pw_m1474_3822x8)]
vpbroadcastd m14, [o(pw_897_3996x8)]
@ -5170,7 +5170,7 @@ ALIGN function_align
mova m20, m15
jmp .main_oddhalf2
ALIGN function_align
.main_oddhalf:
cglobal_label .main_oddhalf
vpbroadcastd m8, [o(pw_101_4095x8)]
vpbroadcastd m9, [o(pw_m2824_2967x8)]
vpbroadcastd m11, [o(pw_1660_3745x8)]

128
third_party/dav1d/src/x86/refmvs.asm поставляемый
Просмотреть файл

@ -57,6 +57,7 @@ save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0
save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2
db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3
save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1
cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
pb_128: times 16 db 128
@ -74,6 +75,12 @@ save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
SAVE_TMVS_TABLE 5, 2, avx2
SAVE_TMVS_TABLE 7, 1, avx2
save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
SAVE_TMVS_TABLE 4, 8, avx512icl
SAVE_TMVS_TABLE 4, 4, avx512icl
SAVE_TMVS_TABLE 5, 2, avx512icl
SAVE_TMVS_TABLE 7, 1, avx512icl
JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
%endif
@ -170,8 +177,6 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
%define rpq r3
%define r10 r1
%define r10d r1
%define r10w r1w
%define r10b r1b
%define r11 r4
%define r11d r4
%endif
@ -486,6 +491,125 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
jg .loop
RET
INIT_ZMM avx512icl
; refmvs_temporal_block *rp, ptrdiff_t stride,
; refmvs_block **rr, uint8_t *ref_sign,
; int col_end8, int row_end8, int col_start8, int row_start8
cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
xend, yend, xstart, ystart
%define base r14-.write1
lea r14, [.write1]
movifnidn xendd, xendm
movifnidn yendd, yendm
mov xstartd, xstartm
mov ystartd, ystartm
psllq m4, [ref_signq]{bcstq}, 8
vpbroadcastq m3, [base+save_ref_shuf+8]
vbroadcasti32x4 m5, [base+cond_shuf512]
vbroadcasti32x4 m6, [base+save_cond0]
vpbroadcastd m7, [base+pb_128]
mova m8, [base+save_pack0]
movu xm9, [base+save_pack0+4]
lea r9d, [xendq*5]
lea xstartd, [xstartq*5]
sub yendd, ystartd
add ystartd, ystartd
lea strideq, [strideq*5]
sub xstartq, r9
add xendd, r9d
add rpq, r9
mov r10d, 0x1f
kmovb k2, r10d
DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
.loop_y:
and ystartd, 30
mov xq, xstartq
mov bq, [rrq+ystartq*8]
add ystartd, 2
lea bq, [bq+xendq*4]
.loop_x:
imul candq, xq, 0x9999
sar candq, 16 ; x / 5 * 3
movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
movu xm0, [bq+candq*8+12] ; cand_b
movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
add r10, r14
add candq, r11
jge .calc
movzx r11d, byte [bq+candq*8+22]
vinserti32x4 ym0, [bq+candq*8+12], 1
movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
add r11, r14
add candq, r12
jge .calc
movzx r12d, byte [bq+candq*8+22]
vinserti32x4 m0, [bq+candq*8+12], 2
movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
add r12, r14
add candq, r13
jge .calc
vinserti32x4 m0, [bq+candq*8+12], 3
movzx r13d, byte [bq+candq*8+22]
movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
add r13, r14
.calc:
pshufb m1, m0, m3
pabsw m2, m0
pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
psubd m2, m1
pshufb m2, m5 ; c0 c1 c1 c0
pand m2, m6
punpckhqdq m1, m2, m2
vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
pshufb m2, m0, m1
mova xm0, xm2
call r10
jge .next_line
vextracti32x4 xm0, m2, 1
call r11
jge .next_line
vextracti32x4 xm0, m2, 2
call r12
jge .next_line
vextracti32x4 xm0, m2, 3
call r13
jl .loop_x
.next_line:
add rpq, strideq
dec hd
jg .loop_y
RET
.write1:
vmovdqu8 [rpq+xq]{k2}, xm0
add xq, 5*1
ret
.write2:
pshufb xm0, xm8
vmovdqu16 [rpq+xq]{k2}, xm0
add xq, 5*2
ret
.write4:
vpermb ym0, ym8, ym0
vmovdqu32 [rpq+xq]{k2}, ym0
add xq, 5*4
ret
.write8:
vpermb m0, m8, m0
vmovdqu64 [rpq+xq]{k2}, m0
add xq, 5*8
ret
.write16:
vpermb m1, m8, m0
movu [rpq+xq+ 0], m1
pshufb xm0, xm9
movu [rpq+xq+64], xm0
add xq, 5*16
ret
INIT_ZMM avx512icl
cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
vbroadcasti32x4 m0, [aq]

2
third_party/dav1d/src/x86/refmvs.h поставляемый
Просмотреть файл

@ -30,6 +30,7 @@
decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
decl_splat_mv_fn(dav1d_splat_mv_sse2);
decl_splat_mv_fn(dav1d_splat_mv_avx2);
@ -54,6 +55,7 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
c->save_tmvs = dav1d_save_tmvs_avx512icl;
c->splat_mv = dav1d_splat_mv_avx512icl;
#endif
}

1
third_party/dav1d/tests/checkasm/ipred.c поставляемый
Просмотреть файл

@ -133,6 +133,7 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
a & 0x1ff, a & 0x600, maxw, maxh);
else if (mode == FILTER_PRED)
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
break;
}
bench_new(a_dst, stride, topleft, w, h, a, 128, 128

18
third_party/dav1d/tests/checkasm/refmvs.c поставляемый
Просмотреть файл

@ -29,6 +29,16 @@
#include <stdio.h>
static inline int gen_mv(const int total_bits, int spel_bits) {
int bits = rnd() & ((1 << spel_bits) - 1);
do {
bits |= (rnd() & 1) << spel_bits;
} while (rnd() & 1 && ++spel_bits < total_bits);
// the do/while makes it relatively more likely to be close to zero (fpel)
// than far away
return rnd() & 1 ? -bits : bits;
}
static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
refmvs_block *rr[31];
refmvs_block r[31 * 256];
@ -58,10 +68,10 @@ static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
while (j + ((dav1d_block_dimensions[bs][0] + 1) >> 1) > col_end8)
bs++;
rr[i * 2][j * 2 + 1] = (refmvs_block) {
.mv.mv[0].x = -(rnd() & 1) * (rnd() & 8191),
.mv.mv[0].y = -(rnd() & 1) * (rnd() & 8191),
.mv.mv[1].x = -(rnd() & 1) * (rnd() & 8191),
.mv.mv[1].y = -(rnd() & 1) * (rnd() & 8191),
.mv.mv[0].x = gen_mv(14, 10),
.mv.mv[0].y = gen_mv(14, 10),
.mv.mv[1].x = gen_mv(14, 10),
.mv.mv[1].y = gen_mv(14, 10),
.ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 },
.bs = bs
};