зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1827248 - Update dav1d to 5aa3b38f9871859e14e55f18ab5e38318fe86305 r=chunmin
Differential Revision: https://phabricator.services.mozilla.com/D175056
This commit is contained in:
Родитель
55175bdf25
Коммит
6dde2ed59f
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb (2023-03-13T15:19:35.000+00:00).
|
||||
release: 5aa3b38f9871859e14e55f18ab5e38318fe86305 (2023-04-08T11:47:31.000+00:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb
|
||||
revision: 5aa3b38f9871859e14e55f18ab5e38318fe86305
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "7d23ec4a042b2feb6c0d4c1b0618a87cb8c34dcb"
|
||||
#define DAV1D_VERSION "5aa3b38f9871859e14e55f18ab5e38318fe86305"
|
||||
|
|
|
@ -1481,11 +1481,10 @@ function ipred_z1_filter_edge_8bpc_neon, export=1
|
|||
sub x5, x5, w3, uxtw
|
||||
add x6, x2, w6, sxtw
|
||||
|
||||
ld1 {v2.16b, v3.16b}, [x5] // padding_mask
|
||||
ld1 {v2.16b}, [x5] // padding_mask
|
||||
|
||||
ld1r {v4.16b}, [x6]
|
||||
bit v0.16b, v4.16b, v2.16b // Pad v0-v1
|
||||
bit v1.16b, v4.16b, v3.16b
|
||||
ld1r {v1.16b}, [x6]
|
||||
bit v0.16b, v1.16b, v2.16b // Pad v0-v1
|
||||
|
||||
// Filter one block
|
||||
ext v2.16b, v0.16b, v1.16b, #1
|
||||
|
@ -1598,6 +1597,17 @@ L(fivetap):
|
|||
ret
|
||||
endfunc
|
||||
|
||||
// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
|
||||
// const int n);
|
||||
function ipred_pixel_set_8bpc_neon, export=1
|
||||
dup v0.16b, w1
|
||||
1:
|
||||
subs w2, w2, #16
|
||||
st1 {v0.16b}, [x0], #16
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const top,
|
||||
// const int width, const int height,
|
||||
|
@ -1671,10 +1681,10 @@ function ipred_z1_fill1_8bpc_neon, export=1
|
|||
dup v7.8b, w11
|
||||
ext v1.16b, v0.16b, v0.16b, #1 // top[base+1]
|
||||
ext v3.16b, v2.16b, v2.16b, #1
|
||||
umull v16.8h, v1.8b, v4.8b // top[base+1]*frac
|
||||
umlal v16.8h, v0.8b, v6.8b // + top[base]*(64-frac)
|
||||
umull v17.8h, v3.8b, v5.8b
|
||||
umlal v17.8h, v2.8b, v7.8b
|
||||
umull v16.8h, v0.8b, v6.8b // top[base]*(64-frac)
|
||||
umlal v16.8h, v1.8b, v4.8b // + top[base+1]*frac
|
||||
umull v17.8h, v2.8b, v7.8b
|
||||
umlal v17.8h, v3.8b, v5.8b
|
||||
rshrn v16.8b, v16.8h, #6
|
||||
rshrn v17.8b, v17.8h, #6
|
||||
st1 {v16.8b}, [x0], x1
|
||||
|
@ -1724,14 +1734,14 @@ function ipred_z1_fill1_8bpc_neon, export=1
|
|||
ext v16.16b, v0.16b, v1.16b, #1 // top[base+1]
|
||||
ext v17.16b, v2.16b, v3.16b, #1
|
||||
subs w3, w3, #16
|
||||
umull v18.8h, v16.8b, v4.8b // top[base+1]*frac
|
||||
umlal v18.8h, v0.8b, v6.8b // + top[base]*(64-frac)
|
||||
umull2 v19.8h, v16.16b, v4.16b
|
||||
umlal2 v19.8h, v0.16b, v6.16b
|
||||
umull v20.8h, v17.8b, v5.8b
|
||||
umlal v20.8h, v2.8b, v7.8b
|
||||
umull2 v21.8h, v17.16b, v5.16b
|
||||
umlal2 v21.8h, v2.16b, v7.16b
|
||||
umull v18.8h, v0.8b, v6.8b // top[base]*(64-frac)
|
||||
umlal v18.8h, v16.8b, v4.8b // + top[base+1]*frac
|
||||
umull2 v19.8h, v0.16b, v6.16b
|
||||
umlal2 v19.8h, v16.16b, v4.16b
|
||||
umull v20.8h, v2.8b, v7.8b
|
||||
umlal v20.8h, v17.8b, v5.8b
|
||||
umull2 v21.8h, v2.16b, v7.16b
|
||||
umlal2 v21.8h, v17.16b, v5.16b
|
||||
rshrn v16.8b, v18.8h, #6
|
||||
rshrn2 v16.16b, v19.8h, #6
|
||||
rshrn v17.8b, v20.8h, #6
|
||||
|
@ -1899,7 +1909,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
ld1r {v31.16b}, [x10] // padding
|
||||
ld1 {v30.8h}, [x11] // increments
|
||||
mov w7, w5
|
||||
b.gt L(ipred_z3_fill1_large_w16)
|
||||
b.gt L(ipred_z3_fill1_large_h16)
|
||||
br x8
|
||||
|
||||
40:
|
||||
|
@ -1909,6 +1919,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
|
||||
movi v23.16b, #0x3e
|
||||
|
||||
// Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
|
||||
ld1 {v0.16b, v1.16b}, [x2] // left[]
|
||||
add v30.4h, v29.4h, v30.4h // ypos
|
||||
|
||||
|
@ -1958,7 +1969,8 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
|
||||
movi v23.16b, #0x3e
|
||||
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
|
||||
// Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
|
||||
ld1 {v0.16b, v1.16b, v2.16b}, [x2] // left[]
|
||||
add v30.8h, v29.8h, v30.8h // ypos
|
||||
|
||||
movi v22.16b, #64
|
||||
|
@ -1974,12 +1986,12 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
uqadd v28.8b, v26.8b, v21.8b // base + 2
|
||||
sub v25.8b, v22.8b, v24.8b // 64 - frac
|
||||
|
||||
tbx v4.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.8b // left[base]
|
||||
tbx v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
|
||||
1:
|
||||
mov v5.8b, v31.8b
|
||||
mov v6.8b, v31.8b
|
||||
tbx v5.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.8b // left[base+1]
|
||||
tbx v6.8b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.8b // left[base+2]
|
||||
tbx v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
|
||||
tbx v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
|
||||
|
||||
umull v16.8h, v4.8b, v25.8b // left[base]*(64-frac)
|
||||
umlal v16.8h, v5.8b, v24.8b // + left[base+1]*frac
|
||||
|
@ -2008,6 +2020,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
mul v30.8h, v30.8h, v28.8h // {0,1,2,3,4,5,6,7}*dy
|
||||
movi v23.16b, #0x3e
|
||||
|
||||
// This is only executed if we've checked that max_base_y <= 64.
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
|
||||
add v28.8h, v28.8h, v30.8h // ypos
|
||||
|
||||
|
@ -2075,6 +2088,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
sub x1, x1, w3, uxtw
|
||||
add v30.8h, v28.8h, v30.8h // ypos
|
||||
|
||||
// This is only executed if we've checked that max_base_y <= 64.
|
||||
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
|
||||
|
||||
movi v22.16b, #64
|
||||
|
@ -2146,7 +2160,7 @@ function ipred_z3_fill1_8bpc_neon, export=1
|
|||
9:
|
||||
ret
|
||||
|
||||
L(ipred_z3_fill1_large_w16):
|
||||
L(ipred_z3_fill1_large_h16):
|
||||
// Fallback case for max_base_y > 64; similar to the z1
|
||||
// implementation. This does the filtering vertically, filling out
|
||||
// a 2x pixel column at a time.
|
||||
|
@ -2358,7 +2372,7 @@ L(ipred_z3_fill_padding_wide):
|
|||
st1 {v31.16b}, [x0], #16
|
||||
b.gt 2b
|
||||
subs w4, w4, #1
|
||||
add x0, x0, x1
|
||||
add x0, x0, x1
|
||||
b.le 9f
|
||||
mov w3, w12
|
||||
b 1b
|
||||
|
@ -2367,16 +2381,11 @@ L(ipred_z3_fill_padding_wide):
|
|||
endfunc
|
||||
|
||||
function ipred_z3_fill2_8bpc_neon, export=1
|
||||
adr x8, L(ipred_z3_fill1_tbl)
|
||||
cmp w3, #8
|
||||
add x10, x2, w6, uxtw // left[max_base_y]
|
||||
movrel x11, increments
|
||||
ld1r {v31.16b}, [x10] // padding
|
||||
ld1 {v30.8h}, [x11] // increments
|
||||
mov w7, w5
|
||||
|
||||
cmp w3, #8
|
||||
add x10, x2, w6, uxtw // left[max_base_y]
|
||||
ld1r {v31.16b}, [x10] // padding
|
||||
b.eq 80f
|
||||
|
||||
40: // w == 4
|
||||
|
@ -2385,6 +2394,8 @@ function ipred_z3_fill2_8bpc_neon, export=1
|
|||
mul v30.4h, v30.4h, v29.4h // {0,1,2,3,4,5,6,7}*dy
|
||||
movi v23.16b, #0x3e
|
||||
|
||||
// Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
|
||||
// so max_base_y <= 32.
|
||||
ld1 {v0.16b, v1.16b}, [x2] // left[]
|
||||
add v30.4h, v29.4h, v30.4h // ypos
|
||||
|
||||
|
@ -2434,6 +2445,8 @@ function ipred_z3_fill2_8bpc_neon, export=1
|
|||
mul v30.8h, v30.8h, v29.8h // {0,1,2,3,4,5,6,7}*dy
|
||||
movi v23.16b, #0x3e
|
||||
|
||||
// Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
|
||||
// so max_base_y <= 32.
|
||||
ld1 {v0.16b, v1.16b}, [x2] // left[]
|
||||
add v30.8h, v29.8h, v30.8h // ypos
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -50,13 +50,15 @@ decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon));
|
|||
|
||||
decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
|
||||
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
#if ARCH_AARCH64
|
||||
void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz,
|
||||
const pixel *const in,
|
||||
const int end);
|
||||
const int end HIGHBD_DECL_SUFFIX);
|
||||
void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz,
|
||||
const pixel *const in,
|
||||
const int end, const int strength);
|
||||
void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px,
|
||||
const int n);
|
||||
void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride,
|
||||
const pixel *const top, const int width,
|
||||
const int height, const int dx,
|
||||
|
@ -76,14 +78,15 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
|
|||
const int enable_intra_edge_filter = angle >> 10;
|
||||
angle &= 511;
|
||||
int dx = dav1d_dr_intra_derivative[angle >> 1];
|
||||
pixel top_out[64 + 64 + (64+15)*2];
|
||||
pixel top_out[64 + 64 + (64+15)*2 + 16];
|
||||
int max_base_x;
|
||||
const int upsample_above = enable_intra_edge_filter ?
|
||||
get_upsample(width + height, 90 - angle, is_sm) : 0;
|
||||
if (upsample_above) {
|
||||
BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height,
|
||||
topleft_in,
|
||||
width + imin(width, height));
|
||||
width + imin(width, height)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
max_base_x = 2 * (width + height) - 2;
|
||||
dx <<= 1;
|
||||
} else {
|
||||
|
@ -102,7 +105,8 @@ static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
const int base_inc = 1 + upsample_above;
|
||||
int pad_pixels = width + 15; // max(dx >> 6) == 15
|
||||
pixel_set(&top_out[max_base_x + 1], top_out[max_base_x], pad_pixels * base_inc);
|
||||
BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1],
|
||||
top_out[max_base_x], pad_pixels * base_inc);
|
||||
if (upsample_above)
|
||||
BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height,
|
||||
dx, max_base_x);
|
||||
|
@ -140,12 +144,13 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
|
|||
const int upsample_left = enable_intra_edge_filter ?
|
||||
get_upsample(width + height, angle - 180, is_sm) : 0;
|
||||
if (upsample_left) {
|
||||
flipped[0] = topleft_in[0];
|
||||
flipped[0] = topleft_in[0];
|
||||
BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0],
|
||||
height + imax(width, height));
|
||||
BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height,
|
||||
flipped,
|
||||
height + imin(width, height));
|
||||
height + imin(width, height)
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
max_base_y = 2 * (width + height) - 2;
|
||||
dy <<= 1;
|
||||
} else {
|
||||
|
@ -172,7 +177,8 @@ static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride,
|
|||
// the other implementation can read height + max(dy >> 6) past the end.
|
||||
int pad_pixels = imax(64 - max_base_y - 1, height + 15);
|
||||
|
||||
pixel_set(&left_out[max_base_y + 1], left_out[max_base_y], pad_pixels * base_inc);
|
||||
BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1],
|
||||
left_out[max_base_y], pad_pixels * base_inc);
|
||||
if (upsample_left)
|
||||
BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height,
|
||||
dy, max_base_y);
|
||||
|
@ -197,7 +203,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons
|
|||
c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon);
|
||||
c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
|
||||
c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
#if ARCH_AARCH64
|
||||
c->intra_pred[Z1_PRED] = ipred_z1_neon;
|
||||
c->intra_pred[Z3_PRED] = ipred_z3_neon;
|
||||
#endif
|
||||
|
|
|
@ -1560,7 +1560,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
if (c->n_fc == 1) {
|
||||
dav1d_thread_picture_ref(&c->out,
|
||||
&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
dav1d_data_props_copy(&c->out.p.m, &in->m);
|
||||
dav1d_picture_copy_props(&c->out.p,
|
||||
c->content_light, c->content_light_ref,
|
||||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref,
|
||||
&in->m);
|
||||
// Must be removed from the context after being attached to the frame
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = NULL;
|
||||
c->event_flags |= dav1d_picture_get_event_flags(&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
} else {
|
||||
pthread_mutex_lock(&c->task_thread.lock);
|
||||
|
@ -1606,7 +1613,15 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
|
|||
dav1d_thread_picture_ref(out_delayed,
|
||||
&c->refs[c->frame_hdr->existing_frame_idx].p);
|
||||
out_delayed->visible = 1;
|
||||
dav1d_data_props_copy(&out_delayed->p.m, &in->m);
|
||||
dav1d_picture_copy_props(&out_delayed->p,
|
||||
c->content_light, c->content_light_ref,
|
||||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref,
|
||||
&in->m);
|
||||
// Must be removed from the context after being attached to the frame
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = NULL;
|
||||
|
||||
pthread_mutex_unlock(&c->task_thread.lock);
|
||||
}
|
||||
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
|
||||
|
|
|
@ -142,9 +142,6 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
|
|||
p->p.h = h;
|
||||
p->seq_hdr = seq_hdr;
|
||||
p->frame_hdr = frame_hdr;
|
||||
p->content_light = content_light;
|
||||
p->mastering_display = mastering_display;
|
||||
p->itut_t35 = itut_t35;
|
||||
p->p.layout = seq_hdr->layout;
|
||||
p->p.bpc = bpc;
|
||||
dav1d_data_props_set_defaults(&p->m);
|
||||
|
@ -194,21 +191,38 @@ static int picture_alloc_with_edges(Dav1dContext *const c,
|
|||
p->frame_hdr_ref = frame_hdr_ref;
|
||||
if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
|
||||
|
||||
dav1d_data_props_copy(&p->m, props);
|
||||
dav1d_picture_copy_props(p, content_light, content_light_ref,
|
||||
mastering_display, mastering_display_ref,
|
||||
itut_t35, itut_t35_ref, props);
|
||||
|
||||
if (extra && extra_ptr)
|
||||
*extra_ptr = &pic_ctx->extra_ptr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dav1d_picture_copy_props(Dav1dPicture *const p,
|
||||
Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
|
||||
Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
|
||||
Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
|
||||
const Dav1dDataProps *const props)
|
||||
{
|
||||
dav1d_data_props_copy(&p->m, props);
|
||||
|
||||
dav1d_ref_dec(&p->content_light_ref);
|
||||
p->content_light_ref = content_light_ref;
|
||||
p->content_light = content_light;
|
||||
if (content_light_ref) dav1d_ref_inc(content_light_ref);
|
||||
|
||||
dav1d_ref_dec(&p->mastering_display_ref);
|
||||
p->mastering_display_ref = mastering_display_ref;
|
||||
p->mastering_display = mastering_display;
|
||||
if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
|
||||
|
||||
dav1d_ref_dec(&p->itut_t35_ref);
|
||||
p->itut_t35_ref = itut_t35_ref;
|
||||
p->itut_t35 = itut_t35;
|
||||
if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f,
|
||||
|
|
|
@ -101,6 +101,12 @@ int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
|
|||
void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
|
||||
void dav1d_picture_unref_internal(Dav1dPicture *p);
|
||||
|
||||
void dav1d_picture_copy_props(Dav1dPicture *p,
|
||||
Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
|
||||
Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
|
||||
Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
|
||||
const Dav1dDataProps *props);
|
||||
|
||||
/**
|
||||
* Get event flags from picture flags.
|
||||
*/
|
||||
|
|
|
@ -85,6 +85,7 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
|
|||
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
|
||||
#if BITDEPTH == 8
|
||||
init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
|
||||
init_angular_ipred_fn(Z2_PRED, ipred_z2, ssse3);
|
||||
init_angular_ipred_fn(Z3_PRED, ipred_z3, ssse3);
|
||||
#endif
|
||||
init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
|
||||
|
|
|
@ -81,6 +81,10 @@ z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0,
|
|||
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
|
||||
db 7, 8, 8, 9, 9, 10, 10, 11
|
||||
z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
|
||||
z2_h_shuf: db 7, 6, 15, 14, 6, 5, 14, 13, 5, 4, 13, 12, 4, 3, 12, 11
|
||||
z2_upsample: db 7, 6, 15, 14, 5, 4, 13, 12, 3, 2, 11, 10, 1, 0, 9, 8
|
||||
z2_dy_offset: dw 88*64, 88*64, 87*64, 87*64
|
||||
pw_m1to4: dw -1, -2, -3, -4
|
||||
z_filter_k: times 4 db 0, 16
|
||||
times 4 db 0, 20
|
||||
times 4 db 8, 16
|
||||
|
@ -129,6 +133,7 @@ JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
|
|||
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_z2, ssse3, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_z3, ssse3, h4, h8, h16, h32, h64
|
||||
JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
|
||||
|
@ -1979,6 +1984,653 @@ ALIGN function_align
|
|||
mova [tlq+16*1], m1
|
||||
ret
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal ipred_z2_8bpc, 4, 12, 13, 16*16, dst, stride, tl, w, h, angle, dx, _, dy
|
||||
%define base r7-$$
|
||||
%define maxwm r6m
|
||||
%define maxhm r7m
|
||||
lea r7, [$$]
|
||||
mov hd, hm
|
||||
mova m8, [base+pw_62]
|
||||
mova m9, [base+pw_64]
|
||||
lea r9d, [wq-4]
|
||||
mova m10, [base+pw_512]
|
||||
shl r9d, 6
|
||||
mova m11, [base+z1_shuf_w4]
|
||||
or r9d, hd
|
||||
mova m12, [base+z2_h_shuf]
|
||||
%else
|
||||
cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, h, angle, dx
|
||||
%define base r1-$$
|
||||
%define m8 [base+pw_62]
|
||||
%define m9 [base+pw_64]
|
||||
%define m10 [base+pw_512]
|
||||
%define m11 [rsp+16*16]
|
||||
%define m12 [rsp+16*17]
|
||||
%define r8 [rsp+16*6+4*1]
|
||||
%define r9b byte [rsp+16*18+4*0]
|
||||
%define r9d dword [rsp+16*18+4*0]
|
||||
%define r10d dword [rsp+16*18+4*1]
|
||||
%define r11d dword [rsp+16*18+4*2]
|
||||
%define maxwm [rsp+16*18+4*3]
|
||||
%define maxhm [rsp+16*19+4*0]
|
||||
%define stridemp [rsp+16*19+4*1]
|
||||
%define strideq r3
|
||||
%define dyd r4
|
||||
%define dyq r4
|
||||
mov stridemp, r1
|
||||
mov r1d, r6m
|
||||
mov r4d, r7m
|
||||
mov maxwm, r1d
|
||||
mov maxhm, r4d
|
||||
LEA r1, $$
|
||||
lea hd, [wq-4]
|
||||
mova m0, [base+z1_shuf_w4]
|
||||
shl hd, 6
|
||||
mova m1, [base+z2_h_shuf]
|
||||
or hd, hm
|
||||
mova m11, m0
|
||||
mov r9d, hd
|
||||
mova m12, m1
|
||||
%endif
|
||||
tzcnt wd, wd
|
||||
movifnidn angled, anglem
|
||||
movsxd wq, [base+ipred_z2_ssse3_table+wq*4]
|
||||
%if ARCH_X86_64
|
||||
movzx dxd, angleb
|
||||
%else
|
||||
movzx dxd, byte anglem
|
||||
%endif
|
||||
xor angled, 0x400
|
||||
mova m0, [tlq-16*4]
|
||||
mov dyd, dxd
|
||||
mova m1, [tlq-16*3]
|
||||
neg dxq
|
||||
mova m2, [tlq-16*2]
|
||||
and dyd, ~1
|
||||
mova m3, [tlq-16*1]
|
||||
and dxq, ~1
|
||||
movd m4, [tlq]
|
||||
movu m5, [tlq+16*0+1]
|
||||
movu m6, [tlq+16*1+1]
|
||||
movzx dyd, word [base+dr_intra_derivative+dyq-90] ; angle - 90
|
||||
movzx dxd, word [base+dr_intra_derivative+dxq+180] ; 180 - angle
|
||||
mova [rsp+16*2], m0
|
||||
pxor m7, m7
|
||||
mova [rsp+16*3], m1
|
||||
pshufb m4, m7
|
||||
mova [rsp+16*4], m2
|
||||
lea wq, [base+ipred_z2_ssse3_table+wq]
|
||||
mova [rsp+16*5], m3
|
||||
neg dxd
|
||||
mova [rsp+16*6], m4
|
||||
or dyd, 4<<16
|
||||
mova [rsp+16*7], m4
|
||||
mova [rsp+16*8], m5
|
||||
mova [rsp+16*9], m6
|
||||
movq m0, [base+z_base_inc+2]
|
||||
movsldup m1, [base+z2_dy_offset]
|
||||
movq m2, [base+pw_256] ; 4<<6
|
||||
movq [rsp+16*14+8*0], m0
|
||||
movq [rsp+16*15+8*0], m1
|
||||
movq [rsp+16*15+8*1], m2
|
||||
%if ARCH_X86_64
|
||||
lea r10d, [dxq+(128<<6)] ; xpos
|
||||
%else
|
||||
mov [rsp+16*7+4*1], dyd
|
||||
lea r4d, [dxq+(128<<6)]
|
||||
mov r10d, r4d
|
||||
movzx hd, r9b
|
||||
%endif
|
||||
mov r11d, (128-4)<<6
|
||||
jmp wq
|
||||
.w4:
|
||||
test angled, 0x400
|
||||
jnz .w4_main
|
||||
movd m5, [tlq+4]
|
||||
lea r3d, [hq+2]
|
||||
add angled, 1022
|
||||
pshufb m5, m7
|
||||
shl r3d, 6
|
||||
movd [rsp+16*8+4], m5
|
||||
test r3d, angled
|
||||
jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
|
||||
call .upsample_above
|
||||
sub angled, 1075 ; angle - 53
|
||||
lea r3d, [hq+3]
|
||||
xor angled, 0x7f ; 180 - angle
|
||||
movd m0, r3d
|
||||
movd m6, angled
|
||||
shr angled, 8 ; is_sm << 1
|
||||
pshufb m0, m7
|
||||
pshufb m6, m7
|
||||
pcmpeqb m0, [base+z_filter_wh4]
|
||||
pand m6, m0
|
||||
pcmpgtb m6, [base+z_filter_t_w48+angleq*8]
|
||||
jmp .w8_filter_left
|
||||
.upsample_above: ; w4/w8
|
||||
movq m3, [rsp+gprsize+16*8-2]
|
||||
movq m1, [rsp+gprsize+16*8-1]
|
||||
movq m0, [rsp+gprsize+16*8+0]
|
||||
movq m4, [rsp+gprsize+16*8+1]
|
||||
movddup m5, [base+pb_36_m4]
|
||||
punpcklbw m1, m3
|
||||
punpcklbw m2, m0, m4
|
||||
pmaddubsw m1, m5
|
||||
pmaddubsw m2, m5
|
||||
%if ARCH_X86_64
|
||||
mova m11, [base+pb_0to15]
|
||||
lea r10d, [r10+dxq+(1<<6)]
|
||||
mov r11d, (128-7)<<6
|
||||
%else
|
||||
mova m3, [base+pb_0to15]
|
||||
mov r3d, [rsp+gprsize+16*18+4*1]
|
||||
mov dword [rsp+gprsize+16*18+4*2], (128-7)<<6
|
||||
lea r3d, [r3+dxq+(1<<6)]
|
||||
mov [rsp+gprsize+16*18+4*1], r3d
|
||||
mova [rsp+gprsize+16*16], m3
|
||||
%endif
|
||||
add dxd, dxd
|
||||
paddw m1, m2
|
||||
pmulhrsw m1, m10
|
||||
movq m2, [rsp+gprsize+16*14]
|
||||
paddw m2, m2
|
||||
movq [rsp+gprsize+16*14], m2
|
||||
packuswb m1, m1
|
||||
punpcklbw m1, m0
|
||||
mova [rsp+gprsize+16*8], m1
|
||||
ret
|
||||
.w4_no_upsample_above:
|
||||
lea r3d, [hq+3]
|
||||
mov [rsp], angled
|
||||
sub angled, 1112 ; angle - 90
|
||||
movd m0, r3d
|
||||
mov r3d, 90
|
||||
movd m1, angled
|
||||
sub r3d, angled ; 180 - angle
|
||||
shr angled, 8 ; is_sm << 1
|
||||
movu m3, [base+z_filter_wh4]
|
||||
mova m4, [base+z_filter_t_w48+angleq*8]
|
||||
call .w8_filter_top
|
||||
mov angled, [rsp]
|
||||
lea r3d, [hq+2]
|
||||
sub angled, 139
|
||||
shl r3d, 6
|
||||
test r3d, angled
|
||||
jnz .w8_filter_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
|
||||
.upsample_left: ; w4/w8
|
||||
neg hq
|
||||
movd m0, [tlq+hq]
|
||||
pshufb m0, m7
|
||||
movd [rsp+16*6+hq-4], m0
|
||||
movq m3, [rsp+16*5+7]
|
||||
movq m0, [rsp+16*5+8]
|
||||
movq m2, [rsp+16*5+9]
|
||||
movq m4, [rsp+16*5+10]
|
||||
movddup m5, [base+pb_36_m4]
|
||||
punpcklbw m1, m0, m3
|
||||
punpcklbw m2, m4
|
||||
pmaddubsw m1, m5
|
||||
pmaddubsw m2, m5
|
||||
movshdup m3, [base+z2_dy_offset]
|
||||
%if ARCH_X86_64
|
||||
mova m12, [base+z2_upsample]
|
||||
add dyd, dyd
|
||||
%else
|
||||
mova m4, [base+z2_upsample]
|
||||
shl dword [rsp+16*7+4*1], 1
|
||||
mova m12, m4
|
||||
%endif
|
||||
paddw m1, m2
|
||||
pmulhrsw m1, m10
|
||||
movq [rsp+16*15], m3
|
||||
packuswb m1, m1
|
||||
punpcklbw m0, m1
|
||||
mova [rsp+16*5], m0
|
||||
.w4_main:
|
||||
movd m6, dxd
|
||||
%if ARCH_X86_64
|
||||
movd m3, dyd
|
||||
%else
|
||||
movd m3, [rsp+16*7+4*1]
|
||||
%endif
|
||||
movddup m0, [rsp+16*14+8*0]
|
||||
pshufb m6, [base+pw_256]
|
||||
paddw m7, m6, m6
|
||||
movq m5, [base+pw_m1to4]
|
||||
pshuflw m4, m3, q0000
|
||||
punpcklqdq m6, m7
|
||||
pmullw m4, m5
|
||||
pshuflw m3, m3, q1111
|
||||
paddw m6, m0
|
||||
pshuflw m0, m4, q3333
|
||||
psubw m4, [rsp+16*15]
|
||||
movq [rsp+16*6+8*1], m3
|
||||
movq [rsp+8*1], m0 ; dy*4
|
||||
%if ARCH_X86_64
|
||||
mov r8, dstq
|
||||
%endif
|
||||
.w4_loop0:
|
||||
%if ARCH_X86_32
|
||||
mov r8, dstq
|
||||
%endif
|
||||
mova [rsp+16*12], m6
|
||||
mov r2d, r10d
|
||||
movq [rsp+8*0], m4
|
||||
pand m0, m4, m8
|
||||
psraw m4, 6
|
||||
psubw m1, m9, m0
|
||||
psllw m0, 8
|
||||
por m0, m1 ; 64-frac_y, frac_y
|
||||
movq [rsp+8*3], m0
|
||||
pabsw m4, m4
|
||||
movq [rsp+8*2], m4
|
||||
movzx hd, r9b
|
||||
.w4_loop:
|
||||
lea r3d, [r2+dxq]
|
||||
shr r2d, 6 ; base_x0
|
||||
movq m0, [rsp+r2]
|
||||
lea r2d, [r3+dxq]
|
||||
shr r3d, 6 ; base_x1
|
||||
movhps m0, [rsp+r3]
|
||||
lea r3d, [r2+dxq]
|
||||
shr r2d, 6 ; base_x2
|
||||
movq m1, [rsp+r2]
|
||||
lea r2d, [r3+dxq]
|
||||
shr r3d, 6 ; base_x3
|
||||
movhps m1, [rsp+r3]
|
||||
pand m2, m8, m6
|
||||
paddsw m5, m6, m7
|
||||
psubw m3, m9, m2
|
||||
psllw m2, 8
|
||||
pshufb m0, m11
|
||||
por m2, m3
|
||||
pmaddubsw m0, m2
|
||||
pand m2, m8, m5
|
||||
psubw m3, m9, m2
|
||||
psllw m2, 8
|
||||
pshufb m1, m11
|
||||
por m2, m3
|
||||
pmaddubsw m1, m2
|
||||
cmp r3d, 127 ; topleft
|
||||
jge .w4_toponly
|
||||
movzx r3d, byte [rsp+8*2+0] ; base_y0
|
||||
movq m3, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+2] ; base_y1
|
||||
movhps m3, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+4] ; base_y2
|
||||
movq m4, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+6] ; base_y3
|
||||
movhps m4, [rsp+r3]
|
||||
pshufb m3, m12
|
||||
pshufb m4, m12
|
||||
punpckldq m2, m3, m4
|
||||
punpckhdq m3, m4
|
||||
movddup m4, [rsp+8*3]
|
||||
pmaddubsw m2, m4
|
||||
pmaddubsw m3, m4
|
||||
psraw m6, 15 ; base_x < topleft
|
||||
pand m2, m6
|
||||
pandn m6, m0
|
||||
por m0, m2, m6
|
||||
psraw m6, m5, 15
|
||||
pand m3, m6
|
||||
pandn m6, m1
|
||||
por m1, m3, m6
|
||||
.w4_toponly:
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m10
|
||||
movifnidn strideq, stridemp
|
||||
packuswb m0, m1
|
||||
movd [dstq+strideq*0], m0
|
||||
pshuflw m1, m0, q1032
|
||||
movd [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
punpckhqdq m0, m0
|
||||
movd [dstq+strideq*0], m0
|
||||
psrlq m0, 32
|
||||
movd [dstq+strideq*1], m0
|
||||
sub hd, 4
|
||||
jz .w4_end
|
||||
movq m4, [rsp+8*2]
|
||||
movq m3, [rsp+16*6+8*1]
|
||||
paddw m6, m5, m7 ; xpos += dx
|
||||
psubw m4, m3
|
||||
movq [rsp+8*2], m4
|
||||
lea dstq, [dstq+strideq*2]
|
||||
cmp r2d, r11d
|
||||
jge .w4_loop
|
||||
movddup m5, [rsp+8*3]
|
||||
.w4_leftonly_loop:
|
||||
movzx r3d, byte [rsp+8*2+0] ; base_y0
|
||||
movq m1, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+2] ; base_y1
|
||||
movhps m1, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+4] ; base_y2
|
||||
movq m2, [rsp+r3]
|
||||
movzx r3d, byte [rsp+8*2+6] ; base_y3
|
||||
movhps m2, [rsp+r3]
|
||||
psubw m4, m3
|
||||
pshufb m1, m12
|
||||
pshufb m2, m12
|
||||
movq [rsp+8*2], m4
|
||||
punpckldq m0, m1, m2
|
||||
punpckhdq m1, m2
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m5
|
||||
movifnidn strideq, stridemp
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m10
|
||||
packuswb m0, m1
|
||||
movd [dstq+strideq*0], m0
|
||||
pshuflw m1, m0, q1032
|
||||
movd [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
punpckhqdq m0, m0
|
||||
movd [dstq+strideq*0], m0
|
||||
psrlq m0, 32
|
||||
movd [dstq+strideq*1], m0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 4
|
||||
jg .w4_leftonly_loop
|
||||
.w4_end:
|
||||
sub r9d, 1<<8
|
||||
jl .w4_ret
|
||||
movq m4, [rsp+8*1]
|
||||
%if ARCH_X86_64
|
||||
add r8, 4
|
||||
mov dstq, r8
|
||||
%else
|
||||
mov dstq, r8
|
||||
add dstq, 4
|
||||
%endif
|
||||
paddw m4, [rsp+8*0] ; base_y += 4*dy
|
||||
movzx r3d, word [rsp+16*15+8*1]
|
||||
add r10d, r3d
|
||||
movddup m6, [rsp+16*15+8*1]
|
||||
paddw m6, [rsp+16*12] ; base_x += (4 << upsample_above)
|
||||
jmp .w4_loop0
|
||||
.w4_ret:
|
||||
RET
|
||||
.w8:
|
||||
test angled, 0x400
|
||||
jnz .w4_main
|
||||
movd m5, [tlq+8]
|
||||
lea r3d, [angleq+126]
|
||||
pshufb m5, m7
|
||||
%if ARCH_X86_64
|
||||
mov r3b, hb
|
||||
%else
|
||||
xor r3b, r3b
|
||||
or r3d, hd
|
||||
%endif
|
||||
movd [rsp+16*8+8], m5
|
||||
cmp r3d, 8
|
||||
ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
|
||||
call .upsample_above
|
||||
sub angled, 53
|
||||
lea r3d, [hq+7]
|
||||
xor angled, 0x7f ; 180 - angle
|
||||
movu m1, [base+z_filter_wh8]
|
||||
movd m0, r3d
|
||||
movd m6, angled
|
||||
shr angled, 8 ; is_sm << 1
|
||||
psrldq m2, [base+z_filter_t_w48+angleq*8], 4
|
||||
pshufb m0, m7
|
||||
pshufb m6, m7
|
||||
pcmpeqb m0, m1
|
||||
pand m6, m0
|
||||
pcmpgtb m6, m2
|
||||
%if ARCH_X86_64
|
||||
movq [rsp+16*15+8*1], m10 ; 8<<6
|
||||
%else
|
||||
movq m0, m10
|
||||
movq [rsp+16*15+8*1], m0
|
||||
%endif
|
||||
jmp .w8_filter_left
|
||||
.w8_no_upsample_above:
|
||||
lea r3d, [hq+7]
|
||||
mov [rsp], angled
|
||||
sub angled, 90
|
||||
movd m0, r3d
|
||||
mov r3d, 90
|
||||
movd m1, angled
|
||||
sub r3d, angled ; 180 - angle
|
||||
shr angled, 8 ; is_sm << 1
|
||||
movu m3, [base+z_filter_wh8]
|
||||
psrldq m4, [base+z_filter_t_w48+angleq*8], 4
|
||||
call .w8_filter_top
|
||||
mov r3d, [rsp]
|
||||
sub r3d, 141
|
||||
%if ARCH_X86_64
|
||||
mov r3b, hb
|
||||
%else
|
||||
xor r3b, r3b
|
||||
or r3d, hd
|
||||
%endif
|
||||
cmp r3d, 8
|
||||
jbe .upsample_left ; angle > 140 && h <= 8 && !is_sm
|
||||
.w8_filter_left:
|
||||
pmovmskb r5d, m6
|
||||
test r5d, r5d
|
||||
jz .w4_main
|
||||
imul r5d, 0x55555555
|
||||
mov r3, tlq
|
||||
shr r5d, 30
|
||||
sub r5, 3 ; filter_strength-3
|
||||
jmp .filter_left
|
||||
.w8_filter_top:
|
||||
movd m6, r3d
|
||||
REPX {pshufb x, m7}, m0, m1, m6
|
||||
pcmpeqb m0, m3
|
||||
pand m1, m0
|
||||
pand m6, m0
|
||||
pcmpgtb m1, m4
|
||||
pcmpgtb m6, m4
|
||||
pmovmskb r5d, m1
|
||||
test r5d, r5d
|
||||
jz .w8_filter_top_end ; filter_strength == 0
|
||||
imul r5d, 0x55555555
|
||||
movq m0, [rsp+gprsize+16*8-2]
|
||||
shr r5d, 30
|
||||
movq m1, [rsp+gprsize+16*8-1]
|
||||
sub r5, 3 ; filter_strength-3
|
||||
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
|
||||
punpcklbw m0, m1
|
||||
pmaddubsw m0, m7
|
||||
movq m1, [rsp+gprsize+16*8+0]
|
||||
movq m2, [rsp+gprsize+16*8+1]
|
||||
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
|
||||
punpcklbw m1, m2
|
||||
pmaddubsw m1, m7
|
||||
movq m2, [rsp+gprsize+16*8+2]
|
||||
movddup m7, [base+z_filter_k+8*2+r5*8+24*2]
|
||||
punpcklbw m2, m2
|
||||
pmaddubsw m2, m7
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
%if ARCH_X86_64
|
||||
mov r3d, r7m ; maxw, offset due to call
|
||||
%else
|
||||
mov r3d, [rsp+gprsize+16*18+4*3]
|
||||
%endif
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m10
|
||||
packuswb m0, m1
|
||||
movq [rsp+gprsize+16*8], m0
|
||||
cmp r3d, 8
|
||||
jge .w8_filter_top_end
|
||||
movq m0, [tlq+r3+1]
|
||||
movq [rsp+gprsize+r3+16*8], m0
|
||||
.w8_filter_top_end:
|
||||
ret
|
||||
.w16:
|
||||
test angled, 0x400
|
||||
jnz .w4_main
|
||||
lea r3d, [hq+15]
|
||||
sub angled, 90
|
||||
movd m0, r3d
|
||||
mov r3d, 90
|
||||
movd m1, angled
|
||||
sub r3d, angled ; 180 - angle
|
||||
shr angled, 8 ; is_sm << 1
|
||||
movd m6, r3d
|
||||
REPX {pshufb x, m7}, m0, m1, m6
|
||||
movq m3, [base+z_filter_t_w16+angleq*4]
|
||||
pcmpeqb m0, [base+z_filter_wh16]
|
||||
pand m1, m0
|
||||
pand m6, m0
|
||||
pcmpgtb m1, m3
|
||||
pcmpgtb m6, m3
|
||||
pmovmskb r5d, m1
|
||||
mov r3, tlq
|
||||
test r5d, r5d
|
||||
jz .w16_filter_left ; filter_strength == 0
|
||||
imul r5d, 0x24924924
|
||||
pshufb m5, [base+z_filter_t_w16] ; tlq[16]
|
||||
shr r5d, 30
|
||||
adc r5, -4 ; filter_strength-3
|
||||
movd [rsp+16*9], m5
|
||||
movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
|
||||
movu m1, [rsp+16*8-2]
|
||||
movu m2, [rsp+16*8-1]
|
||||
punpcklbw m0, m1, m2
|
||||
pmaddubsw m0, m7
|
||||
punpckhbw m1, m2
|
||||
pmaddubsw m1, m7
|
||||
movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
|
||||
mova m3, [rsp+16*8+0]
|
||||
movu m4, [rsp+16*8+1]
|
||||
punpcklbw m2, m3, m4
|
||||
pmaddubsw m2, m7
|
||||
punpckhbw m3, m4
|
||||
pmaddubsw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
test r5d, r5d
|
||||
jnz .w16_filter_end ; 3-tap
|
||||
movddup m7, [base+z_filter_k+8*8]
|
||||
movu m3, [rsp+16*8+2]
|
||||
punpcklbw m2, m3, m3
|
||||
pmaddubsw m2, m7
|
||||
punpckhbw m3, m3
|
||||
pmaddubsw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
.w16_filter_end:
|
||||
mov r2d, maxwm
|
||||
pmulhrsw m0, m10
|
||||
pmulhrsw m1, m10
|
||||
packuswb m0, m1
|
||||
mova [rsp+16*8], m0
|
||||
cmp r2d, 16
|
||||
jge .w16_filter_left
|
||||
movu m0, [r3+r2+1]
|
||||
movu [rsp+r2+16*8], m0
|
||||
.w16_filter_left:
|
||||
pmovmskb r5d, m6
|
||||
test r5d, r5d
|
||||
jz .w4_main
|
||||
imul r5d, 0x24924924
|
||||
shr r5d, 30
|
||||
adc r5, -4 ; filter_strength-3
|
||||
jmp .filter_left
|
||||
.w32:
|
||||
test angled, 0x400
|
||||
jnz .w4_main
|
||||
pshufb m6, [base+z_filter_t_w16] ; tlq[32]
|
||||
mov r3, tlq
|
||||
lea tlq, [rsp+16*9]
|
||||
movd [tlq+16*1], m6
|
||||
xor r5d, r5d ; filter_strength = 3
|
||||
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
|
||||
mova m0, [tlq+16*0]
|
||||
mova m1, [tlq+16*1]
|
||||
mov r2d, maxwm
|
||||
mova [rsp+16*8], m0
|
||||
mova [rsp+16*9], m1
|
||||
cmp r2d, 32
|
||||
jge .filter_left
|
||||
movu m0, [r3+r2+16*0+1]
|
||||
movu m1, [r3+r2+16*1+1]
|
||||
movu [rsp+r2+16*8], m0
|
||||
movu [rsp+r2+16*9], m1
|
||||
jmp .filter_left
|
||||
.w64:
|
||||
movu m0, [tlq+16*2+1]
|
||||
movu m1, [tlq+16*3+1]
|
||||
mova [rsp+16*10], m0
|
||||
mova [rsp+16*11], m1
|
||||
test angled, 0x400
|
||||
jnz .w4_main
|
||||
pshufb m1, [base+z_filter_t_w16] ; tlq[64]
|
||||
mov r3, tlq
|
||||
lea tlq, [rsp+16*11]
|
||||
movd [tlq+16*1], m1
|
||||
xor r5d, r5d ; filter_strength = 3
|
||||
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
|
||||
sub tlq, 16*2
|
||||
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
|
||||
mova m0, [tlq+16*0]
|
||||
mova m1, [tlq+16*1]
|
||||
mova m2, [tlq+16*2]
|
||||
mova m3, [tlq+16*3]
|
||||
mov r2d, maxwm
|
||||
mova [rsp+16* 8], m0
|
||||
mova [rsp+16* 9], m1
|
||||
mova [rsp+16*10], m2
|
||||
mova [rsp+16*11], m3
|
||||
cmp r2d, 64
|
||||
jge .filter_left
|
||||
movu m0, [r3+r2+16*0+1]
|
||||
movu m1, [r3+r2+16*1+1]
|
||||
movu [rsp+r2+16* 8], m0
|
||||
movu [rsp+r2+16* 9], m1
|
||||
cmp r2d, 32
|
||||
jge .filter_left
|
||||
movu m0, [r3+r2+16*2+1]
|
||||
movu m1, [r3+r2+16*3+1]
|
||||
movu [rsp+r2+16*10], m0
|
||||
movu [rsp+r2+16*11], m1
|
||||
.filter_left:
|
||||
neg hq
|
||||
movd m0, [r3+hq]
|
||||
pxor m1, m1
|
||||
pshufb m0, m1
|
||||
movd [rsp+16*6+hq-4], m0
|
||||
lea tlq, [rsp+16*5]
|
||||
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
|
||||
cmp hd, -32
|
||||
jge .filter_left_end
|
||||
sub tlq, 16*2
|
||||
call mangle(private_prefix %+ _ipred_z1_8bpc_ssse3).filter_edge
|
||||
mova m0, [tlq+16*0]
|
||||
mova m1, [tlq+16*1]
|
||||
mova [rsp+16*2], m0
|
||||
mova [rsp+16*3], m1
|
||||
.filter_left_end:
|
||||
mov r2d, maxhm
|
||||
mova m0, [rsp+16*5]
|
||||
mova m1, [rsp+16*6]
|
||||
mova m2, [rsp+16*7]
|
||||
neg r2
|
||||
mova [rsp+16*4], m0
|
||||
mova [rsp+16*5], m1
|
||||
mova [rsp+16*6], m2
|
||||
cmp r2d, hd
|
||||
jle .w4_main
|
||||
movu m0, [r3+r2-16*2]
|
||||
movu m1, [r3+r2-16*1]
|
||||
movu [rsp+r2+16*4], m0
|
||||
movu [rsp+r2+16*5], m1
|
||||
cmp r2d, -32
|
||||
jle .w4_main
|
||||
movu m0, [r3+r2-16*4]
|
||||
movu m1, [r3+r2-16*3]
|
||||
movu [rsp+r2+16*2], m0
|
||||
movu [rsp+r2+16*3], m1
|
||||
jmp .w4_main
|
||||
|
||||
%if ARCH_X86_64
|
||||
cglobal ipred_z3_8bpc, 4, 9, 11, 16*10, dst, stride, tl, w, h, angle, dy, _, org_w
|
||||
%define base r7-$$
|
||||
|
|
|
@ -317,6 +317,9 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
|||
assign_itx16_bpc_fn(R, 16, 8, 12, avx2);
|
||||
assign_itx12_bpc_fn( , 16, 16, 12, avx2);
|
||||
assign_itx2_bpc_fn (R, 32, 8, 12, avx2);
|
||||
assign_itx_bpc_fn(R, 16, 32, identity_identity, IDTX, 12, avx2);
|
||||
assign_itx_bpc_fn(R, 32, 16, identity_identity, IDTX, 12, avx2);
|
||||
assign_itx_bpc_fn( , 32, 32, identity_identity, IDTX, 12, avx2);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -353,6 +356,7 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
|
|||
assign_itx2_bpc_fn (R, 32, 8, 10, avx512icl);
|
||||
assign_itx2_bpc_fn (R, 32, 16, 10, avx512icl);
|
||||
assign_itx2_bpc_fn ( , 32, 32, 10, avx512icl);
|
||||
assign_itx1_bpc_fn (R, 16, 64, 10, avx512icl);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -3137,10 +3137,14 @@ INV_TXFM_8X16_FN identity, adst
|
|||
INV_TXFM_8X16_FN identity, flipadst
|
||||
INV_TXFM_8X16_FN identity, identity
|
||||
|
||||
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
|
||||
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384]
|
||||
pmulhrsw m%2, m%3, m%1
|
||||
%if %0 == 4 ; if downshifting by 1
|
||||
%ifnum %4
|
||||
pmulhrsw m%2, m%4
|
||||
%else ; without rounding
|
||||
psraw m%2, 1
|
||||
%endif
|
||||
%else
|
||||
paddsw m%1, m%1
|
||||
%endif
|
||||
|
@ -6837,10 +6841,11 @@ ALIGN function_align
|
|||
ret
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob
|
||||
vpbroadcastd m7, [pixel_10bpc_max]
|
||||
.pass1:
|
||||
vpbroadcastd m8, [pw_2896x8]
|
||||
vpbroadcastd m9, [pw_1697x16]
|
||||
vpbroadcastd m11, [pw_8192]
|
||||
vpbroadcastd m7, [pixel_10bpc_max]
|
||||
lea r6, [strideq*5]
|
||||
pxor m6, m6
|
||||
paddw m10, m11, m11 ; pw_16384
|
||||
|
@ -6910,11 +6915,15 @@ ALIGN function_align
|
|||
punpckhqdq m1, m3, m2
|
||||
jmp m(iidentity_8x8_internal_10bpc).write_2x8x2
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob
|
||||
vpbroadcastd m7, [pixel_12bpc_max]
|
||||
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
lea r6, [rsp+32*4]
|
||||
|
@ -7136,10 +7145,11 @@ ALIGN function_align
|
|||
jmp m(idct_16x8_internal_10bpc).write_16x4
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob
|
||||
vpbroadcastd m7, [pixel_10bpc_max]
|
||||
.pass1:
|
||||
vpbroadcastd m8, [pw_2896x8]
|
||||
vpbroadcastd m9, [pw_1697x16]
|
||||
vpbroadcastd m10, [pw_2048]
|
||||
vpbroadcastd m7, [pixel_10bpc_max]
|
||||
vpbroadcastd m10, [pw_4096]
|
||||
lea r6, [strideq*5]
|
||||
pxor m6, m6
|
||||
mov r5, dstq
|
||||
|
@ -7187,16 +7197,20 @@ ALIGN function_align
|
|||
packssdw m3, [cq+64*7]
|
||||
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
|
||||
REPX {paddsw x, x }, m0, m1, m2, m3
|
||||
REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3
|
||||
REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3
|
||||
REPX {pmulhrsw x, m10}, m0, m1, m2, m3
|
||||
REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob
|
||||
vpbroadcastd m7, [pixel_12bpc_max]
|
||||
jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
lea r6, [rsp+32*7]
|
||||
|
@ -7364,9 +7378,10 @@ ALIGN function_align
|
|||
jmp m(idct_16x16_internal_8bpc).main
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob
|
||||
%undef cmp
|
||||
vpbroadcastd m5, [pw_8192]
|
||||
%undef cmp
|
||||
vpbroadcastd m7, [pixel_10bpc_max]
|
||||
.pass1:
|
||||
vpbroadcastd m5, [pw_8192]
|
||||
pxor m6, m6
|
||||
lea r6, [strideq*3]
|
||||
lea r5, [strideq*5]
|
||||
|
@ -7432,6 +7447,10 @@ ALIGN function_align
|
|||
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
||||
jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero
|
||||
|
||||
cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob
|
||||
vpbroadcastd m7, [pixel_12bpc_max]
|
||||
jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1
|
||||
|
||||
%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
|
||||
%if %1 & 1
|
||||
mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
|
||||
|
@ -7472,7 +7491,7 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
|
@ -7814,7 +7833,7 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
lea r6, [rsp+32*6]
|
||||
|
@ -8043,7 +8062,7 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob
|
|||
RET
|
||||
.normal:
|
||||
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
|
@ -8262,7 +8281,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob
|
|||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
|
@ -8411,7 +8430,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
|||
test eobd, eobd
|
||||
jz .dconly
|
||||
PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
|
||||
%undef cmp
|
||||
%undef cmp
|
||||
vpbroadcastd m11, [pd_2048]
|
||||
vpbroadcastd m12, [clip_18b_min]
|
||||
vpbroadcastd m13, [clip_18b_max]
|
||||
|
|
|
@ -174,6 +174,8 @@ cextern inv_txfm_add_dct_dct_32x16_8bpc_avx512icl.main_oddhalf_fast2
|
|||
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf
|
||||
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast
|
||||
cextern inv_txfm_add_dct_dct_32x32_8bpc_avx512icl.main_oddhalf_fast2
|
||||
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf
|
||||
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx512icl.main_oddhalf_fast
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
@ -3815,4 +3817,317 @@ cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 7, 16, dst, stride, c, eo
|
|||
punpckhdq m8, m0 ; 6 7
|
||||
ret
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob
|
||||
lea r5, [o_base]
|
||||
test eobd, eobd
|
||||
jz .dconly
|
||||
|
||||
PROLOGUE 4, 7, 32, -8*mmsize, dst, stride, c, eob
|
||||
%undef cmp
|
||||
vpbroadcastd m12, [o(pd_2896)]
|
||||
vpbroadcastd m13, [o(pd_2048)]
|
||||
vpbroadcastd m14, [o(clip_18b_min)]
|
||||
vpbroadcastd m15, [o(clip_18b_max)]
|
||||
cmp eobd, 36
|
||||
jl .fast
|
||||
call .pass1
|
||||
cmp eobd, 151
|
||||
jge .full
|
||||
lea r5, [o_base_8bpc]
|
||||
|
||||
punpckhwd m22, m0, m0
|
||||
punpckhwd m23, m1, m1
|
||||
punpckhwd m24, m2, m2
|
||||
punpckhwd m25, m3, m3
|
||||
punpckhwd m26, m4, m4
|
||||
punpckhwd m27, m5, m5
|
||||
punpckhwd m28, m6, m6
|
||||
punpckhwd m29, m7, m7
|
||||
punpcklwd m21, m1, m1
|
||||
punpcklwd m14, m3, m3
|
||||
punpcklwd m18, m5, m5
|
||||
punpcklwd m15, m7, m7
|
||||
pxor m9, m9
|
||||
punpcklwd m9, m9, m0
|
||||
punpcklwd m8, m2, m2
|
||||
punpcklwd m7, m4, m4
|
||||
punpcklwd m1, m6, m6
|
||||
call m(idct_16x16_internal_8bpc).main_fast2
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
|
||||
mova [rsp+mmsize*0], m14
|
||||
mova [rsp+mmsize*1], m15
|
||||
mova [rsp+mmsize*2], m16
|
||||
mova [rsp+mmsize*3], m17
|
||||
mova [rsp+mmsize*4], m18
|
||||
mova [rsp+mmsize*5], m19
|
||||
mova [rsp+mmsize*6], m20
|
||||
mova [rsp+mmsize*7], m21
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
|
||||
|
||||
pxor m12, m12
|
||||
mov r3d, 64*3
|
||||
.zero_loop:
|
||||
REPX {mova [cq+r3*8+128*x], m12}, 0, 1, 2, 3
|
||||
sub r3d, 64
|
||||
jge .zero_loop
|
||||
|
||||
jmp .pass2_end
|
||||
.full:
|
||||
mova [cq+128*0], m0
|
||||
mova [cq+128*1], m1
|
||||
mova [cq+128*2], m2
|
||||
mova [cq+128*3], m3
|
||||
mova [cq+128*4], m4
|
||||
mova [cq+128*5], m5
|
||||
mova [cq+128*6], m6
|
||||
mova [cq+128*7], m7
|
||||
add cq, 64
|
||||
call .pass1
|
||||
sub cq, 64
|
||||
mova m22, [cq+128*0] ; 0 1
|
||||
mova m23, [cq+128*1] ; 2 3
|
||||
mova m24, [cq+128*2] ; 4 5
|
||||
mova m25, [cq+128*3] ; 6 7
|
||||
mova m26, [cq+128*4] ; 8 9
|
||||
mova m27, [cq+128*5] ; 10 11
|
||||
mova m28, [cq+128*6] ; 12 13
|
||||
mova m29, [cq+128*7] ; 14 15
|
||||
mova [cq+64* 8], m0
|
||||
mova [cq+64* 9], m1
|
||||
mova [cq+64*10], m2
|
||||
mova [cq+64*11], m3
|
||||
mova [cq+64*12], m4
|
||||
mova [cq+64*13], m5
|
||||
mova [cq+64*14], m6
|
||||
mova [cq+64*15], m7
|
||||
lea r5, [o_base_8bpc]
|
||||
|
||||
punpcklwd m20, m1, m1
|
||||
punpcklwd m16, m3, m3
|
||||
punpcklwd m19, m5, m5
|
||||
punpcklwd m17, m7, m7
|
||||
punpcklwd m8, m24, m24 ; 4
|
||||
punpcklwd m5, m2, m2 ; 20
|
||||
punpcklwd m1, m28, m28 ; 12
|
||||
punpcklwd m7, m26, m26 ; 8
|
||||
punpcklwd m3, m4, m4 ; 24
|
||||
punpcklwd m4, m6, m6 ; 28
|
||||
pxor m9, m9
|
||||
punpcklwd m6, m9, m0 ; __ 16
|
||||
mova m0, m4
|
||||
punpcklwd m9, m9, m22 ; __ 0
|
||||
call m(idct_16x16_internal_8bpc).main_fast
|
||||
punpcklwd m21, m23, m23 ; 2
|
||||
punpcklwd m15, m29, m29 ; 14
|
||||
punpcklwd m18, m27, m27 ; 10
|
||||
punpcklwd m14, m25, m25 ; 6
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
|
||||
mova [rsp+mmsize*0], m14
|
||||
mova [rsp+mmsize*1], m15
|
||||
mova [rsp+mmsize*2], m16
|
||||
mova [rsp+mmsize*3], m17
|
||||
mova [rsp+mmsize*4], m18
|
||||
mova [rsp+mmsize*5], m19
|
||||
mova [rsp+mmsize*6], m20
|
||||
mova [rsp+mmsize*7], m21
|
||||
mova m21, [cq+64*15]
|
||||
mova m14, [cq+64* 8]
|
||||
mova m17, [cq+64*11]
|
||||
mova m18, [cq+64*12]
|
||||
mova m19, [cq+64*13]
|
||||
mova m16, [cq+64*10]
|
||||
mova m15, [cq+64* 9]
|
||||
mova m20, [cq+64*14]
|
||||
REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
|
||||
m24, m19, m16, m27, m28, m15, m20, m23
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
|
||||
|
||||
pxor m12, m12
|
||||
mov r3d, 32*7
|
||||
.full_zero_loop:
|
||||
REPX {mova [cq+r3*8+64*x], m12}, 0, 1, 2, 3
|
||||
sub r3d, 32
|
||||
jge .full_zero_loop
|
||||
|
||||
jmp .pass2_end
|
||||
.fast:
|
||||
mova ym0, [cq+128*0]
|
||||
mova ym2, [cq+128*4]
|
||||
movshdup m8, [o(permB)]
|
||||
mova ym1, [cq+128*2]
|
||||
mova ym3, [cq+128*6]
|
||||
mova ym4, [cq+128*1]
|
||||
mova ym5, [cq+128*3]
|
||||
mova ym6, [cq+128*5]
|
||||
mova ym7, [cq+128*7]
|
||||
vpermt2q m0, m8, m2 ; 0 4
|
||||
vpermt2q m1, m8, m3 ; 2 6
|
||||
vpermt2q m4, m8, m5 ; 1 3
|
||||
vpermt2q m7, m8, m6 ; 7 5
|
||||
call m(idct_8x8_internal_10bpc).main_fast
|
||||
call m(idct_16x8_internal_10bpc).main_fast
|
||||
vpbroadcastd m11, [o(pd_2)]
|
||||
call m(idct_8x16_internal_10bpc).main_end2
|
||||
mova m8, [o(idct8x32p)]
|
||||
packssdw m0, m4
|
||||
packssdw m1, m5
|
||||
packssdw m2, m6
|
||||
packssdw m3, m7
|
||||
mova m6, [dup16_perm]
|
||||
vpermb m0, m8, m0
|
||||
vpermb m2, m8, m2
|
||||
vprold m8, 16
|
||||
vpermb m1, m8, m1
|
||||
vpermb m3, m8, m3
|
||||
punpckldq m4, m0, m2
|
||||
punpckhdq m0, m2
|
||||
punpckldq m2, m1, m3
|
||||
punpckhdq m1, m3
|
||||
punpckldq m21, m4, m2
|
||||
punpckhdq m14, m4, m2
|
||||
punpckldq m18, m0, m1
|
||||
punpckhdq m15, m0, m1
|
||||
vpord m7, m6, [o(pb_32)] {1to16}
|
||||
vpermb m22, m7, m21 ; 1
|
||||
pmovzxwd m9, ym21 ; 0
|
||||
vpermb m8, m6, m18 ; 4
|
||||
vpermb m24, m7, m18 ; 5
|
||||
vpermb m21, m6, m14 ; 2
|
||||
vpermb m23, m7, m14 ; 3
|
||||
vpermb m14, m6, m15 ; 6
|
||||
vpermb m25, m7, m15 ; 7
|
||||
lea r5, [o_base_8bpc]
|
||||
pslld m9, 16
|
||||
|
||||
pxor m7, m7
|
||||
REPX {mova x, m7}, m1, m18, m15, m26, m27, m28, m29
|
||||
|
||||
call m(idct_16x16_internal_8bpc).main_fast2
|
||||
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
|
||||
mova [rsp+mmsize*0], m14
|
||||
mova [rsp+mmsize*1], m15
|
||||
mova [rsp+mmsize*2], m16
|
||||
mova [rsp+mmsize*3], m17
|
||||
mova [rsp+mmsize*4], m18
|
||||
mova [rsp+mmsize*5], m19
|
||||
mova [rsp+mmsize*6], m20
|
||||
mova [rsp+mmsize*7], m21
|
||||
|
||||
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
|
||||
|
||||
pxor m12, m12
|
||||
REPX {mova [cq+128*x], ym12}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
.pass2_end:
|
||||
movshdup m30, [permC]
|
||||
vpbroadcastd m11, [pw_2048]
|
||||
vpbroadcastd m13, [pixel_10bpc_max]
|
||||
lea r6, [strideq*3]
|
||||
psrlq m31, m30, 8
|
||||
vpermq m8, m30, m0
|
||||
vpermq m9, m31, m1
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m2
|
||||
vpermq m9, m31, m3
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m4
|
||||
vpermq m9, m31, m5
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m6
|
||||
vpermq m9, m31, m7
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
|
||||
mova m1, [rsp+mmsize*0]
|
||||
mova m2, [rsp+mmsize*1]
|
||||
mova m3, [rsp+mmsize*2]
|
||||
mova m4, [rsp+mmsize*3]
|
||||
mova m5, [rsp+mmsize*4]
|
||||
mova m6, [rsp+mmsize*5]
|
||||
mova m7, [rsp+mmsize*6]
|
||||
mova m8, [rsp+mmsize*7]
|
||||
|
||||
paddsw m0, m1, m21
|
||||
psubsw m21, m1, m21
|
||||
paddsw m1, m2, m20
|
||||
psubsw m20, m2, m20
|
||||
paddsw m2, m3, m19
|
||||
psubsw m19, m3, m19
|
||||
paddsw m3, m4, m18
|
||||
psubsw m18, m4, m18
|
||||
paddsw m4, m5, m17
|
||||
psubsw m17, m5, m17
|
||||
paddsw m5, m6, m16
|
||||
psubsw m16, m6, m16
|
||||
paddsw m6, m7, m15
|
||||
psubsw m15, m7, m15
|
||||
paddsw m7, m8, m14
|
||||
psubsw m14, m8, m14
|
||||
|
||||
vpermq m8, m30, m0
|
||||
vpermq m9, m31, m1
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m2
|
||||
vpermq m9, m31, m3
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m4
|
||||
vpermq m9, m31, m5
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m6
|
||||
vpermq m9, m31, m7
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
|
||||
vpermq m8, m30, m14
|
||||
vpermq m9, m31, m15
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m16
|
||||
vpermq m9, m31, m17
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m18
|
||||
vpermq m9, m31, m19
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m20
|
||||
vpermq m9, m31, m21
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
|
||||
vpermq m8, m30, m22
|
||||
vpermq m9, m31, m23
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m24
|
||||
vpermq m9, m31, m25
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m26
|
||||
vpermq m9, m31, m27
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
vpermq m8, m30, m28
|
||||
vpermq m9, m31, m29
|
||||
call m(idct_16x8_internal_10bpc).write_16x4
|
||||
RET
|
||||
.pass1:
|
||||
mova m0, [cq+128* 0]
|
||||
mova m1, [cq+128* 2]
|
||||
mova m2, [cq+128* 4]
|
||||
mova m3, [cq+128* 6]
|
||||
mova m4, [cq+128* 8]
|
||||
mova m5, [cq+128*10]
|
||||
mova m6, [cq+128*12]
|
||||
mova m7, [cq+128*14]
|
||||
call m(idct_8x16_internal_10bpc).main
|
||||
mova m16, [cq+128* 1]
|
||||
mova m17, [cq+128* 3]
|
||||
mova m18, [cq+128* 5]
|
||||
mova m19, [cq+128* 7]
|
||||
mova m20, [cq+128* 9]
|
||||
mova m21, [cq+128*11]
|
||||
mova m22, [cq+128*13]
|
||||
mova m23, [cq+128*15]
|
||||
call m(idct_16x16_internal_10bpc).main
|
||||
call m(idct_16x16_internal_10bpc).main_end
|
||||
jmp m(idct_16x16_internal_10bpc).main_end3
|
||||
.dconly:
|
||||
imul r6d, [cq], 181
|
||||
mov [cq], eobd
|
||||
or r3d, 64
|
||||
add r6d, 640
|
||||
sar r6d, 10
|
||||
jmp m(inv_txfm_add_dct_dct_16x8_10bpc).dconly2
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
|
|
@ -5143,7 +5143,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
|
|||
sar r6d, 8+2
|
||||
jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
|
||||
ALIGN function_align
|
||||
.main_oddhalf_fast: ; bottom three-quarters are zero
|
||||
cglobal_label .main_oddhalf_fast ; bottom three-quarters are zero
|
||||
vpbroadcastd m8, [o(pw_101_4095x8)]
|
||||
vpbroadcastd m21, [o(pw_m1474_3822x8)]
|
||||
vpbroadcastd m14, [o(pw_897_3996x8)]
|
||||
|
@ -5170,7 +5170,7 @@ ALIGN function_align
|
|||
mova m20, m15
|
||||
jmp .main_oddhalf2
|
||||
ALIGN function_align
|
||||
.main_oddhalf:
|
||||
cglobal_label .main_oddhalf
|
||||
vpbroadcastd m8, [o(pw_101_4095x8)]
|
||||
vpbroadcastd m9, [o(pw_m2824_2967x8)]
|
||||
vpbroadcastd m11, [o(pw_1660_3745x8)]
|
||||
|
|
|
@ -57,6 +57,7 @@ save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0
|
|||
save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2
|
||||
db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3
|
||||
save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1
|
||||
cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3
|
||||
save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
|
||||
save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
|
||||
pb_128: times 16 db 128
|
||||
|
@ -74,6 +75,12 @@ save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
|
|||
SAVE_TMVS_TABLE 5, 2, avx2
|
||||
SAVE_TMVS_TABLE 7, 1, avx2
|
||||
|
||||
save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
|
||||
SAVE_TMVS_TABLE 4, 8, avx512icl
|
||||
SAVE_TMVS_TABLE 4, 4, avx512icl
|
||||
SAVE_TMVS_TABLE 5, 2, avx512icl
|
||||
SAVE_TMVS_TABLE 7, 1, avx512icl
|
||||
|
||||
JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
|
||||
JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
|
||||
%endif
|
||||
|
@ -170,8 +177,6 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
|
|||
%define rpq r3
|
||||
%define r10 r1
|
||||
%define r10d r1
|
||||
%define r10w r1w
|
||||
%define r10b r1b
|
||||
%define r11 r4
|
||||
%define r11d r4
|
||||
%endif
|
||||
|
@ -486,6 +491,125 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
|
|||
jg .loop
|
||||
RET
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
; refmvs_temporal_block *rp, ptrdiff_t stride,
|
||||
; refmvs_block **rr, uint8_t *ref_sign,
|
||||
; int col_end8, int row_end8, int col_start8, int row_start8
|
||||
cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
|
||||
xend, yend, xstart, ystart
|
||||
%define base r14-.write1
|
||||
lea r14, [.write1]
|
||||
movifnidn xendd, xendm
|
||||
movifnidn yendd, yendm
|
||||
mov xstartd, xstartm
|
||||
mov ystartd, ystartm
|
||||
psllq m4, [ref_signq]{bcstq}, 8
|
||||
vpbroadcastq m3, [base+save_ref_shuf+8]
|
||||
vbroadcasti32x4 m5, [base+cond_shuf512]
|
||||
vbroadcasti32x4 m6, [base+save_cond0]
|
||||
vpbroadcastd m7, [base+pb_128]
|
||||
mova m8, [base+save_pack0]
|
||||
movu xm9, [base+save_pack0+4]
|
||||
lea r9d, [xendq*5]
|
||||
lea xstartd, [xstartq*5]
|
||||
sub yendd, ystartd
|
||||
add ystartd, ystartd
|
||||
lea strideq, [strideq*5]
|
||||
sub xstartq, r9
|
||||
add xendd, r9d
|
||||
add rpq, r9
|
||||
mov r10d, 0x1f
|
||||
kmovb k2, r10d
|
||||
DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
|
||||
.loop_y:
|
||||
and ystartd, 30
|
||||
mov xq, xstartq
|
||||
mov bq, [rrq+ystartq*8]
|
||||
add ystartd, 2
|
||||
lea bq, [bq+xendq*4]
|
||||
.loop_x:
|
||||
imul candq, xq, 0x9999
|
||||
sar candq, 16 ; x / 5 * 3
|
||||
movzx r10d, byte [bq+candq*8+22] ; cand_b->bs
|
||||
movu xm0, [bq+candq*8+12] ; cand_b
|
||||
movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
|
||||
movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
|
||||
add r10, r14
|
||||
add candq, r11
|
||||
jge .calc
|
||||
movzx r11d, byte [bq+candq*8+22]
|
||||
vinserti32x4 ym0, [bq+candq*8+12], 1
|
||||
movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
|
||||
movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
|
||||
add r11, r14
|
||||
add candq, r12
|
||||
jge .calc
|
||||
movzx r12d, byte [bq+candq*8+22]
|
||||
vinserti32x4 m0, [bq+candq*8+12], 2
|
||||
movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
|
||||
movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
|
||||
add r12, r14
|
||||
add candq, r13
|
||||
jge .calc
|
||||
vinserti32x4 m0, [bq+candq*8+12], 3
|
||||
movzx r13d, byte [bq+candq*8+22]
|
||||
movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
|
||||
add r13, r14
|
||||
.calc:
|
||||
pshufb m1, m0, m3
|
||||
pabsw m2, m0
|
||||
pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1]
|
||||
psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096
|
||||
psubd m2, m1
|
||||
pshufb m2, m5 ; c0 c1 c1 c0
|
||||
pand m2, m6
|
||||
punpckhqdq m1, m2, m2
|
||||
vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
|
||||
pshufb m2, m0, m1
|
||||
mova xm0, xm2
|
||||
call r10
|
||||
jge .next_line
|
||||
vextracti32x4 xm0, m2, 1
|
||||
call r11
|
||||
jge .next_line
|
||||
vextracti32x4 xm0, m2, 2
|
||||
call r12
|
||||
jge .next_line
|
||||
vextracti32x4 xm0, m2, 3
|
||||
call r13
|
||||
jl .loop_x
|
||||
.next_line:
|
||||
add rpq, strideq
|
||||
dec hd
|
||||
jg .loop_y
|
||||
RET
|
||||
.write1:
|
||||
vmovdqu8 [rpq+xq]{k2}, xm0
|
||||
add xq, 5*1
|
||||
ret
|
||||
.write2:
|
||||
pshufb xm0, xm8
|
||||
vmovdqu16 [rpq+xq]{k2}, xm0
|
||||
add xq, 5*2
|
||||
ret
|
||||
.write4:
|
||||
vpermb ym0, ym8, ym0
|
||||
vmovdqu32 [rpq+xq]{k2}, ym0
|
||||
add xq, 5*4
|
||||
ret
|
||||
.write8:
|
||||
vpermb m0, m8, m0
|
||||
vmovdqu64 [rpq+xq]{k2}, m0
|
||||
add xq, 5*8
|
||||
ret
|
||||
.write16:
|
||||
vpermb m1, m8, m0
|
||||
movu [rpq+xq+ 0], m1
|
||||
pshufb xm0, xm9
|
||||
movu [rpq+xq+64], xm0
|
||||
add xq, 5*16
|
||||
ret
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
|
||||
vbroadcasti32x4 m0, [aq]
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_ssse3);
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_avx2);
|
||||
decl_save_tmvs_fn(dav1d_save_tmvs_avx512icl);
|
||||
|
||||
decl_splat_mv_fn(dav1d_splat_mv_sse2);
|
||||
decl_splat_mv_fn(dav1d_splat_mv_avx2);
|
||||
|
@ -54,6 +55,7 @@ static ALWAYS_INLINE void refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
|
||||
|
||||
c->save_tmvs = dav1d_save_tmvs_avx512icl;
|
||||
c->splat_mv = dav1d_splat_mv_avx512icl;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -133,6 +133,7 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
a & 0x1ff, a & 0x600, maxw, maxh);
|
||||
else if (mode == FILTER_PRED)
|
||||
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
|
||||
break;
|
||||
}
|
||||
|
||||
bench_new(a_dst, stride, topleft, w, h, a, 128, 128
|
||||
|
|
|
@ -29,6 +29,16 @@
|
|||
|
||||
#include <stdio.h>
|
||||
|
||||
static inline int gen_mv(const int total_bits, int spel_bits) {
|
||||
int bits = rnd() & ((1 << spel_bits) - 1);
|
||||
do {
|
||||
bits |= (rnd() & 1) << spel_bits;
|
||||
} while (rnd() & 1 && ++spel_bits < total_bits);
|
||||
// the do/while makes it relatively more likely to be close to zero (fpel)
|
||||
// than far away
|
||||
return rnd() & 1 ? -bits : bits;
|
||||
}
|
||||
|
||||
static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
|
||||
refmvs_block *rr[31];
|
||||
refmvs_block r[31 * 256];
|
||||
|
@ -58,10 +68,10 @@ static void check_save_tmvs(const Dav1dRefmvsDSPContext *const c) {
|
|||
while (j + ((dav1d_block_dimensions[bs][0] + 1) >> 1) > col_end8)
|
||||
bs++;
|
||||
rr[i * 2][j * 2 + 1] = (refmvs_block) {
|
||||
.mv.mv[0].x = -(rnd() & 1) * (rnd() & 8191),
|
||||
.mv.mv[0].y = -(rnd() & 1) * (rnd() & 8191),
|
||||
.mv.mv[1].x = -(rnd() & 1) * (rnd() & 8191),
|
||||
.mv.mv[1].y = -(rnd() & 1) * (rnd() & 8191),
|
||||
.mv.mv[0].x = gen_mv(14, 10),
|
||||
.mv.mv[0].y = gen_mv(14, 10),
|
||||
.mv.mv[1].x = gen_mv(14, 10),
|
||||
.mv.mv[1].y = gen_mv(14, 10),
|
||||
.ref.ref = { (rnd() % 9) - 1, (rnd() % 9) - 1 },
|
||||
.bs = bs
|
||||
};
|
||||
|
|
Загрузка…
Ссылка в новой задаче