зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1588123 - Update libdav1d to head; r=TD-Linux
This updates libdav1d to commit 5595102721d3c298d7cee64e64878486a3b8bdad. Differential Revision: https://phabricator.services.mozilla.com/D50205 --HG-- rename : third_party/dav1d/snap/snapcraft.yaml => third_party/dav1d/package/snap/snapcraft.yaml extra : moz-landing-system : lando
This commit is contained in:
Родитель
0e8b885146
Коммит
8f20970320
|
@ -153,6 +153,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
relative_path = '../../../third_party/dav1d/src/arm/'
|
||||
bitdepth_basenames = [
|
||||
'cdef_init_tmpl.c',
|
||||
'ipred_init_tmpl.c',
|
||||
'itx_init_tmpl.c',
|
||||
'loopfilter_init_tmpl.c',
|
||||
'looprestoration_init_tmpl.c',
|
||||
|
@ -191,6 +192,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
if CONFIG['CPU_ARCH'] == 'aarch64':
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/arm/64/cdef.S',
|
||||
'../../../third_party/dav1d/src/arm/64/ipred.S',
|
||||
'../../../third_party/dav1d/src/arm/64/itx.S',
|
||||
'../../../third_party/dav1d/src/arm/64/loopfilter.S',
|
||||
'../../../third_party/dav1d/src/arm/64/looprestoration.S',
|
||||
|
@ -199,6 +201,7 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
]
|
||||
elif CONFIG['CPU_ARCH'] == 'arm':
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/arm/32/cdef.S',
|
||||
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
||||
'../../../third_party/dav1d/src/arm/32/mc.S',
|
||||
]
|
||||
|
|
|
@ -20,7 +20,7 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).
|
||||
release: commit 5595102721d3c298d7cee64e64878486a3b8bdad (2019-10-22T19:50:25.000+02:00).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.4.0-49-gc0865f3"
|
||||
#define DAV1D_VERSION "0.5.0-6-g5595102"
|
||||
|
|
|
@ -178,7 +178,7 @@ build-debian-aarch64:
|
|||
- aarch64
|
||||
- debian
|
||||
script:
|
||||
- meson build --buildtype release --werror
|
||||
- meson build --buildtype debugoptimized --werror
|
||||
- ninja -C build
|
||||
- cd build && meson test -v
|
||||
|
||||
|
@ -219,7 +219,7 @@ build-debian-armv7:
|
|||
- armv7
|
||||
- debian
|
||||
script:
|
||||
- meson build --buildtype release --werror
|
||||
- meson build --buildtype debugoptimized --werror
|
||||
- ninja -C build
|
||||
- cd build && meson test -v
|
||||
|
||||
|
@ -241,7 +241,7 @@ build-ubuntu-snap:
|
|||
- debian
|
||||
- amd64
|
||||
script:
|
||||
- snapcraft snap
|
||||
- cd package/snap && snapcraft snap
|
||||
- |
|
||||
if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
|
||||
echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
|
||||
|
@ -251,7 +251,7 @@ build-ubuntu-snap:
|
|||
artifacts:
|
||||
name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
|
||||
paths:
|
||||
- dav1d_*.snap
|
||||
- package/snap/dav1d_*.snap
|
||||
expire_in: 1 week
|
||||
allow_failure: true
|
||||
|
||||
|
|
|
@ -1,3 +1,25 @@
|
|||
Changes for 0.5.0 'Asiatic Cheetah':
|
||||
----------------------------
|
||||
|
||||
0.5.0 is a medium release fixing regressions and minor issues,
|
||||
and improving speed significantly:
|
||||
- Export ITU T.35 metadata
|
||||
- Speed improvements on blend_ on ARM
|
||||
- Speed improvements on decode_coef and MSAC
|
||||
- NEON optimizations for blend*, w_mask_, ipred functions for ARM64
|
||||
- NEON optimizations for CDEF and warp on ARM32
|
||||
- SSE2 optimizations for MSAC hi_tok decoding
|
||||
- SSSE3 optimizations for deblocking loopfilters and warp_affine
|
||||
- AVX-2 optimizations for film grain and ipred_z2
|
||||
- SSE4 optimizations for warp_affine
|
||||
- VSX optimizations for wiener
|
||||
- Fix inverse transform overflows in x86 and NEON asm
|
||||
- Fix integer overflows with large frames
|
||||
- Improve film grain generation to match reference code
|
||||
- Improve compatibility with older binutils for ARM
|
||||
- More advanced Player example in tools
|
||||
|
||||
|
||||
Changes for 0.4.0 'Cheetah':
|
||||
----------------------------
|
||||
|
||||
|
@ -11,6 +33,7 @@ Changes for 0.4.0 'Cheetah':
|
|||
- NEON optimizations for blend functions on ARM
|
||||
- NEON optimizations for w_mask functions on ARM
|
||||
- NEON optimizations for inverse transforms on ARM64
|
||||
- VSX optimizations for CDEF filter
|
||||
- Improve handling of malloc failures
|
||||
- Simple Player example in tools
|
||||
|
||||
|
@ -38,7 +61,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
|
|||
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
|
||||
The impact is important on SSSE3, SSE4 and AVX-2 cpus
|
||||
- SSSE3 optimizations for all blocks size in itx
|
||||
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
|
||||
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
|
||||
- Speed improvements on CDEF for SSE4 CPUs
|
||||
- NEON optimizations for SGR and loop filter
|
||||
- Minor crashes, improvements and build changes
|
||||
|
|
|
@ -73,28 +73,15 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
|
|||
# Compile
|
||||
|
||||
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
|
||||
2. Run `meson build --buildtype release`
|
||||
3. Build with `ninja -C build`
|
||||
2. Run `mkdir build && cd build` to create a build directory and enter it
|
||||
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
|
||||
4. Run `ninja` to compile
|
||||
|
||||
# Run tests
|
||||
|
||||
1. During initial build dir setup or `meson configure` specify `-Denable_tests=true`
|
||||
2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
|
||||
for checkasm
|
||||
|
||||
# Run testdata based tests
|
||||
|
||||
1. Checkout the test data repository
|
||||
|
||||
```
|
||||
git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
|
||||
```
|
||||
2. During initial build dir setup or `meson configure` specify `-Denable_tests=true` and `-Dtestdata_tests=true`
|
||||
|
||||
```
|
||||
meson .test -Denable_tests=true -Dtestdata_tests=true
|
||||
```
|
||||
3. In the build directory run `meson test` optionally with `-v` for more verbose output
|
||||
1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository
|
||||
2. During meson configuration, specify `-Dtestdata_tests=true`
|
||||
3. Run `meson test -v` after compiling
|
||||
|
||||
# Support
|
||||
|
||||
|
|
Двоичный файл не отображается.
До Ширина: | Высота: | Размер: 0 B После Ширина: | Высота: | Размер: 19 KiB |
|
@ -28,6 +28,7 @@
|
|||
#include "vcs_version.h"
|
||||
|
||||
#include <getopt.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
@ -48,6 +49,9 @@
|
|||
*/
|
||||
typedef struct {
|
||||
const char *inputfile;
|
||||
int highquality;
|
||||
int untimed;
|
||||
int zerocopy;
|
||||
} Dav1dPlaySettings;
|
||||
|
||||
#define WINDOW_WIDTH 910
|
||||
|
@ -156,9 +160,13 @@ typedef struct rdr_info
|
|||
// Callback to destroy the renderer
|
||||
void (*destroy_renderer)(void *cookie);
|
||||
// Callback to the render function that renders a prevously sent frame
|
||||
void (*render)(void *cookie);
|
||||
void (*render)(void *cookie, const Dav1dPlaySettings *settings);
|
||||
// Callback to the send frame function
|
||||
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic);
|
||||
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
|
||||
const Dav1dPlaySettings *settings);
|
||||
// Callback for alloc/release pictures (optional)
|
||||
int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
|
||||
void (*release_pic)(Dav1dPicture *pic, void *cookie);
|
||||
} Dav1dPlayRenderInfo;
|
||||
|
||||
#ifdef HAVE_PLACEBO_VULKAN
|
||||
|
@ -325,7 +333,7 @@ static void placebo_renderer_destroy(void *cookie)
|
|||
pl_context_destroy(&(rd_priv_ctx->ctx));
|
||||
}
|
||||
|
||||
static void placebo_render(void *cookie)
|
||||
static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
|
@ -358,8 +366,9 @@ static void placebo_render(void *cookie)
|
|||
.height = img->params.h,
|
||||
};
|
||||
|
||||
struct pl_render_params render_params = pl_render_default_params;
|
||||
//render_params.upscaler = &pl_filter_ewa_lanczos;
|
||||
struct pl_render_params render_params = {0};
|
||||
if (settings->highquality)
|
||||
render_params = pl_render_default_params;
|
||||
|
||||
struct pl_render_target target;
|
||||
pl_render_target_from_swapchain(&target, &frame);
|
||||
|
@ -385,7 +394,8 @@ static void placebo_render(void *cookie)
|
|||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
}
|
||||
|
||||
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
|
||||
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
|
||||
const Dav1dPlaySettings *settings)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
|
@ -413,7 +423,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
|
|||
.height = height,
|
||||
.pixel_stride = 1,
|
||||
.row_stride = dav1d_pic->stride[0],
|
||||
.pixels = dav1d_pic->data[0],
|
||||
.component_size = {8},
|
||||
.component_map = {0},
|
||||
};
|
||||
|
@ -424,7 +433,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
|
|||
.height = height/2,
|
||||
.pixel_stride = 1,
|
||||
.row_stride = dav1d_pic->stride[1],
|
||||
.pixels = dav1d_pic->data[1],
|
||||
.component_size = {8},
|
||||
.component_map = {1},
|
||||
};
|
||||
|
@ -435,11 +443,23 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
|
|||
.height = height/2,
|
||||
.pixel_stride = 1,
|
||||
.row_stride = dav1d_pic->stride[1],
|
||||
.pixels = dav1d_pic->data[2],
|
||||
.component_size = {8},
|
||||
.component_map = {2},
|
||||
};
|
||||
|
||||
if (settings->zerocopy) {
|
||||
const struct pl_buf *buf = dav1d_pic->allocator_data;
|
||||
assert(buf);
|
||||
data_y.buf = data_u.buf = data_v.buf = buf;
|
||||
data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
|
||||
data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
|
||||
data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
|
||||
} else {
|
||||
data_y.pixels = dav1d_pic->data[0];
|
||||
data_u.pixels = dav1d_pic->data[1];
|
||||
data_v.pixels = dav1d_pic->data[2];
|
||||
}
|
||||
|
||||
bool ok = true;
|
||||
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
|
||||
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
|
||||
|
@ -456,11 +476,106 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
|
|||
return !ok;
|
||||
}
|
||||
|
||||
// Align to power of 2
|
||||
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
|
||||
|
||||
static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
SDL_LockMutex(rd_priv_ctx->lock);
|
||||
|
||||
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
|
||||
int ret = DAV1D_ERR(ENOMEM);
|
||||
|
||||
// Copied from dav1d_default_picture_alloc
|
||||
const int hbd = p->p.bpc > 8;
|
||||
const int aligned_w = ALIGN2(p->p.w, 128);
|
||||
const int aligned_h = ALIGN2(p->p.h, 128);
|
||||
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
p->stride[0] = aligned_w << hbd;
|
||||
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
|
||||
|
||||
// Align strides up to multiples of the GPU performance hints
|
||||
p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
|
||||
p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
|
||||
|
||||
// Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
|
||||
size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
|
||||
const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
|
||||
const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
|
||||
|
||||
// The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
|
||||
// even in the case that the driver gives us insane alignments
|
||||
const size_t pic_size = y_sz + 2 * uv_sz;
|
||||
const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
|
||||
|
||||
// Validate size limitations
|
||||
if (total_size > gpu->limits.max_xfer_size) {
|
||||
printf("alloc of %zu bytes exceeds limits\n", total_size);
|
||||
goto err;
|
||||
}
|
||||
|
||||
const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
|
||||
.type = PL_BUF_TEX_TRANSFER,
|
||||
.host_mapped = true,
|
||||
.size = total_size,
|
||||
.memory_type = PL_BUF_MEM_HOST,
|
||||
.user_data = p,
|
||||
});
|
||||
|
||||
if (!buf) {
|
||||
printf("alloc of GPU mapped buffer failed\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
assert(buf->data);
|
||||
uintptr_t base = (uintptr_t) buf->data, data[3];
|
||||
data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
|
||||
data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
|
||||
data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
|
||||
|
||||
// Sanity check offset alignment for the sake of debugging
|
||||
if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
|
||||
data[1] - base != ALIGN2(data[1] - base, off_align) ||
|
||||
data[2] - base != ALIGN2(data[2] - base, off_align))
|
||||
{
|
||||
printf("GPU buffer horribly misaligned, expect slowdown!\n");
|
||||
}
|
||||
|
||||
p->allocator_data = (void *) buf;
|
||||
p->data[0] = (void *) data[0];
|
||||
p->data[1] = (void *) data[1];
|
||||
p->data[2] = (void *) data[2];
|
||||
ret = 0;
|
||||
|
||||
// fall through
|
||||
err:
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
assert(pic->allocator_data);
|
||||
|
||||
SDL_LockMutex(rd_priv_ctx->lock);
|
||||
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
|
||||
pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
|
||||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
}
|
||||
|
||||
static const Dav1dPlayRenderInfo renderer_info = {
|
||||
.create_renderer = placebo_renderer_create,
|
||||
.destroy_renderer = placebo_renderer_destroy,
|
||||
.render = placebo_render,
|
||||
.update_frame = placebo_upload_planes
|
||||
.update_frame = placebo_upload_planes,
|
||||
.alloc_pic = placebo_alloc_pic,
|
||||
.release_pic = placebo_release_pic,
|
||||
};
|
||||
|
||||
#else
|
||||
|
@ -516,7 +631,7 @@ static void sdl_renderer_destroy(void *cookie)
|
|||
free(rd_priv_ctx);
|
||||
}
|
||||
|
||||
static void sdl_render(void *cookie)
|
||||
static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
|
@ -536,7 +651,8 @@ static void sdl_render(void *cookie)
|
|||
SDL_UnlockMutex(rd_priv_ctx->lock);
|
||||
}
|
||||
|
||||
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic)
|
||||
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
|
||||
const Dav1dPlaySettings *settings)
|
||||
{
|
||||
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
|
||||
assert(rd_priv_ctx != NULL);
|
||||
|
@ -647,8 +763,11 @@ static void dp_settings_print_usage(const char *const app,
|
|||
fprintf(stderr, "Usage: %s [options]\n\n", app);
|
||||
fprintf(stderr, "Supported options:\n"
|
||||
" --input/-i $file: input file\n"
|
||||
" --untimed/-u: ignore PTS, render as fast as possible\n"
|
||||
" --framethreads $num: number of frame threads (default: 1)\n"
|
||||
" --tilethreads $num: number of tile threads (default: 1)\n"
|
||||
" --highquality: enable high quality rendering\n"
|
||||
" --zerocopy/-z: enable zero copy upload path\n"
|
||||
" --version/-v: print version and exit\n");
|
||||
exit(1);
|
||||
}
|
||||
|
@ -672,19 +791,23 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
|||
Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
|
||||
|
||||
// Short options
|
||||
static const char short_opts[] = "i:v";
|
||||
static const char short_opts[] = "i:vuz";
|
||||
|
||||
enum {
|
||||
ARG_FRAME_THREADS = 256,
|
||||
ARG_TILE_THREADS,
|
||||
ARG_HIGH_QUALITY,
|
||||
};
|
||||
|
||||
// Long options
|
||||
static const struct option long_opts[] = {
|
||||
{ "input", 1, NULL, 'i' },
|
||||
{ "version", 0, NULL, 'v' },
|
||||
{ "untimed", 0, NULL, 'u' },
|
||||
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
|
||||
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
|
||||
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
|
||||
{ "zerocopy", 0, NULL, 'z' },
|
||||
{ NULL, 0, NULL, 0 },
|
||||
};
|
||||
|
||||
|
@ -696,6 +819,21 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
|
|||
case 'v':
|
||||
fprintf(stderr, "%s\n", dav1d_version());
|
||||
exit(0);
|
||||
case 'u':
|
||||
settings->untimed = true;
|
||||
break;
|
||||
case ARG_HIGH_QUALITY:
|
||||
settings->highquality = true;
|
||||
#ifndef HAVE_PLACEBO_VULKAN
|
||||
fprintf(stderr, "warning: --highquality requires libplacebo\n");
|
||||
#endif
|
||||
break;
|
||||
case 'z':
|
||||
settings->zerocopy = true;
|
||||
#ifndef HAVE_PLACEBO_VULKAN
|
||||
fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
|
||||
#endif
|
||||
break;
|
||||
case ARG_FRAME_THREADS:
|
||||
lib_settings->n_frame_threads =
|
||||
parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
|
||||
|
@ -811,7 +949,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
|
|||
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
|
||||
Dav1dPicture *dav1d_pic)
|
||||
{
|
||||
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic);
|
||||
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
|
||||
rd_ctx->current_pts = dav1d_pic->m.timestamp;
|
||||
}
|
||||
|
||||
|
@ -853,16 +991,20 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
|
|||
int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
|
||||
rd_ctx->last_pts = rd_ctx->current_pts;
|
||||
|
||||
// In untimed mode, simply don't wait
|
||||
if (rd_ctx->settings.untimed)
|
||||
wait_time = 0;
|
||||
|
||||
// This way of timing the playback is not accurate, as there is no guarantee
|
||||
// that SDL_Delay will wait for exactly the requested amount of time so in a
|
||||
// accurate player this would need to be done in a better way.
|
||||
if (wait_time >= 0) {
|
||||
if (wait_time > 0) {
|
||||
SDL_Delay(wait_time);
|
||||
} else if (wait_time < -10) { // Do not warn for minor time drifts
|
||||
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
|
||||
}
|
||||
|
||||
renderer_info.render(rd_ctx->rd_priv);
|
||||
renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);
|
||||
|
||||
rd_ctx->last_ticks = SDL_GetTicks();
|
||||
}
|
||||
|
@ -1046,6 +1188,18 @@ int main(int argc, char **argv)
|
|||
// Parse and validate arguments
|
||||
dp_rd_ctx_parse_args(rd_ctx, argc, argv);
|
||||
|
||||
if (rd_ctx->settings.zerocopy) {
|
||||
if (renderer_info.alloc_pic) {
|
||||
rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
|
||||
.cookie = rd_ctx->rd_priv,
|
||||
.alloc_picture_callback = renderer_info.alloc_pic,
|
||||
.release_picture_callback = renderer_info.release_pic,
|
||||
};
|
||||
} else {
|
||||
fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Start decoder thread
|
||||
decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.4.0',
|
||||
version: '0.5.0',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
|
|
|
@ -17,7 +17,7 @@ apps:
|
|||
parts:
|
||||
dav1d:
|
||||
plugin: meson
|
||||
source: .
|
||||
source: ../../
|
||||
build-packages: [ 'nasm' ]
|
||||
meson-parameters:
|
||||
- --prefix=/usr
|
|
@ -0,0 +1,660 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// n1 = s0/d0
|
||||
// w1 = d0/q0
|
||||
// n2 = s4/d2
|
||||
// w2 = d2/q1
|
||||
.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
|
||||
tst r6, #1 // CDEF_HAVE_LEFT
|
||||
beq 2f
|
||||
// CDEF_HAVE_LEFT
|
||||
tst r6, #2 // CDEF_HAVE_RIGHT
|
||||
beq 1f
|
||||
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
ldrh r12, [\s1, #-2]
|
||||
vldr \n1, [\s1]
|
||||
vdup.16 d4, r12
|
||||
ldrh r12, [\s1, #\w]
|
||||
vmov.16 d4[1], r12
|
||||
ldrh r12, [\s2, #-2]
|
||||
vldr \n2, [\s2]
|
||||
vmov.16 d4[2], r12
|
||||
ldrh r12, [\s2, #\w]
|
||||
vmovl.u8 q0, d0
|
||||
vmov.16 d4[3], r12
|
||||
vmovl.u8 q1, d2
|
||||
vmovl.u8 q2, d4
|
||||
vstr s8, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s9, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
vstr s10, [r0, #-4]
|
||||
vst1.16 {\w2}, [r0, :\align]
|
||||
vstr s11, [r0, #2*\w]
|
||||
.if \ret
|
||||
pop {r4-r7,pc}
|
||||
.else
|
||||
add r0, r0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
1:
|
||||
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ldrh r12, [\s1, #-2]
|
||||
vldr \n1, [\s1]
|
||||
vdup.16 d4, r12
|
||||
ldrh r12, [\s2, #-2]
|
||||
vldr \n2, [\s2]
|
||||
vmovl.u8 q0, d0
|
||||
vmov.16 d4[1], r12
|
||||
vmovl.u8 q1, d2
|
||||
vmovl.u8 q2, d4
|
||||
vstr s8, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
vstr s9, [r0, #-4]
|
||||
vst1.16 {\w2}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
.if \ret
|
||||
pop {r4-r7,pc}
|
||||
.else
|
||||
add r0, r0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
2:
|
||||
// !CDEF_HAVE_LEFT
|
||||
tst r6, #2 // CDEF_HAVE_RIGHT
|
||||
beq 1f
|
||||
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
vldr \n1, [\s1]
|
||||
ldrh r12, [\s1, #\w]
|
||||
vldr \n2, [\s2]
|
||||
vdup.16 d4, r12
|
||||
ldrh r12, [\s2, #\w]
|
||||
vmovl.u8 q0, d0
|
||||
vmov.16 d4[1], r12
|
||||
vmovl.u8 q1, d2
|
||||
vmovl.u8 q2, d4
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s8, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w2}, [r0, :\align]
|
||||
vstr s9, [r0, #2*\w]
|
||||
.if \ret
|
||||
pop {r4-r7,pc}
|
||||
.else
|
||||
add r0, r0, #2*\stride
|
||||
b 3f
|
||||
.endif
|
||||
|
||||
1:
|
||||
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
vldr \n1, [\s1]
|
||||
vldr \n2, [\s2]
|
||||
vmovl.u8 q0, d0
|
||||
vmovl.u8 q1, d2
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w2}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
.if \ret
|
||||
pop {r4-r7,pc}
|
||||
.else
|
||||
add r0, r0, #2*\stride
|
||||
.endif
|
||||
3:
|
||||
.endm
|
||||
|
||||
.macro load_n_incr dst, src, incr, w
|
||||
.if \w == 4
|
||||
vld1.32 {\dst\()[0]}, [\src, :32], \incr
|
||||
.else
|
||||
vld1.8 {\dst\()}, [\src, :64], \incr
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// /*const*/ pixel *const top[2], int h,
|
||||
// enum CdefEdgeFlags edges);
|
||||
|
||||
// n1 = s0/d0
|
||||
// w1 = d0/q0
|
||||
// n2 = s4/d2
|
||||
// w2 = d2/q1
|
||||
.macro padding_func w, stride, n1, w1, n2, w2, align
|
||||
function cdef_padding\w\()_neon, export=1
|
||||
push {r4-r7,lr}
|
||||
ldrd r4, r5, [sp, #20]
|
||||
ldr r6, [sp, #28]
|
||||
vmov.i16 q3, #0x8000
|
||||
tst r6, #4 // CDEF_HAVE_TOP
|
||||
bne 1f
|
||||
// !CDEF_HAVE_TOP
|
||||
sub r12, r0, #2*(2*\stride+2)
|
||||
vmov.i16 q2, #0x8000
|
||||
vst1.16 {q2,q3}, [r12]!
|
||||
.if \w == 8
|
||||
vst1.16 {q2,q3}, [r12]!
|
||||
.endif
|
||||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_TOP
|
||||
ldr r7, [r4]
|
||||
ldr lr, [r4, #4]
|
||||
sub r0, r0, #2*(2*\stride)
|
||||
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
|
||||
|
||||
// Middle section
|
||||
3:
|
||||
tst r6, #1 // CDEF_HAVE_LEFT
|
||||
beq 2f
|
||||
// CDEF_HAVE_LEFT
|
||||
tst r6, #2 // CDEF_HAVE_RIGHT
|
||||
beq 1f
|
||||
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ldrh r12, [r3], #2
|
||||
vldr \n1, [r1]
|
||||
vdup.16 d2, r12
|
||||
ldrh r12, [r1, #\w]
|
||||
add r1, r1, r2
|
||||
subs r5, r5, #1
|
||||
vmov.16 d2[1], r12
|
||||
vmovl.u8 q0, d0
|
||||
vmovl.u8 q1, d2
|
||||
vstr s4, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s5, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
bgt 0b
|
||||
b 3f
|
||||
1:
|
||||
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ldrh r12, [r3], #2
|
||||
load_n_incr d0, r1, r2, \w
|
||||
vdup.16 d2, r12
|
||||
subs r5, r5, #1
|
||||
vmovl.u8 q0, d0
|
||||
vmovl.u8 q1, d2
|
||||
vstr s4, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
bgt 1b
|
||||
b 3f
|
||||
2:
|
||||
tst r6, #2 // CDEF_HAVE_RIGHT
|
||||
beq 1f
|
||||
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ldrh r12, [r1, #\w]
|
||||
load_n_incr d0, r1, r2, \w
|
||||
vdup.16 d2, r12
|
||||
subs r5, r5, #1
|
||||
vmovl.u8 q0, d0
|
||||
vmovl.u8 q1, d2
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s4, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
bgt 0b
|
||||
b 3f
|
||||
1:
|
||||
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
load_n_incr d0, r1, r2, \w
|
||||
subs r5, r5, #1
|
||||
vmovl.u8 q0, d0
|
||||
vstr s12, [r0, #-4]
|
||||
vst1.16 {\w1}, [r0, :\align]
|
||||
vstr s12, [r0, #2*\w]
|
||||
add r0, r0, #2*\stride
|
||||
bgt 1b
|
||||
|
||||
3:
|
||||
tst r6, #8 // CDEF_HAVE_BOTTOM
|
||||
bne 1f
|
||||
// !CDEF_HAVE_BOTTOM
|
||||
sub r12, r0, #4
|
||||
vmov.i16 q2, #0x8000
|
||||
vst1.16 {q2,q3}, [r12]!
|
||||
.if \w == 8
|
||||
vst1.16 {q2,q3}, [r12]!
|
||||
.endif
|
||||
pop {r4-r7,pc}
|
||||
1:
|
||||
// CDEF_HAVE_BOTTOM
|
||||
add r7, r1, r2
|
||||
pad_top_bottom r1, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
padding_func 8, 16, d0, q0, d2, q1, 128
|
||||
padding_func 4, 8, s0, d0, s4, d2, 64
|
||||
|
||||
.macro dir_table w, stride
|
||||
const directions\w
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
.byte 1 * \stride + 0, 2 * \stride + 0
|
||||
.byte 1 * \stride + 0, 2 * \stride - 1
|
||||
// Repeated, to avoid & 7
|
||||
.byte -1 * \stride + 1, -2 * \stride + 2
|
||||
.byte 0 * \stride + 1, -1 * \stride + 2
|
||||
.byte 0 * \stride + 1, 0 * \stride + 2
|
||||
.byte 0 * \stride + 1, 1 * \stride + 2
|
||||
.byte 1 * \stride + 1, 2 * \stride + 2
|
||||
.byte 1 * \stride + 0, 2 * \stride + 1
|
||||
endconst
|
||||
.endm
|
||||
|
||||
dir_table 8, 16
|
||||
dir_table 4, 8
|
||||
|
||||
const pri_taps
|
||||
.byte 4, 2, 3, 3
|
||||
endconst
|
||||
|
||||
.macro load_px d11, d12, d21, d22, w
|
||||
.if \w == 8
|
||||
add r6, r2, r9, lsl #1 // x + off
|
||||
sub r9, r2, r9, lsl #1 // x - off
|
||||
vld1.16 {\d11,\d12}, [r6] // p0
|
||||
vld1.16 {\d21,\d22}, [r9] // p1
|
||||
.else
|
||||
add r6, r2, r9, lsl #1 // x + off
|
||||
sub r9, r2, r9, lsl #1 // x - off
|
||||
vld1.16 {\d11}, [r6] // p0
|
||||
add r6, r6, #2*8 // += stride
|
||||
vld1.16 {\d21}, [r9] // p1
|
||||
add r9, r9, #2*8 // += stride
|
||||
vld1.16 {\d12}, [r6] // p0
|
||||
vld1.16 {\d22}, [r9] // p1
|
||||
.endif
|
||||
.endm
|
||||
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
|
||||
cmp \threshold, #0
|
||||
vmin.u16 q2, q2, \s1
|
||||
vmax.s16 q3, q3, \s1
|
||||
vmin.u16 q2, q2, \s2
|
||||
vmax.s16 q3, q3, \s2
|
||||
|
||||
beq 3f
|
||||
vabd.u16 q8, q0, \s1 // abs(diff)
|
||||
vabd.u16 q11, q0, \s2 // abs(diff)
|
||||
vshl.u16 q9, q8, \shift // abs(diff) >> shift
|
||||
vshl.u16 q12, q11, \shift // abs(diff) >> shift
|
||||
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
vsub.i16 q10, \s1, q0 // diff = p0 - px
|
||||
vsub.u16 q13, \s2, q0 // diff = p1 - px
|
||||
vneg.s16 q8, q9 // -clip
|
||||
vneg.s16 q11, q12 // -clip
|
||||
vmin.s16 q10, q10, q9 // imin(diff, clip)
|
||||
vmin.s16 q13, q13, q12 // imin(diff, clip)
|
||||
vdup.16 q9, \tap // taps[k]
|
||||
vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
|
||||
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
|
||||
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
|
||||
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
|
||||
3:
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
|
||||
// const uint16_t *tmp, int pri_strength,
|
||||
// int sec_strength, int dir, int damping, int h);
|
||||
.macro filter w
|
||||
function cdef_filter\w\()_neon, export=1
|
||||
push {r4-r9,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #92]
|
||||
ldrd r6, r7, [sp, #100]
|
||||
movrel_local r8, pri_taps
|
||||
and r9, r3, #1
|
||||
add r8, r8, r9, lsl #1
|
||||
movrel_local r9, directions\w
|
||||
add r5, r9, r5, lsl #1
|
||||
vmov.u16 d17, #15
|
||||
vdup.16 d16, r6 // damping
|
||||
|
||||
vdup.16 q5, r3 // threshold
|
||||
vdup.16 q7, r4 // threshold
|
||||
vmov.16 d8[0], r3
|
||||
vmov.16 d8[1], r4
|
||||
vclz.i16 d8, d8 // clz(threshold)
|
||||
vsub.i16 d8, d17, d8 // ulog2(threshold)
|
||||
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
|
||||
vneg.s16 d8, d8 // -shift
|
||||
vdup.16 q6, d8[1]
|
||||
vdup.16 q4, d8[0]
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
vld1.16 {q0}, [r2, :128] // px
|
||||
.else
|
||||
add r12, r2, #2*8
|
||||
vld1.16 {d0}, [r2, :64] // px
|
||||
vld1.16 {d1}, [r12, :64] // px
|
||||
.endif
|
||||
|
||||
vmov.u16 q1, #0 // sum
|
||||
vmov.u16 q2, q0 // min
|
||||
vmov.u16 q3, q0 // max
|
||||
|
||||
// Instead of loading sec_taps 2, 1 from memory, just set it
|
||||
// to 2 initially and decrease for the second round.
|
||||
mov lr, #2 // sec_taps[0]
|
||||
|
||||
2:
|
||||
ldrsb r9, [r5] // off1
|
||||
|
||||
load_px d28, d29, d30, d31, \w
|
||||
|
||||
add r5, r5, #4 // +2*2
|
||||
ldrsb r9, [r5] // off2
|
||||
|
||||
ldrb r12, [r8] // *pri_taps
|
||||
|
||||
handle_pixel q14, q15, r3, q5, q4, r12
|
||||
|
||||
load_px d28, d29, d30, d31, \w
|
||||
|
||||
add r5, r5, #8 // +2*4
|
||||
ldrsb r9, [r5] // off3
|
||||
|
||||
handle_pixel q14, q15, r4, q7, q6, lr
|
||||
|
||||
load_px d28, d29, d30, d31, \w
|
||||
|
||||
handle_pixel q14, q15, r4, q7, q6, lr
|
||||
|
||||
sub r5, r5, #11 // x8 -= 2*(2+4); x8 += 1;
|
||||
subs lr, lr, #1 // sec_tap-- (value)
|
||||
add r8, r8, #1 // pri_taps++ (pointer)
|
||||
bne 2b
|
||||
|
||||
vshr.s16 q14, q1, #15 // -(sum < 0)
|
||||
vadd.i16 q1, q1, q14 // sum - (sum < 0)
|
||||
vrshr.s16 q1, q1, #4 // (8 + sum - (sum < 0)) >> 4
|
||||
vadd.i16 q0, q0, q1 // px + (8 + sum ...) >> 4
|
||||
vmin.s16 q0, q0, q3
|
||||
vmax.s16 q0, q0, q2 // iclip(px + .., min, max)
|
||||
vmovn.u16 d0, q0
|
||||
.if \w == 8
|
||||
add r2, r2, #2*16 // tmp += tmp_stride
|
||||
subs r7, r7, #1 // h--
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
.else
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
add r2, r2, #2*16 // tmp += 2*tmp_stride
|
||||
subs r7, r7, #2 // h -= 2
|
||||
vst1.32 {d0[1]}, [r0, :32], r1
|
||||
.endif
|
||||
|
||||
// Reset pri_taps/sec_taps back to the original point
|
||||
sub r5, r5, #2
|
||||
sub r8, r8, #2
|
||||
|
||||
bgt 1b
|
||||
vpop {q4-q7}
|
||||
pop {r4-r9,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
filter 8
|
||||
filter 4
|
||||
|
||||
const div_table, align=4
|
||||
.short 840, 420, 280, 210, 168, 140, 120, 105
|
||||
endconst
|
||||
|
||||
const alt_fact, align=4
|
||||
.short 420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
|
||||
endconst
|
||||
|
||||
// int dav1d_cdef_find_dir_neon(const pixel *img, const ptrdiff_t stride,
|
||||
// unsigned *const var)
|
||||
function cdef_find_dir_neon, export=1
|
||||
push {lr}
|
||||
vpush {q4-q7}
|
||||
sub sp, sp, #32 // cost
|
||||
mov r3, #8
|
||||
vmov.u16 q1, #0 // q0-q1 sum_diag[0]
|
||||
vmov.u16 q3, #0 // q2-q3 sum_diag[1]
|
||||
vmov.u16 q5, #0 // q4-q5 sum_hv[0-1]
|
||||
vmov.u16 q8, #0 // q6,d16 sum_alt[0]
|
||||
// q7,d17 sum_alt[1]
|
||||
vmov.u16 q9, #0 // q9,d22 sum_alt[2]
|
||||
vmov.u16 q11, #0
|
||||
vmov.u16 q10, #0 // q10,d23 sum_alt[3]
|
||||
|
||||
|
||||
.irpc i, 01234567
|
||||
vld1.8 {d30}, [r0, :64], r1
|
||||
vmov.u8 d31, #128
|
||||
vsubl.u8 q15, d30, d31 // img[x] - 128
|
||||
vmov.u16 q14, #0
|
||||
|
||||
.if \i == 0
|
||||
vmov q0, q15 // sum_diag[0]
|
||||
.else
|
||||
vext.8 q12, q14, q15, #(16-2*\i)
|
||||
vext.8 q13, q15, q14, #(16-2*\i)
|
||||
vadd.i16 q0, q0, q12 // sum_diag[0]
|
||||
vadd.i16 q1, q1, q13 // sum_diag[0]
|
||||
.endif
|
||||
vrev64.16 q13, q15
|
||||
vswp d26, d27 // [-x]
|
||||
.if \i == 0
|
||||
vmov q2, q13 // sum_diag[1]
|
||||
.else
|
||||
vext.8 q12, q14, q13, #(16-2*\i)
|
||||
vext.8 q13, q13, q14, #(16-2*\i)
|
||||
vadd.i16 q2, q2, q12 // sum_diag[1]
|
||||
vadd.i16 q3, q3, q13 // sum_diag[1]
|
||||
.endif
|
||||
|
||||
vpadd.u16 d26, d30, d31 // [(x >> 1)]
|
||||
vmov.u16 d27, #0
|
||||
vpadd.u16 d24, d26, d28
|
||||
vpadd.u16 d24, d24, d28 // [y]
|
||||
vmov.u16 r12, d24[0]
|
||||
vadd.i16 q5, q5, q15 // sum_hv[1]
|
||||
.if \i < 4
|
||||
vmov.16 d8[\i], r12 // sum_hv[0]
|
||||
.else
|
||||
vmov.16 d9[\i-4], r12 // sum_hv[0]
|
||||
.endif
|
||||
|
||||
.if \i == 0
|
||||
vmov.u16 q6, q13 // sum_alt[0]
|
||||
.else
|
||||
vext.8 q12, q14, q13, #(16-2*\i)
|
||||
vext.8 q14, q13, q14, #(16-2*\i)
|
||||
vadd.i16 q6, q6, q12 // sum_alt[0]
|
||||
vadd.i16 d16, d16, d28 // sum_alt[0]
|
||||
.endif
|
||||
vrev64.16 d26, d26 // [-(x >> 1)]
|
||||
vmov.u16 q14, #0
|
||||
.if \i == 0
|
||||
vmov q7, q13 // sum_alt[1]
|
||||
.else
|
||||
vext.8 q12, q14, q13, #(16-2*\i)
|
||||
vext.8 q13, q13, q14, #(16-2*\i)
|
||||
vadd.i16 q7, q7, q12 // sum_alt[1]
|
||||
vadd.i16 d17, d17, d26 // sum_alt[1]
|
||||
.endif
|
||||
|
||||
.if \i < 6
|
||||
vext.8 q12, q14, q15, #(16-2*(3-(\i/2)))
|
||||
vext.8 q13, q15, q14, #(16-2*(3-(\i/2)))
|
||||
vadd.i16 q9, q9, q12 // sum_alt[2]
|
||||
vadd.i16 d22, d22, d26 // sum_alt[2]
|
||||
.else
|
||||
vadd.i16 q9, q9, q15 // sum_alt[2]
|
||||
.endif
|
||||
.if \i == 0
|
||||
vmov q10, q15 // sum_alt[3]
|
||||
.elseif \i == 1
|
||||
vadd.i16 q10, q10, q15 // sum_alt[3]
|
||||
.else
|
||||
vext.8 q12, q14, q15, #(16-2*(\i/2))
|
||||
vext.8 q13, q15, q14, #(16-2*(\i/2))
|
||||
vadd.i16 q10, q10, q12 // sum_alt[3]
|
||||
vadd.i16 d23, d23, d26 // sum_alt[3]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
vmov.u32 q15, #105
|
||||
|
||||
vmull.s16 q12, d8, d8 // sum_hv[0]*sum_hv[0]
|
||||
vmlal.s16 q12, d9, d9
|
||||
vmull.s16 q13, d10, d10 // sum_hv[1]*sum_hv[1]
|
||||
vmlal.s16 q13, d11, d11
|
||||
vadd.s32 d8, d24, d25
|
||||
vadd.s32 d9, d26, d27
|
||||
vpadd.s32 d8, d8, d9 // cost[2,6] (s16, s17)
|
||||
vmul.i32 d8, d8, d30 // cost[2,6] *= 105
|
||||
|
||||
vrev64.16 q1, q1
|
||||
vrev64.16 q3, q3
|
||||
vext.8 q1, q1, q1, #10 // sum_diag[0][14-n]
|
||||
vext.8 q3, q3, q3, #10 // sum_diag[1][14-n]
|
||||
|
||||
vstr s16, [sp, #2*4] // cost[2]
|
||||
vstr s17, [sp, #6*4] // cost[6]
|
||||
|
||||
movrel_local r12, div_table
|
||||
vld1.16 {q14}, [r12, :128]
|
||||
|
||||
vmull.s16 q5, d0, d0 // sum_diag[0]*sum_diag[0]
|
||||
vmull.s16 q12, d1, d1
|
||||
vmlal.s16 q5, d2, d2
|
||||
vmlal.s16 q12, d3, d3
|
||||
vmull.s16 q0, d4, d4 // sum_diag[1]*sum_diag[1]
|
||||
vmull.s16 q1, d5, d5
|
||||
vmlal.s16 q0, d6, d6
|
||||
vmlal.s16 q1, d7, d7
|
||||
vmovl.u16 q13, d28 // div_table
|
||||
vmovl.u16 q14, d29
|
||||
vmul.i32 q5, q5, q13 // cost[0]
|
||||
vmla.i32 q5, q12, q14
|
||||
vmul.i32 q0, q0, q13 // cost[4]
|
||||
vmla.i32 q0, q1, q14
|
||||
vadd.i32 d10, d10, d11
|
||||
vadd.i32 d0, d0, d1
|
||||
vpadd.i32 d0, d10, d0 // cost[0,4] = s0,s1
|
||||
|
||||
movrel_local r12, alt_fact
|
||||
vld1.16 {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
|
||||
|
||||
vstr s0, [sp, #0*4] // cost[0]
|
||||
vstr s1, [sp, #4*4] // cost[4]
|
||||
|
||||
vmovl.u16 q13, d29 // div_table[2*m+1] + 105
|
||||
vmovl.u16 q14, d30
|
||||
vmovl.u16 q15, d31
|
||||
|
||||
.macro cost_alt dest, s1, s2, s3, s4, s5, s6
|
||||
vmull.s16 q1, \s1, \s1 // sum_alt[n]*sum_alt[n]
|
||||
vmull.s16 q2, \s2, \s2
|
||||
vmull.s16 q3, \s3, \s3
|
||||
vmull.s16 q5, \s4, \s4 // sum_alt[n]*sum_alt[n]
|
||||
vmull.s16 q12, \s5, \s5
|
||||
vmull.s16 q6, \s6, \s6 // q6 overlaps the first \s1-\s2 here
|
||||
vmul.i32 q1, q1, q13 // sum_alt[n]^2*fact
|
||||
vmla.i32 q1, q2, q14
|
||||
vmla.i32 q1, q3, q15
|
||||
vmul.i32 q5, q5, q13 // sum_alt[n]^2*fact
|
||||
vmla.i32 q5, q12, q14
|
||||
vmla.i32 q5, q6, q15
|
||||
vadd.i32 d2, d2, d3
|
||||
vadd.i32 d3, d10, d11
|
||||
vpadd.i32 \dest, d2, d3 // *cost_ptr
|
||||
.endm
|
||||
cost_alt d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
|
||||
cost_alt d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
|
||||
vstr s28, [sp, #1*4] // cost[1]
|
||||
vstr s29, [sp, #3*4] // cost[3]
|
||||
|
||||
mov r0, #0 // best_dir
|
||||
vmov.32 r1, d0[0] // best_cost
|
||||
mov r3, #1 // n
|
||||
|
||||
vstr s30, [sp, #5*4] // cost[5]
|
||||
vstr s31, [sp, #7*4] // cost[7]
|
||||
|
||||
vmov.32 r12, d14[0]
|
||||
|
||||
.macro find_best s1, s2, s3
|
||||
.ifnb \s2
|
||||
vmov.32 lr, \s2
|
||||
.endif
|
||||
cmp r12, r1 // cost[n] > best_cost
|
||||
itt gt
|
||||
movgt r0, r3 // best_dir = n
|
||||
movgt r1, r12 // best_cost = cost[n]
|
||||
.ifnb \s2
|
||||
add r3, r3, #1 // n++
|
||||
cmp lr, r1 // cost[n] > best_cost
|
||||
vmov.32 r12, \s3
|
||||
itt gt
|
||||
movgt r0, r3 // best_dir = n
|
||||
movgt r1, lr // best_cost = cost[n]
|
||||
add r3, r3, #1 // n++
|
||||
.endif
|
||||
.endm
|
||||
find_best d14[0], d8[0], d14[1]
|
||||
find_best d14[1], d0[1], d15[0]
|
||||
find_best d15[0], d8[1], d15[1]
|
||||
find_best d15[1]
|
||||
|
||||
eor r3, r0, #4 // best_dir ^4
|
||||
ldr r12, [sp, r3, lsl #2]
|
||||
sub r1, r1, r12 // best_cost - cost[best_dir ^ 4]
|
||||
lsr r1, r1, #10
|
||||
str r1, [r2] // *var
|
||||
|
||||
add sp, sp, #32
|
||||
vpop {q4-q7}
|
||||
pop {pc}
|
||||
endfunc
|
|
@ -2971,3 +2971,206 @@ endfunc
|
|||
|
||||
filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
|
||||
filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6
|
||||
|
||||
.macro load_filter_ptr src
|
||||
asr r12, \src, #10
|
||||
add r12, r11, r12, lsl #3
|
||||
.endm
|
||||
|
||||
.macro load_filter_coef dst, src, inc
|
||||
vld1.8 {\dst}, [r12, :64]
|
||||
add \src, \src, \inc
|
||||
.endm
|
||||
|
||||
.macro load_filter_row dst, src, inc
|
||||
load_filter_ptr \src
|
||||
load_filter_coef \dst, \src, \inc
|
||||
.endm
|
||||
|
||||
function warp_filter_horz_neon
|
||||
load_filter_ptr r5 // filter 0
|
||||
vld1.16 {q7}, [r2], r3
|
||||
|
||||
load_filter_coef d0, r5, r7 // filter 0
|
||||
vmovl.u8 q6, d14 // original pixels
|
||||
load_filter_row d2, r5, r7 // filter 1
|
||||
vmovl.u8 q7, d15 // original pixels
|
||||
load_filter_row d4, r5, r7 // filter 2
|
||||
vmovl.s8 q0, d0 // filter 0
|
||||
vext.8 q3, q6, q7, #2*1 // filter 1 pixels
|
||||
load_filter_ptr r5 // filter 3
|
||||
vmovl.s8 q1, d2 // filter 1
|
||||
vmul.i16 q5, q6, q0 // filter 0 output
|
||||
load_filter_coef d0, r5, r7 // filter 3
|
||||
vmovl.s8 q2, d4 // filter 2
|
||||
load_filter_ptr r5 // filter 4
|
||||
vext.8 q4, q6, q7, #2*2 // filter 2 pixels
|
||||
vmul.i16 q3, q3, q1 // filter 1 output
|
||||
load_filter_coef d2, r5, r7 // filter 4
|
||||
vmul.i16 q4, q4, q2 // filter 2 output
|
||||
vext.8 q2, q6, q7, #2*3 // filter 3 pixels
|
||||
vmovl.s8 q0, d0 // filter 3
|
||||
vpaddl.s16 q5, q5 // pixel 0 (4x32)
|
||||
vpaddl.s16 q3, q3 // pixel 1 (4x32)
|
||||
vmul.i16 q0, q2, q0 // filter 3 output
|
||||
load_filter_ptr r5 // filter 5
|
||||
vext.8 q2, q6, q7, #2*4 // filter 4 pixels
|
||||
vmovl.s8 q1, d2 // filter 4
|
||||
vpaddl.s16 q4, q4 // pixel 2 (4x32)
|
||||
vpadd.s32 d10, d10, d11 // pixel 0 (2x32)
|
||||
vpadd.s32 d11, d6, d7 // pixel 1 (2x32)
|
||||
load_filter_coef d6, r5, r7 // filter 5
|
||||
vmul.i16 q1, q2, q1 // filter 4 output
|
||||
vpadd.s32 d8, d8, d9 // pixel 2 (2x32)
|
||||
load_filter_ptr r5 // filter 6
|
||||
vpaddl.s16 q0, q0 // pixel 3 (4x32)
|
||||
vpadd.s32 d10, d10, d11 // pixel 0,1
|
||||
vext.8 q2, q6, q7, #2*5 // filter 5 pixels
|
||||
vmovl.s8 q3, d6 // filter 5
|
||||
vpaddl.s16 q1, q1 // pixel 4 (4x32)
|
||||
vpadd.s32 d9, d0, d1 // pixel 3 (2x32)
|
||||
load_filter_coef d0, r5, r7 // filter 6
|
||||
vmul.i16 q2, q2, q3 // filter 5 output
|
||||
vpadd.s32 d11, d8, d9 // pixel 2,3
|
||||
load_filter_ptr r5 // filter 7
|
||||
vpaddl.s16 q2, q2 // pixel 5 (4x32)
|
||||
vpadd.s32 d8, d2, d3 // pixel 4 (2x32)
|
||||
vext.8 q3, q6, q7, #2*6 // filter 6 pixels
|
||||
vmovl.s8 q0, d0 // filter 6
|
||||
vpadd.s32 d9, d4, d5 // pixel 5 (2x32)
|
||||
load_filter_coef d4, r5, r7 // filter 7
|
||||
vpadd.s32 d8, d8, d9 // pixel 4,5
|
||||
vext.8 q1, q6, q7, #2*7 // filter 7 pixels
|
||||
vmovl.s8 q2, d4 // filter 7
|
||||
vmul.i16 q3, q3, q0 // filter 6 output
|
||||
vmul.i16 q1, q1, q2 // filter 7 output
|
||||
sub r5, r5, r7, lsl #3
|
||||
vpaddl.s16 q3, q3 // pixel 6 (4x32)
|
||||
vpaddl.s16 q1, q1 // pixel 7 (4x32)
|
||||
vpadd.s32 d6, d6, d7 // pixel 6 (2x32)
|
||||
vpadd.s32 d2, d2, d3 // pixel 7 (2x32)
|
||||
vpadd.s32 d9, d6, d2 // pixel 6,7
|
||||
|
||||
add r5, r5, r8
|
||||
|
||||
vrshrn.s32 d10, q5, #3
|
||||
vrshrn.s32 d11, q4, #3
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
// void dav1d_warp_affine_8x8_8bpc_neon(
|
||||
// pixel *dst, const ptrdiff_t dst_stride,
|
||||
// const pixel *src, const ptrdiff_t src_stride,
|
||||
// const int16_t *const abcd, int mx, int my)
|
||||
.macro warp t, shift
|
||||
function warp_affine_8x8\t\()_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
ldr r6, [sp, #108]
|
||||
ldrd r8, r9, [r4]
|
||||
sxth r7, r8
|
||||
asr r8, r8, #16
|
||||
asr r4, r9, #16
|
||||
sxth r9, r9
|
||||
mov r10, #8
|
||||
sub r2, r2, r3, lsl #1
|
||||
sub r2, r2, r3
|
||||
sub r2, r2, #3
|
||||
movrel r11, X(mc_warp_filter), 64*8
|
||||
.ifnb \t
|
||||
lsl r1, r1, #1
|
||||
.endif
|
||||
add r5, r5, #512
|
||||
add r6, r6, #512
|
||||
|
||||
bl warp_filter_horz_neon
|
||||
vmov q8, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q9, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q10, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q11, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q12, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q13, q5
|
||||
bl warp_filter_horz_neon
|
||||
vmov q14, q5
|
||||
|
||||
1:
|
||||
bl warp_filter_horz_neon
|
||||
vmov q15, q5
|
||||
|
||||
load_filter_row d8, r6, r9
|
||||
load_filter_row d9, r6, r9
|
||||
load_filter_row d10, r6, r9
|
||||
load_filter_row d11, r6, r9
|
||||
load_filter_row d12, r6, r9
|
||||
load_filter_row d13, r6, r9
|
||||
load_filter_row d14, r6, r9
|
||||
load_filter_row d15, r6, r9
|
||||
transpose_8x8b q4, q5, q6, q7, d8, d9, d10, d11, d12, d13, d14, d15
|
||||
vmovl.s8 q1, d8
|
||||
vmovl.s8 q2, d9
|
||||
vmovl.s8 q3, d10
|
||||
vmovl.s8 q4, d11
|
||||
vmovl.s8 q5, d12
|
||||
vmovl.s8 q6, d13
|
||||
|
||||
sub r6, r6, r9, lsl #3
|
||||
|
||||
// This ordering of vmull/vmlal is highly beneficial for
|
||||
// Cortex A8/A9/A53 here, but harmful for Cortex A7.
|
||||
vmull.s16 q0, d16, d2
|
||||
vmlal.s16 q0, d18, d4
|
||||
vmlal.s16 q0, d20, d6
|
||||
vmlal.s16 q0, d22, d8
|
||||
vmlal.s16 q0, d24, d10
|
||||
vmlal.s16 q0, d26, d12
|
||||
vmull.s16 q1, d17, d3
|
||||
vmlal.s16 q1, d19, d5
|
||||
vmlal.s16 q1, d21, d7
|
||||
vmlal.s16 q1, d23, d9
|
||||
vmlal.s16 q1, d25, d11
|
||||
vmlal.s16 q1, d27, d13
|
||||
|
||||
vmovl.s8 q2, d14
|
||||
vmovl.s8 q3, d15
|
||||
|
||||
vmlal.s16 q0, d28, d4
|
||||
vmlal.s16 q0, d30, d6
|
||||
vmlal.s16 q1, d29, d5
|
||||
vmlal.s16 q1, d31, d7
|
||||
|
||||
vmov q8, q9
|
||||
vmov q9, q10
|
||||
vqrshrn.s32 d0, q0, #\shift
|
||||
vmov q10, q11
|
||||
vqrshrn.s32 d1, q1, #\shift
|
||||
vmov q11, q12
|
||||
vmov q12, q13
|
||||
.ifb \t
|
||||
vqmovun.s16 d0, q0
|
||||
.endif
|
||||
vmov q13, q14
|
||||
vmov q14, q15
|
||||
subs r10, r10, #1
|
||||
.ifnb \t
|
||||
vst1.16 {q0}, [r0, :128], r1
|
||||
.else
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
.endif
|
||||
|
||||
add r6, r6, r4
|
||||
bgt 1b
|
||||
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
warp , 11
|
||||
warp t, 7
|
||||
|
|
|
@ -32,6 +32,20 @@
|
|||
#include "config.h"
|
||||
#include "src/arm/asm.S"
|
||||
|
||||
.macro movrel_local rd, val, offset=0
|
||||
#if defined(PIC)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
2:
|
||||
add \rd, \rd, pc
|
||||
#else
|
||||
movw \rd, #:lower16:\val+\offset
|
||||
movt \rd, #:upper16:\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro movrel rd, val, offset=0
|
||||
#if defined(PIC) && defined(__APPLE__)
|
||||
ldr \rd, 1f
|
||||
|
@ -50,17 +64,24 @@
|
|||
.indirect_symbol \val
|
||||
.word 0
|
||||
.text
|
||||
#elif defined(PIC)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
2:
|
||||
add \rd, \rd, pc
|
||||
#else
|
||||
movw \rd, #:lower16:\val+\offset
|
||||
movt \rd, #:upper16:\val+\offset
|
||||
movrel_local \rd, \val, \offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8b q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
|
||||
vtrn.32 \q0, \q2
|
||||
vtrn.32 \q1, \q3
|
||||
|
||||
vtrn.16 \r0, \r2
|
||||
vtrn.16 \r1, \r3
|
||||
vtrn.16 \r4, \r6
|
||||
vtrn.16 \r5, \r7
|
||||
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
vtrn.8 \r4, \r5
|
||||
vtrn.8 \r6, \r7
|
||||
.endm
|
||||
|
||||
#endif /* DAV1D_SRC_ARM_32_UTIL_S */
|
||||
|
|
|
@ -129,6 +129,14 @@
|
|||
3:
|
||||
.endm
|
||||
|
||||
.macro load_n_incr dst, src, incr, w
|
||||
.if \w == 4
|
||||
ld1 {\dst\().s}[0], [\src], \incr
|
||||
.else
|
||||
ld1 {\dst\().8b}, [\src], \incr
|
||||
.endif
|
||||
.endm
|
||||
|
||||
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
|
||||
// ptrdiff_t src_stride, const pixel (*left)[2],
|
||||
// /*const*/ pixel *const top[2], int h,
|
||||
|
@ -163,9 +171,8 @@ function cdef_padding\w\()_neon, export=1
|
|||
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ld1 {v0.h}[0], [x3], #2
|
||||
ldr \rn\()1, [x1]
|
||||
ldr h2, [x1, #\w]
|
||||
add x1, x1, x2
|
||||
load_n_incr v1, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
uxtl v0.8h, v0.8b
|
||||
uxtl v1.8h, v1.8b
|
||||
|
@ -179,11 +186,7 @@ function cdef_padding\w\()_neon, export=1
|
|||
1:
|
||||
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
ld1 {v0.h}[0], [x3], #2
|
||||
.if \w == 8
|
||||
ld1 {v1.8b}, [x1], x2
|
||||
.else
|
||||
ld1 {v1.s}[0], [x1], x2
|
||||
.endif
|
||||
load_n_incr v1, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
uxtl v0.8h, v0.8b
|
||||
uxtl v1.8h, v1.8b
|
||||
|
@ -198,9 +201,8 @@ function cdef_padding\w\()_neon, export=1
|
|||
b.eq 1f
|
||||
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
|
||||
0:
|
||||
ldr \rn\()0, [x1]
|
||||
ldr h1, [x1, #\w]
|
||||
add x1, x1, x2
|
||||
load_n_incr v0, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
uxtl v0.8h, v0.8b
|
||||
uxtl v1.8h, v1.8b
|
||||
|
@ -212,11 +214,7 @@ function cdef_padding\w\()_neon, export=1
|
|||
b 3f
|
||||
1:
|
||||
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
|
||||
.if \w == 8
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
.else
|
||||
ld1 {v0.s}[0], [x1], x2
|
||||
.endif
|
||||
load_n_incr v0, x1, x2, \w
|
||||
subs w5, w5, #1
|
||||
uxtl v0.8h, v0.8b
|
||||
str s31, [x0]
|
||||
|
@ -299,17 +297,17 @@ endconst
|
|||
uabd v20.8h, v0.8h, \s2\().8h // abs(diff)
|
||||
ushl v17.8h, v16.8h, \shift // abs(diff) >> shift
|
||||
ushl v21.8h, v20.8h, \shift // abs(diff) >> shift
|
||||
uqsub v17.8h, \thresh_vec, v17.8h // imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.8h, \thresh_vec, v21.8h // imax(0, threshold - (abs(diff) >> shift))
|
||||
cmhi v18.8h, v0.8h, \s1\().8h // px > p0
|
||||
cmhi v22.8h, v0.8h, \s2\().8h // px > p1
|
||||
umin v17.8h, v17.8h, v16.8h // imin(abs(diff), imax())
|
||||
umin v21.8h, v21.8h, v20.8h // imin(abs(diff), imax())
|
||||
uqsub v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
uqsub v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
|
||||
sub v18.8h, \s1\().8h, v0.8h // diff = p0 - px
|
||||
sub v22.8h, \s2\().8h, v0.8h // diff = p1 - px
|
||||
neg v16.8h, v17.8h // -clip
|
||||
neg v20.8h, v21.8h // -clip
|
||||
smin v18.8h, v18.8h, v17.8h // imin(diff, clip)
|
||||
smin v22.8h, v22.8h, v21.8h // imin(diff, clip)
|
||||
dup v19.8h, \tap // taps[k]
|
||||
neg v16.8h, v17.8h // -imin()
|
||||
neg v20.8h, v21.8h // -imin()
|
||||
bsl v18.16b, v16.16b, v17.16b // constrain() = apply_sign()
|
||||
bsl v22.16b, v20.16b, v21.16b // constrain() = apply_sign()
|
||||
smax v18.8h, v18.8h, v16.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
smax v22.8h, v22.8h, v20.8h // constrain() = imax(imin(diff, clip), -clip)
|
||||
mla v1.8h, v18.8h, v19.8h // sum += taps[k] * constrain()
|
||||
mla v1.8h, v22.8h, v19.8h // sum += taps[k] * constrain()
|
||||
3:
|
||||
|
@ -325,19 +323,18 @@ function cdef_filter\w\()_neon, export=1
|
|||
add x8, x8, w9, uxtw #1
|
||||
movrel x9, directions\w
|
||||
add x5, x9, w5, uxtw #1
|
||||
movi v30.8h, #15
|
||||
dup v28.8h, w6 // damping
|
||||
movi v30.4h, #15
|
||||
dup v28.4h, w6 // damping
|
||||
|
||||
dup v25.8h, w3 // threshold
|
||||
dup v27.8h, w4 // threshold
|
||||
clz v24.8h, v25.8h // clz(threshold)
|
||||
clz v26.8h, v27.8h // clz(threshold)
|
||||
sub v24.8h, v30.8h, v24.8h // ulog2(threshold)
|
||||
sub v26.8h, v30.8h, v26.8h // ulog2(threshold)
|
||||
uqsub v24.8h, v28.8h, v24.8h // shift = imax(0, damping - ulog2(threshold))
|
||||
uqsub v26.8h, v28.8h, v26.8h // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.8h, v24.8h // -shift
|
||||
neg v26.8h, v26.8h // -shift
|
||||
trn1 v24.4h, v25.4h, v27.4h
|
||||
clz v24.4h, v24.4h // clz(threshold)
|
||||
sub v24.4h, v30.4h, v24.4h // ulog2(threshold)
|
||||
uqsub v24.4h, v28.4h, v24.4h // shift = imax(0, damping - ulog2(threshold))
|
||||
neg v24.4h, v24.4h // -shift
|
||||
dup v26.8h, v24.h[1]
|
||||
dup v24.8h, v24.h[0]
|
||||
|
||||
1:
|
||||
.if \w == 8
|
||||
|
@ -467,15 +464,15 @@ function cdef_find_dir_neon, export=1
|
|||
ext v24.16b, v30.16b, v29.16b, #(16-2*\i)
|
||||
ext v25.16b, v29.16b, v30.16b, #(16-2*\i)
|
||||
add v6.8h, v6.8h, v22.8h // sum_alt[0]
|
||||
add v7.8h, v7.8h, v23.8h // sum_alt[0]
|
||||
add v7.4h, v7.4h, v23.4h // sum_alt[0]
|
||||
add v16.8h, v16.8h, v24.8h // sum_alt[1]
|
||||
add v17.8h, v17.8h, v25.8h // sum_alt[1]
|
||||
add v17.4h, v17.4h, v25.4h // sum_alt[1]
|
||||
.endif
|
||||
.if \i < 6
|
||||
ext v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
|
||||
ext v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
|
||||
add v18.8h, v18.8h, v22.8h // sum_alt[2]
|
||||
add v19.8h, v19.8h, v23.8h // sum_alt[2]
|
||||
add v19.4h, v19.4h, v23.4h // sum_alt[2]
|
||||
.else
|
||||
add v18.8h, v18.8h, v26.8h // sum_alt[2]
|
||||
.endif
|
||||
|
@ -487,7 +484,7 @@ function cdef_find_dir_neon, export=1
|
|||
ext v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
|
||||
ext v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
|
||||
add v20.8h, v20.8h, v24.8h // sum_alt[3]
|
||||
add v21.8h, v21.8h, v25.8h // sum_alt[3]
|
||||
add v21.4h, v21.4h, v25.4h // sum_alt[3]
|
||||
.endif
|
||||
.endr
|
||||
|
||||
|
@ -504,10 +501,8 @@ function cdef_find_dir_neon, export=1
|
|||
|
||||
rev64 v1.8h, v1.8h
|
||||
rev64 v3.8h, v3.8h
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // sum_diag[0][15-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #8 // sum_diag[1][15-n]
|
||||
ext v1.16b, v1.16b, v1.16b, #2 // sum_diag[0][14-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #2 // sum_diag[1][14-n]
|
||||
ext v1.16b, v1.16b, v1.16b, #10 // sum_diag[0][14-n]
|
||||
ext v3.16b, v3.16b, v3.16b, #10 // sum_diag[1][14-n]
|
||||
|
||||
str s4, [sp, #2*4] // cost[2]
|
||||
str s5, [sp, #6*4] // cost[6]
|
||||
|
@ -559,16 +554,17 @@ function cdef_find_dir_neon, export=1
|
|||
addv \d2, v25.4s // *cost_ptr
|
||||
.endm
|
||||
cost_alt s6, s16, v6, v7, v16, v17 // cost[1], cost[3]
|
||||
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
|
||||
str s6, [sp, #1*4] // cost[1]
|
||||
str s16, [sp, #3*4] // cost[3]
|
||||
cost_alt s18, s20, v18, v19, v20, v21 // cost[5], cost[7]
|
||||
str s18, [sp, #5*4] // cost[5]
|
||||
str s20, [sp, #7*4] // cost[7]
|
||||
|
||||
mov w0, #0 // best_dir
|
||||
mov w1, v0.s[0] // best_cost
|
||||
mov w3, #1 // n
|
||||
|
||||
str s18, [sp, #5*4] // cost[5]
|
||||
str s20, [sp, #7*4] // cost[7]
|
||||
|
||||
mov w4, v6.s[0]
|
||||
|
||||
.macro find_best s1, s2, s3
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -148,27 +148,6 @@ endconst
|
|||
.endif
|
||||
.endm
|
||||
|
||||
.macro saddl_sz d0, d1, s0, s1, sz
|
||||
saddl \d0\().4s, \s0\().4h, \s1\().4h
|
||||
.ifc \sz, .8h
|
||||
saddl2 \d1\().4s, \s0\().8h, \s1\().8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro ssubl_sz d0, d1, s0, s1, sz
|
||||
ssubl \d0\().4s, \s0\().4h, \s1\().4h
|
||||
.ifc \sz, .8h
|
||||
ssubl2 \d1\().4s, \s0\().8h, \s1\().8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro mul_4s_sz d0, d1, s0, s1, c, sz
|
||||
mul \d0\().4s, \s0\().4s, \c
|
||||
.ifc \sz, .8h
|
||||
mul \d1\().4s, \s1\().4s, \c
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
|
||||
sqrdmulh \r0\sz, \r0\sz, \c
|
||||
sqrdmulh \r1\sz, \r1\sz, \c
|
||||
|
@ -489,18 +468,18 @@ endfunc
|
|||
.endm
|
||||
|
||||
.macro idct_4 r0, r1, r2, r3, sz
|
||||
add v2\sz, \r0\sz, \r2\sz
|
||||
sub v3\sz, \r0\sz, \r2\sz
|
||||
smull_smlal v6, v7, \r1, \r3, v0.h[3], v0.h[2], \sz
|
||||
smull_smlsl v4, v5, \r1, \r3, v0.h[2], v0.h[3], \sz
|
||||
sqrdmulh v2\sz, v2\sz, v0.h[1]
|
||||
sqrdmulh v3\sz, v3\sz, v0.h[1]
|
||||
smull_smlal v2, v3, \r0, \r2, v0.h[0], v0.h[0], \sz
|
||||
rshrn_sz v6, v6, v7, #12, \sz
|
||||
rshrn_sz v4, v4, v5, #12, \sz
|
||||
rshrn_sz v7, v4, v5, #12, \sz
|
||||
smull_smlsl v4, v5, \r0, \r2, v0.h[0], v0.h[0], \sz
|
||||
rshrn_sz v2, v2, v3, #12, \sz
|
||||
rshrn_sz v3, v4, v5, #12, \sz
|
||||
sqadd \r0\sz, v2\sz, v6\sz
|
||||
sqsub \r3\sz, v2\sz, v6\sz
|
||||
sqadd \r1\sz, v3\sz, v4\sz
|
||||
sqsub \r2\sz, v3\sz, v4\sz
|
||||
sqadd \r1\sz, v3\sz, v7\sz
|
||||
sqsub \r2\sz, v3\sz, v7\sz
|
||||
.endm
|
||||
|
||||
function inv_dct_4x4_neon
|
||||
|
@ -780,11 +759,10 @@ def_fn_4x4 identity, flipadst
|
|||
sqadd v3\sz, \r7\sz, \r5\sz // t7
|
||||
sqsub \r3\sz, \r7\sz, \r5\sz // t6a
|
||||
|
||||
sub \r5\sz, \r3\sz, \r1\sz // -> t5
|
||||
add \r7\sz, \r3\sz, \r1\sz // -> t6
|
||||
|
||||
sqrdmulh v4\sz, \r5\sz, v0.h[1] // t5
|
||||
sqrdmulh v5\sz, \r7\sz, v0.h[1] // t6
|
||||
smull_smlsl v4, v5, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t5
|
||||
smull_smlal v6, v7, \r3, \r1, v0.h[0], v0.h[0], \sz // -> t6
|
||||
rshrn_sz v4, v4, v5, #12, \sz // t5
|
||||
rshrn_sz v5, v6, v7, #12, \sz // t6
|
||||
|
||||
sqsub \r7\sz, \r0\sz, v3\sz // out7
|
||||
sqadd \r0\sz, \r0\sz, v3\sz // out0
|
||||
|
@ -865,22 +843,14 @@ endfunc
|
|||
sqsub v5\sz, v5\sz, v19\sz // t7
|
||||
sqneg \o1\()\sz, \o1\()\sz // out1
|
||||
|
||||
movi v0.4s, #2896>>4
|
||||
|
||||
saddl_sz v18, v19, v2, v4, \sz // -> out3 (v19 or v20)
|
||||
ssubl_sz v6, v7, v2, v4, \sz // -> out4 (v20 or v19)
|
||||
ssubl_sz v20, v21, v3, v5, \sz // -> out5 (v21 or v18)
|
||||
saddl_sz v4, v5, v3, v5, \sz // -> out2 (v18 or v21)
|
||||
|
||||
mul_4s_sz v18, v19, v18, v19, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v20, v21, v20, v21, v0.s[0], \sz
|
||||
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
|
||||
|
||||
rshrn_sz v2, v18, v19, #8, \sz // out3
|
||||
rshrn_sz v3, v20, v21, #8, \sz // out5
|
||||
rshrn_sz \o2, v4, v5, #8, \sz // out2 (v18 or v21)
|
||||
rshrn_sz \o4, v6, v7, #8, \sz // out4 (v20 or v19)
|
||||
smull_smlal v18, v19, v2, v4, v1.h[0], v1.h[0], \sz // -> out3 (v19 or v20)
|
||||
smull_smlsl v6, v7, v2, v4, v1.h[0], v1.h[0], \sz // -> out4 (v20 or v19)
|
||||
smull_smlsl v20, v21, v3, v5, v1.h[0], v1.h[0], \sz // -> out5 (v21 or v18)
|
||||
rshrn_sz v2, v18, v19, #12, \sz // out3
|
||||
smull_smlal v18, v19, v3, v5, v1.h[0], v1.h[0], \sz // -> out2 (v18 or v21)
|
||||
rshrn_sz v3, v20, v21, #12, \sz // out5
|
||||
rshrn_sz \o2, v18, v19, #12, \sz // out2 (v18 or v21)
|
||||
rshrn_sz \o4, v6, v7, #12, \sz // out4 (v20 or v19)
|
||||
|
||||
sqneg \o3\()\sz, v2\sz // out3
|
||||
sqneg \o5\()\sz, v3\sz // out5
|
||||
|
@ -1127,14 +1097,15 @@ def_fns_48 8, 4
|
|||
sqsub v25\sz, v27\sz, v29\sz // t13
|
||||
sqadd v27\sz, v27\sz, v29\sz // t14
|
||||
|
||||
sub v23\sz, v3\sz, v2\sz // -> t11
|
||||
add v29\sz, v3\sz, v2\sz // -> t12
|
||||
sub v6\sz, v25\sz, v21\sz // -> t10a
|
||||
add v7\sz, v25\sz, v21\sz // -> t13a
|
||||
sqrdmulh v2\sz, v23\sz, v0.h[1] // t11
|
||||
sqrdmulh v3\sz, v29\sz, v0.h[1] // t12
|
||||
sqrdmulh v4\sz, v6\sz, v0.h[1] // t10a
|
||||
sqrdmulh v5\sz, v7\sz, v0.h[1] // t13a
|
||||
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], \sz // -> t11
|
||||
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], \sz // -> t12
|
||||
smull_smlsl v2, v3, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
|
||||
|
||||
rshrn_sz v4, v4, v5, #12, \sz // t11
|
||||
rshrn_sz v5, v6, v7, #12, \sz // t12
|
||||
smull_smlal v6, v7, v25, v21, v0.h[0], v0.h[0], \sz // -> t10a
|
||||
rshrn_sz v2, v2, v3, #12, \sz // t10a
|
||||
rshrn_sz v3, v6, v7, #12, \sz // t13a
|
||||
|
||||
sqadd v6\sz, v16\sz, v31\sz // out0
|
||||
sqsub v31\sz, v16\sz, v31\sz // out15
|
||||
|
@ -1143,18 +1114,18 @@ def_fns_48 8, 4
|
|||
sqsub v7\sz, v30\sz, v17\sz // out8
|
||||
sqadd v17\sz, v18\sz, v27\sz // out1
|
||||
sqsub v30\sz, v18\sz, v27\sz // out14
|
||||
sqadd v18\sz, v20\sz, v5\sz // out2
|
||||
sqsub v29\sz, v20\sz, v5\sz // out13
|
||||
sqadd v5\sz, v28\sz, v19\sz // out6
|
||||
sqadd v18\sz, v20\sz, v3\sz // out2
|
||||
sqsub v29\sz, v20\sz, v3\sz // out13
|
||||
sqadd v3\sz, v28\sz, v19\sz // out6
|
||||
sqsub v25\sz, v28\sz, v19\sz // out9
|
||||
sqadd v19\sz, v22\sz, v3\sz // out3
|
||||
sqsub v28\sz, v22\sz, v3\sz // out12
|
||||
sqadd v20\sz, v24\sz, v2\sz // out4
|
||||
sqsub v27\sz, v24\sz, v2\sz // out11
|
||||
sqadd v21\sz, v26\sz, v4\sz // out5
|
||||
sqsub v26\sz, v26\sz, v4\sz // out10
|
||||
sqadd v19\sz, v22\sz, v5\sz // out3
|
||||
sqsub v28\sz, v22\sz, v5\sz // out12
|
||||
sqadd v20\sz, v24\sz, v4\sz // out4
|
||||
sqsub v27\sz, v24\sz, v4\sz // out11
|
||||
sqadd v21\sz, v26\sz, v2\sz // out5
|
||||
sqsub v26\sz, v26\sz, v2\sz // out10
|
||||
mov v24\szb, v7\szb
|
||||
mov v22\szb, v5\szb
|
||||
mov v22\szb, v3\szb
|
||||
.endm
|
||||
|
||||
function inv_dct_8x16_neon
|
||||
|
@ -1310,37 +1281,25 @@ endfunc
|
|||
sqsub v23\sz, v25\sz, v23\sz // t7
|
||||
sqneg \o3\sz, \o3\sz // out3
|
||||
|
||||
movi v0.4s, #2896>>4
|
||||
smull_smlsl v24, v25, v2, v21, v0.h[0], v0.h[0], \sz // -> out8 (v24 or v23)
|
||||
smull_smlal v4, v5, v2, v21, v0.h[0], v0.h[0], \sz // -> out7 (v23 or v24)
|
||||
smull_smlal v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out5 (v21 or v26)
|
||||
|
||||
ssubl_sz v24, v25, v2, v21, \sz // -> out8 (v24 or v23)
|
||||
saddl_sz v4, v5, v2, v21, \sz // -> out7 (v23 or v24)
|
||||
saddl_sz v6, v7, v26, v3, \sz // -> out5 (v21 or v26)
|
||||
ssubl_sz v2, v3, v26, v3, \sz // -> out10 (v26 or v21)
|
||||
rshrn_sz v24, v24, v25, #12, \sz // out8
|
||||
rshrn_sz v4, v4, v5, #12, \sz // out7
|
||||
rshrn_sz v5, v6, v7, #12, \sz // out5
|
||||
smull_smlsl v6, v7, v26, v3, v0.h[0], v0.h[0], \sz // -> out10 (v26 or v21)
|
||||
smull_smlal v2, v3, v22, v23, v0.h[0], v0.h[0], \sz // -> out4 (v20 or v27)
|
||||
rshrn_sz v26, v6, v7, #12, \sz // out10
|
||||
|
||||
mul_4s_sz v24, v25, v24, v25, v0.s[0], \sz
|
||||
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
|
||||
smull_smlsl v6, v7, v22, v23, v0.h[0], v0.h[0], \sz // -> out11 (v27 or v20)
|
||||
smull_smlal v22, v23, v27, v20, v0.h[0], v0.h[0], \sz // -> out6 (v22 or v25)
|
||||
smull_smlsl v21, v25, v27, v20, v0.h[0], v0.h[0], \sz // -> out9 (v25 or v22)
|
||||
|
||||
rshrn_sz v24, v24, v25, #8, \sz // out8
|
||||
rshrn_sz v4, v4, v5, #8, \sz // out7
|
||||
rshrn_sz v5, v6, v7, #8, \sz // out5
|
||||
rshrn_sz v26, v2, v3, #8, \sz // out10
|
||||
|
||||
saddl_sz v2, v3, v22, v23, \sz // -> out4 (v20 or v27)
|
||||
ssubl_sz v6, v7, v22, v23, \sz // -> out11 (v27 or v20)
|
||||
saddl_sz v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
|
||||
ssubl_sz v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
|
||||
|
||||
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v22, v23, v22, v23, v0.s[0], \sz
|
||||
mul_4s_sz v21, v25, v21, v25, v0.s[0], \sz
|
||||
|
||||
rshrn_sz \o4, v2, v3, #8, \sz // out4
|
||||
rshrn_sz v6, v6, v7, #8, \sz // out11
|
||||
rshrn_sz v7, v21, v25, #8, \sz // out9
|
||||
rshrn_sz \o6, v22, v23, #8, \sz // out6
|
||||
rshrn_sz \o4, v2, v3, #12, \sz // out4
|
||||
rshrn_sz v6, v6, v7, #12, \sz // out11
|
||||
rshrn_sz v7, v21, v25, #12, \sz // out9
|
||||
rshrn_sz \o6, v22, v23, #12, \sz // out6
|
||||
|
||||
.ifc \o8, v23
|
||||
mov \o8\szb, v24\szb
|
||||
|
@ -1915,22 +1874,26 @@ function inv_dct32_odd_8x16_neon
|
|||
sqsub v24.8h, v24.8h, v19.8h // t27a
|
||||
mov v19.16b, v4.16b // out19
|
||||
|
||||
sub v20.8h, v24.8h, v26.8h // -> t20
|
||||
add v4.8h, v24.8h, v26.8h // -> t27
|
||||
sub v5.8h, v25.8h, v27.8h // -> t21a
|
||||
add v26.8h, v25.8h, v27.8h // -> t26a
|
||||
sqrdmulh v20.8h, v20.8h, v0.h[1] // t20 = out20
|
||||
sqrdmulh v27.8h, v4.8h, v0.h[1] // t27 = out27
|
||||
sub v22.8h, v21.8h, v23.8h // -> t22
|
||||
add v25.8h, v21.8h, v23.8h // -> t25
|
||||
sqrdmulh v21.8h, v5.8h, v0.h[1] // t21a = out21
|
||||
sqrdmulh v26.8h, v26.8h, v0.h[1] // t26a = out26
|
||||
sub v23.8h, v3.8h, v2.8h // -> t23a
|
||||
add v24.8h, v3.8h, v2.8h // -> t24a
|
||||
sqrdmulh v22.8h, v22.8h, v0.h[1] // t22 = out22
|
||||
sqrdmulh v25.8h, v25.8h, v0.h[1] // t25 = out25
|
||||
sqrdmulh v23.8h, v23.8h, v0.h[1] // t23a = out23
|
||||
sqrdmulh v24.8h, v24.8h, v0.h[1] // t24a = out24
|
||||
smull_smlsl v4, v5, v24, v26, v0.h[0], v0.h[0], .8h // -> t20
|
||||
smull_smlal v6, v7, v24, v26, v0.h[0], v0.h[0], .8h // -> t27
|
||||
rshrn_sz v20, v4, v5, #12, .8h // t20
|
||||
rshrn_sz v22, v6, v7, #12, .8h // t27
|
||||
|
||||
smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
|
||||
smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
|
||||
mov v27.16b, v22.16b // t27
|
||||
rshrn_sz v26, v4, v5, #12, .8h // t26a
|
||||
|
||||
smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
|
||||
smull_smlal v4, v5, v21, v23, v0.h[0], v0.h[0], .8h // -> t25
|
||||
rshrn_sz v21, v6, v7, #12, .8h // t21a
|
||||
rshrn_sz v22, v24, v25, #12, .8h // t22
|
||||
rshrn_sz v25, v4, v5, #12, .8h // t25
|
||||
|
||||
smull_smlsl v4, v5, v3, v2, v0.h[0], v0.h[0], .8h // -> t23a
|
||||
smull_smlal v6, v7, v3, v2, v0.h[0], v0.h[0], .8h // -> t24a
|
||||
rshrn_sz v23, v4, v5, #12, .8h // t23a
|
||||
rshrn_sz v24, v6, v7, #12, .8h // t24a
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
|
|
@ -2975,7 +2975,9 @@ function warp_filter_horz_neon
|
|||
ld1 {v16.8b, v17.8b}, [x2], x3
|
||||
|
||||
load_filter_row d0, w12, w7
|
||||
uxtl v16.8h, v16.8b
|
||||
load_filter_row d1, w12, w7
|
||||
uxtl v17.8h, v17.8b
|
||||
load_filter_row d2, w12, w7
|
||||
sxtl v0.8h, v0.8b
|
||||
load_filter_row d3, w12, w7
|
||||
|
@ -2988,16 +2990,12 @@ function warp_filter_horz_neon
|
|||
sxtl v4.8h, v4.8b
|
||||
load_filter_row d7, w12, w7
|
||||
sxtl v5.8h, v5.8b
|
||||
sxtl v6.8h, v6.8b
|
||||
sxtl v7.8h, v7.8b
|
||||
|
||||
uxtl v16.8h, v16.8b
|
||||
uxtl v17.8h, v17.8b
|
||||
|
||||
ext v18.16b, v16.16b, v17.16b, #2*1
|
||||
mul v23.8h, v16.8h, v0.8h
|
||||
sxtl v6.8h, v6.8b
|
||||
ext v19.16b, v16.16b, v17.16b, #2*2
|
||||
mul v18.8h, v18.8h, v1.8h
|
||||
sxtl v7.8h, v7.8b
|
||||
ext v20.16b, v16.16b, v17.16b, #2*3
|
||||
mul v19.8h, v19.8h, v2.8h
|
||||
ext v21.16b, v16.16b, v17.16b, #2*4
|
||||
|
@ -3009,28 +3007,20 @@ function warp_filter_horz_neon
|
|||
saddlp v19.4s, v19.8h
|
||||
mul v22.8h, v22.8h, v5.8h
|
||||
saddlp v20.4s, v20.8h
|
||||
addv s23, v23.4s
|
||||
saddlp v21.4s, v21.8h
|
||||
addv s18, v18.4s
|
||||
saddlp v22.4s, v22.8h
|
||||
addv s19, v19.4s
|
||||
trn1 v18.2s, v23.2s, v18.2s
|
||||
addv s20, v20.4s
|
||||
addp v18.4s, v23.4s, v18.4s
|
||||
ext v23.16b, v16.16b, v17.16b, #2*6
|
||||
trn1 v19.2s, v19.2s, v20.2s
|
||||
addv s21, v21.4s
|
||||
addp v19.4s, v19.4s, v20.4s
|
||||
mul v23.8h, v23.8h, v6.8h
|
||||
ext v20.16b, v16.16b, v17.16b, #2*7
|
||||
addv s22, v22.4s
|
||||
mul v20.8h, v20.8h, v7.8h
|
||||
saddlp v23.4s, v23.8h
|
||||
trn1 v21.2s, v21.2s, v22.2s
|
||||
addp v21.4s, v21.4s, v22.4s
|
||||
saddlp v20.4s, v20.8h
|
||||
addv s23, v23.4s
|
||||
addv s20, v20.4s
|
||||
trn1 v20.2s, v23.2s, v20.2s
|
||||
trn1 v18.2d, v18.2d, v19.2d
|
||||
trn1 v20.2d, v21.2d, v20.2d
|
||||
addp v20.4s, v23.4s, v20.4s
|
||||
addp v18.4s, v18.4s, v19.4s
|
||||
addp v20.4s, v21.4s, v20.4s
|
||||
|
||||
add w5, w5, w8
|
||||
|
||||
|
@ -3047,14 +3037,10 @@ endfunc
|
|||
.macro warp t, shift
|
||||
function warp_affine_8x8\t\()_8bpc_neon, export=1
|
||||
ldr x4, [x4]
|
||||
ubfx x7, x4, #0, #16
|
||||
ubfx x8, x4, #16, #16
|
||||
ubfx x9, x4, #32, #16
|
||||
ubfx x4, x4, #48, #16
|
||||
sxth w7, w7
|
||||
sxth w8, w8
|
||||
sxth w9, w9
|
||||
sxth w4, w4
|
||||
sbfx x7, x4, #0, #16
|
||||
sbfx x8, x4, #16, #16
|
||||
sbfx x9, x4, #32, #16
|
||||
sbfx x4, x4, #48, #16
|
||||
mov w10, #8
|
||||
sub x2, x2, x3, lsl #1
|
||||
sub x2, x2, x3
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
#include "src/cpu.h"
|
||||
#include "src/cdef.h"
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
decl_cdef_dir_fn(dav1d_cdef_find_dir_neon);
|
||||
|
||||
void dav1d_cdef_padding4_neon(uint16_t *tmp, const pixel *src,
|
||||
|
@ -58,8 +58,8 @@ cdef_filter_##w##x##h##_neon(pixel *dst, \
|
|||
const int damping, \
|
||||
const enum CdefEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride,); \
|
||||
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
|
||||
ALIGN_STK_16(uint16_t, tmp_buf, 12*tmp_stride + 8,); \
|
||||
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 8; \
|
||||
dav1d_cdef_padding##w##_neon(tmp, dst, stride, left, top, h, edges); \
|
||||
dav1d_cdef_filter##w##_neon(dst, stride, tmp, pri_strength, \
|
||||
sec_strength, dir, damping, h); \
|
||||
|
@ -76,7 +76,7 @@ COLD void bitfn(dav1d_cdef_dsp_init_arm)(Dav1dCdefDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
c->dir = dav1d_cdef_find_dir_neon;
|
||||
c->fb[0] = cdef_filter_8x8_neon;
|
||||
c->fb[1] = cdef_filter_4x8_neon;
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/ipred.h"
|
||||
|
||||
decl_angular_ipred_fn(dav1d_ipred_dc_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_h_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_v_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_paeth_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
|
||||
decl_angular_ipred_fn(dav1d_ipred_filter_neon);
|
||||
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_neon);
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_128_neon);
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_top_neon);
|
||||
decl_cfl_pred_fn(dav1d_ipred_cfl_left_neon);
|
||||
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_neon);
|
||||
decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_neon);
|
||||
|
||||
decl_pal_pred_fn(dav1d_pal_pred_neon);
|
||||
|
||||
COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
|
||||
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
|
||||
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
|
||||
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
|
||||
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
|
||||
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
|
||||
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
|
||||
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
|
||||
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
|
||||
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
|
||||
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_neon;
|
||||
|
||||
c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_neon;
|
||||
c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_neon;
|
||||
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_neon;
|
||||
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_neon;
|
||||
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_neon;
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon;
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_neon;
|
||||
#endif
|
||||
}
|
|
@ -107,9 +107,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
|
||||
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
|
||||
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -33,6 +33,12 @@
|
|||
|
||||
#include "src/cdef_apply.h"
|
||||
|
||||
|
||||
enum Backup2x8Flags {
|
||||
BACKUP_2X8_Y = 1 << 0,
|
||||
BACKUP_2X8_UV = 1 << 1,
|
||||
};
|
||||
|
||||
static void backup2lines(pixel *const dst[3][2],
|
||||
/*const*/ pixel *const src[3],
|
||||
const ptrdiff_t src_stride[2], int y_off, int w,
|
||||
|
@ -56,13 +62,18 @@ static void backup2lines(pixel *const dst[3][2],
|
|||
static void backup2x8(pixel dst[3][8][2],
|
||||
/*const*/ pixel *const src[3],
|
||||
const ptrdiff_t src_stride[2], int x_off,
|
||||
const enum Dav1dPixelLayout layout)
|
||||
const enum Dav1dPixelLayout layout,
|
||||
const enum Backup2x8Flags flag)
|
||||
{
|
||||
ptrdiff_t y_off = 0;
|
||||
for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
|
||||
pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
|
||||
if (flag & BACKUP_2X8_Y) {
|
||||
for (int y = 0; y < 8; y++, y_off += PXSTRIDE(src_stride[0]))
|
||||
pixel_copy(dst[0][y], &src[0][y_off + x_off - 2], 2);
|
||||
}
|
||||
|
||||
if (layout == DAV1D_PIXEL_LAYOUT_I400 || !(flag & BACKUP_2X8_UV))
|
||||
return;
|
||||
|
||||
if (layout == DAV1D_PIXEL_LAYOUT_I400) return;
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
|
||||
|
@ -98,13 +109,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
|
||||
// FIXME a design improvement that could be made here is to keep a set of
|
||||
// flags for each block position on whether the block was filtered; if not,
|
||||
// the backup of pre-filter data is empty, and the restore is therefore
|
||||
// unnecessary as well.
|
||||
|
||||
for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
|
||||
const int tf = f->lf.top_pre_cdef_toggle;
|
||||
const int by_idx = by & 30;
|
||||
if (by + 2 >= f->bh) edges &= ~CDEF_HAVE_BOTTOM;
|
||||
|
||||
if (edges & CDEF_HAVE_BOTTOM) {
|
||||
|
@ -117,6 +124,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
pixel *iptrs[3] = { ptrs[0], ptrs[1], ptrs[2] };
|
||||
edges &= ~CDEF_HAVE_LEFT;
|
||||
edges |= CDEF_HAVE_RIGHT;
|
||||
enum Backup2x8Flags prev_flag = 0;
|
||||
for (int sbx = 0, last_skip = 1; sbx < sb64w; sbx++, edges |= CDEF_HAVE_LEFT) {
|
||||
const int sb128x = sbx >>1;
|
||||
const int sb64_idx = ((by & sbsz) >> 3) + (sbx & 1);
|
||||
|
@ -131,6 +139,8 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
|
||||
const int y_lvl = f->frame_hdr->cdef.y_strength[cdef_idx];
|
||||
const int uv_lvl = f->frame_hdr->cdef.uv_strength[cdef_idx];
|
||||
const enum Backup2x8Flags flag = !!y_lvl + (!!uv_lvl << 1);
|
||||
|
||||
pixel *bptrs[3] = { iptrs[0], iptrs[1], iptrs[2] };
|
||||
for (int bx = sbx * sbsz; bx < imin((sbx + 1) * sbsz, f->bw);
|
||||
bx += 2, edges |= CDEF_HAVE_LEFT)
|
||||
|
@ -140,22 +150,23 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
|
|||
// check if this 8x8 block had any coded coefficients; if not,
|
||||
// go to the next block
|
||||
const unsigned bx_mask = 3U << (bx & 14);
|
||||
const int by_idx = by & 30, bx_idx = (bx & 16) >> 4;
|
||||
const int bx_idx = (bx & 16) >> 4;
|
||||
if (!((lflvl[sb128x].noskip_mask[by_idx + 0][bx_idx] |
|
||||
lflvl[sb128x].noskip_mask[by_idx + 1][bx_idx]) & bx_mask))
|
||||
{
|
||||
last_skip = 1;
|
||||
goto next_b;
|
||||
}
|
||||
|
||||
if (last_skip && edges & CDEF_HAVE_LEFT) {
|
||||
const int do_left = last_skip ? flag : (prev_flag ^ flag) & flag;
|
||||
prev_flag = flag;
|
||||
if (do_left && edges & CDEF_HAVE_LEFT) {
|
||||
// we didn't backup the prefilter data because it wasn't
|
||||
// there, so do it here instead
|
||||
backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout);
|
||||
backup2x8(lr_bak[bit], bptrs, f->cur.stride, 0, layout, do_left);
|
||||
}
|
||||
if (edges & CDEF_HAVE_RIGHT) {
|
||||
// backup pre-filter data for next iteration
|
||||
backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout);
|
||||
backup2x8(lr_bak[!bit], bptrs, f->cur.stride, 8, layout, flag);
|
||||
}
|
||||
|
||||
// the actual filter
|
||||
|
|
|
@ -1176,14 +1176,18 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
f->bd_fn.recon_b_intra(t, bs, intra_edge_flags, b);
|
||||
}
|
||||
|
||||
dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
|
||||
f->frame_hdr, (const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][0][0],
|
||||
t->bx, t->by, f->w4, f->h4, bs,
|
||||
b->tx, b->uvtx, f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
if (f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1])
|
||||
{
|
||||
dav1d_create_lf_mask_intra(t->lf_mask, f->lf.level, f->b4_stride,
|
||||
(const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][0][0],
|
||||
t->bx, t->by, f->w4, f->h4, bs,
|
||||
b->tx, b->uvtx, f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
}
|
||||
|
||||
// update contexts
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
|
@ -1859,17 +1863,21 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
if (f->bd_fn.recon_b_inter(t, bs, b)) return -1;
|
||||
}
|
||||
|
||||
const int is_globalmv =
|
||||
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
|
||||
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
|
||||
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
|
||||
f->frame_hdr, lf_lvls, t->bx, t->by,
|
||||
f->w4, f->h4, b->skip, bs, b->tx_split,
|
||||
b->uvtx, f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
if (f->frame_hdr->loopfilter.level_y[0] ||
|
||||
f->frame_hdr->loopfilter.level_y[1])
|
||||
{
|
||||
const int is_globalmv =
|
||||
b->inter_mode == (is_comp ? GLOBALMV_GLOBALMV : GLOBALMV);
|
||||
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
|
||||
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
|
||||
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride,
|
||||
lf_lvls, t->bx, t->by, f->w4, f->h4,
|
||||
b->skip, bs, b->tx_split, b->uvtx,
|
||||
f->cur.p.layout,
|
||||
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
|
||||
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
|
||||
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
|
||||
}
|
||||
|
||||
// context updates
|
||||
if (is_comp) {
|
||||
|
@ -2339,7 +2347,7 @@ static void setup_tile(Dav1dTileState *const ts,
|
|||
((ts->tiling.col_start & 16) >> 4);
|
||||
}
|
||||
for (int p = 0; p < 3; p++) {
|
||||
if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
|
||||
if (!((f->lf.restore_planes >> p) & 1U))
|
||||
continue;
|
||||
|
||||
if (f->frame_hdr->super_res.enabled) {
|
||||
|
@ -2503,7 +2511,7 @@ int dav1d_decode_tile_sbrow(Dav1dTileContext *const t) {
|
|||
}
|
||||
// Restoration filter
|
||||
for (int p = 0; p < 3; p++) {
|
||||
if (f->frame_hdr->restoration.type[p] == DAV1D_RESTORATION_NONE)
|
||||
if (!((f->lf.restore_planes >> p) & 1U))
|
||||
continue;
|
||||
|
||||
const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
@ -2817,6 +2825,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
|||
}
|
||||
f->lf.lr_mask_sz = lr_mask_sz;
|
||||
}
|
||||
f->lf.restore_planes =
|
||||
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
|
||||
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
|
||||
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
|
||||
if (f->frame_hdr->loopfilter.sharpness != f->lf.last_sharpness) {
|
||||
dav1d_calc_eih(&f->lf.lim_lut, f->frame_hdr->loopfilter.sharpness);
|
||||
f->lf.last_sharpness = f->frame_hdr->loopfilter.sharpness;
|
||||
|
|
|
@ -1126,6 +1126,7 @@ INIT_XMM
|
|||
%xdefine %%tmp %%f %+ 0
|
||||
%ifnum %%tmp
|
||||
RESET_MM_PERMUTATION
|
||||
AVX512_MM_PERMUTATION
|
||||
%assign %%i 0
|
||||
%rep num_mmregs
|
||||
%xdefine %%tmp %%f %+ %%i
|
||||
|
|
|
@ -42,9 +42,12 @@ static void generate_scaling(const int bitdepth,
|
|||
const uint8_t points[][2], const int num,
|
||||
uint8_t scaling[SCALING_SIZE])
|
||||
{
|
||||
#if BITDEPTH == 8
|
||||
const int shift_x = 0;
|
||||
#else
|
||||
const int shift_x = bitdepth - 8;
|
||||
#endif
|
||||
const int scaling_size = 1 << bitdepth;
|
||||
const int pad = 1 << shift_x;
|
||||
|
||||
// Fill up the preceding entries with the initial value
|
||||
for (int i = 0; i < points[0][0] << shift_x; i++)
|
||||
|
@ -69,9 +72,8 @@ static void generate_scaling(const int bitdepth,
|
|||
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
|
||||
scaling[i] = points[num - 1][1];
|
||||
|
||||
if (pad <= 1) return;
|
||||
|
||||
const int rnd = pad >> 1;
|
||||
#if BITDEPTH != 8
|
||||
const int pad = 1 << shift_x, rnd = pad >> 1;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
const int bx = points[i][0] << shift_x;
|
||||
const int ex = points[i+1][0] << shift_x;
|
||||
|
@ -83,6 +85,7 @@ static void generate_scaling(const int bitdepth,
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef UNIT_TEST
|
||||
|
|
|
@ -51,7 +51,7 @@ typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
|
|||
#define decl_generate_grain_uv_fn(name) \
|
||||
void (name)(entry buf[][GRAIN_WIDTH], \
|
||||
const entry buf_y[][GRAIN_WIDTH], \
|
||||
const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
|
||||
const Dav1dFilmGrainData *const data, const intptr_t uv HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
|
||||
|
||||
#define decl_fgy_32x32xn_fn(name) \
|
||||
|
|
|
@ -88,7 +88,7 @@ static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
|
|||
static NOINLINE void
|
||||
generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
|
||||
const entry buf_y[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *const data, const int uv,
|
||||
const Dav1dFilmGrainData *const data, const intptr_t uv,
|
||||
const int subx, const int suby HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
|
@ -156,8 +156,8 @@ gnuv_ss_fn(444, 0, 0);
|
|||
// samples from the correct block of a grain LUT, while taking into account the
|
||||
// offsets provided by the offsets cache
|
||||
static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
|
||||
int offsets[2][2], int subx, int suby,
|
||||
int bx, int by, int x, int y)
|
||||
const int offsets[2][2], const int subx, const int suby,
|
||||
const int bx, const int by, const int x, const int y)
|
||||
{
|
||||
const int randval = offsets[bx][by];
|
||||
const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
|
||||
|
|
|
@ -228,6 +228,7 @@ struct Dav1dFrameContext {
|
|||
int tile_row; // for carry-over at tile row edges
|
||||
pixel *p[3], *sr_p[3];
|
||||
Av1Filter *mask_ptr, *prev_mask_ptr;
|
||||
int restore_planes; // enum LrRestorePlanes
|
||||
} lf;
|
||||
|
||||
// threading (refer to tc[] for per-thread things)
|
||||
|
|
|
@ -89,6 +89,7 @@ typedef struct Dav1dIntraPredDSPContext {
|
|||
} Dav1dIntraPredDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_intra_pred_dsp_init, Dav1dIntraPredDSPContext *c);
|
||||
bitfn_decls(void dav1d_intra_pred_dsp_init_arm, Dav1dIntraPredDSPContext *c);
|
||||
bitfn_decls(void dav1d_intra_pred_dsp_init_x86, Dav1dIntraPredDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_IPRED_H */
|
||||
|
|
|
@ -324,44 +324,37 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
}
|
||||
|
||||
static int get_filter_strength(const unsigned blk_wh, const unsigned d,
|
||||
const int type)
|
||||
{
|
||||
int strength = 0;
|
||||
|
||||
if (type == 0) {
|
||||
if (blk_wh <= 8) {
|
||||
if (d >= 56) strength = 1;
|
||||
} else if (blk_wh <= 12) {
|
||||
if (d >= 40) strength = 1;
|
||||
} else if (blk_wh <= 16) {
|
||||
if (d >= 40) strength = 1;
|
||||
} else if (blk_wh <= 24) {
|
||||
if (d >= 8) strength = 1;
|
||||
if (d >= 16) strength = 2;
|
||||
if (d >= 32) strength = 3;
|
||||
} else if (blk_wh <= 32) {
|
||||
if (d >= 1) strength = 1;
|
||||
if (d >= 4) strength = 2;
|
||||
if (d >= 32) strength = 3;
|
||||
static int get_filter_strength(const int wh, const int angle, const int is_sm) {
|
||||
if (is_sm) {
|
||||
if (wh <= 8) {
|
||||
if (angle >= 64) return 2;
|
||||
if (angle >= 40) return 1;
|
||||
} else if (wh <= 16) {
|
||||
if (angle >= 48) return 2;
|
||||
if (angle >= 20) return 1;
|
||||
} else if (wh <= 24) {
|
||||
if (angle >= 4) return 3;
|
||||
} else {
|
||||
if (d >= 1) strength = 3;
|
||||
return 3;
|
||||
}
|
||||
} else {
|
||||
if (blk_wh <= 8) {
|
||||
if (d >= 40) strength = 1;
|
||||
if (d >= 64) strength = 2;
|
||||
} else if (blk_wh <= 16) {
|
||||
if (d >= 20) strength = 1;
|
||||
if (d >= 48) strength = 2;
|
||||
} else if (blk_wh <= 24) {
|
||||
if (d >= 4) strength = 3;
|
||||
if (wh <= 8) {
|
||||
if (angle >= 56) return 1;
|
||||
} else if (wh <= 16) {
|
||||
if (angle >= 40) return 1;
|
||||
} else if (wh <= 24) {
|
||||
if (angle >= 32) return 3;
|
||||
if (angle >= 16) return 2;
|
||||
if (angle >= 8) return 1;
|
||||
} else if (wh <= 32) {
|
||||
if (angle >= 32) return 3;
|
||||
if (angle >= 4) return 2;
|
||||
return 1;
|
||||
} else {
|
||||
if (d >= 1) strength = 3;
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
|
||||
return strength;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void filter_edge(pixel *const out, const int sz,
|
||||
|
@ -451,12 +444,12 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
|
|||
for (int y = 0, xpos = dx; y < height;
|
||||
y++, dst += PXSTRIDE(stride), xpos += dx)
|
||||
{
|
||||
const int frac = (xpos >> 1) & 0x1F;
|
||||
const int frac = xpos & 0x3E;
|
||||
|
||||
for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
|
||||
if (base < max_base_x) {
|
||||
const int v = top[base] * (32 - frac) + top[base + 1] * frac;
|
||||
dst[x] = iclip_pixel((v + 16) >> 5);
|
||||
const int v = top[base] * (64 - frac) + top[base + 1] * frac;
|
||||
dst[x] = (v + 32) >> 6;
|
||||
} else {
|
||||
pixel_set(&dst[x], top[max_base_x], width - x);
|
||||
break;
|
||||
|
@ -518,30 +511,29 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
*topleft = *topleft_in;
|
||||
|
||||
const int min_base_x = -(1 + upsample_above);
|
||||
const int base_inc_x = 1 + upsample_above;
|
||||
const pixel *const left = &topleft[-(1 + upsample_left)];
|
||||
const pixel *const top = &topleft[1 + upsample_above];
|
||||
for (int y = 0, xpos = -dx; y < height;
|
||||
for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
|
||||
y++, xpos -= dx, dst += PXSTRIDE(stride))
|
||||
{
|
||||
int base_x = xpos >> 6;
|
||||
const int frac_x = (xpos >> 1) & 0x1F;
|
||||
const int frac_x = xpos & 0x3E;
|
||||
|
||||
for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
|
||||
x++, base_x += base_inc_x, ypos -= dy)
|
||||
{
|
||||
int v;
|
||||
|
||||
if (base_x >= min_base_x) {
|
||||
v = top[base_x] * (32 - frac_x) + top[base_x + 1] * frac_x;
|
||||
if (base_x >= 0) {
|
||||
v = topleft[base_x] * (64 - frac_x) +
|
||||
topleft[base_x + 1] * frac_x;
|
||||
} else {
|
||||
const int base_y = ypos >> 6;
|
||||
assert(base_y >= -(1 + upsample_left));
|
||||
const int frac_y = (ypos >> 1) & 0x1F;
|
||||
v = left[-base_y] * (32 - frac_y) + left[-(base_y + 1)] * frac_y;
|
||||
const int frac_y = ypos & 0x3E;
|
||||
v = left[-base_y] * (64 - frac_y) +
|
||||
left[-(base_y + 1)] * frac_y;
|
||||
}
|
||||
dst[x] = iclip_pixel((v + 16) >> 5);
|
||||
dst[x] = (v + 32) >> 6;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -588,13 +580,13 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
const int base_inc = 1 + upsample_left;
|
||||
for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
|
||||
const int frac = (ypos >> 1) & 0x1F;
|
||||
const int frac = ypos & 0x3E;
|
||||
|
||||
for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
|
||||
if (base < max_base_y) {
|
||||
const int v = left[-base] * (32 - frac) +
|
||||
const int v = left[-base] * (64 - frac) +
|
||||
left[-(base + 1)] * frac;
|
||||
dst[y * PXSTRIDE(stride) + x] = iclip_pixel((v + 16) >> 5);
|
||||
dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
|
||||
} else {
|
||||
do {
|
||||
dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
|
||||
|
@ -605,6 +597,22 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
|
|||
}
|
||||
}
|
||||
|
||||
#if ARCH_X86
|
||||
#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
|
||||
flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 + \
|
||||
flt_ptr[16] * p2 + flt_ptr[17] * p3 + \
|
||||
flt_ptr[32] * p4 + flt_ptr[33] * p5 + \
|
||||
flt_ptr[48] * p6
|
||||
#define FLT_INCR 2
|
||||
#else
|
||||
#define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
|
||||
flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 + \
|
||||
flt_ptr[16] * p2 + flt_ptr[24] * p3 + \
|
||||
flt_ptr[32] * p4 + flt_ptr[40] * p5 + \
|
||||
flt_ptr[48] * p6
|
||||
#define FLT_INCR 1
|
||||
#endif
|
||||
|
||||
/* Up to 32x32 only */
|
||||
static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
|
||||
const pixel *const topleft_in,
|
||||
|
@ -633,11 +641,8 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
|
|||
const int8_t *flt_ptr = filter;
|
||||
|
||||
for (int yy = 0; yy < 2; yy++) {
|
||||
for (int xx = 0; xx < 4; xx++, flt_ptr += 2) {
|
||||
int acc = flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +
|
||||
flt_ptr[16] * p2 + flt_ptr[17] * p3 +
|
||||
flt_ptr[32] * p4 + flt_ptr[33] * p5 +
|
||||
flt_ptr[48] * p6;
|
||||
for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
|
||||
int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
|
||||
ptr[xx] = iclip_pixel((acc + 8) >> 4);
|
||||
}
|
||||
ptr += PXSTRIDE(stride);
|
||||
|
@ -751,7 +756,11 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
|
|||
|
||||
c->pal_pred = pal_pred_c;
|
||||
|
||||
#if HAVE_ASM && ARCH_X86
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_intra_pred_dsp_init_arm)(c);
|
||||
#elif ARCH_X86
|
||||
bitfn(dav1d_intra_pred_dsp_init_x86)(c);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -286,7 +286,6 @@ static inline void mask_edges_chroma(uint16_t (*const masks)[32][2][2],
|
|||
void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
|
||||
uint8_t (*const level_cache)[4],
|
||||
const ptrdiff_t b4_stride,
|
||||
const Dav1dFrameHeader *const hdr,
|
||||
const uint8_t (*filter_level)[8][2],
|
||||
const int bx, const int by,
|
||||
const int iw, const int ih,
|
||||
|
@ -297,9 +296,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
|
|||
uint8_t *const ay, uint8_t *const ly,
|
||||
uint8_t *const auv, uint8_t *const luv)
|
||||
{
|
||||
if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
|
||||
return;
|
||||
|
||||
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
|
||||
const int bw4 = imin(iw - bx, b_dim[0]);
|
||||
const int bh4 = imin(ih - by, b_dim[1]);
|
||||
|
@ -350,7 +346,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *const lflvl,
|
|||
void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
|
||||
uint8_t (*const level_cache)[4],
|
||||
const ptrdiff_t b4_stride,
|
||||
const Dav1dFrameHeader *const hdr,
|
||||
const uint8_t (*filter_level)[8][2],
|
||||
const int bx, const int by,
|
||||
const int iw, const int ih,
|
||||
|
@ -361,9 +356,6 @@ void dav1d_create_lf_mask_inter(Av1Filter *const lflvl,
|
|||
uint8_t *const ay, uint8_t *const ly,
|
||||
uint8_t *const auv, uint8_t *const luv)
|
||||
{
|
||||
if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1])
|
||||
return;
|
||||
|
||||
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
|
||||
const int bw4 = imin(iw - bx, b_dim[0]);
|
||||
const int bh4 = imin(ih - by, b_dim[1]);
|
||||
|
|
|
@ -63,7 +63,6 @@ typedef struct Av1Restoration {
|
|||
|
||||
void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
|
||||
const ptrdiff_t b4_stride,
|
||||
const Dav1dFrameHeader *hdr,
|
||||
const uint8_t (*level)[8][2], int bx, int by,
|
||||
int iw, int ih, enum BlockSize bs,
|
||||
enum RectTxfmSize ytx, enum RectTxfmSize uvtx,
|
||||
|
@ -71,7 +70,6 @@ void dav1d_create_lf_mask_intra(Av1Filter *lflvl, uint8_t (*level_cache)[4],
|
|||
uint8_t *ly, uint8_t *auv, uint8_t *luv);
|
||||
void dav1d_create_lf_mask_inter(Av1Filter *lflvl, uint8_t (*level_cache)[4],
|
||||
const ptrdiff_t b4_stride,
|
||||
const Dav1dFrameHeader *hdr,
|
||||
const uint8_t (*level)[8][2], int bx, int by,
|
||||
int iw, int ih, int skip_inter,
|
||||
enum BlockSize bs, const uint16_t *tx_mask,
|
||||
|
|
|
@ -75,5 +75,6 @@ typedef struct Dav1dLoopRestorationDSPContext {
|
|||
bitfn_decls(void dav1d_loop_restoration_dsp_init, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_arm, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_x86, Dav1dLoopRestorationDSPContext *c);
|
||||
bitfn_decls(void dav1d_loop_restoration_dsp_init_ppc, Dav1dLoopRestorationDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_LOOPRESTORATION_H */
|
||||
|
|
|
@ -580,6 +580,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
|
|||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_loop_restoration_dsp_init_arm)(c);
|
||||
#elif ARCH_PPC64LE
|
||||
bitfn(dav1d_loop_restoration_dsp_init_ppc)(c);
|
||||
#elif ARCH_X86
|
||||
bitfn(dav1d_loop_restoration_dsp_init_x86)(c);
|
||||
#endif
|
||||
|
|
|
@ -112,10 +112,7 @@ void bytefn(dav1d_lr_copy_lpf)(Dav1dFrameContext *const f,
|
|||
const ptrdiff_t lr_stride = ((f->sr_cur.p.p.w + 31) & ~31) * sizeof(pixel);
|
||||
|
||||
// TODO Also check block level restore type to reduce copying.
|
||||
const int restore_planes =
|
||||
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
|
||||
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
|
||||
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
|
||||
const int restore_planes = f->lf.restore_planes;
|
||||
|
||||
if (restore_planes & LR_RESTORE_Y) {
|
||||
const int h = f->cur.p.h;
|
||||
|
@ -180,12 +177,8 @@ static void lr_stripe(const Dav1dFrameContext *const f, pixel *p,
|
|||
}
|
||||
|
||||
while (y + stripe_h <= row_h) {
|
||||
// TODO Look into getting rid of the this if
|
||||
if (y + stripe_h == row_h) {
|
||||
edges &= ~LR_HAVE_BOTTOM;
|
||||
} else {
|
||||
edges |= LR_HAVE_BOTTOM;
|
||||
}
|
||||
// Change HAVE_BOTTOM bit in edges to (y + stripe_h != row_h)
|
||||
edges ^= (-(y + stripe_h != row_h) ^ edges) & LR_HAVE_BOTTOM;
|
||||
if (lr->type == DAV1D_RESTORATION_WIENER) {
|
||||
dsp->lr.wiener(p, p_stride, left, lpf, lpf_stride, unit_w, stripe_h,
|
||||
filterh, filterv, edges HIGHBD_CALL_SUFFIX);
|
||||
|
@ -239,8 +232,7 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
|
|||
const int shift_hor = 7 - ss_hor;
|
||||
|
||||
pixel pre_lr_border[2][128 + 8 /* maximum sbrow height is 128 + 8 rows offset */][4];
|
||||
|
||||
int unit_w = unit_size, bit = 0;
|
||||
const Av1RestorationUnit *lr[2];
|
||||
|
||||
enum LrEdgeFlags edges = (y > 0 ? LR_HAVE_TOP : 0) | LR_HAVE_RIGHT |
|
||||
(row_h < h ? LR_HAVE_BOTTOM : 0);
|
||||
|
@ -251,26 +243,27 @@ static void lr_sbrow(const Dav1dFrameContext *const f, pixel *p, const int y,
|
|||
aligned_unit_pos <<= ss_ver;
|
||||
const int sb_idx = (aligned_unit_pos >> 7) * f->sr_sb128w;
|
||||
const int unit_idx = ((aligned_unit_pos >> 6) & 1) << 1;
|
||||
for (int x = 0; x < w; x += unit_w, edges |= LR_HAVE_LEFT, bit ^= 1) {
|
||||
if (x + max_unit_size > w) {
|
||||
unit_w = w - x;
|
||||
edges &= ~LR_HAVE_RIGHT;
|
||||
}
|
||||
|
||||
// Based on the position of the restoration unit, find the corresponding
|
||||
// AV1Filter unit.
|
||||
const int u_idx = unit_idx + ((x >> (shift_hor - 1)) & 1);
|
||||
const Av1RestorationUnit *const lr =
|
||||
&f->lf.lr_mask[sb_idx + (x >> shift_hor)].lr[plane][u_idx];
|
||||
|
||||
// FIXME Don't backup if the next restoration unit is RESTORE_NONE
|
||||
if (edges & LR_HAVE_RIGHT) {
|
||||
backup4xU(pre_lr_border[bit], p + unit_w - 4, p_stride, row_h - y);
|
||||
}
|
||||
if (lr->type != DAV1D_RESTORATION_NONE) {
|
||||
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr, edges);
|
||||
}
|
||||
p += unit_w;
|
||||
lr[0] = &f->lf.lr_mask[sb_idx].lr[plane][unit_idx];
|
||||
int restore = lr[0]->type != DAV1D_RESTORATION_NONE;
|
||||
int x = 0, bit = 0;
|
||||
for (; x + max_unit_size <= w; p += unit_size, edges |= LR_HAVE_LEFT, bit ^= 1) {
|
||||
const int next_x = x + unit_size;
|
||||
const int next_u_idx = unit_idx + ((next_x >> (shift_hor - 1)) & 1);
|
||||
lr[!bit] =
|
||||
&f->lf.lr_mask[sb_idx + (next_x >> shift_hor)].lr[plane][next_u_idx];
|
||||
const int restore_next = lr[!bit]->type != DAV1D_RESTORATION_NONE;
|
||||
if (restore_next)
|
||||
backup4xU(pre_lr_border[bit], p + unit_size - 4, p_stride, row_h - y);
|
||||
if (restore)
|
||||
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_size, row_h,
|
||||
lr[bit], edges);
|
||||
x = next_x;
|
||||
restore = restore_next;
|
||||
}
|
||||
if (restore) {
|
||||
edges &= ~LR_HAVE_RIGHT;
|
||||
const int unit_w = w - x;
|
||||
lr_stripe(f, p, pre_lr_border[!bit], x, y, plane, unit_w, row_h, lr[bit], edges);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -279,11 +272,7 @@ void bytefn(dav1d_lr_sbrow)(Dav1dFrameContext *const f, pixel *const dst[3],
|
|||
{
|
||||
const int offset_y = 8 * !!sby;
|
||||
const ptrdiff_t *const dst_stride = f->sr_cur.p.stride;
|
||||
|
||||
const int restore_planes =
|
||||
((f->frame_hdr->restoration.type[0] != DAV1D_RESTORATION_NONE) << 0) +
|
||||
((f->frame_hdr->restoration.type[1] != DAV1D_RESTORATION_NONE) << 1) +
|
||||
((f->frame_hdr->restoration.type[2] != DAV1D_RESTORATION_NONE) << 2);
|
||||
const int restore_planes = f->lf.restore_planes;
|
||||
|
||||
if (restore_planes & LR_RESTORE_Y) {
|
||||
const int h = f->sr_cur.p.p.h;
|
||||
|
|
|
@ -93,6 +93,7 @@ if is_asm_enabled
|
|||
)
|
||||
libdav1d_tmpl_sources += files(
|
||||
'arm/cdef_init_tmpl.c',
|
||||
'arm/ipred_init_tmpl.c',
|
||||
'arm/itx_init_tmpl.c',
|
||||
'arm/loopfilter_init_tmpl.c',
|
||||
'arm/looprestoration_init_tmpl.c',
|
||||
|
@ -101,6 +102,7 @@ if is_asm_enabled
|
|||
if host_machine.cpu_family() == 'aarch64'
|
||||
libdav1d_sources += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/ipred.S',
|
||||
'arm/64/itx.S',
|
||||
'arm/64/loopfilter.S',
|
||||
'arm/64/looprestoration.S',
|
||||
|
@ -109,6 +111,7 @@ if is_asm_enabled
|
|||
)
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
libdav1d_sources += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/looprestoration.S',
|
||||
'arm/32/mc.S',
|
||||
)
|
||||
|
@ -167,6 +170,7 @@ if is_asm_enabled
|
|||
)
|
||||
libdav1d_arch_tmpl_sources += files(
|
||||
'ppc/cdef_init_tmpl.c',
|
||||
'ppc/looprestoration_init_tmpl.c',
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -1098,6 +1098,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
|
|||
const int num_uv_pos = num_y_pos + !!fgd->num_y_points;
|
||||
for (int i = 0; i < num_uv_pos; i++)
|
||||
fgd->ar_coeffs_uv[pl][i] = dav1d_get_bits(gb, 8) - 128;
|
||||
if (!fgd->num_y_points)
|
||||
fgd->ar_coeffs_uv[pl][num_uv_pos] = 0;
|
||||
}
|
||||
fgd->ar_coeff_shift = dav1d_get_bits(gb, 2) + 6;
|
||||
fgd->grain_scale_shift = dav1d_get_bits(gb, 2);
|
||||
|
|
|
@ -0,0 +1,350 @@
|
|||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Michail Alvanos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "common/intops.h"
|
||||
#include "src/ppc/types.h"
|
||||
#include "src/cpu.h"
|
||||
#include "src/looprestoration.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
|
||||
#define REST_UNIT_STRIDE (400)
|
||||
|
||||
static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
|
||||
v = vec_max(minv, v);
|
||||
v = vec_min(maxv, v);
|
||||
return v;
|
||||
}
|
||||
|
||||
#define APPLY_FILTER_H(v, f, ssum1, ssum2) do { \
|
||||
i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
|
||||
i16x8 ktmp_u16_low = (i16x8) u8l_to_u16(v); \
|
||||
ssum1 = vec_madd(ktmp_u16_high, f, ssum1); \
|
||||
ssum2 = vec_madd(ktmp_u16_low, f, ssum2); \
|
||||
} while (0)
|
||||
|
||||
static void wiener_filter_h_vsx(int32_t *hor_ptr,
|
||||
uint8_t *tmp_ptr,
|
||||
const int16_t filterh[7],
|
||||
const int w, const int h)
|
||||
{
|
||||
static const i32x4 zerov = vec_splats(0);
|
||||
static const i32x4 seven_vec = vec_splats(7);
|
||||
static const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
|
||||
static const i32x4 round_bits_vec = vec_splats(3);
|
||||
static const i32x4 rounding_off_vec = vec_splats(1<<2);
|
||||
static const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
|
||||
|
||||
i16x8 filterhvall = vec_vsx_ld(0, filterh);
|
||||
i16x8 filterhv0 = vec_splat( filterhvall, 0);
|
||||
i16x8 filterhv1 = vec_splat( filterhvall, 1);
|
||||
i16x8 filterhv2 = vec_splat( filterhvall, 2);
|
||||
i16x8 filterhv3 = vec_splat( filterhvall, 3);
|
||||
i16x8 filterhv4 = vec_splat( filterhvall, 4);
|
||||
i16x8 filterhv5 = vec_splat( filterhvall, 5);
|
||||
i16x8 filterhv6 = vec_splat( filterhvall, 6);
|
||||
|
||||
for (int j = 0; j < h + 6; j++) {
|
||||
for (int i = 0; i < w; i+=16) {
|
||||
i32x4 sum1 = bitdepth_added_vec;
|
||||
i32x4 sum2 = bitdepth_added_vec;
|
||||
i32x4 sum3 = bitdepth_added_vec;
|
||||
i32x4 sum4 = bitdepth_added_vec;
|
||||
|
||||
u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
|
||||
u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
|
||||
|
||||
u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
|
||||
u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
|
||||
u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
|
||||
u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
|
||||
u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
|
||||
u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
|
||||
|
||||
u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
|
||||
u16x8 tmp_u16_low = u8l_to_u16(tmp_v3);
|
||||
|
||||
i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
|
||||
i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
|
||||
i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
|
||||
i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
|
||||
|
||||
i16x8 ssum1 = (i16x8) zerov;
|
||||
i16x8 ssum2 = (i16x8) zerov;
|
||||
|
||||
APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
|
||||
APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
|
||||
|
||||
sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
|
||||
sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
|
||||
sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
|
||||
sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
|
||||
|
||||
sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
|
||||
sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
|
||||
sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
|
||||
sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
|
||||
|
||||
sum1 = iclip_vec(sum1, zerov, clip_limit_v);
|
||||
sum2 = iclip_vec(sum2, zerov, clip_limit_v);
|
||||
sum3 = iclip_vec(sum3, zerov, clip_limit_v);
|
||||
sum4 = iclip_vec(sum4, zerov, clip_limit_v);
|
||||
|
||||
vec_st(sum1, 0, &hor_ptr[i]);
|
||||
vec_st(sum2, 16, &hor_ptr[i]);
|
||||
vec_st(sum3, 32, &hor_ptr[i]);
|
||||
vec_st(sum4, 48, &hor_ptr[i]);
|
||||
}
|
||||
tmp_ptr += REST_UNIT_STRIDE;
|
||||
hor_ptr += REST_UNIT_STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
static inline i16x8 iclip_u8_vec(i16x8 v) {
|
||||
static const i16x8 zerov = vec_splats((int16_t)0);
|
||||
static const i16x8 maxv = vec_splats((int16_t)255);
|
||||
v = vec_max(zerov, v);
|
||||
v = vec_min(maxv, v);
|
||||
return v;
|
||||
}
|
||||
|
||||
#define APPLY_FILTER_V(index, f) do { \
|
||||
i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
|
||||
sum1 = sum1 + v1 * f; \
|
||||
sum2 = sum2 + v2 * f; \
|
||||
sum3 = sum3 + v3 * f; \
|
||||
sum4 = sum4 + v4 * f; \
|
||||
} while (0)
|
||||
|
||||
#define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
|
||||
i32x4 v_1 = (i32x4) vec_ld( 0, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v_2 = (i32x4) vec_ld(16, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v_3 = (i32x4) vec_ld(32, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 v_4 = (i32x4) vec_ld(48, &hor[(j + 3) * REST_UNIT_STRIDE + i]); \
|
||||
i32x4 sum1 = -round_offset_vec; \
|
||||
i32x4 sum2 = -round_offset_vec; \
|
||||
i32x4 sum3 = -round_offset_vec; \
|
||||
i32x4 sum4 = -round_offset_vec; \
|
||||
APPLY_FILTER_V(0, filterv0); \
|
||||
APPLY_FILTER_V(1, filterv1); \
|
||||
APPLY_FILTER_V(2, filterv2); \
|
||||
APPLY_FILTER_V(3, filterv3); \
|
||||
APPLY_FILTER_V(4, filterv4); \
|
||||
APPLY_FILTER_V(5, filterv5); \
|
||||
APPLY_FILTER_V(6, filterv6); \
|
||||
sum1 = (v_1 << seven_vec) + sum1 + rounding_off_vec; \
|
||||
sum2 = (v_2 << seven_vec) + sum2 + rounding_off_vec; \
|
||||
sum3 = (v_3 << seven_vec) + sum3 + rounding_off_vec; \
|
||||
sum4 = (v_4 << seven_vec) + sum4 + rounding_off_vec; \
|
||||
sum1 = sum1 >> round_bits_vec; \
|
||||
sum2 = sum2 >> round_bits_vec; \
|
||||
sum3 = sum3 >> round_bits_vec; \
|
||||
sum4 = sum4 >> round_bits_vec; \
|
||||
i16x8 sum_short_packed_1 = (i16x8) vec_pack( sum1, sum2 ); \
|
||||
i16x8 sum_short_packed_2 = (i16x8) vec_pack( sum3, sum4 ); \
|
||||
sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
|
||||
sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
|
||||
sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2 ); \
|
||||
} while (0)
|
||||
|
||||
static inline void wiener_filter_v_vsx(uint8_t *p,
|
||||
const ptrdiff_t p_stride,
|
||||
const int32_t *hor,
|
||||
const int16_t filterv[7],
|
||||
const int w, const int h)
|
||||
{
|
||||
static const i32x4 round_bits_vec = vec_splats(11);
|
||||
static const i32x4 rounding_off_vec = vec_splats(1 << 10);
|
||||
static const i32x4 round_offset_vec = vec_splats(1 << 18);
|
||||
static const i32x4 seven_vec = vec_splats(7);
|
||||
|
||||
i32x4 filterv0 = vec_splats((int32_t) filterv[0]);
|
||||
i32x4 filterv1 = vec_splats((int32_t) filterv[1]);
|
||||
i32x4 filterv2 = vec_splats((int32_t) filterv[2]);
|
||||
i32x4 filterv3 = vec_splats((int32_t) filterv[3]);
|
||||
i32x4 filterv4 = vec_splats((int32_t) filterv[4]);
|
||||
i32x4 filterv5 = vec_splats((int32_t) filterv[5]);
|
||||
i32x4 filterv6 = vec_splats((int32_t) filterv[6]);
|
||||
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i <(w-w%16); i += 16) {
|
||||
u8x16 sum_pixel;
|
||||
LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
|
||||
vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(p_stride) + i]);
|
||||
}
|
||||
// remaining loop
|
||||
if (w & 0xf){
|
||||
int i=w-w%16;
|
||||
ALIGN_STK_16(uint8_t, tmp_out, 16,);
|
||||
u8x16 sum_pixel;
|
||||
|
||||
LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
|
||||
vec_vsx_st(sum_pixel, 0, tmp_out);
|
||||
|
||||
for (int k=0; i<w; i++, k++) {
|
||||
p[j * PXSTRIDE(p_stride) + i] = tmp_out[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void padding(uint8_t *dst, const uint8_t *p,
|
||||
const ptrdiff_t p_stride, const uint8_t (*left)[4],
|
||||
const uint8_t *lpf, const ptrdiff_t lpf_stride,
|
||||
int unit_w, const int stripe_h,
|
||||
const enum LrEdgeFlags edges)
|
||||
{
|
||||
const int have_left = !!(edges & LR_HAVE_LEFT);
|
||||
const int have_right = !!(edges & LR_HAVE_RIGHT);
|
||||
|
||||
// Copy more pixels if we don't have to pad them
|
||||
unit_w += 3 * have_left + 3 * have_right;
|
||||
uint8_t *dst_l = dst + 3 * !have_left;
|
||||
p -= 3 * have_left;
|
||||
lpf -= 3 * have_left;
|
||||
|
||||
if (edges & LR_HAVE_TOP) {
|
||||
// Copy previous loop filtered rows
|
||||
const uint8_t *const above_1 = lpf;
|
||||
const uint8_t *const above_2 = above_1 + PXSTRIDE(lpf_stride);
|
||||
pixel_copy(dst_l, above_1, unit_w);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
|
||||
} else {
|
||||
// Pad with first row
|
||||
pixel_copy(dst_l, p, unit_w);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
|
||||
if (have_left) {
|
||||
pixel_copy(dst_l, &left[0][1], 3);
|
||||
pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
|
||||
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
|
||||
if (edges & LR_HAVE_BOTTOM) {
|
||||
// Copy next loop filtered rows
|
||||
const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(lpf_stride);
|
||||
const uint8_t *const below_2 = below_1 + PXSTRIDE(lpf_stride);
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
|
||||
} else {
|
||||
// Pad with last row
|
||||
const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(p_stride);
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
|
||||
if (have_left) {
|
||||
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
|
||||
}
|
||||
}
|
||||
|
||||
// Inner UNIT_WxSTRIPE_H
|
||||
for (int j = 0; j < stripe_h; j++) {
|
||||
pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
|
||||
dst_tl += REST_UNIT_STRIDE;
|
||||
p += PXSTRIDE(p_stride);
|
||||
}
|
||||
|
||||
if (!have_right) {
|
||||
uint8_t *pad = dst_l + unit_w;
|
||||
uint8_t *row_last = &dst_l[unit_w - 1];
|
||||
// Pad 3x(STRIPE_H+6) with last column
|
||||
for (int j = 0; j < stripe_h + 6; j++) {
|
||||
pixel_set(pad, *row_last, 3);
|
||||
pad += REST_UNIT_STRIDE;
|
||||
row_last += REST_UNIT_STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
if (!have_left) {
|
||||
// Pad 3x(STRIPE_H+6) with first column
|
||||
for (int j = 0; j < stripe_h + 6; j++) {
|
||||
pixel_set(dst, *dst_l, 3);
|
||||
dst += REST_UNIT_STRIDE;
|
||||
dst_l += REST_UNIT_STRIDE;
|
||||
}
|
||||
} else {
|
||||
dst += 3 * REST_UNIT_STRIDE;
|
||||
for (int j = 0; j < stripe_h; j++) {
|
||||
pixel_copy(dst, &left[j][1], 3);
|
||||
dst += REST_UNIT_STRIDE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// FIXME Could split into luma and chroma specific functions,
|
||||
// (since first and last tops are always 0 for chroma)
|
||||
// FIXME Could implement a version that requires less temporary memory
|
||||
// (should be possible to implement with only 6 rows of temp storage)
|
||||
static void wiener_filter_vsx(uint8_t *p, const ptrdiff_t p_stride,
|
||||
const uint8_t (*const left)[4],
|
||||
const uint8_t *lpf,
|
||||
const ptrdiff_t lpf_stride,
|
||||
const int w, const int h,
|
||||
const int16_t filterh[7],
|
||||
const int16_t filterv[7],
|
||||
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
|
||||
// of padding above and below
|
||||
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
|
||||
padding(tmp, p, p_stride, left, lpf, lpf_stride, w, h, edges);
|
||||
ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
|
||||
|
||||
wiener_filter_h_vsx(hor, tmp, filterh, w, h);
|
||||
wiener_filter_v_vsx(p, p_stride, hor, filterv, w, h);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_ppc)
|
||||
(Dav1dLoopRestorationDSPContext *const c)
|
||||
{
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_vsx;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -47,6 +47,8 @@
|
|||
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
|
||||
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
|
||||
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
|
||||
#define i16h_to_i32(v) ((i32x4) vec_unpackh((i16x8)v))
|
||||
#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
|
||||
#define i16l_to_i32(v) ((i32x4) vec_unpackl((i16x8)v))
|
||||
|
||||
#endif /* DAV1D_SRC_PPC_TYPES_H */
|
||||
|
|
|
@ -1971,7 +1971,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
|||
start_of_tile_row);
|
||||
}
|
||||
|
||||
if (f->seq_hdr->restoration) {
|
||||
if (f->lf.restore_planes) {
|
||||
// Store loop filtered pixels required by loop restoration
|
||||
bytefn(dav1d_lr_copy_lpf)(f, f->lf.p, sby);
|
||||
}
|
||||
|
@ -2010,7 +2010,7 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
|
|||
f->resize_start[!!pl] HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
}
|
||||
if (f->seq_hdr->restoration) {
|
||||
if (f->lf.restore_planes) {
|
||||
bytefn(dav1d_lr_sbrow)(f, f->lf.sr_p, sby);
|
||||
}
|
||||
|
||||
|
|
|
@ -716,52 +716,65 @@ const uint16_t dav1d_dr_intra_derivative[44] = {
|
|||
3 // 87, 177, 267
|
||||
};
|
||||
|
||||
#if ARCH_X86
|
||||
#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
|
||||
[2*idx+0] = f0, [2*idx+1] = f1, \
|
||||
[2*idx+16] = f2, [2*idx+17] = f3, \
|
||||
[2*idx+32] = f4, [2*idx+33] = f5, \
|
||||
[2*idx+48] = f6
|
||||
#else
|
||||
#define F(idx, f0, f1, f2, f3, f4, f5, f6) \
|
||||
[1*idx+0] = f0, [1*idx+8] = f1, \
|
||||
[1*idx+16] = f2, [1*idx+24] = f3, \
|
||||
[1*idx+32] = f4, [1*idx+40] = f5, \
|
||||
[1*idx+48] = f6
|
||||
#endif
|
||||
const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
|
||||
{
|
||||
-6, 10, -5, 2, -3, 1, -3, 1,
|
||||
-4, 6, -3, 2, -3, 2, -3, 1,
|
||||
0, 0, 10, 0, 1, 10, 1, 2,
|
||||
0, 0, 6, 0, 2, 6, 2, 2,
|
||||
0, 12, 0, 9, 0, 7, 10, 5,
|
||||
0, 2, 0, 2, 0, 2, 6, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
12, 0, 9, 0, 7, 0, 5, 0
|
||||
F( 0, -6, 10, 0, 0, 0, 12, 0 ),
|
||||
F( 1, -5, 2, 10, 0, 0, 9, 0 ),
|
||||
F( 2, -3, 1, 1, 10, 0, 7, 0 ),
|
||||
F( 3, -3, 1, 1, 2, 10, 5, 0 ),
|
||||
F( 4, -4, 6, 0, 0, 0, 2, 12 ),
|
||||
F( 5, -3, 2, 6, 0, 0, 2, 9 ),
|
||||
F( 6, -3, 2, 2, 6, 0, 2, 7 ),
|
||||
F( 7, -3, 1, 2, 2, 6, 3, 5 ),
|
||||
}, {
|
||||
-10, 16, -6, 0, -4, 0, -2, 0,
|
||||
-10, 16, -6, 0, -4, 0, -2, 0,
|
||||
0, 0, 16, 0, 0, 16, 0, 0,
|
||||
0, 0, 16, 0, 0, 16, 0, 0,
|
||||
0, 10, 0, 6, 0, 4, 16, 2,
|
||||
0, 0, 0, 0, 0, 0, 16, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
10, 0, 6, 0, 4, 0, 2, 0
|
||||
F( 0, -10, 16, 0, 0, 0, 10, 0 ),
|
||||
F( 1, -6, 0, 16, 0, 0, 6, 0 ),
|
||||
F( 2, -4, 0, 0, 16, 0, 4, 0 ),
|
||||
F( 3, -2, 0, 0, 0, 16, 2, 0 ),
|
||||
F( 4, -10, 16, 0, 0, 0, 0, 10 ),
|
||||
F( 5, -6, 0, 16, 0, 0, 0, 6 ),
|
||||
F( 6, -4, 0, 0, 16, 0, 0, 4 ),
|
||||
F( 7, -2, 0, 0, 0, 16, 0, 2 ),
|
||||
}, {
|
||||
-8, 8, -8, 0, -8, 0, -8, 0,
|
||||
-4, 4, -4, 0, -4, 0, -4, 0,
|
||||
0, 0, 8, 0, 0, 8, 0, 0,
|
||||
0, 0, 4, 0, 0, 4, 0, 0,
|
||||
0, 16, 0, 16, 0, 16, 8, 16,
|
||||
0, 0, 0, 0, 0, 0, 4, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
16, 0, 16, 0, 16, 0, 16, 0
|
||||
F( 0, -8, 8, 0, 0, 0, 16, 0 ),
|
||||
F( 1, -8, 0, 8, 0, 0, 16, 0 ),
|
||||
F( 2, -8, 0, 0, 8, 0, 16, 0 ),
|
||||
F( 3, -8, 0, 0, 0, 8, 16, 0 ),
|
||||
F( 4, -4, 4, 0, 0, 0, 0, 16 ),
|
||||
F( 5, -4, 0, 4, 0, 0, 0, 16 ),
|
||||
F( 6, -4, 0, 0, 4, 0, 0, 16 ),
|
||||
F( 7, -4, 0, 0, 0, 4, 0, 16 ),
|
||||
}, {
|
||||
-2, 8, -1, 3, -1, 2, 0, 1,
|
||||
-1, 4, -1, 3, -1, 2, -1, 2,
|
||||
0, 0, 8, 0, 3, 8, 2, 3,
|
||||
0, 0, 4, 0, 3, 4, 2, 3,
|
||||
0, 10, 0, 6, 0, 4, 8, 2,
|
||||
0, 3, 0, 4, 0, 4, 4, 3,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
10, 0, 6, 0, 4, 0, 3, 0
|
||||
F( 0, -2, 8, 0, 0, 0, 10, 0 ),
|
||||
F( 1, -1, 3, 8, 0, 0, 6, 0 ),
|
||||
F( 2, -1, 2, 3, 8, 0, 4, 0 ),
|
||||
F( 3, 0, 1, 2, 3, 8, 2, 0 ),
|
||||
F( 4, -1, 4, 0, 0, 0, 3, 10 ),
|
||||
F( 5, -1, 3, 4, 0, 0, 4, 6 ),
|
||||
F( 6, -1, 2, 3, 4, 0, 4, 4 ),
|
||||
F( 7, -1, 2, 2, 3, 4, 3, 3 ),
|
||||
}, {
|
||||
-12, 14, -10, 0, -9, 0, -8, 0,
|
||||
-10, 12, -9, 1, -8, 0, -7, 0,
|
||||
0, 0, 14, 0, 0, 14, 0, 0,
|
||||
0, 0, 12, 0, 0, 12, 0, 1,
|
||||
0, 14, 0, 12, 0, 11, 14, 10,
|
||||
0, 0, 0, 0, 0, 1, 12, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
14, 0, 12, 0, 11, 0, 9, 0
|
||||
F( 0, -12, 14, 0, 0, 0, 14, 0 ),
|
||||
F( 1, -10, 0, 14, 0, 0, 12, 0 ),
|
||||
F( 2, -9, 0, 0, 14, 0, 11, 0 ),
|
||||
F( 3, -8, 0, 0, 0, 14, 10, 0 ),
|
||||
F( 4, -10, 12, 0, 0, 0, 0, 14 ),
|
||||
F( 5, -9, 1, 12, 0, 0, 0, 12 ),
|
||||
F( 6, -8, 0, 0, 12, 0, 1, 11 ),
|
||||
F( 7, -7, 0, 0, 1, 12, 1, 9 ),
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -31,14 +31,17 @@
|
|||
decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_8x8_sse2);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x8_sse2);
|
||||
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_avx2);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse4);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
|
||||
decl_cdef_fn(dav1d_cdef_filter_4x4_sse2);
|
||||
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
|
||||
decl_cdef_dir_fn(dav1d_cdef_dir_sse4);
|
||||
|
@ -47,6 +50,14 @@ decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
|
|||
COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->fb[0] = dav1d_cdef_filter_8x8_sse2;
|
||||
c->fb[1] = dav1d_cdef_filter_4x8_sse2;
|
||||
c->fb[2] = dav1d_cdef_filter_4x4_sse2;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
|
|
|
@ -32,6 +32,7 @@ SECTION_RODATA 16
|
|||
pb_0: times 16 db 0
|
||||
pb_0xFF: times 16 db 0xFF
|
||||
%endif
|
||||
pw_8: times 8 dw 8
|
||||
pw_128: times 8 dw 128
|
||||
pw_256: times 8 dw 256
|
||||
pw_2048: times 8 dw 2048
|
||||
|
@ -118,6 +119,36 @@ SECTION .text
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PMOVZXBW 2-3 0 ; %3 = half
|
||||
%if %3 == 1
|
||||
movd %1, %2
|
||||
%else
|
||||
movq %1, %2
|
||||
%endif
|
||||
punpcklbw %1, m15
|
||||
%endmacro
|
||||
|
||||
%macro PSHUFB_0 2
|
||||
%if cpuflag(ssse3)
|
||||
pshufb %1, %2
|
||||
%else
|
||||
punpcklbw %1, %1
|
||||
pshuflw %1, %1, q0000
|
||||
punpcklqdq %1, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_SEC_TAP 0
|
||||
%if ARCH_X86_64
|
||||
movd m3, [secq+kq]
|
||||
PSHUFB_0 m3, m15
|
||||
%else
|
||||
movd m2, [secq+kq] ; sec_taps
|
||||
pxor m3, m3
|
||||
PSHUFB_0 m2, m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
|
||||
; load p0/p1
|
||||
movsx offq, byte [dirq+kq+%1] ; off1
|
||||
|
@ -153,13 +184,13 @@ SECTION .text
|
|||
pmaxsw m7, m10 ; max after p1
|
||||
pminsw m8, m6 ; min after p1
|
||||
%else
|
||||
pcmpeqw m3, m5, OUT_OF_BOUNDS_MEM
|
||||
pandn m3, m5
|
||||
pmaxsw m7, m3 ; max after p0
|
||||
pcmpeqw m9, m5, OUT_OF_BOUNDS_MEM
|
||||
pandn m9, m5
|
||||
pmaxsw m7, m9 ; max after p0
|
||||
pminsw m8, m5 ; min after p0
|
||||
pcmpeqw m3, m6, OUT_OF_BOUNDS_MEM
|
||||
pandn m3, m6
|
||||
pmaxsw m7, m3 ; max after p1
|
||||
pcmpeqw m9, m6, OUT_OF_BOUNDS_MEM
|
||||
pandn m9, m6
|
||||
pmaxsw m7, m9 ; max after p1
|
||||
pminsw m8, m6 ; min after p1
|
||||
%endif
|
||||
%endif
|
||||
|
@ -168,13 +199,24 @@ SECTION .text
|
|||
psubw m5, m4 ; diff_p0(p0 - px)
|
||||
psubw m6, m4 ; diff_p1(p1 - px)
|
||||
packsswb m5, m6 ; convert pixel diff to 8-bit
|
||||
%if ARCH_X86_64 && cpuflag(sse4)
|
||||
%if cpuflag(ssse3)
|
||||
%if ARCH_X86_64 && cpuflag(sse4)
|
||||
pshufb m5, m14 ; group diffs p0 and p1 into pairs
|
||||
%else
|
||||
%else
|
||||
pshufb m5, [PIC_sym(shufb_lohi)]
|
||||
%endif
|
||||
%endif
|
||||
pabsb m6, m5
|
||||
psignb m9, %5, m5
|
||||
%else
|
||||
movlhps m6, m5
|
||||
punpckhbw m6, m5
|
||||
pxor m5, m5
|
||||
pcmpgtb m5, m6
|
||||
paddb m6, m5
|
||||
pxor m6, m5
|
||||
paddb m9, %5, m5
|
||||
pxor m9, m5
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
psrlw m10, m6, %2 ; emulate 8-bit shift
|
||||
pand m10, %3
|
||||
|
@ -186,17 +228,18 @@ SECTION .text
|
|||
pxor m5, [PIC_sym(pb_0xFF)]
|
||||
%endif
|
||||
pminub m5, m6 ; constrain(diff_p)
|
||||
%if cpuflag(ssse3)
|
||||
pmaddubsw m5, m9 ; constrain(diff_p) * taps
|
||||
paddw m13, m5
|
||||
%endmacro
|
||||
|
||||
%macro PMOVZXBW 2-3 0 ; %3 = half
|
||||
%if %3 == 1
|
||||
movd %1, %2
|
||||
%else
|
||||
movq %1, %2
|
||||
psrlw m2, m5, 8
|
||||
psraw m6, m9, 8
|
||||
psllw m5, 8
|
||||
psllw m9, 8
|
||||
pmullw m2, m6
|
||||
pmulhw m5, m9
|
||||
paddw m5, m2
|
||||
%endif
|
||||
punpcklbw %1, m15
|
||||
paddw m13, m5
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_BODY 4 ; dst, src, block_width, tmp_stride
|
||||
|
@ -610,8 +653,8 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
%endif
|
||||
movd m2, [tableq+pridmpq]
|
||||
movd m3, [tableq+secdmpq]
|
||||
pshufb m2, m15 ; pri_shift_mask
|
||||
pshufb m3, m15 ; sec_shift_mask
|
||||
PSHUFB_0 m2, m15 ; pri_shift_mask
|
||||
PSHUFB_0 m3, m15 ; sec_shift_mask
|
||||
%if ARCH_X86_64
|
||||
SWAP m2, m11
|
||||
SWAP m3, m12
|
||||
|
@ -630,13 +673,15 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
movd m0, prid
|
||||
movd m1, secd
|
||||
%if ARCH_X86_64
|
||||
pshufb m0, m15
|
||||
pshufb m1, m15
|
||||
PSHUFB_0 m0, m15
|
||||
PSHUFB_0 m1, m15
|
||||
%else
|
||||
mova m2, m15
|
||||
%if cpuflag(ssse3)
|
||||
pxor m2, m2
|
||||
%endif
|
||||
mova m3, [PIC_sym(pb_0xFF)]
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
PSHUFB_0 m0, m2
|
||||
PSHUFB_0 m1, m2
|
||||
pxor m0, m3
|
||||
pxor m1, m3
|
||||
mova [esp+0x20], m0
|
||||
|
@ -687,36 +732,44 @@ cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
|
|||
mova m7, m4 ; max
|
||||
mova m8, m4 ; min
|
||||
.k_loop:
|
||||
%if ARCH_X86_64
|
||||
movd m2, [priq+kq] ; pri_taps
|
||||
movd m3, [secq+kq] ; sec_taps
|
||||
pshufb m2, m15
|
||||
pshufb m3, m15
|
||||
%if ARCH_X86_64
|
||||
PSHUFB_0 m2, m15
|
||||
%if cpuflag(ssse3)
|
||||
LOAD_SEC_TAP ; sec_taps
|
||||
%endif
|
||||
ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
|
||||
%if notcpuflag(ssse3)
|
||||
LOAD_SEC_TAP ; sec_taps
|
||||
%endif
|
||||
ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
|
||||
ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
|
||||
%else
|
||||
movd m2, [priq+kq] ; pri_taps
|
||||
pshufb m2, m15
|
||||
%if cpuflag(ssse3)
|
||||
pxor m3, m3
|
||||
%endif
|
||||
PSHUFB_0 m2, m3
|
||||
ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
|
||||
|
||||
movd m2, [secq+kq] ; sec_taps
|
||||
pshufb m2, m15
|
||||
LOAD_SEC_TAP ; sec_taps
|
||||
ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
|
||||
%if notcpuflag(ssse3)
|
||||
LOAD_SEC_TAP ; sec_taps
|
||||
%endif
|
||||
ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
|
||||
%endif
|
||||
|
||||
dec kq
|
||||
jge .k_loop
|
||||
|
||||
%if cpuflag(sse4)
|
||||
pcmpgtw m6, m15, m13
|
||||
%else
|
||||
pxor m6, m6
|
||||
pcmpgtw m6, m13
|
||||
%endif
|
||||
paddw m13, m6
|
||||
%if cpuflag(ssse3)
|
||||
pmulhrsw m13, [PIC_sym(pw_2048)]
|
||||
%else
|
||||
paddw m13, [PIC_sym(pw_8)]
|
||||
psraw m13, 4
|
||||
%endif
|
||||
paddw m4, m13
|
||||
pminsw m4, m7
|
||||
pmaxsw m4, m8
|
||||
|
@ -1352,3 +1405,8 @@ CDEF_FILTER 8, 8, 32
|
|||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
CDEF_DIR
|
||||
|
||||
INIT_XMM sse2
|
||||
CDEF_FILTER 8, 8, 32
|
||||
CDEF_FILTER 4, 8, 32
|
||||
CDEF_FILTER 4, 4, 32
|
||||
|
|
|
@ -32,6 +32,8 @@ pw_1024: times 16 dw 1024
|
|||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
pd_m65536: dd ~0xffff
|
||||
pb_23_22: times 2 db 23, 22
|
||||
pb_1: times 4 db 1
|
||||
|
@ -55,6 +57,7 @@ pb_27_17_17_27: db 27, 17, 17, 27
|
|||
%endmacro
|
||||
|
||||
JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
|
||||
JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
|
||||
|
||||
struc FGData
|
||||
.seed: resd 1
|
||||
|
@ -409,6 +412,443 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
|
|||
jg .y_loop_ar3
|
||||
RET
|
||||
|
||||
INIT_XMM avx2
|
||||
cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
lea r4, [pb_mask]
|
||||
%define base r4-pb_mask
|
||||
movq xm1, [base+rnd_next_upperbit_mask]
|
||||
movq xm4, [base+mul_bits]
|
||||
movq xm7, [base+hmul_bits]
|
||||
mov r5d, [fg_dataq+FGData.grain_scale_shift]
|
||||
vpbroadcastw xm8, [base+round+r5*2]
|
||||
mova xm5, [base+pb_mask]
|
||||
vpbroadcastw xm0, [fg_dataq+FGData.seed]
|
||||
vpbroadcastw xm9, [base+pw_seed_xor+uvq*4]
|
||||
pxor xm0, xm9
|
||||
vpbroadcastd xm9, [base+pd_m65536]
|
||||
lea r6, [gaussian_sequence]
|
||||
mov r7d, 38
|
||||
add bufq, 44
|
||||
.loop_y:
|
||||
mov r5, -44
|
||||
.loop_x:
|
||||
pand xm2, xm0, xm1
|
||||
psrlw xm3, xm2, 10
|
||||
por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set
|
||||
pmullw xm2, xm4 ; bits 0x0f00 are set
|
||||
pshufb xm2, xm5, xm2 ; set 15th bit for next 4 seeds
|
||||
psllq xm6, xm2, 30
|
||||
por xm2, xm6
|
||||
psllq xm6, xm2, 15
|
||||
por xm2, xm6 ; aggregate each bit into next seed's high bit
|
||||
pmulhuw xm3, xm0, xm7
|
||||
por xm2, xm3 ; 4 next output seeds
|
||||
pshuflw xm0, xm2, q3333
|
||||
psrlw xm2, 5
|
||||
pmovzxwd xm3, xm2
|
||||
mova xm6, xm9
|
||||
vpgatherdd xm2, [r6+xm3*2], xm6
|
||||
pandn xm2, xm9, xm2
|
||||
packusdw xm2, xm2
|
||||
pmulhrsw xm2, xm8
|
||||
packsswb xm2, xm2
|
||||
movd [bufq+r5], xm2
|
||||
add r5, 4
|
||||
jl .loop_x
|
||||
add bufq, 82
|
||||
dec r7d
|
||||
jg .loop_y
|
||||
|
||||
; auto-regression code
|
||||
movsxd r5, [fg_dataq+FGData.ar_coeff_lag]
|
||||
movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4]
|
||||
lea r5, [r5+base+generate_grain_uv_420_avx2_table]
|
||||
jmp r5
|
||||
|
||||
.ar0:
|
||||
INIT_YMM avx2
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
|
||||
imul uvd, 25
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
|
||||
movd xm3, [base+hmul_bits+shiftq*2]
|
||||
DEFINE_ARGS buf, bufy, h
|
||||
pmovsxbw xm4, xm4
|
||||
vpbroadcastd m7, [pb_1]
|
||||
vpbroadcastw m6, [hmul_bits+4]
|
||||
vpbroadcastw m4, xm4
|
||||
vpbroadcastw m3, xm3
|
||||
sub bufq, 82*38+82-(82*3+41)
|
||||
add bufyq, 3+82*3
|
||||
mov hd, 35
|
||||
.y_loop_ar0:
|
||||
; first 32 pixels
|
||||
movu xm8, [bufyq]
|
||||
movu xm9, [bufyq+82]
|
||||
movu xm10, [bufyq+16]
|
||||
movu xm11, [bufyq+82+16]
|
||||
vinserti128 m8, [bufyq+32], 1
|
||||
vinserti128 m9, [bufyq+82+32], 1
|
||||
vinserti128 m10, [bufyq+48], 1
|
||||
vinserti128 m11, [bufyq+82+48], 1
|
||||
pmaddubsw m8, m7, m8
|
||||
pmaddubsw m9, m7, m9
|
||||
pmaddubsw m10, m7, m10
|
||||
pmaddubsw m11, m7, m11
|
||||
paddw m8, m9
|
||||
paddw m10, m11
|
||||
pmulhrsw m8, m6
|
||||
pmulhrsw m10, m6
|
||||
pmullw m8, m4
|
||||
pmullw m10, m4
|
||||
pmulhrsw m8, m3
|
||||
pmulhrsw m10, m3
|
||||
packsswb m8, m10
|
||||
movu m0, [bufq]
|
||||
punpckhbw m1, m0, m8
|
||||
punpcklbw m0, m8
|
||||
pmaddubsw m1, m7, m1
|
||||
pmaddubsw m0, m7, m0
|
||||
packsswb m0, m1
|
||||
movu [bufq], m0
|
||||
|
||||
; last 6 pixels
|
||||
movu xm8, [bufyq+32*2]
|
||||
movu xm9, [bufyq+32*2+82]
|
||||
pmaddubsw xm8, xm7, xm8
|
||||
pmaddubsw xm9, xm7, xm9
|
||||
paddw xm8, xm9
|
||||
pmulhrsw xm8, xm6
|
||||
pmullw xm8, xm4
|
||||
pmulhrsw xm8, xm3
|
||||
packsswb xm8, xm8
|
||||
movq xm0, [bufq+32]
|
||||
punpcklbw xm8, xm0
|
||||
pmaddubsw xm8, xm7, xm8
|
||||
packsswb xm8, xm8
|
||||
vpblendw xm0, xm8, xm0, 1000b
|
||||
movq [bufq+32], xm0
|
||||
|
||||
add bufq, 82
|
||||
add bufyq, 82*2
|
||||
dec hd
|
||||
jg .y_loop_ar0
|
||||
RET
|
||||
|
||||
.ar1:
|
||||
INIT_XMM avx2
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, val3, cf3, min, max, x, shift
|
||||
imul uvd, 25
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
movsx cf3d, byte [fg_dataq+FGData.ar_coeffs_uv+uvq+3]
|
||||
movd xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq]
|
||||
pinsrb xm4, [fg_dataq+FGData.ar_coeffs_uv+uvq+4], 3
|
||||
DEFINE_ARGS buf, bufy, h, val0, val3, cf3, min, max, x, shift
|
||||
pmovsxbw xm4, xm4
|
||||
pshufd xm5, xm4, q1111
|
||||
pshufd xm4, xm4, q0000
|
||||
pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd
|
||||
vpbroadcastd xm7, [pb_1]
|
||||
vpbroadcastw xm6, [hmul_bits+4]
|
||||
vpbroadcastd xm3, xm3
|
||||
sub bufq, 82*38+44-(82*3+41)
|
||||
add bufyq, 79+82*3
|
||||
mov hd, 35
|
||||
mov mind, -128
|
||||
mov maxd, 127
|
||||
.y_loop_ar1:
|
||||
mov xq, -38
|
||||
movsx val3d, byte [bufq+xq-1]
|
||||
.x_loop_ar1:
|
||||
pmovsxbw xm0, [bufq+xq-82-1] ; top/left
|
||||
movq xm8, [bufyq+xq*2]
|
||||
movq xm9, [bufyq+xq*2+82]
|
||||
psrldq xm2, xm0, 2 ; top
|
||||
psrldq xm1, xm0, 4 ; top/right
|
||||
pmaddubsw xm8, xm7, xm8
|
||||
pmaddubsw xm9, xm7, xm9
|
||||
paddw xm8, xm9
|
||||
pmulhrsw xm8, xm6
|
||||
punpcklwd xm0, xm2
|
||||
punpcklwd xm1, xm8
|
||||
pmaddwd xm0, xm4
|
||||
pmaddwd xm1, xm5
|
||||
paddd xm0, xm1
|
||||
paddd xm0, xm3
|
||||
.x_loop_ar1_inner:
|
||||
movd val0d, xm0
|
||||
psrldq xm0, 4
|
||||
imul val3d, cf3d
|
||||
add val3d, val0d
|
||||
sarx val3d, val3d, shiftd
|
||||
movsx val0d, byte [bufq+xq]
|
||||
add val3d, val0d
|
||||
cmp val3d, maxd
|
||||
cmovg val3d, maxd
|
||||
cmp val3d, mind
|
||||
cmovl val3d, mind
|
||||
mov byte [bufq+xq], val3b
|
||||
; keep val3d in-place as left for next x iteration
|
||||
inc xq
|
||||
jz .x_loop_ar1_end
|
||||
test xq, 3
|
||||
jnz .x_loop_ar1_inner
|
||||
jmp .x_loop_ar1
|
||||
|
||||
.x_loop_ar1_end:
|
||||
add bufq, 82
|
||||
add bufyq, 82*2
|
||||
dec hd
|
||||
jg .y_loop_ar1
|
||||
RET
|
||||
|
||||
.ar2:
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
imul uvd, 25
|
||||
movd xm15, [base+hmul_bits-10+shiftq*2]
|
||||
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
|
||||
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
|
||||
DEFINE_ARGS buf, bufy, h, x
|
||||
pshufd xm12, xm9, q0000
|
||||
pshufd xm13, xm9, q1111
|
||||
pshufd xm14, xm9, q2222
|
||||
pxor xm10, xm10
|
||||
vpblendw xm14, xm10, 10101010b
|
||||
pshufd xm11, xm8, q3333
|
||||
pshufd xm10, xm8, q2222
|
||||
pshufd xm9, xm8, q1111
|
||||
pshufd xm8, xm8, q0000
|
||||
sub bufq, 82*38+44-(82*3+41)
|
||||
add bufyq, 79+82*3
|
||||
mov hd, 35
|
||||
.y_loop_ar2:
|
||||
mov xq, -38
|
||||
|
||||
.x_loop_ar2:
|
||||
pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5]
|
||||
pmovsxbw xm1, [bufq+xq-82*1-2] ; y=-1,x=[-2,+5]
|
||||
psrldq xm2, xm0, 2 ; y=-2,x=[-1,+5]
|
||||
psrldq xm3, xm1, 2 ; y=-1,x=[-1,+5]
|
||||
psrldq xm4, xm1, 4 ; y=-1,x=[+0,+5]
|
||||
punpcklwd xm2, xm0, xm2
|
||||
punpcklwd xm3, xm4
|
||||
pmaddwd xm2, xm8
|
||||
pmaddwd xm3, xm11
|
||||
paddd xm2, xm3
|
||||
|
||||
psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5]
|
||||
psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5]
|
||||
psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5]
|
||||
punpcklwd xm4, xm5
|
||||
punpcklwd xm6, xm1
|
||||
psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5]
|
||||
psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5]
|
||||
punpcklwd xm7, xm1
|
||||
pmaddwd xm4, xm9
|
||||
pmaddwd xm6, xm10
|
||||
pmaddwd xm7, xm12
|
||||
paddd xm4, xm6
|
||||
paddd xm2, xm7
|
||||
paddd xm2, xm4
|
||||
|
||||
vpbroadcastd xm4, [base+pb_1]
|
||||
movq xm6, [bufyq+xq*2]
|
||||
movq xm7, [bufyq+xq*2+82]
|
||||
pmaddubsw xm6, xm4, xm6
|
||||
pmaddubsw xm7, xm4, xm7
|
||||
vpbroadcastw xm4, [base+hmul_bits+4]
|
||||
paddw xm6, xm7
|
||||
pmulhrsw xm6, xm4
|
||||
pxor xm7, xm7
|
||||
punpcklwd xm6, xm7
|
||||
pmaddwd xm6, xm14
|
||||
paddd xm2, xm6
|
||||
|
||||
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
|
||||
.x_loop_ar2_inner:
|
||||
pmovsxbw xm0, xm0
|
||||
pmaddwd xm3, xm0, xm13
|
||||
paddd xm3, xm2
|
||||
psrldq xm2, 4 ; shift top to next pixel
|
||||
psrad xm3, 5
|
||||
packssdw xm3, xm3
|
||||
pmulhrsw xm3, xm15
|
||||
pslldq xm3, 2
|
||||
psrldq xm0, 2
|
||||
paddw xm3, xm0
|
||||
vpblendw xm0, xm3, 00000010b
|
||||
packsswb xm0, xm0
|
||||
pextrb [bufq+xq], xm0, 1
|
||||
inc xq
|
||||
jz .x_loop_ar2_end
|
||||
test xq, 3
|
||||
jnz .x_loop_ar2_inner
|
||||
jmp .x_loop_ar2
|
||||
|
||||
.x_loop_ar2_end:
|
||||
add bufq, 82
|
||||
add bufyq, 82*2
|
||||
dec hd
|
||||
jg .y_loop_ar2
|
||||
RET
|
||||
|
||||
.ar3:
|
||||
DEFINE_ARGS buf, bufy, fg_data, uv, unused, shift
|
||||
SUB rsp, 16*12
|
||||
%assign stack_size_padded (stack_size_padded+16*12)
|
||||
%assign stack_size (stack_size+16*12)
|
||||
mov shiftd, [fg_dataq+FGData.ar_coeff_shift]
|
||||
imul uvd, 25
|
||||
movd xm14, [base+hmul_bits-10+shiftq*2]
|
||||
pmovsxbw xm0, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 0] ; cf0-7
|
||||
pmovsxbw xm1, [fg_dataq+FGData.ar_coeffs_uv+uvq+ 8] ; cf8-15
|
||||
pmovsxbw xm2, [fg_dataq+FGData.ar_coeffs_uv+uvq+16] ; cf16-23
|
||||
pmovsxbw xm5, [fg_dataq+FGData.ar_coeffs_uv+uvq+24] ; cf24 [luma]
|
||||
pshufd xm9, xm0, q1111
|
||||
pshufd xm10, xm0, q2222
|
||||
pshufd xm11, xm0, q3333
|
||||
pshufd xm0, xm0, q0000
|
||||
pshufd xm6, xm1, q1111
|
||||
pshufd xm7, xm1, q2222
|
||||
pshufd xm8, xm1, q3333
|
||||
pshufd xm1, xm1, q0000
|
||||
pshufd xm3, xm2, q1111
|
||||
pshufd xm4, xm2, q2222
|
||||
vpbroadcastw xm5, xm5
|
||||
vpblendw xm4, xm5, 10101010b ; interleave luma cf
|
||||
psrldq xm5, xm2, 10
|
||||
pshufd xm2, xm2, q0000
|
||||
pinsrw xm5, [base+round_vals+shiftq*2-10], 3
|
||||
mova [rsp+ 0*16], xm0
|
||||
mova [rsp+ 1*16], xm9
|
||||
mova [rsp+ 2*16], xm10
|
||||
mova [rsp+ 3*16], xm11
|
||||
mova [rsp+ 4*16], xm1
|
||||
mova [rsp+ 5*16], xm6
|
||||
mova [rsp+ 6*16], xm7
|
||||
mova [rsp+ 7*16], xm8
|
||||
mova [rsp+ 8*16], xm2
|
||||
mova [rsp+ 9*16], xm3
|
||||
mova [rsp+10*16], xm4
|
||||
mova [rsp+11*16], xm5
|
||||
vpbroadcastd xm13, [base+pb_1]
|
||||
vpbroadcastw xm15, [base+hmul_bits+4]
|
||||
DEFINE_ARGS buf, bufy, h, x
|
||||
sub bufq, 82*38+44-(82*3+41)
|
||||
add bufyq, 79+82*3
|
||||
mov hd, 35
|
||||
.y_loop_ar3:
|
||||
mov xq, -38
|
||||
|
||||
.x_loop_ar3:
|
||||
movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12]
|
||||
movu xm1, [bufq+xq-82*2-3] ; y=-2,x=[-3,+12]
|
||||
movu xm2, [bufq+xq-82*1-3] ; y=-1,x=[-3,+12]
|
||||
pxor xm3, xm3
|
||||
pcmpgtb xm6, xm3, xm2
|
||||
pcmpgtb xm5, xm3, xm1
|
||||
pcmpgtb xm4, xm3, xm0
|
||||
punpckhbw xm3, xm0, xm4
|
||||
punpcklbw xm0, xm4
|
||||
punpckhbw xm4, xm1, xm5
|
||||
punpcklbw xm1, xm5
|
||||
punpckhbw xm5, xm2, xm6
|
||||
punpcklbw xm2, xm6
|
||||
|
||||
psrldq xm6, xm0, 2
|
||||
psrldq xm7, xm0, 4
|
||||
psrldq xm8, xm0, 6
|
||||
psrldq xm9, xm0, 8
|
||||
palignr xm10, xm3, xm0, 10
|
||||
palignr xm11, xm3, xm0, 12
|
||||
|
||||
punpcklwd xm0, xm6
|
||||
punpcklwd xm7, xm8
|
||||
punpcklwd xm9, xm10
|
||||
punpcklwd xm11, xm1
|
||||
pmaddwd xm0, [rsp+ 0*16]
|
||||
pmaddwd xm7, [rsp+ 1*16]
|
||||
pmaddwd xm9, [rsp+ 2*16]
|
||||
pmaddwd xm11, [rsp+ 3*16]
|
||||
paddd xm0, xm7
|
||||
paddd xm9, xm11
|
||||
paddd xm0, xm9
|
||||
|
||||
psrldq xm6, xm1, 2
|
||||
psrldq xm7, xm1, 4
|
||||
psrldq xm8, xm1, 6
|
||||
psrldq xm9, xm1, 8
|
||||
palignr xm10, xm4, xm1, 10
|
||||
palignr xm11, xm4, xm1, 12
|
||||
psrldq xm12, xm2, 2
|
||||
|
||||
punpcklwd xm6, xm7
|
||||
punpcklwd xm8, xm9
|
||||
punpcklwd xm10, xm11
|
||||
punpcklwd xm12, xm2, xm12
|
||||
pmaddwd xm6, [rsp+ 4*16]
|
||||
pmaddwd xm8, [rsp+ 5*16]
|
||||
pmaddwd xm10, [rsp+ 6*16]
|
||||
pmaddwd xm12, [rsp+ 7*16]
|
||||
paddd xm6, xm8
|
||||
paddd xm10, xm12
|
||||
paddd xm6, xm10
|
||||
paddd xm0, xm6
|
||||
|
||||
psrldq xm6, xm2, 4
|
||||
psrldq xm7, xm2, 6
|
||||
psrldq xm8, xm2, 8
|
||||
palignr xm9, xm5, xm2, 10
|
||||
palignr xm5, xm5, xm2, 12
|
||||
|
||||
movq xm1, [bufyq+xq*2]
|
||||
movq xm2, [bufyq+xq*2+82]
|
||||
pmaddubsw xm1, xm13, xm1
|
||||
pmaddubsw xm2, xm13, xm2
|
||||
paddw xm1, xm2
|
||||
vpbroadcastw xm3, xm15
|
||||
pmulhrsw xm1, xm3
|
||||
|
||||
punpcklwd xm6, xm7
|
||||
punpcklwd xm8, xm9
|
||||
punpcklwd xm5, xm1
|
||||
pmaddwd xm6, [rsp+ 8*16]
|
||||
pmaddwd xm8, [rsp+ 9*16]
|
||||
pmaddwd xm5, [rsp+10*16]
|
||||
paddd xm0, xm6
|
||||
paddd xm8, xm5
|
||||
paddd xm0, xm8
|
||||
|
||||
movq xm1, [bufq+xq-3] ; y=0,x=[-3,+4]
|
||||
.x_loop_ar3_inner:
|
||||
pmovsxbw xm1, xm1
|
||||
pmaddwd xm2, xm1, [rsp+16*11]
|
||||
pshufd xm3, xm2, q1111
|
||||
paddd xm2, xm3 ; left+cur
|
||||
paddd xm2, xm0 ; add top
|
||||
psrldq xm0, 4
|
||||
psrad xm2, 5
|
||||
packssdw xm2, xm2
|
||||
pmulhrsw xm2, xm14
|
||||
pslldq xm2, 6
|
||||
vpblendw xm1, xm2, 1000b
|
||||
packsswb xm1, xm1
|
||||
pextrb [bufq+xq], xm1, 3
|
||||
psrldq xm1, 1
|
||||
inc xq
|
||||
jz .x_loop_ar3_end
|
||||
test xq, 3
|
||||
jnz .x_loop_ar3_inner
|
||||
jmp .x_loop_ar3
|
||||
|
||||
.x_loop_ar3_end:
|
||||
add bufq, 82
|
||||
add bufyq, 82*2
|
||||
dec hd
|
||||
jg .y_loop_ar3
|
||||
RET
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
|
||||
pcmpeqw m10, m10
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "src/film_grain.h"
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
|
||||
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
|
||||
|
||||
|
@ -39,6 +40,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
|
|||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
c->generate_grain_y = dav1d_generate_grain_y_avx2;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
|
||||
#endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -39,6 +39,7 @@ decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
|
|||
decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
|
||||
decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
|
||||
decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
|
||||
decl_angular_ipred_fn(dav1d_ipred_z2_avx2);
|
||||
decl_angular_ipred_fn(dav1d_ipred_z3_avx2);
|
||||
decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
|
||||
|
||||
|
@ -119,6 +120,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
|
|||
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
|
||||
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
|
||||
c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
|
||||
c->intra_pred[Z2_PRED] = dav1d_ipred_z2_avx2;
|
||||
c->intra_pred[Z3_PRED] = dav1d_ipred_z3_avx2;
|
||||
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;
|
||||
|
||||
|
|
|
@ -50,7 +50,6 @@ pw_2482_3344: dw 2482, 3344
|
|||
pw_m3344_3344: dw -3344, 3344
|
||||
pw_m3803_3344: dw -3803, 3344
|
||||
pw_m3803_m6688: dw -3803, -6688
|
||||
COEF_PAIR 2896, 2896
|
||||
pw_2896_m2896: dw 2896, -2896
|
||||
|
||||
pw_5: times 2 dw 5
|
||||
|
@ -63,6 +62,7 @@ pw_5793x4: times 2 dw 5793*4
|
|||
|
||||
pd_2048: dd 2048
|
||||
|
||||
COEF_PAIR 2896, 2896
|
||||
COEF_PAIR 1567, 3784
|
||||
COEF_PAIR 3784, 1567
|
||||
COEF_PAIR 201, 4091
|
||||
|
@ -194,7 +194,7 @@ SECTION .text
|
|||
|
||||
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
|
||||
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
|
||||
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
|
||||
%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
|
||||
punpckhwd m%3, m%2, m%1
|
||||
punpcklwd m%2, m%1
|
||||
%if %7 < 32
|
||||
|
@ -222,20 +222,20 @@ SECTION .text
|
|||
paddd m%2, m%5
|
||||
psrad m%3, 12
|
||||
psrad m%2, 12
|
||||
%if %0 == 8
|
||||
packssdw m%8, m%2, m%3
|
||||
%else
|
||||
packssdw m%2, m%3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
|
||||
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ; t2, t3
|
||||
vpbroadcastd m%6, [o(pw_2896x8)]
|
||||
paddw m%5, m%1, m%3
|
||||
psubw m%1, m%3
|
||||
pmulhrsw m%1, m%6 ; t1
|
||||
pmulhrsw m%5, m%6 ; t0
|
||||
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3
|
||||
ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0
|
||||
psubsw m%3, m%1, m%2
|
||||
paddsw m%2, m%1
|
||||
paddsw m%1, m%5, m%4
|
||||
psubsw m%4, m%5, m%4
|
||||
paddsw m%1, m%4, m%5
|
||||
psubsw m%4, m%5
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
|
||||
|
@ -246,27 +246,20 @@ SECTION .text
|
|||
psubsw m%2, m%6 ; t5a
|
||||
paddsw m%10, m%8, m%4 ; t7
|
||||
psubsw m%8, m%4 ; t6a
|
||||
vpbroadcastd m%4, [o(pw_2896x8)]
|
||||
psubw m%6, m%1, m%5
|
||||
paddw m%1, m%5
|
||||
psubw m%5, m%8, m%2
|
||||
paddw m%8, m%2
|
||||
pmulhrsw m%1, m%4 ; t0
|
||||
pmulhrsw m%6, m%4 ; t1
|
||||
pmulhrsw m%8, m%4 ; t6
|
||||
pmulhrsw m%5, m%4 ; t5
|
||||
psubsw m%4, m%1, m%7 ; dct4 out3
|
||||
paddsw m%1, m%7 ; dct4 out0
|
||||
paddsw m%7, m%6, m%3 ; dct4 out1
|
||||
psubsw m%6, m%3 ; dct4 out2
|
||||
paddsw m%2, m%7, m%8 ; out1
|
||||
psubsw m%7, m%8 ; out6
|
||||
ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0
|
||||
ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6
|
||||
psubsw m%6, m%1, m%3 ; dct4 out2
|
||||
paddsw m%3, m%1 ; dct4 out1
|
||||
paddsw m%1, m%5, m%7 ; dct4 out0
|
||||
psubsw m%5, m%7 ; dct4 out3
|
||||
psubsw m%7, m%3, m%2 ; out6
|
||||
paddsw m%2, m%3 ; out1
|
||||
paddsw m%3, m%6, m%8 ; out2
|
||||
psubsw m%6, m%8 ; out5
|
||||
psubsw m%8, m%1, m%10 ; out7
|
||||
paddsw m%1, m%10 ; out0
|
||||
paddsw m%3, m%6, m%5 ; out2
|
||||
psubsw m%6, m%5 ; out5
|
||||
psubsw m%5, m%4, m%9 ; out4
|
||||
paddsw m%4, m%9 ; out3
|
||||
paddsw m%4, m%5, m%9 ; out3
|
||||
psubsw m%5, m%9 ; out4
|
||||
%endmacro
|
||||
|
||||
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
|
||||
|
@ -286,20 +279,16 @@ SECTION .text
|
|||
paddsw m%1, m%5 ; t8
|
||||
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
|
||||
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
|
||||
vpbroadcastd m%10, [o(pw_2896x8)]
|
||||
psubsw m%5, m%2, m%9 ; t10
|
||||
paddsw m%2, m%9 ; t9
|
||||
psubsw m%9, m%1, m%3 ; t11a
|
||||
psubsw m%5, m%1, m%3 ; t11a
|
||||
paddsw m%1, m%3 ; t8a
|
||||
psubsw m%3, m%7, m%4 ; t13
|
||||
paddsw m%7, m%4 ; t14
|
||||
psubsw m%4, m%8, m%6 ; t12a
|
||||
paddsw m%8, m%6 ; t15a
|
||||
paddw m%6, m%3, m%5 ; t13a
|
||||
psubw m%3, m%5 ; t10a
|
||||
paddw m%5, m%4, m%9 ; t12
|
||||
psubw m%4, m%9 ; t11
|
||||
REPX {pmulhrsw x, m%10}, m%6, m%3, m%5, m%4
|
||||
psubsw m%6, m%2, m%9 ; t10
|
||||
paddsw m%2, m%9 ; t9
|
||||
ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a
|
||||
ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12
|
||||
%endmacro
|
||||
|
||||
%macro WRAP_XMM 1+
|
||||
|
@ -446,21 +435,14 @@ ALIGN function_align
|
|||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D_PACKED 0-1 ; pw_2896x8
|
||||
%macro IDCT4_1D_PACKED 0
|
||||
vpbroadcastd m4, [o(pd_2048)]
|
||||
punpckhwd m2, m1, m0
|
||||
psubw m3, m0, m1
|
||||
paddw m0, m1
|
||||
punpcklqdq m0, m3
|
||||
ITX_MUL2X_PACK 2, 1, 3, 4, 1567, 3784
|
||||
%if %0 == 1
|
||||
pmulhrsw m0, m%1
|
||||
%else
|
||||
vpbroadcastd m4, [o(pw_2896x8)]
|
||||
pmulhrsw m0, m4 ; t0 t1
|
||||
%endif
|
||||
psubsw m1, m0, m2 ; out3 out2
|
||||
paddsw m0, m2 ; out0 out1
|
||||
punpcklwd m1, m0
|
||||
ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
|
||||
ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
|
||||
paddsw m0, m1, m2 ; out0 out1
|
||||
psubsw m1, m2 ; out3 out2
|
||||
%endmacro
|
||||
|
||||
%macro IADST4_1D_PACKED 0
|
||||
|
@ -683,30 +665,30 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m6, [o(pd_2048)]
|
||||
punpckhwd m5, m3, m0 ; in7 in1
|
||||
punpckhwd m4, m1, m2 ; in3 in5
|
||||
punpcklwd m3, m1 ; in2 in6
|
||||
psubw m1, m0, m2
|
||||
paddw m0, m2
|
||||
punpcklqdq m0, m1 ; in0+in4 in0-in4
|
||||
ITX_MUL2X_PACK 5, 1, 2, 6, 799, 4017, 1 ; t4a t7a
|
||||
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
|
||||
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
|
||||
vpbroadcastd m6, [o(pw_2896x8)]
|
||||
psubsw m2, m5, m4 ; t4 t7
|
||||
paddsw m5, m4 ; t5a t6a
|
||||
pshufd m4, m2, q1032
|
||||
psubw m1, m2, m4
|
||||
paddw m4, m2
|
||||
vpblendd m4, m4, m1, 0xcc
|
||||
pmulhrsw m0, m6 ; t0 t1
|
||||
pmulhrsw m4, m6 ; t6 t5
|
||||
psubsw m1, m0, m3 ; tmp3 tmp2
|
||||
paddsw m0, m3 ; tmp0 tmp1
|
||||
shufps m2, m5, m4, q1032 ; t7 t6
|
||||
vpblendd m5, m5, m4, 0xcc ; t4 t5
|
||||
psubsw m3, m0, m2 ; out7 out6
|
||||
paddsw m0, m2 ; out0 out1
|
||||
psubsw m2, m1, m5 ; out4 out5
|
||||
paddsw m1, m5 ; out3 out2
|
||||
punpcklwd m3, m1 ; in6 in2
|
||||
punpcklwd m2, m0 ; in4 in0
|
||||
ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
|
||||
ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
|
||||
ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
|
||||
psubsw m0, m5, m4 ; t5a t6a (interleaved)
|
||||
paddsw m4, m5 ; t4 t7 (interleaved)
|
||||
ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
|
||||
vpbroadcastd m1, [o(pw_m2896_2896)]
|
||||
ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5
|
||||
%if mmsize > 16
|
||||
vbroadcasti128 m1, [o(deint_shuf)]
|
||||
pshufb m4, m1
|
||||
%else
|
||||
pshufb m4, [o(deint_shuf)]
|
||||
%endif
|
||||
psubsw m1, m2, m3 ; tmp3 tmp2
|
||||
paddsw m3, m2 ; tmp0 tmp1
|
||||
shufps m2, m4, m0, q1032 ; t7 t6
|
||||
vpblendd m4, m0, 0xcc ; t4 t5
|
||||
paddsw m0, m3, m2 ; out0 out1
|
||||
psubsw m3, m2 ; out7 out6
|
||||
psubsw m2, m1, m4 ; out4 out5
|
||||
paddsw m1, m4 ; out3 out2
|
||||
%endmacro
|
||||
|
||||
%macro IADST8_1D_PACKED 1 ; pass
|
||||
|
@ -797,10 +779,10 @@ INV_TXFM_4X8_FN dct, flipadst
|
|||
cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpermq m0, [cq+32*0], q3120
|
||||
vpermq m1, [cq+32*1], q3120
|
||||
vpbroadcastd m5, [o(pw_2896x8)]
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m1, m5
|
||||
IDCT4_1D_PACKED 5
|
||||
vpbroadcastd m2, [o(pw_2896x8)]
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
IDCT4_1D_PACKED
|
||||
vbroadcasti128 m2, [o(deint_shuf)]
|
||||
shufps m3, m0, m1, q1331
|
||||
shufps m0, m0, m1, q0220
|
||||
|
@ -1011,9 +993,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
vpbroadcastd m10, [o(pd_2048)]
|
||||
.main2:
|
||||
punpckhwd m8, m7, m0 ; dct16 in15 in1
|
||||
paddw m9, m0, m4
|
||||
psubw m0, m4
|
||||
punpcklqdq m9, m0 ; dct4 in0+in2 in0-in2
|
||||
punpcklwd m9, m4, m0 ; dct4 in2 in0
|
||||
punpckhwd m0, m3, m4 ; dct16 in7 in9
|
||||
punpcklwd m7, m1 ; dct8 in7 in1
|
||||
punpckhwd m1, m6 ; dct16 in3 in13
|
||||
|
@ -1024,47 +1004,44 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
|||
ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a
|
||||
ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a
|
||||
ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a
|
||||
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
|
||||
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
|
||||
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a
|
||||
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a
|
||||
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
|
||||
psubsw m2, m8, m0 ; t9 t14
|
||||
paddsw m8, m0 ; t8 t15
|
||||
psubsw m0, m1, m5 ; t10 t13
|
||||
paddsw m1, m5 ; t11 t12
|
||||
%if mmsize > 16
|
||||
vbroadcasti128 m5, [o(deint_shuf)]
|
||||
%else
|
||||
mova m5, [o(deint_shuf)]
|
||||
%endif
|
||||
pshufb m8, m5
|
||||
pshufb m1, m5
|
||||
vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784
|
||||
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
|
||||
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a
|
||||
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
|
||||
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
|
||||
psubsw m5, m7, m3 ; t5a t6a
|
||||
paddsw m7, m3 ; t4 t7
|
||||
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a
|
||||
psubsw m4, m8, m1 ; t11a t12a
|
||||
paddsw m8, m1 ; t8a t15a
|
||||
paddsw m1, m2, m0 ; t9 t14
|
||||
psubsw m1, m7, m3 ; t5a t6a
|
||||
paddsw m7, m3 ; t4 t7
|
||||
paddsw m3, m2, m0 ; t9 t14
|
||||
psubsw m2, m0 ; t10 t13
|
||||
punpckhqdq m0, m8, m1 ; t15a t14
|
||||
punpcklqdq m8, m1 ; t8a t9
|
||||
pshufd m3, m5, q1032
|
||||
psubw m1, m5, m3
|
||||
paddw m3, m5
|
||||
vpblendd m3, m3, m1, 0xcc ; t6 t5
|
||||
vpbroadcastd m1, [o(pw_2896x8)]
|
||||
punpckhqdq m5, m4, m2 ; t12a t13
|
||||
punpcklqdq m2, m4, m2 ; t11a t10
|
||||
psubw m4, m5, m2
|
||||
paddw m5, m2
|
||||
pmulhrsw m9, m1 ; t0 t1
|
||||
pmulhrsw m3, m1 ; t6 t5
|
||||
pmulhrsw m4, m1 ; t11 t10a
|
||||
pmulhrsw m5, m1 ; t12 t13a
|
||||
shufps m2, m7, m3, q1032 ; t7 t6
|
||||
vpblendd m7, m7, m3, 0xcc ; t4 t5
|
||||
%if mmsize > 16
|
||||
vbroadcasti128 m0, [o(deint_shuf)]
|
||||
%else
|
||||
mova m0, [o(deint_shuf)]
|
||||
%endif
|
||||
pshufb m8, m0
|
||||
pshufb m7, m0
|
||||
pshufb m3, m0
|
||||
ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1
|
||||
vpbroadcastd m0, [o(pw_m2896_2896)]
|
||||
ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12
|
||||
vpbroadcastd m5, [o(pw_2896_2896)]
|
||||
ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5
|
||||
vpbroadcastd m0, [o(pw_m2896_2896)]
|
||||
ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a
|
||||
punpckhqdq m0, m8, m3 ; t15a t14
|
||||
punpcklqdq m8, m3 ; t8a t9
|
||||
shufps m5, m4, m2, q1032 ; t12 t13a
|
||||
vpblendd m4, m2, 0xcc ; t11 t10a
|
||||
shufps m2, m7, m1, q1032 ; t7 t6
|
||||
vpblendd m7, m1, 0xcc ; t4 t5
|
||||
psubsw m1, m9, m6 ; dct4 out3 out2
|
||||
paddsw m9, m6 ; dct4 out0 out1
|
||||
psubsw m3, m9, m2 ; dct8 out7 out6
|
||||
|
@ -3699,12 +3676,11 @@ ALIGN function_align
|
|||
paddsw m6, m11 ; t17 t30
|
||||
psubsw m11, m0, m14 ; t21 t26
|
||||
paddsw m0, m14 ; t22 t25
|
||||
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 1 ; t18a t29a
|
||||
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 1 ; t19 t28
|
||||
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 1 ; t20 t27
|
||||
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
|
||||
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a
|
||||
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28
|
||||
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27
|
||||
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a
|
||||
vbroadcasti128 m12, [o(deint_shuf)]
|
||||
REPX {pshufb x, m12}, m0, m1, m6, m8
|
||||
psubsw m14, m1, m8 ; t23 t24
|
||||
paddsw m1, m8 ; t16 t31
|
||||
psubsw m8, m6, m0 ; t22a t25a
|
||||
|
@ -3713,16 +3689,18 @@ ALIGN function_align
|
|||
paddsw m15, m11 ; t18 t29
|
||||
psubsw m11, m13, m9 ; t20a t27a
|
||||
paddsw m13, m9 ; t19a t28a
|
||||
vpbroadcastd m12, [o(pw_2896x8)]
|
||||
punpcklqdq m9, m11, m0 ; t20a t21
|
||||
punpckhqdq m11, m0 ; t27a t26
|
||||
punpcklqdq m0, m14, m8 ; t23 t22a
|
||||
punpckhqdq m14, m8 ; t24 t25a
|
||||
psubw m8, m11, m9 ; t20 t21a
|
||||
paddw m11, m9 ; t27 t26a
|
||||
psubw m9, m14, m0 ; t23a t22
|
||||
paddw m14, m0 ; t24a t25
|
||||
REPX {pmulhrsw x, m12}, m8, m9, m14, m11
|
||||
REPX {pshufb x, m12}, m1, m6, m15, m13
|
||||
ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a
|
||||
vpbroadcastd m9, [o(pw_m2896_2896)]
|
||||
ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25
|
||||
vpbroadcastd m12, [o(pw_2896_2896)]
|
||||
ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a
|
||||
vpbroadcastd m12, [o(pw_2896_2896)]
|
||||
ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20
|
||||
shufps m9, m14, m8, q1032 ; t23a t22
|
||||
vpblendd m14, m8, 0xcc ; t24a t25
|
||||
shufps m8, m11, m0, q1032 ; t20 t21a
|
||||
vpblendd m11, m0, 0xcc ; t27 t26a
|
||||
punpcklqdq m0, m1, m6 ; t16 t17a
|
||||
punpckhqdq m1, m6 ; t31 t30a
|
||||
psubsw m10, m5, m8 ; out20 out21
|
||||
|
@ -4327,33 +4305,29 @@ ALIGN function_align
|
|||
mova m5, [rsp+gprsize+32*0] ; t22
|
||||
mova m6, [rsp+gprsize+32*1] ; t23
|
||||
mova m3, [rsp+gprsize+32*2] ; t24a
|
||||
vpbroadcastd m8, [o(pw_2896x8)]
|
||||
psubsw m1, m14, m5 ; t22a
|
||||
paddsw m14, m5 ; t17a
|
||||
psubsw m5, m0, m6 ; t23
|
||||
paddsw m0, m6 ; t16
|
||||
psubsw m6, m4, m3 ; t24
|
||||
paddsw m4, m3 ; t31
|
||||
vpbroadcastd m8, [o(pw_m2896_2896)]
|
||||
vpbroadcastd m3, [o(pw_2896_2896)]
|
||||
mova [tmp1q-32*4], m0
|
||||
mova [tmp1q-32*3], m14
|
||||
mova [tmp2q+32*3], m4
|
||||
psubw m3, m13, m9 ; t20
|
||||
paddw m13, m9 ; t27
|
||||
psubw m9, m2, m10 ; t21a
|
||||
paddw m2, m10 ; t26a
|
||||
psubw m10, m7, m1 ; t22
|
||||
paddw m7, m1 ; t25
|
||||
psubw m1, m6, m5 ; t23a
|
||||
paddw m6, m5 ; t24a
|
||||
REPX {pmulhrsw x, m8}, m3, m13, m9, m2, m10, m7, m1, m6
|
||||
mova [tmp1q+32*0], m3
|
||||
mova [tmp1q+32*1], m9
|
||||
mova [tmp1q+32*2], m10
|
||||
mova [tmp1q+32*3], m1
|
||||
mova [tmp2q-32*4], m6
|
||||
mova [tmp2q-32*3], m7
|
||||
mova [tmp2q-32*2], m2
|
||||
mova [tmp2q-32*1], m13
|
||||
ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27
|
||||
ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a
|
||||
ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25
|
||||
ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a
|
||||
mova [tmp1q+32*0], m13
|
||||
mova [tmp1q+32*1], m2
|
||||
mova [tmp1q+32*2], m7
|
||||
mova [tmp1q+32*3], m6
|
||||
mova [tmp2q-32*4], m5
|
||||
mova [tmp2q-32*3], m1
|
||||
mova [tmp2q-32*2], m10
|
||||
mova [tmp2q-32*1], m9
|
||||
ret
|
||||
ALIGN function_align
|
||||
.transpose_2x8x8_round:
|
||||
|
@ -5237,11 +5211,10 @@ ALIGN function_align
|
|||
sub rax, o_idct64_offset + 8
|
||||
vpbroadcastd m11, [o(pw_1567_3784)]
|
||||
vpbroadcastd m12, [o(pw_m3784_1567)]
|
||||
vpbroadcastd m13, [o(pw_m1567_m3784)]
|
||||
vpbroadcastd m14, [o(pw_2896x8)]
|
||||
vpbroadcastd m13, [o(pw_2896_2896)]
|
||||
vpbroadcastd m14, [o(pw_m2896_2896)]
|
||||
.main_part2_pass1_loop:
|
||||
call .main_part2_internal
|
||||
REPX {pmulhrsw x, m14}, m1, m2, m4, m3
|
||||
IDCT64_PART2_END 0, 7, 0, 6, 9, 10
|
||||
IDCT64_PART2_END 7, 8, 5, 0, 6, 7
|
||||
IDCT64_PART2_END 8, 2, 1, 0, 6, 7
|
||||
|
@ -5251,53 +5224,51 @@ ALIGN function_align
|
|||
ret
|
||||
.main_part2_internal:
|
||||
mova m0, [tmp1q-32*12] ; t32a
|
||||
mova m1, [tmp2q-32*13] ; t39a
|
||||
mova m2, [tmp1q-32* 4] ; t40a
|
||||
mova m6, [tmp2q-32*13] ; t39a
|
||||
mova m1, [tmp1q-32* 4] ; t40a
|
||||
mova m5, [tmp2q+32* 3] ; t55a
|
||||
add tmp1q, 32
|
||||
sub tmp2q, 32
|
||||
mova m4, [tmp1q+32* 3] ; t48a
|
||||
mova m3, [tmp2q-32* 4] ; t47a
|
||||
mova m6, [tmp1q+32*11] ; t56a
|
||||
mova m2, [tmp1q+32* 3] ; t48a
|
||||
mova m4, [tmp2q-32* 4] ; t47a
|
||||
mova m3, [tmp1q+32*11] ; t56a
|
||||
mova m7, [tmp2q+32*12] ; t63a
|
||||
psubsw m8, m0, m1 ; t39
|
||||
paddsw m0, m1 ; t32
|
||||
psubsw m1, m3, m2 ; t40
|
||||
paddsw m3, m2 ; t47
|
||||
psubsw m2, m4, m5 ; t55
|
||||
paddsw m4, m5 ; t48
|
||||
psubsw m5, m7, m6 ; t56
|
||||
paddsw m7, m6 ; t63
|
||||
ITX_MULSUB_2W 5, 8, 6, 9, 15, 11, 12 ; t39a, t56a
|
||||
ITX_MULSUB_2W 2, 1, 6, 9, 15, 12, 13 ; t40a, t55a
|
||||
psubsw m6, m0, m3 ; t47a
|
||||
paddsw m0, m3 ; t32a
|
||||
psubsw m3, m7, m4 ; t48a
|
||||
paddsw m7, m4 ; t63a
|
||||
psubsw m4, m5, m2 ; t40
|
||||
paddsw m5, m2 ; t39
|
||||
psubsw m2, m8, m1 ; t55
|
||||
paddsw m8, m1 ; t56
|
||||
psubw m1, m2, m4 ; t40a
|
||||
paddw m2, m4 ; t55a
|
||||
psubw m4, m3, m6 ; t47
|
||||
paddw m3, m6 ; t48
|
||||
psubsw m8, m0, m6 ; t39
|
||||
paddsw m0, m6 ; t32
|
||||
psubsw m6, m4, m1 ; t40
|
||||
paddsw m4, m1 ; t47
|
||||
psubsw m1, m2, m5 ; t55
|
||||
paddsw m2, m5 ; t48
|
||||
psubsw m5, m7, m3 ; t56
|
||||
paddsw m7, m3 ; t63
|
||||
ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a
|
||||
vpbroadcastd m9, [o(pw_m1567_m3784)]
|
||||
ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a
|
||||
psubsw m3, m0, m4 ; t47a
|
||||
paddsw m0, m4 ; t32a
|
||||
psubsw m4, m7, m2 ; t48a
|
||||
paddsw m7, m2 ; t63a
|
||||
psubsw m2, m5, m1 ; t40
|
||||
paddsw m5, m1 ; t39
|
||||
psubsw m1, m8, m6 ; t55
|
||||
paddsw m8, m6 ; t56
|
||||
ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48
|
||||
ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a
|
||||
ret
|
||||
.main_part2_pass2:
|
||||
sub rax, o_idct64_offset + 8
|
||||
vpbroadcastd m11, [o(pw_1567_3784)]
|
||||
vpbroadcastd m12, [o(pw_m3784_1567)]
|
||||
vpbroadcastd m13, [o(pw_m1567_m3784)]
|
||||
vpbroadcastd m14, [o(pw_2048)]
|
||||
vpbroadcastd m13, [o(pw_2896_2896)]
|
||||
lea r9, [strideq*5] ; stride*5
|
||||
lea r3, [r9+strideq*1] ; stride*6
|
||||
lea r7, [r9+strideq*2] ; stride*7
|
||||
lea r8, [r3+strideq*2] ; stride*8
|
||||
lea r2, [dstq+r7]
|
||||
.main_part2_pass2_loop:
|
||||
vpbroadcastd m14, [o(pw_m2896_2896)]
|
||||
call .main_part2_internal
|
||||
vpbroadcastd m10, [o(pw_2896x8)]
|
||||
REPX {pmulhrsw x, m10}, m1, m2, m4, m3
|
||||
vpbroadcastd m14, [o(pw_2048)]
|
||||
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8
|
||||
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8
|
||||
IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8
|
||||
|
|
|
@ -202,7 +202,7 @@ SECTION .text
|
|||
ret
|
||||
%endmacro
|
||||
|
||||
; flags: 1 = swap, 2: coef_regs
|
||||
; flags: 1 = swap, 2: coef_regs, 4: no_pack
|
||||
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
|
||||
%if %6 & 2
|
||||
pmaddwd m%2, m%4, m%1
|
||||
|
@ -218,24 +218,17 @@ SECTION .text
|
|||
paddd m%1, m%3
|
||||
psrad m%2, 12
|
||||
psrad m%1, 12
|
||||
%if %6 & 4 == 0
|
||||
packssdw m%1, m%2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
|
||||
punpckhwd m2, m0, m1 ;unpacked in1 in3
|
||||
psubw m3, m0, m1
|
||||
paddw m0, m1
|
||||
punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2
|
||||
|
||||
mova m3, [o(pd_2048)]
|
||||
punpckhwd m2, m0, m1 ;unpacked in1 in3
|
||||
punpcklwd m0, m1 ;unpacked in0 in2
|
||||
ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
|
||||
|
||||
%if %0 == 1
|
||||
pmulhrsw m0, m%1
|
||||
%else
|
||||
pmulhrsw m0, [o(pw_2896x8)] ;high: t1 ;low: t0
|
||||
%endif
|
||||
|
||||
ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
|
||||
psubsw m1, m0, m2 ;high: out2 ;low: out3
|
||||
paddsw m0, m2 ;high: out1 ;low: out0
|
||||
%endmacro
|
||||
|
@ -499,79 +492,81 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
|
|||
|
||||
%macro IDCT8_1D_PACKED 0
|
||||
mova m6, [o(pd_2048)]
|
||||
punpckhwd m5, m0, m3 ;unpacked in1 in7
|
||||
punpckhwd m4, m2, m1 ;unpacked in5 in3
|
||||
punpckhwd m4, m0, m3 ;unpacked in1 in7
|
||||
punpcklwd m0, m2 ;unpacked in0 in4
|
||||
punpckhwd m2, m1 ;unpacked in5 in3
|
||||
punpcklwd m1, m3 ;unpacked in2 in6
|
||||
psubw m3, m0, m2
|
||||
paddw m0, m2
|
||||
punpcklqdq m0, m3 ;low: in0+in4 high: in0-in4
|
||||
ITX_MUL2X_PACK 5, 2, 6, 799, 4017, 1 ;low: t4a high: t7a
|
||||
ITX_MUL2X_PACK 4, 2, 6, 3406, 2276, 1 ;low: t5a high: t6a
|
||||
ITX_MUL2X_PACK 1, 2, 6, 1567, 3784 ;low: t3 high: t2
|
||||
mova m6, [o(pw_2896x8)]
|
||||
psubsw m2, m5, m4 ;low: t5a high: t6a
|
||||
paddsw m5, m4 ;low: t4 high: t7
|
||||
punpckhqdq m4, m2, m2 ;low: t6a high: t6a
|
||||
psubw m3, m4, m2 ;low: t6a - t5a
|
||||
paddw m4, m2 ;low: t6a + t5a
|
||||
punpcklqdq m4, m3 ;low: t6a + t5a high: t6a - t5a
|
||||
pmulhrsw m0, m6 ;low: t0 high: t1
|
||||
pmulhrsw m4, m6 ;low: t6 high: t5
|
||||
shufps m2, m5, m4, q1032 ;low: t7 high: t6
|
||||
shufps m5, m4, q3210 ;low: t4 high: t5
|
||||
psubsw m4, m0, m1 ;low: tmp3 high: tmp2
|
||||
ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
|
||||
ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
|
||||
ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
|
||||
psubsw m3, m4, m2 ;low: t6a high: t5a
|
||||
paddsw m4, m2 ;low: t7 high: t4
|
||||
pshufb m3, [o(deint_shuf1)]
|
||||
ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
|
||||
ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
|
||||
psubsw m2, m0, m1 ;low: tmp3 high: tmp2
|
||||
paddsw m0, m1 ;low: tmp0 high: tmp1
|
||||
psubsw m3, m0, m2 ;low: out7 high: out6
|
||||
paddsw m0, m2 ;low: out0 high: out1
|
||||
psubsw m2, m4, m5 ;low: out4 high: out5
|
||||
paddsw m1, m4, m5 ;low: out3 high: out2
|
||||
punpcklqdq m1, m4, m3 ;low: t7 high: t6
|
||||
punpckhqdq m4, m3 ;low: t4 high: t5
|
||||
psubsw m3, m0, m1 ;low: out7 high: out6
|
||||
paddsw m0, m1 ;low: out0 high: out1
|
||||
paddsw m1, m2, m4 ;low: out3 high: out2
|
||||
psubsw m2, m4 ;low: out4 high: out5
|
||||
%endmacro
|
||||
|
||||
;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
|
||||
;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
|
||||
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
|
||||
punpckhwd m%3, m%1, m%2
|
||||
%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
|
||||
punpckhwd m%4, m%1, m%2
|
||||
punpcklwd m%1, m%2
|
||||
%if %7 < 8
|
||||
pmaddwd m%2, m%7, m%1
|
||||
pmaddwd m%4, m%7, m%3
|
||||
pmaddwd m%3, m%7, m%4
|
||||
%else
|
||||
mova m%2, [o(pw_%7_%6)]
|
||||
pmaddwd m%4, m%3, m%2
|
||||
%if %8
|
||||
pmaddwd m%3, m%1, m%2
|
||||
pmaddwd m%2, m%4
|
||||
%else
|
||||
pmaddwd m%3, m%4, m%2
|
||||
pmaddwd m%2, m%1
|
||||
%endif
|
||||
paddd m%4, m%5
|
||||
paddd m%2, m%5
|
||||
psrad m%4, 12
|
||||
psrad m%2, 12
|
||||
packssdw m%2, m%4 ;dst2
|
||||
%if %7 < 8
|
||||
pmaddwd m%3, m%6
|
||||
pmaddwd m%1, m%6
|
||||
%else
|
||||
mova m%4, [o(pw_%6_m%7)]
|
||||
pmaddwd m%3, m%4
|
||||
pmaddwd m%1, m%4
|
||||
%endif
|
||||
paddd m%3, m%5
|
||||
paddd m%1, m%5
|
||||
paddd m%2, m%5
|
||||
psrad m%3, 12
|
||||
psrad m%2, 12
|
||||
%if %8
|
||||
packssdw m%3, m%2
|
||||
%else
|
||||
packssdw m%2, m%3 ;dst2
|
||||
%endif
|
||||
%if %7 < 8
|
||||
pmaddwd m%4, m%6
|
||||
pmaddwd m%1, m%6
|
||||
%elif %8
|
||||
mova m%2, [o(pw_%6_m%7)]
|
||||
pmaddwd m%4, m%2
|
||||
pmaddwd m%1, m%2
|
||||
%else
|
||||
mova m%3, [o(pw_%6_m%7)]
|
||||
pmaddwd m%4, m%3
|
||||
pmaddwd m%1, m%3
|
||||
%endif
|
||||
paddd m%4, m%5
|
||||
paddd m%1, m%5
|
||||
psrad m%4, 12
|
||||
psrad m%1, 12
|
||||
packssdw m%1, m%3 ;dst1
|
||||
packssdw m%1, m%4 ;dst1
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
|
||||
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ;t2, t3
|
||||
mova m%6, [o(pw_2896x8)]
|
||||
paddw m%5, m%1, m%3
|
||||
psubw m%1, m%3
|
||||
pmulhrsw m%1, m%6 ;t1
|
||||
pmulhrsw m%5, m%6 ;t0
|
||||
psubsw m%3, m%1, m%2 ;out2
|
||||
paddsw m%2, m%1 ;out1
|
||||
paddsw m%1, m%5, m%4 ;out0
|
||||
psubsw m%5, m%4 ;out3
|
||||
mova m%4, m%5
|
||||
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
|
||||
ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
|
||||
psubsw m%3, m%1, m%2 ;out2
|
||||
paddsw m%2, m%1 ;out1
|
||||
paddsw m%1, m%5, m%4 ;out0
|
||||
psubsw m%4, m%5 ;out3
|
||||
%endmacro
|
||||
|
||||
%macro WRITE_4X8 4 ;row[1-4]
|
||||
|
@ -1286,17 +1281,13 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
|||
%endmacro
|
||||
|
||||
%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
|
||||
ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
|
||||
ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276 ;t5a, t6a
|
||||
psubsw m%5, m%1, m%3 ;t5a
|
||||
paddsw m%1, m%3 ;t4
|
||||
psubsw m%6, m%4, m%2 ;t6a
|
||||
paddsw m%4, m%2 ;t7
|
||||
mova m%3, [o(pw_2896x8)]
|
||||
psubw m%2, m%6, m%5 ;t6a - t5a
|
||||
paddw m%6, m%5 ;t6a + t5a
|
||||
pmulhrsw m%2, m%3 ;t5
|
||||
pmulhrsw m%3, m%6 ;t6
|
||||
ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
|
||||
ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
|
||||
psubsw m%2, m%4, m%5 ;t6a
|
||||
paddsw m%4, m%5 ;t7
|
||||
psubsw m%5, m%1, m%3 ;t5a
|
||||
paddsw m%1, m%3 ;t4
|
||||
ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
|
||||
%endmacro
|
||||
|
||||
INV_TXFM_8X8_FN dct, dct, 0
|
||||
|
@ -2063,37 +2054,34 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
|||
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
|
||||
punpckhwd m%5, m%4, m%1 ;packed in13 in3
|
||||
punpcklwd m%1, m%4 ;packed in1 in15
|
||||
punpcklwd m%6, m%3, m%2 ;packed in9 in7
|
||||
punpcklwd m%4, m%3, m%2 ;packed in9 in7
|
||||
punpckhwd m%2, m%3 ;packed in5 in11
|
||||
|
||||
mova m%7, [o(pd_2048)]
|
||||
ITX_MUL2X_PACK %1, %4, %7, 401, 4076, 1 ;low: t8a high: t15a
|
||||
ITX_MUL2X_PACK %6, %4, %7, 3166, 2598, 1 ;low: t9a high: t14a
|
||||
ITX_MUL2X_PACK %2, %4, %7, 1931, 3612, 1 ;low: t10a high: t13a
|
||||
ITX_MUL2X_PACK %5, %4, %7, 3920, 1189, 1 ;low: t11a high: t12a
|
||||
psubsw m%4, m%1, m%6 ;low: t9 high: t14
|
||||
paddsw m%1, m%6 ;low: t8 high: t15
|
||||
ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
|
||||
ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
|
||||
ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
|
||||
ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
|
||||
psubsw m%6, m%1, m%4 ;low: t9 high: t14
|
||||
paddsw m%1, m%4 ;low: t8 high: t15
|
||||
psubsw m%3, m%5, m%2 ;low: t10 high: t13
|
||||
paddsw m%2, m%5 ;low: t11 high: t12
|
||||
punpcklqdq m%5, m%4, m%3 ;low: t9 high: t10
|
||||
punpckhqdq m%4, m%3 ;low: t14 high: t13
|
||||
punpcklwd m%6, m%4, m%5 ;packed t14 t9
|
||||
punpckhwd m%5, m%4 ;packed t10 t13
|
||||
paddsw m%5, m%2 ;low: t11 high: t12
|
||||
mova m%2, [o(deint_shuf2)]
|
||||
pshufb m%6, m%2
|
||||
pshufb m%3, [o(deint_shuf1)]
|
||||
pxor m%4, m%4
|
||||
psubw m%4, m%5 ;packed -t10 -t13
|
||||
psubw m%4, m%3 ;packed -t10 -t13
|
||||
ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
|
||||
ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a
|
||||
psubsw m%3, m%1, m%2 ;low: t11a high: t12a
|
||||
paddsw m%1, m%2 ;low: t8a high: t15a
|
||||
psubsw m%3, m%1, m%5 ;low: t11a high: t12a
|
||||
paddsw m%1, m%5 ;low: t8a high: t15a
|
||||
psubsw m%5, m%6, m%4 ;low: t10 high: t13
|
||||
paddsw m%6, m%4 ;low: t9 high: t14
|
||||
mova m%7, [o(pw_2896x8)]
|
||||
punpckhqdq m%4, m%3, m%5 ;low: t12a high: t13
|
||||
punpcklqdq m%3, m%5 ;low: t11a high: t10
|
||||
psubw m%2, m%4, m%3
|
||||
paddw m%3, m%4
|
||||
pmulhrsw m%2, m%7 ;low: t11 high: t10a
|
||||
pmulhrsw m%3, m%7 ;low: t12 high: t13a
|
||||
pshufb m%3, m%2
|
||||
pshufb m%5, m%2
|
||||
ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
|
||||
ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
|
||||
packssdw m%2, m%4 ;low: t11 high: t10a
|
||||
packssdw m%3, m%5 ;low: t12 high: t13a
|
||||
punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
|
||||
punpcklqdq m%1, m%6 ;low: t8a high: t9
|
||||
%endmacro
|
||||
|
@ -2918,19 +2906,14 @@ ALIGN function_align
|
|||
mova m0, [rsp+gprsize*2+16*1]
|
||||
mova m2, [rsp+gprsize*2+16*2]
|
||||
mova [rsp+gprsize*2+16*1], m4
|
||||
psubsw m4, m0, m3 ;t13
|
||||
psubsw m5, m0, m3 ;t13
|
||||
paddsw m0, m3 ;t14
|
||||
psubsw m3, m2, m1 ;t12a
|
||||
mova m3, [o(pd_2048)]
|
||||
psubsw m4, m2, m1 ;t12a
|
||||
paddsw m1, m2 ;t15a
|
||||
mova m5, [o(pw_2896x8)]
|
||||
psubw m2, m4, m7 ;t13-t10
|
||||
paddw m7, m4 ;t13+t10
|
||||
psubw m4, m3, m6 ;t12a-t11a
|
||||
paddw m6, m3 ;t12a+t11a
|
||||
pmulhrsw m7, m5 ;t13a
|
||||
pmulhrsw m4, m5 ;t11
|
||||
pmulhrsw m6, m5 ;t12
|
||||
pmulhrsw m5, m2 ;t10a
|
||||
mova [rsp+gprsize*2+16*2], m1
|
||||
ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
|
||||
ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
|
||||
mova m3, [rsp+gprsize*2+16*8]
|
||||
psubsw m2, m3, m5 ;out10
|
||||
paddsw m3, m5 ;out5
|
||||
|
@ -2950,6 +2933,7 @@ ALIGN function_align
|
|||
mova [rsp+gprsize*2+16*5], m6
|
||||
psubsw m6, m7, m0 ;out14
|
||||
paddsw m7, m0 ;out1
|
||||
mova m1, [rsp+gprsize*2+16*2]
|
||||
mova m0, [rsp+gprsize*2+16*3]
|
||||
mova [rsp+gprsize*2+16*4], m7
|
||||
psubsw m7, m0, m1 ;out15
|
||||
|
@ -4211,35 +4195,30 @@ ALIGN function_align
|
|||
psubsw m5, m3, m2 ;t28a
|
||||
paddsw m3, m2 ;t31a
|
||||
ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
|
||||
|
||||
mova m2, [rsp+gprsize*2+16*15] ;tmp12
|
||||
psubsw m1, m5, m6 ;t20a
|
||||
paddsw m5, m6 ;t19a
|
||||
psubsw m6, m2, m5 ;out19
|
||||
paddsw m2, m5 ;out12
|
||||
mova m5, [rsp+gprsize*2+16*30] ;t27
|
||||
mova [rsp+gprsize*2+16*22], m6 ;out19
|
||||
mova [rsp+gprsize*2+16*15], m2 ;out12
|
||||
mova m5, [rsp+gprsize*2+16*30] ;t27
|
||||
psubsw m6, m4, m5 ;t27a
|
||||
paddsw m4, m5 ;t28a
|
||||
ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
|
||||
mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
|
||||
mova m7, [o(pw_2896x8)]
|
||||
psubw m5, m6, m1 ;t27a - t20a
|
||||
paddw m6, m1 ;t27a + t20a
|
||||
psubsw m1, m2, m4 ;out28
|
||||
psubsw m5, m2, m4 ;out28
|
||||
paddsw m2, m4 ;out3
|
||||
pmulhrsw m5, m7 ;t20
|
||||
pmulhrsw m6, m7 ;t27
|
||||
mova m4, [rsp+gprsize*2+16*14] ;tmp11
|
||||
mova [rsp+gprsize*2+16*31], m1 ;out28
|
||||
mova [rsp+gprsize*2+16*31], m5 ;out28
|
||||
mova [rsp+gprsize*2+16*6 ], m2 ;out3
|
||||
psubsw m1, m4, m5 ;out20
|
||||
paddsw m4, m5 ;out11
|
||||
psubsw m5, m4, m6 ;out20
|
||||
paddsw m4, m6 ;out11
|
||||
mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
|
||||
mova [rsp+gprsize*2+16*23], m1 ;out20
|
||||
mova [rsp+gprsize*2+16*23], m5 ;out20
|
||||
mova [rsp+gprsize*2+16*14], m4 ;out11
|
||||
psubsw m5, m2, m6 ;out27
|
||||
paddsw m2, m6 ;out4
|
||||
psubsw m5, m2, m1 ;out27
|
||||
paddsw m2, m1 ;out4
|
||||
mova m1, [rsp+gprsize*2+16*26] ;t23a
|
||||
mova m4, [rsp+gprsize*2+16*27] ;t24a
|
||||
mova [rsp+gprsize*2+16*30], m5 ;out27
|
||||
|
@ -4248,27 +4227,24 @@ ALIGN function_align
|
|||
paddsw m0, m1 ;t16
|
||||
psubsw m2, m3, m4 ;t24
|
||||
paddsw m3, m4 ;t31
|
||||
ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
|
||||
mova m6, [rsp+gprsize*2+16*18] ;tmp15
|
||||
psubw m1, m2, m5 ;t24 - t23
|
||||
paddw m2, m5 ;t24 + t23
|
||||
psubsw m4, m6, m0 ;out16
|
||||
paddsw m6, m0 ;out15
|
||||
pmulhrsw m1, m7 ;t23a
|
||||
pmulhrsw m2, m7 ;t24a
|
||||
mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
|
||||
mova m5, [rsp+gprsize*2+16*11] ;tmp8
|
||||
mova m1, [rsp+gprsize*2+16*11] ;tmp8
|
||||
mova [rsp+gprsize*2+16*18], m6 ;out15
|
||||
mova [rsp+gprsize*2+16*19], m4 ;out16
|
||||
psubsw m6, m0, m3 ;out31
|
||||
paddsw m0, m3 ;out0
|
||||
psubsw m4, m5, m1 ;out23
|
||||
paddsw m5, m1 ;out8
|
||||
psubsw m4, m1, m2 ;out23
|
||||
paddsw m1, m2 ;out8
|
||||
mova m3, [rsp+gprsize*2+16*10] ;tmp7
|
||||
mova [rsp+gprsize*2+16*34], m6 ;out31
|
||||
mova [rsp+gprsize*2+16*11], m5 ;out8
|
||||
mova [rsp+gprsize*2+16*11], m1 ;out8
|
||||
mova [rsp+gprsize*2+16*26], m4 ;out23
|
||||
paddsw m6, m3, m2 ;out7
|
||||
psubsw m3, m2 ;out24
|
||||
paddsw m6, m3, m5 ;out7
|
||||
psubsw m3, m5 ;out24
|
||||
mova m1, [rsp+gprsize*2+16*20] ;t17
|
||||
mova m5, [rsp+gprsize*2+16*25] ;t22
|
||||
mova m2, [rsp+gprsize*2+16*17] ;tmp14
|
||||
|
@ -4283,23 +4259,20 @@ ALIGN function_align
|
|||
mova [rsp+gprsize*2+16*20], m3 ;out17
|
||||
psubsw m2, m1, m5 ;t25a
|
||||
paddsw m1, m5 ;t30a
|
||||
psubw m3, m2, m4 ;t25a - t22a
|
||||
paddw m2, m4 ;t25a + t22a
|
||||
ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
|
||||
mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
|
||||
pmulhrsw m3, m7 ;t22
|
||||
pmulhrsw m2, m7 ;t25
|
||||
psubsw m4, m5, m1 ;out30
|
||||
psubsw m3, m5, m1 ;out30
|
||||
paddsw m5, m1 ;out1
|
||||
mova m1, [rsp+gprsize*2+16*12] ;tmp9
|
||||
mova [rsp+gprsize*2+16*33], m4 ;out30
|
||||
mova [rsp+gprsize*2+16*33], m3 ;out30
|
||||
mova [rsp+gprsize*2+16*4 ], m5 ;out1
|
||||
psubsw m4, m1, m3 ;out22
|
||||
paddsw m1, m3 ;out9
|
||||
psubsw m3, m1, m2 ;out22
|
||||
paddsw m1, m2 ;out9
|
||||
mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
|
||||
mova [rsp+gprsize*2+16*25], m4 ;out22
|
||||
mova [rsp+gprsize*2+16*25], m3 ;out22
|
||||
mova [rsp+gprsize*2+16*12], m1 ;out9
|
||||
psubsw m3, m5, m2 ;out25
|
||||
paddsw m5, m2 ;out6
|
||||
psubsw m3, m5, m4 ;out25
|
||||
paddsw m5, m4 ;out6
|
||||
mova m4, [rsp+gprsize*2+16*21] ;t18a
|
||||
mova m1, [rsp+gprsize*2+16*24] ;t21a
|
||||
mova m2, [rsp+gprsize*2+16*16] ;tmp13
|
||||
|
@ -4315,17 +4288,14 @@ ALIGN function_align
|
|||
mova [rsp+gprsize*2+16*16], m2 ;out13
|
||||
psubsw m5, m3, m1 ;t26
|
||||
paddsw m3, m1 ;t29
|
||||
ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
|
||||
mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
|
||||
psubw m1, m5, m4 ;t26 - t21
|
||||
paddw m4, m5 ;t26 + t21
|
||||
psubsw m5, m2, m3 ;out29
|
||||
psubsw m1, m2, m3 ;out29
|
||||
paddsw m2, m3 ;out2
|
||||
pmulhrsw m1, m7 ;t21a
|
||||
pmulhrsw m4, m7 ;t26a
|
||||
mova m3, [rsp+gprsize*2+16*13] ;tmp10
|
||||
mova [rsp+gprsize*2+16*32], m5 ;out29
|
||||
psubsw m7, m3, m1 ;out21
|
||||
paddsw m3, m1 ;out10
|
||||
mova [rsp+gprsize*2+16*32], m1 ;out29
|
||||
psubsw m7, m3, m5 ;out21
|
||||
paddsw m3, m5 ;out10
|
||||
mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
|
||||
mova [rsp+gprsize*2+16*24], m7 ;out21
|
||||
mova [rsp+gprsize*2+16*13], m3 ;out10
|
||||
|
@ -6010,262 +5980,237 @@ ALIGN function_align
|
|||
psubw m5, m6, m3
|
||||
ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t43, t52
|
||||
|
||||
mova m7, [o(pw_2896x8)]
|
||||
mova m2, [rsp+gprsize*2+16*38] ;t35a
|
||||
mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
|
||||
psubsw m6, m2, m0 ;t44
|
||||
paddsw m2, m0 ;t35
|
||||
psubsw m0, m3, m2 ;out35
|
||||
paddsw m2, m3 ;out28
|
||||
mova m3, [rsp+gprsize*2+16*63] ;t60a
|
||||
mova [rsp+gprsize*2+16*38], m0 ;out35
|
||||
mova [rsp+gprsize*2+16*31], m2 ;out28
|
||||
mova m3, [rsp+gprsize*2+16*63] ;t60a
|
||||
mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
|
||||
psubsw m0, m3, m1 ;t51
|
||||
paddsw m3, m1 ;t60
|
||||
psubw m1, m0, m6 ;t44a
|
||||
paddw m0, m6 ;t51a
|
||||
psubsw m6, m2, m3 ;out60
|
||||
ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
|
||||
mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
|
||||
psubsw m1, m2, m3 ;out60
|
||||
paddsw m2, m3 ;out3
|
||||
pmulhrsw m1, m7 ;t44a
|
||||
pmulhrsw m0, m7 ;t51a
|
||||
mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
|
||||
mova [rsp+gprsize*2+16*63], m6 ;out60
|
||||
mova [rsp+gprsize*2+16*63], m1 ;out60
|
||||
mova [rsp+gprsize*2+16*6 ], m2 ;out3
|
||||
psubsw m6, m3, m1 ;out44
|
||||
paddsw m3, m1 ;out19
|
||||
psubsw m1, m3, m0 ;out44
|
||||
paddsw m3, m0 ;out19
|
||||
mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
|
||||
mova [rsp+gprsize*2+16*47], m6 ;out44
|
||||
mova [rsp+gprsize*2+16*22], m3 ;out19
|
||||
psubsw m1, m2, m0 ;out51
|
||||
paddsw m2, m0 ;out12
|
||||
mova [rsp+gprsize*2+16*54], m1 ;out51
|
||||
mova [rsp+gprsize*2+16*15], m2 ;out12
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*39] ;t36
|
||||
mova [rsp+gprsize*2+16*47], m1 ;out44
|
||||
mova [rsp+gprsize*2+16*22], m3 ;out19
|
||||
mova m1, [rsp+gprsize*2+16*62] ;t59
|
||||
psubsw m3, m2, m6 ;out51
|
||||
paddsw m2, m6 ;out12
|
||||
mova [rsp+gprsize*2+16*54], m3 ;out51
|
||||
mova [rsp+gprsize*2+16*15], m2 ;out12
|
||||
psubsw m2, m0, m5 ;t43a
|
||||
paddsw m0, m5 ;t36a
|
||||
mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
|
||||
psubsw m3, m1, m4 ;t52a
|
||||
paddsw m1, m4 ;t59a
|
||||
psubw m5, m3, m2 ;t43
|
||||
paddw m3, m2 ;t52
|
||||
mova m2, [rsp+gprsize*2+16*30] ;tmp[27]
|
||||
ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
|
||||
mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
|
||||
pmulhrsw m5, m7 ;t43
|
||||
pmulhrsw m3, m7 ;t52
|
||||
psubsw m6, m2, m0 ;out36
|
||||
paddsw m2, m0 ;out27
|
||||
psubsw m6, m5, m0 ;out36
|
||||
paddsw m5, m0 ;out27
|
||||
psubsw m0, m4, m1 ;out59
|
||||
paddsw m4, m1 ;out4
|
||||
mova [rsp+gprsize*2+16*39], m6 ;out36
|
||||
mova [rsp+gprsize*2+16*30], m2 ;out27
|
||||
mova [rsp+gprsize*2+16*30], m5 ;out27
|
||||
mova [rsp+gprsize*2+16*62], m0 ;out59
|
||||
mova [rsp+gprsize*2+16*7 ], m4 ;out4
|
||||
mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
|
||||
mova m2, [rsp+gprsize*2+16*14] ;tmp[11]
|
||||
psubsw m4, m0, m5 ;out43
|
||||
paddsw m0, m5 ;out20
|
||||
psubsw m6, m2, m3 ;out52
|
||||
paddsw m2, m3 ;out11
|
||||
mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
|
||||
psubsw m4, m0, m3 ;out43
|
||||
paddsw m0, m3 ;out20
|
||||
psubsw m6, m5, m2 ;out52
|
||||
paddsw m5, m2 ;out11
|
||||
mova [rsp+gprsize*2+16*46], m4 ;out43
|
||||
mova [rsp+gprsize*2+16*23], m0 ;out20
|
||||
mova [rsp+gprsize*2+16*55], m6 ;out52
|
||||
mova [rsp+gprsize*2+16*14], m2 ;out11
|
||||
mova [rsp+gprsize*2+16*14], m5 ;out11
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*40] ;t37a
|
||||
mova m2, [rsp+gprsize*2+16*45] ;t42a
|
||||
mova m5, [rsp+gprsize*2+16*45] ;t42a
|
||||
mova m3, [rsp+gprsize*2+16*56] ;t53a
|
||||
mova m1, [rsp+gprsize*2+16*61] ;t58a
|
||||
psubsw m4, m0, m2 ;t42
|
||||
paddsw m0, m2 ;t37
|
||||
mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
|
||||
psubsw m4, m0, m5 ;t42
|
||||
paddsw m0, m5 ;t37
|
||||
psubsw m5, m1, m3 ;t53
|
||||
paddsw m1, m3 ;t58
|
||||
psubw m6, m5, m4 ;t42a
|
||||
paddw m5, m4 ;t53a
|
||||
mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
|
||||
mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
|
||||
pmulhrsw m6, m7 ;t42a
|
||||
pmulhrsw m5, m7 ;t53a
|
||||
psubsw m4, m2, m0 ;out37
|
||||
psubsw m6, m2, m0 ;out37
|
||||
paddsw m2, m0 ;out26
|
||||
psubsw m0, m3, m1 ;out58
|
||||
paddsw m3, m1 ;out5
|
||||
mova [rsp+gprsize*2+16*40], m4 ;out37
|
||||
mova [rsp+gprsize*2+16*40], m6 ;out37
|
||||
mova [rsp+gprsize*2+16*29], m2 ;out26
|
||||
mova [rsp+gprsize*2+16*61], m0 ;out58
|
||||
mova [rsp+gprsize*2+16*8 ], m3 ;out5
|
||||
mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
|
||||
mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
|
||||
psubsw m2, m0, m6 ;out42
|
||||
paddsw m0, m6 ;out21
|
||||
psubsw m3, m1, m5 ;out53
|
||||
paddsw m1, m5 ;out10
|
||||
psubsw m2, m0, m5 ;out42
|
||||
paddsw m0, m5 ;out21
|
||||
psubsw m3, m1, m4 ;out53
|
||||
paddsw m1, m4 ;out10
|
||||
mova [rsp+gprsize*2+16*45], m2 ;out42
|
||||
mova [rsp+gprsize*2+16*24], m0 ;out21
|
||||
mova [rsp+gprsize*2+16*56], m3 ;out53
|
||||
mova [rsp+gprsize*2+16*13], m1 ;out10
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*41] ;t38
|
||||
mova m2, [rsp+gprsize*2+16*44] ;t41
|
||||
mova m5, [rsp+gprsize*2+16*44] ;t41
|
||||
mova m3, [rsp+gprsize*2+16*57] ;t54
|
||||
mova m1, [rsp+gprsize*2+16*60] ;t57
|
||||
psubsw m4, m0, m2 ;t41a
|
||||
paddsw m0, m2 ;t38a
|
||||
mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
|
||||
psubsw m4, m0, m5 ;t41a
|
||||
paddsw m0, m5 ;t38a
|
||||
psubsw m5, m1, m3 ;t54a
|
||||
paddsw m1, m3 ;t57a
|
||||
psubw m6, m5, m4 ;t41
|
||||
paddw m5, m4 ;t54
|
||||
mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
|
||||
mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
|
||||
pmulhrsw m6, m7 ;t41a
|
||||
pmulhrsw m5, m7 ;t54a
|
||||
psubsw m4, m2, m0 ;out38
|
||||
psubsw m6, m2, m0 ;out38
|
||||
paddsw m2, m0 ;out25
|
||||
psubsw m0, m3, m1 ;out57
|
||||
paddsw m3, m1 ;out6
|
||||
mova [rsp+gprsize*2+16*41], m4 ;out38
|
||||
mova [rsp+gprsize*2+16*41], m6 ;out38
|
||||
mova [rsp+gprsize*2+16*28], m2 ;out25
|
||||
mova [rsp+gprsize*2+16*60], m0 ;out57
|
||||
mova [rsp+gprsize*2+16*9 ], m3 ;out6
|
||||
mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
|
||||
mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
|
||||
psubsw m2, m0, m6 ;out41
|
||||
paddsw m0, m6 ;out22
|
||||
psubsw m3, m1, m5 ;out54
|
||||
paddsw m1, m5 ;out9
|
||||
psubsw m2, m0, m5 ;out41
|
||||
paddsw m0, m5 ;out22
|
||||
psubsw m3, m1, m4 ;out54
|
||||
paddsw m1, m4 ;out9
|
||||
mova [rsp+gprsize*2+16*44], m2 ;out41
|
||||
mova [rsp+gprsize*2+16*25], m0 ;out22
|
||||
mova [rsp+gprsize*2+16*57], m3 ;out54
|
||||
mova [rsp+gprsize*2+16*12], m1 ;out9
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*42] ;t39a
|
||||
mova m2, [rsp+gprsize*2+16*43] ;t40a
|
||||
mova m5, [rsp+gprsize*2+16*43] ;t40a
|
||||
mova m3, [rsp+gprsize*2+16*58] ;t55a
|
||||
mova m1, [rsp+gprsize*2+16*59] ;t56a
|
||||
psubsw m4, m0, m2 ;t40
|
||||
paddsw m0, m2 ;t39
|
||||
mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
|
||||
psubsw m4, m0, m5 ;t40
|
||||
paddsw m0, m5 ;t39
|
||||
psubsw m5, m1, m3 ;t55
|
||||
paddsw m1, m3 ;t56
|
||||
psubw m6, m5, m4 ;t40a
|
||||
paddw m5, m4 ;t55a
|
||||
mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
|
||||
mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
|
||||
pmulhrsw m6, m7 ;t40a
|
||||
pmulhrsw m5, m7 ;t55a
|
||||
psubsw m4, m2, m0 ;out39
|
||||
psubsw m6, m2, m0 ;out39
|
||||
paddsw m2, m0 ;out24
|
||||
psubsw m0, m3, m1 ;out56
|
||||
paddsw m3, m1 ;out7
|
||||
mova [rsp+gprsize*2+16*42], m4 ;out39
|
||||
mova [rsp+gprsize*2+16*42], m6 ;out39
|
||||
mova [rsp+gprsize*2+16*27], m2 ;out24
|
||||
mova [rsp+gprsize*2+16*59], m0 ;out56
|
||||
mova [rsp+gprsize*2+16*10], m3 ;out7
|
||||
mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
|
||||
mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
|
||||
psubsw m2, m0, m6 ;out40
|
||||
paddsw m0, m6 ;out23
|
||||
psubsw m3, m1, m5 ;out55
|
||||
paddsw m1, m5 ;out8
|
||||
psubsw m2, m0, m5 ;out40
|
||||
paddsw m0, m5 ;out23
|
||||
psubsw m3, m1, m4 ;out55
|
||||
paddsw m1, m4 ;out8
|
||||
mova [rsp+gprsize*2+16*43], m2 ;out40
|
||||
mova [rsp+gprsize*2+16*26], m0 ;out23
|
||||
mova [rsp+gprsize*2+16*58], m3 ;out55
|
||||
mova [rsp+gprsize*2+16*11], m1 ;out8
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*37] ;t34
|
||||
mova m2, [rsp+gprsize*2+16*48] ;t45
|
||||
mova m5, [rsp+gprsize*2+16*48] ;t45
|
||||
mova m3, [rsp+gprsize*2+16*53] ;t50
|
||||
mova m1, [rsp+gprsize*2+16*64] ;t61
|
||||
psubsw m4, m0, m2 ;t45a
|
||||
paddsw m0, m2 ;t34a
|
||||
mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
|
||||
psubsw m4, m0, m5 ;t45a
|
||||
paddsw m0, m5 ;t34a
|
||||
psubsw m5, m1, m3 ;t50a
|
||||
paddsw m1, m3 ;t61a
|
||||
psubw m6, m5, m4 ;t45
|
||||
paddw m5, m4 ;t50
|
||||
mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
|
||||
mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
|
||||
pmulhrsw m6, m7 ;t45
|
||||
pmulhrsw m5, m7 ;t50
|
||||
psubsw m4, m2, m0 ;out34
|
||||
psubsw m6, m2, m0 ;out34
|
||||
paddsw m2, m0 ;out29
|
||||
psubsw m0, m3, m1 ;out61
|
||||
paddsw m3, m1 ;out2
|
||||
mova [rsp+gprsize*2+16*37], m4 ;out34
|
||||
mova [rsp+gprsize*2+16*37], m6 ;out34
|
||||
mova [rsp+gprsize*2+16*32], m2 ;out29
|
||||
mova [rsp+gprsize*2+16*64], m0 ;out61
|
||||
mova [rsp+gprsize*2+16*5 ], m3 ;out2
|
||||
mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
|
||||
mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
|
||||
psubsw m2, m0, m6 ;out45
|
||||
paddsw m0, m6 ;out18
|
||||
psubsw m3, m1, m5 ;out50
|
||||
paddsw m1, m5 ;out13
|
||||
psubsw m2, m0, m5 ;out45
|
||||
paddsw m0, m5 ;out18
|
||||
psubsw m3, m1, m4 ;out50
|
||||
paddsw m1, m4 ;out13
|
||||
mova [rsp+gprsize*2+16*48], m2 ;out45
|
||||
mova [rsp+gprsize*2+16*21], m0 ;out18
|
||||
mova [rsp+gprsize*2+16*53], m3 ;out50
|
||||
mova [rsp+gprsize*2+16*16], m1 ;out13
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*36] ;t33a
|
||||
mova m2, [rsp+gprsize*2+16*49] ;t46a
|
||||
mova m5, [rsp+gprsize*2+16*49] ;t46a
|
||||
mova m3, [rsp+gprsize*2+16*52] ;t49a
|
||||
mova m1, [rsp+gprsize*2+16*65] ;t62a
|
||||
psubsw m4, m0, m2 ;t46
|
||||
paddsw m0, m2 ;t33
|
||||
mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
|
||||
psubsw m4, m0, m5 ;t46
|
||||
paddsw m0, m5 ;t33
|
||||
psubsw m5, m1, m3 ;t49
|
||||
paddsw m1, m3 ;t62
|
||||
psubw m6, m5, m4 ;t46a
|
||||
paddw m5, m4 ;t49a
|
||||
mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
|
||||
mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
|
||||
pmulhrsw m6, m7 ;t46a
|
||||
pmulhrsw m5, m7 ;t49a
|
||||
psubsw m4, m2, m0 ;out33
|
||||
psubsw m6, m2, m0 ;out33
|
||||
paddsw m2, m0 ;out30
|
||||
psubsw m0, m3, m1 ;out62
|
||||
paddsw m3, m1 ;out1
|
||||
mova [rsp+gprsize*2+16*36], m4 ;out33
|
||||
mova [rsp+gprsize*2+16*36], m6 ;out33
|
||||
mova [rsp+gprsize*2+16*33], m2 ;out30
|
||||
mova [rsp+gprsize*2+16*65], m0 ;out62
|
||||
mova [rsp+gprsize*2+16*4 ], m3 ;out1
|
||||
mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
|
||||
mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
|
||||
psubsw m2, m0, m6 ;out46
|
||||
paddsw m0, m6 ;out17
|
||||
psubsw m3, m1, m5 ;out49
|
||||
paddsw m1, m5 ;out14
|
||||
psubsw m2, m0, m5 ;out46
|
||||
paddsw m0, m5 ;out17
|
||||
psubsw m3, m1, m4 ;out49
|
||||
paddsw m1, m4 ;out14
|
||||
mova [rsp+gprsize*2+16*49], m2 ;out46
|
||||
mova [rsp+gprsize*2+16*20], m0 ;out17
|
||||
mova [rsp+gprsize*2+16*52], m3 ;out49
|
||||
mova [rsp+gprsize*2+16*17], m1 ;out14
|
||||
|
||||
mova m0, [rsp+gprsize*2+16*35] ;t32
|
||||
mova m2, [rsp+gprsize*2+16*50] ;t47
|
||||
mova m5, [rsp+gprsize*2+16*50] ;t47
|
||||
mova m3, [rsp+gprsize*2+16*51] ;t48
|
||||
mova m1, [rsp+gprsize*2+16*66] ;t63
|
||||
psubsw m4, m0, m2 ;t47a
|
||||
paddsw m0, m2 ;t32a
|
||||
mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
|
||||
psubsw m4, m0, m5 ;t47a
|
||||
paddsw m0, m5 ;t32a
|
||||
psubsw m5, m1, m3 ;t48a
|
||||
paddsw m1, m3 ;t63a
|
||||
psubw m6, m5, m4 ;t47
|
||||
paddw m5, m4 ;t48
|
||||
mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
|
||||
ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
|
||||
mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
|
||||
pmulhrsw m6, m7 ;t47
|
||||
pmulhrsw m5, m7 ;t48
|
||||
psubsw m4, m2, m0 ;out32
|
||||
psubsw m6, m2, m0 ;out32
|
||||
paddsw m2, m0 ;out31
|
||||
psubsw m0, m3, m1 ;out63
|
||||
paddsw m3, m1 ;out0
|
||||
mova [rsp+gprsize*2+16*35], m4 ;out32
|
||||
mova [rsp+gprsize*2+16*35], m6 ;out32
|
||||
mova [rsp+gprsize*2+16*34], m2 ;out31
|
||||
mova [rsp+gprsize*2+16*66], m0 ;out63
|
||||
mova [rsp+gprsize*2+16*3 ], m3 ;out0
|
||||
mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
|
||||
mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
|
||||
psubsw m2, m0, m6 ;out47
|
||||
paddsw m0, m6 ;out16
|
||||
psubsw m3, m1, m5 ;out48
|
||||
paddsw m1, m5 ;out15
|
||||
psubsw m2, m0, m5 ;out47
|
||||
paddsw m0, m5 ;out16
|
||||
psubsw m3, m1, m4 ;out48
|
||||
paddsw m1, m4 ;out15
|
||||
mova [rsp+gprsize*2+16*50], m2 ;out47
|
||||
mova [rsp+gprsize*2+16*19], m0 ;out16
|
||||
mova [rsp+gprsize*2+16*51], m3 ;out48
|
||||
|
@ -6273,7 +6218,6 @@ ALIGN function_align
|
|||
ret
|
||||
|
||||
|
||||
|
||||
cglobal inv_txfm_add_dct_dct_64x16, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
|
||||
%if ARCH_X86_32
|
||||
LEA r5, $$
|
||||
|
|
|
@ -88,7 +88,11 @@ decl_blend_dir_fn(dav1d_blend_h_avx2);
|
|||
decl_blend_dir_fn(dav1d_blend_h_ssse3);
|
||||
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
|
||||
|
||||
decl_emu_edge_fn(dav1d_emu_edge_avx2);
|
||||
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
|
||||
|
@ -134,9 +138,21 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
|||
c->blend = dav1d_blend_ssse3;
|
||||
c->blend_v = dav1d_blend_v_ssse3;
|
||||
c->blend_h = dav1d_blend_h_ssse3;
|
||||
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_ssse3;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_ssse3;
|
||||
#endif
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_sse4;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_sse4;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
|
||||
return;
|
||||
|
||||
|
|
|
@ -44,6 +44,10 @@ obmc_masks: db 0, 0, 0, 0
|
|||
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
|
||||
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
|
||||
|
||||
warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
|
||||
warp_8x8_shufB: db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
|
||||
warp_8x8_shufC: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
|
||||
warp_8x8_shufD: db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
|
||||
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
||||
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
|
||||
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
|
||||
|
@ -53,17 +57,18 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
|
|||
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
|
||||
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
|
||||
|
||||
pb_64: times 16 db 64
|
||||
pw_8: times 8 dw 8
|
||||
pw_26: times 8 dw 26
|
||||
pw_34: times 8 dw 34
|
||||
pw_512: times 8 dw 512
|
||||
pw_1024: times 8 dw 1024
|
||||
pw_2048: times 8 dw 2048
|
||||
pw_6903: times 8 dw 6903
|
||||
pw_8192: times 8 dw 8192
|
||||
pd_32: times 4 dd 32
|
||||
pd_512: times 4 dd 512
|
||||
pb_64: times 16 db 64
|
||||
pw_8: times 8 dw 8
|
||||
pw_26: times 8 dw 26
|
||||
pw_34: times 8 dw 34
|
||||
pw_512: times 8 dw 512
|
||||
pw_1024: times 8 dw 1024
|
||||
pw_2048: times 8 dw 2048
|
||||
pw_6903: times 8 dw 6903
|
||||
pw_8192: times 8 dw 8192
|
||||
pd_32: times 4 dd 32
|
||||
pd_512: times 4 dd 512
|
||||
pd_32768: times 4 dd 32768
|
||||
|
||||
pw_258: times 2 dw 258
|
||||
|
||||
|
@ -146,6 +151,8 @@ HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
|
|||
|
||||
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
|
||||
|
||||
cextern mc_warp_filter
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM ssse3
|
||||
|
@ -3302,6 +3309,580 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
|||
jg .hv_w8_loop0
|
||||
RET
|
||||
|
||||
%if ARCH_X86_32
|
||||
%macro SAVE_ALPHA_BETA 0
|
||||
mov alpham, alphad
|
||||
mov betam, betad
|
||||
%endmacro
|
||||
|
||||
%macro SAVE_DELTA_GAMMA 0
|
||||
mov deltam, deltad
|
||||
mov gammam, gammad
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_ALPHA_BETA_MX 0
|
||||
mov mym, myd
|
||||
mov alphad, alpham
|
||||
mov betad, betam
|
||||
mov mxd, mxm
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DELTA_GAMMA_MY 0
|
||||
mov mxm, mxd
|
||||
mov deltad, deltam
|
||||
mov gammad, gammam
|
||||
mov myd, mym
|
||||
%endmacro
|
||||
|
||||
%define PIC_reg r2
|
||||
%define PIC_base_offset $$
|
||||
%define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
|
||||
%else
|
||||
%define SAVE_ALPHA_BETA
|
||||
%define SAVE_DELTA_GAMMA
|
||||
%define PIC_sym(sym) sym
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_32
|
||||
%if STACK_ALIGNMENT < required_stack_alignment
|
||||
%assign copy_args 8*4
|
||||
%else
|
||||
%assign copy_args 0
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%macro RELOC_ARGS 0
|
||||
%if copy_args
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
mov r2, r2m
|
||||
mov r3, r3m
|
||||
mov r5, r5m
|
||||
mov dstm, r0
|
||||
mov dsm, r1
|
||||
mov srcm, r2
|
||||
mov ssm, r3
|
||||
mov mxm, r5
|
||||
mov r0, r6m
|
||||
mov mym, r0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BLENDHWDW 2 ; blend high words from dwords, src1, src2
|
||||
%if cpuflag(sse4)
|
||||
pblendw %1, %2, 0xAA
|
||||
%else
|
||||
pand %2, m10
|
||||
por %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
|
||||
; Can be done using gathers, but that's terribly slow on many CPU:s
|
||||
%if ARCH_X86_32
|
||||
%define m8 m4
|
||||
%define m9 m5
|
||||
%define m14 m6
|
||||
%define m15 m7
|
||||
%define m11 m7
|
||||
pxor m11, m11
|
||||
%endif
|
||||
lea tmp1d, [myq+deltaq*4]
|
||||
lea tmp2d, [myq+deltaq*1]
|
||||
shr myd, 10
|
||||
shr tmp1d, 10
|
||||
movq m2, [filterq+myq *8] ; a
|
||||
movq m8, [filterq+tmp1q*8] ; e
|
||||
lea tmp1d, [tmp2q+deltaq*4]
|
||||
lea myd, [tmp2q+deltaq*1]
|
||||
shr tmp2d, 10
|
||||
shr tmp1d, 10
|
||||
movq m3, [filterq+tmp2q*8] ; b
|
||||
movq m0, [filterq+tmp1q*8] ; f
|
||||
punpcklwd m2, m3
|
||||
punpcklwd m8, m0
|
||||
lea tmp1d, [myq+deltaq*4]
|
||||
lea tmp2d, [myq+deltaq*1]
|
||||
shr myd, 10
|
||||
shr tmp1d, 10
|
||||
movq m0, [filterq+myq *8] ; c
|
||||
movq m9, [filterq+tmp1q*8] ; g
|
||||
lea tmp1d, [tmp2q+deltaq*4]
|
||||
lea myd, [tmp2q+gammaq] ; my += gamma
|
||||
shr tmp2d, 10
|
||||
shr tmp1d, 10
|
||||
movq m3, [filterq+tmp2q*8] ; d
|
||||
movq m1, [filterq+tmp1q*8] ; h
|
||||
punpcklwd m0, m3
|
||||
punpcklwd m9, m1
|
||||
punpckldq m1, m2, m0
|
||||
punpckhdq m2, m0
|
||||
punpcklbw m0, m11, m1 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
|
||||
punpckhbw m3, m11, m1 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
|
||||
punpcklbw m1, m11, m2 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
|
||||
punpckhbw m14, m11, m2 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
|
||||
pmaddwd m0, %3
|
||||
pmaddwd m3, %5
|
||||
pmaddwd m1, %7
|
||||
pmaddwd m14, %9
|
||||
paddd m0, m3
|
||||
paddd m1, m14
|
||||
paddd m0, m1
|
||||
mova %1, m0
|
||||
%if ARCH_X86_64
|
||||
SWAP m3, m14
|
||||
%endif
|
||||
punpckldq m0, m8, m9
|
||||
punpckhdq m8, m9
|
||||
punpcklbw m1, m11, m0 ; e0 e2 f0 f2 g0 g2 h0 h2 << 8
|
||||
punpckhbw m14, m11, m0 ; e4 e6 f4 f6 g4 g6 h4 h6 << 8
|
||||
punpcklbw m2, m11, m8 ; e1 e3 f1 f3 g1 g3 h1 h3 << 8
|
||||
punpckhbw m15, m11, m8 ; e5 e7 f5 f7 g5 g7 h5 h7 << 8
|
||||
pmaddwd m1, %4
|
||||
pmaddwd m14, %6
|
||||
pmaddwd m2, %8
|
||||
pmaddwd m15, %10
|
||||
paddd m1, m14
|
||||
paddd m2, m15
|
||||
paddd m1, m2
|
||||
mova %2, m1
|
||||
%if ARCH_X86_64
|
||||
SWAP m14, m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define counterd r4d
|
||||
%else
|
||||
%if copy_args == 0
|
||||
%define counterd dword r4m
|
||||
%else
|
||||
%define counterd dword [esp+stack_size-4*7]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%macro WARP_AFFINE_8X8T 0
|
||||
%if ARCH_X86_64
|
||||
cglobal warp_affine_8x8t, 6, 14, 16, 0x90, tmp, ts
|
||||
%else
|
||||
cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
|
||||
%if copy_args
|
||||
%define tmpm [esp+stack_size-4*1]
|
||||
%define tsm [esp+stack_size-4*2]
|
||||
%endif
|
||||
%endif
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main
|
||||
.loop:
|
||||
%if ARCH_X86_32
|
||||
%define m12 m4
|
||||
%define m13 m5
|
||||
%define m14 m6
|
||||
%define m15 m7
|
||||
mova m12, [esp+0xC0]
|
||||
mova m13, [esp+0xD0]
|
||||
mova m14, [esp+0xE0]
|
||||
mova m15, [esp+0xF0]
|
||||
%endif
|
||||
psrad m12, 13
|
||||
psrad m13, 13
|
||||
psrad m14, 13
|
||||
psrad m15, 13
|
||||
packssdw m12, m13
|
||||
packssdw m14, m15
|
||||
mova m13, [PIC_sym(pw_8192)]
|
||||
pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
|
||||
pmulhrsw m14, m13
|
||||
mova [tmpq+tsq*0], m12
|
||||
mova [tmpq+tsq*2], m14
|
||||
dec counterd
|
||||
jz mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).end
|
||||
%if ARCH_X86_32
|
||||
mov tmpm, tmpd
|
||||
mov r0, [esp+0x100]
|
||||
mov r1, [esp+0x104]
|
||||
%endif
|
||||
call mangle(private_prefix %+ _warp_affine_8x8_%+cpuname).main2
|
||||
lea tmpq, [tmpq+tsq*4]
|
||||
jmp .loop
|
||||
%endmacro
|
||||
|
||||
%macro WARP_AFFINE_8X8 0
|
||||
%if ARCH_X86_64
|
||||
cglobal warp_affine_8x8, 6, 14, 16, 0x90, \
|
||||
dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
|
||||
filter, tmp1, delta, my, gamma
|
||||
%else
|
||||
cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
|
||||
dst, ds, src, ss, abcd, mx, tmp2, alpha, beta, \
|
||||
filter, tmp1, delta, my, gamma
|
||||
%define alphaq r0
|
||||
%define alphad r0
|
||||
%define alpham [esp+gprsize+0x100]
|
||||
%define betaq r1
|
||||
%define betad r1
|
||||
%define betam [esp+gprsize+0x104]
|
||||
%define deltaq r0
|
||||
%define deltad r0
|
||||
%define deltam [esp+gprsize+0x108]
|
||||
%define gammaq r1
|
||||
%define gammad r1
|
||||
%define gammam [esp+gprsize+0x10C]
|
||||
%define filterq r3
|
||||
%define tmp1q r4
|
||||
%define tmp1d r4
|
||||
%define tmp1m [esp+gprsize+0x110]
|
||||
%define myq r5
|
||||
%define myd r5
|
||||
%define mym r6m
|
||||
%if copy_args
|
||||
%define dstm [esp+stack_size-4*1]
|
||||
%define dsm [esp+stack_size-4*2]
|
||||
%define srcm [esp+stack_size-4*3]
|
||||
%define ssm [esp+stack_size-4*4]
|
||||
%define mxm [esp+stack_size-4*5]
|
||||
%define mym [esp+stack_size-4*6]
|
||||
%endif
|
||||
%endif
|
||||
call .main
|
||||
jmp .start
|
||||
.loop:
|
||||
%if ARCH_X86_32
|
||||
mov dstm, dstd
|
||||
mov alphad, [esp+0x100]
|
||||
mov betad, [esp+0x104]
|
||||
%endif
|
||||
call .main2
|
||||
lea dstq, [dstq+dsq*2]
|
||||
.start:
|
||||
%if cpuflag(ssse3)
|
||||
%if ARCH_X86_64
|
||||
mova m10, [PIC_sym(pw_8192)]
|
||||
%else
|
||||
%define m10 [PIC_sym(pw_8192)]
|
||||
%endif
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
%define m12 m5
|
||||
%define m13 m6
|
||||
mova m12, [esp+0xC0]
|
||||
mova m13, [esp+0xD0]
|
||||
%endif
|
||||
%if cpuflag(sse4)
|
||||
%if ARCH_X86_32
|
||||
%define m11 m4
|
||||
pxor m11, m11
|
||||
%endif
|
||||
psrad m12, 18
|
||||
psrad m13, 18
|
||||
packusdw m12, m13
|
||||
pavgw m12, m11 ; (x + (1 << 10)) >> 11
|
||||
%else
|
||||
psrad m12, 17
|
||||
psrad m13, 17
|
||||
packssdw m12, m13
|
||||
pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
%define m14 m6
|
||||
%define m15 m7
|
||||
mova m14, [esp+0xE0]
|
||||
mova m15, [esp+0xF0]
|
||||
%endif
|
||||
%if cpuflag(sse4)
|
||||
psrad m14, 18
|
||||
psrad m15, 18
|
||||
packusdw m14, m15
|
||||
pavgw m14, m11 ; (x + (1 << 10)) >> 11
|
||||
%else
|
||||
psrad m14, 17
|
||||
psrad m15, 17
|
||||
packssdw m14, m15
|
||||
pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
|
||||
%endif
|
||||
packuswb m12, m14
|
||||
movq [dstq+dsq*0], m12
|
||||
movhps [dstq+dsq*1], m12
|
||||
dec counterd
|
||||
jg .loop
|
||||
.end:
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
%assign stack_offset stack_offset+gprsize
|
||||
%if ARCH_X86_32
|
||||
%assign stack_size stack_size+4
|
||||
%if copy_args
|
||||
%assign stack_offset stack_offset-4
|
||||
%endif
|
||||
RELOC_ARGS
|
||||
LEA PIC_reg, $$
|
||||
%define PIC_mem [esp+gprsize+0x114]
|
||||
mov abcdd, abcdm
|
||||
%if copy_args == 0
|
||||
mov ssd, ssm
|
||||
mov mxd, mxm
|
||||
%endif
|
||||
mov PIC_mem, PIC_reg
|
||||
mov srcd, srcm
|
||||
%endif
|
||||
movsx deltad, word [abcdq+2*2]
|
||||
movsx gammad, word [abcdq+2*3]
|
||||
lea tmp1d, [deltaq*3]
|
||||
sub gammad, tmp1d ; gamma -= delta*3
|
||||
SAVE_DELTA_GAMMA
|
||||
%if ARCH_X86_32
|
||||
mov abcdd, abcdm
|
||||
%endif
|
||||
movsx alphad, word [abcdq+2*0]
|
||||
movsx betad, word [abcdq+2*1]
|
||||
lea tmp1q, [ssq*3+3]
|
||||
add mxd, 512+(64<<10)
|
||||
lea tmp2d, [alphaq*3]
|
||||
sub srcq, tmp1q ; src -= src_stride*3 + 3
|
||||
%if ARCH_X86_32
|
||||
mov srcm, srcd
|
||||
mov PIC_reg, PIC_mem
|
||||
%endif
|
||||
sub betad, tmp2d ; beta -= alpha*3
|
||||
lea filterq, [PIC_sym(mc_warp_filter)]
|
||||
%if ARCH_X86_64
|
||||
mov myd, r6m
|
||||
pxor m11, m11
|
||||
%endif
|
||||
call .h
|
||||
psrld m2, m0, 16
|
||||
psrld m3, m1, 16
|
||||
%if ARCH_X86_32
|
||||
mova [esp+gprsize+0x10], m3
|
||||
%endif
|
||||
call .h
|
||||
psrld m4, m0, 16
|
||||
psrld m5, m1, 16
|
||||
%if ARCH_X86_32
|
||||
mova [esp+gprsize+0x20], m4
|
||||
mova [esp+gprsize+0x30], m5
|
||||
%endif
|
||||
call .h
|
||||
%if ARCH_X86_64
|
||||
%define blendmask [rsp+gprsize+0x80]
|
||||
%else
|
||||
mova m3, [esp+gprsize+0x10]
|
||||
%define blendmask [esp+gprsize+0x120]
|
||||
%define m10 m7
|
||||
%endif
|
||||
pcmpeqd m10, m10
|
||||
pslld m10, 16
|
||||
mova blendmask, m10
|
||||
BLENDHWDW m2, m0 ; 0
|
||||
BLENDHWDW m3, m1 ; 2
|
||||
mova [rsp+gprsize+0x00], m2
|
||||
mova [rsp+gprsize+0x10], m3
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
mova m4, [esp+gprsize+0x20]
|
||||
mova m5, [esp+gprsize+0x30]
|
||||
%endif
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m4, m0 ; 1
|
||||
BLENDHWDW m5, m1 ; 3
|
||||
mova [rsp+gprsize+0x20], m4
|
||||
mova [rsp+gprsize+0x30], m5
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
mova m3, [esp+gprsize+0x10]
|
||||
%define m10 m5
|
||||
%endif
|
||||
psrld m6, m2, 16
|
||||
psrld m7, m3, 16
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m6, m0 ; 2
|
||||
BLENDHWDW m7, m1 ; 4
|
||||
mova [rsp+gprsize+0x40], m6
|
||||
mova [rsp+gprsize+0x50], m7
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
mova m4, [esp+gprsize+0x20]
|
||||
mova m5, [esp+gprsize+0x30]
|
||||
%endif
|
||||
psrld m2, m4, 16
|
||||
psrld m3, m5, 16
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m2, m0 ; 3
|
||||
BLENDHWDW m3, m1 ; 5
|
||||
mova [rsp+gprsize+0x60], m2
|
||||
mova [rsp+gprsize+0x70], m3
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
mova m6, [esp+gprsize+0x40]
|
||||
mova m7, [esp+gprsize+0x50]
|
||||
%define m10 m7
|
||||
%endif
|
||||
psrld m4, m6, 16
|
||||
psrld m5, m7, 16
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m4, m0 ; 4
|
||||
BLENDHWDW m5, m1 ; 6
|
||||
%if ARCH_X86_64
|
||||
add myd, 512+(64<<10)
|
||||
mova m6, m2
|
||||
mova m7, m3
|
||||
%else
|
||||
mova [esp+gprsize+0x80], m4
|
||||
mova [esp+gprsize+0x90], m5
|
||||
add dword mym, 512+(64<<10)
|
||||
%endif
|
||||
mov counterd, 4
|
||||
SAVE_ALPHA_BETA
|
||||
.main2:
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
mova m6, [esp+gprsize+0x60]
|
||||
mova m7, [esp+gprsize+0x70]
|
||||
%define m10 m5
|
||||
%endif
|
||||
psrld m6, 16
|
||||
psrld m7, 16
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m6, m0 ; 5
|
||||
BLENDHWDW m7, m1 ; 7
|
||||
%if ARCH_X86_64
|
||||
WARP_V m12, m13, [rsp+gprsize+0x00], [rsp+gprsize+0x10], \
|
||||
m4, m5, \
|
||||
[rsp+gprsize+0x20], [rsp+gprsize+0x30], \
|
||||
m6, m7
|
||||
%else
|
||||
mova [esp+gprsize+0xA0], m6
|
||||
mova [esp+gprsize+0xB0], m7
|
||||
LOAD_DELTA_GAMMA_MY
|
||||
WARP_V [esp+gprsize+0xC0], [esp+gprsize+0xD0], \
|
||||
[esp+gprsize+0x00], [esp+gprsize+0x10], \
|
||||
[esp+gprsize+0x80], [esp+gprsize+0x90], \
|
||||
[esp+gprsize+0x20], [esp+gprsize+0x30], \
|
||||
[esp+gprsize+0xA0], [esp+gprsize+0xB0]
|
||||
LOAD_ALPHA_BETA_MX
|
||||
%endif
|
||||
call .h
|
||||
mova m2, [rsp+gprsize+0x40]
|
||||
mova m3, [rsp+gprsize+0x50]
|
||||
%if ARCH_X86_32
|
||||
mova m4, [rsp+gprsize+0x80]
|
||||
mova m5, [rsp+gprsize+0x90]
|
||||
%define m10 m7
|
||||
%endif
|
||||
mova [rsp+gprsize+0x00], m2
|
||||
mova [rsp+gprsize+0x10], m3
|
||||
mova [rsp+gprsize+0x40], m4
|
||||
mova [rsp+gprsize+0x50], m5
|
||||
psrld m4, 16
|
||||
psrld m5, 16
|
||||
mova m10, blendmask
|
||||
BLENDHWDW m4, m0 ; 6
|
||||
BLENDHWDW m5, m1 ; 8
|
||||
%if ARCH_X86_64
|
||||
WARP_V m14, m15, [rsp+gprsize+0x20], [rsp+gprsize+0x30], \
|
||||
m6, m7, \
|
||||
[rsp+gprsize+0x00], [rsp+gprsize+0x10], \
|
||||
m4, m5
|
||||
%else
|
||||
mova [esp+gprsize+0x80], m4
|
||||
mova [esp+gprsize+0x90], m5
|
||||
LOAD_DELTA_GAMMA_MY
|
||||
WARP_V [esp+gprsize+0xE0], [esp+gprsize+0xF0], \
|
||||
[esp+gprsize+0x20], [esp+gprsize+0x30], \
|
||||
[esp+gprsize+0xA0], [esp+gprsize+0xB0], \
|
||||
[esp+gprsize+0x00], [esp+gprsize+0x10], \
|
||||
[esp+gprsize+0x80], [esp+gprsize+0x90]
|
||||
mov mym, myd
|
||||
mov dstd, dstm
|
||||
mov dsd, dsm
|
||||
mov mxd, mxm
|
||||
%endif
|
||||
mova m2, [rsp+gprsize+0x60]
|
||||
mova m3, [rsp+gprsize+0x70]
|
||||
%if ARCH_X86_32
|
||||
mova m6, [esp+gprsize+0xA0]
|
||||
mova m7, [esp+gprsize+0xB0]
|
||||
%endif
|
||||
mova [rsp+gprsize+0x20], m2
|
||||
mova [rsp+gprsize+0x30], m3
|
||||
mova [rsp+gprsize+0x60], m6
|
||||
mova [rsp+gprsize+0x70], m7
|
||||
ret
|
||||
ALIGN function_align
|
||||
.h:
|
||||
%if ARCH_X86_32
|
||||
%define m8 m3
|
||||
%define m9 m4
|
||||
%define m10 m5
|
||||
%define m14 m6
|
||||
%define m15 m7
|
||||
%endif
|
||||
lea tmp1d, [mxq+alphaq*4]
|
||||
lea tmp2d, [mxq+alphaq*1]
|
||||
%if ARCH_X86_32
|
||||
%assign stack_offset stack_offset+4
|
||||
%assign stack_size stack_size+4
|
||||
%define PIC_mem [esp+gprsize*2+0x114]
|
||||
mov PIC_mem, PIC_reg
|
||||
mov srcd, srcm
|
||||
%endif
|
||||
movu m10, [srcq]
|
||||
%if ARCH_X86_32
|
||||
add srcd, ssm
|
||||
mov srcm, srcd
|
||||
mov PIC_reg, PIC_mem
|
||||
%else
|
||||
add srcq, ssq
|
||||
%endif
|
||||
shr mxd, 10
|
||||
shr tmp1d, 10
|
||||
movq m1, [filterq+mxq *8] ; 0 X
|
||||
movq m8, [filterq+tmp1q*8] ; 4 X
|
||||
lea tmp1d, [tmp2q+alphaq*4]
|
||||
lea mxd, [tmp2q+alphaq*1]
|
||||
shr tmp2d, 10
|
||||
shr tmp1d, 10
|
||||
movhps m1, [filterq+tmp2q*8] ; 0 1
|
||||
movhps m8, [filterq+tmp1q*8] ; 4 5
|
||||
lea tmp1d, [mxq+alphaq*4]
|
||||
lea tmp2d, [mxq+alphaq*1]
|
||||
shr mxd, 10
|
||||
shr tmp1d, 10
|
||||
movq m14, [filterq+mxq *8] ; 2 X
|
||||
movq m9, [filterq+tmp1q*8] ; 6 X
|
||||
lea tmp1d, [tmp2q+alphaq*4]
|
||||
lea mxd, [tmp2q+betaq] ; mx += beta
|
||||
shr tmp2d, 10
|
||||
shr tmp1d, 10
|
||||
movhps m14, [filterq+tmp2q*8] ; 2 3
|
||||
movhps m9, [filterq+tmp1q*8] ; 6 7
|
||||
pshufb m0, m10, [PIC_sym(warp_8x8_shufA)]
|
||||
pmaddubsw m0, m1
|
||||
pshufb m1, m10, [PIC_sym(warp_8x8_shufB)]
|
||||
pmaddubsw m1, m8
|
||||
pshufb m15, m10, [PIC_sym(warp_8x8_shufC)]
|
||||
pmaddubsw m15, m14
|
||||
pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
|
||||
pmaddubsw m10, m9
|
||||
mova m14, [PIC_sym(pw_8192)]
|
||||
mova m9, [PIC_sym(pd_32768)]
|
||||
phaddw m0, m15
|
||||
phaddw m1, m10
|
||||
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
|
||||
pmaddwd m1, m14
|
||||
paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
|
||||
paddd m1, m9
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse4
|
||||
WARP_AFFINE_8X8
|
||||
WARP_AFFINE_8X8T
|
||||
|
||||
INIT_XMM ssse3
|
||||
WARP_AFFINE_8X8
|
||||
WARP_AFFINE_8X8T
|
||||
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 6, 4
|
||||
%else
|
||||
|
|
|
@ -142,6 +142,7 @@ static struct {
|
|||
unsigned int seed;
|
||||
int bench_c;
|
||||
int verbose;
|
||||
int function_listing;
|
||||
} state;
|
||||
|
||||
/* float compare support code */
|
||||
|
@ -365,6 +366,14 @@ static void print_benchs(const CheckasmFunc *const f) {
|
|||
}
|
||||
#endif
|
||||
|
||||
static void print_functions(const CheckasmFunc *const f) {
|
||||
if (f) {
|
||||
print_functions(f->child[0]);
|
||||
printf("%s\n", f->name);
|
||||
print_functions(f->child[1]);
|
||||
}
|
||||
}
|
||||
|
||||
#define is_digit(x) ((x) >= '0' && (x) <= '9')
|
||||
|
||||
/* ASCIIbetical sort except preserving natural order for numbers */
|
||||
|
@ -515,7 +524,8 @@ int main(int argc, char *argv[]) {
|
|||
"Options:\n"
|
||||
" --test=<test_name> Test only <test_name>\n"
|
||||
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
|
||||
" --list List the available tests\n"
|
||||
" --list-functions List available functions\n"
|
||||
" --list-tests List available tests\n"
|
||||
" --bench-c Benchmark the C-only functions\n"
|
||||
" --verbose -v Print failures verbosely\n");
|
||||
return 0;
|
||||
|
@ -534,11 +544,11 @@ int main(int argc, char *argv[]) {
|
|||
state.bench_pattern = "";
|
||||
} else if (!strncmp(argv[1], "--test=", 7)) {
|
||||
state.test_name = argv[1] + 7;
|
||||
} else if (!strcmp(argv[1], "--list")) {
|
||||
fprintf(stderr, "checkasm: available tests [");
|
||||
for (int i = 0; tests[i].func; i++)
|
||||
fprintf(stderr, "%s%s", i ? ", ": "", tests[i].name);
|
||||
fprintf(stderr, "]\n");
|
||||
} else if (!strcmp(argv[1], "--list-functions")) {
|
||||
state.function_listing = 1;
|
||||
} else if (!strcmp(argv[1], "--list-tests")) {
|
||||
for (int i = 0; tests[i].name; i++)
|
||||
printf("%s\n", tests[i].name);
|
||||
return 0;
|
||||
} else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
|
||||
state.verbose = 1;
|
||||
|
@ -553,24 +563,28 @@ int main(int argc, char *argv[]) {
|
|||
fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
|
||||
|
||||
check_cpu_flag(NULL, 0);
|
||||
for (int i = 0; cpus[i].flag; i++)
|
||||
check_cpu_flag(cpus[i].name, cpus[i].flag);
|
||||
|
||||
if (!state.num_checked) {
|
||||
fprintf(stderr, "checkasm: no tests to perform\n");
|
||||
} else if (state.num_failed) {
|
||||
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
|
||||
state.num_failed, state.num_checked);
|
||||
ret = 1;
|
||||
if (state.function_listing) {
|
||||
print_functions(state.funcs);
|
||||
} else {
|
||||
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
|
||||
for (int i = 0; cpus[i].flag; i++)
|
||||
check_cpu_flag(cpus[i].name, cpus[i].flag);
|
||||
|
||||
if (!state.num_checked) {
|
||||
fprintf(stderr, "checkasm: no tests to perform\n");
|
||||
} else if (state.num_failed) {
|
||||
fprintf(stderr, "checkasm: %d of %d tests have failed\n",
|
||||
state.num_failed, state.num_checked);
|
||||
ret = 1;
|
||||
} else {
|
||||
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
|
||||
#ifdef readtime
|
||||
if (state.bench_pattern) {
|
||||
state.nop_time = measure_nop_time();
|
||||
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
|
||||
print_benchs(state.funcs);
|
||||
}
|
||||
if (state.bench_pattern) {
|
||||
state.nop_time = measure_nop_time();
|
||||
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
|
||||
print_benchs(state.funcs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
destroy_func_tree(state.funcs);
|
||||
|
@ -592,6 +606,10 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
|
|||
return NULL;
|
||||
|
||||
state.current_func = get_func(&state.funcs, name_buf);
|
||||
|
||||
if (state.function_listing) /* Save function names without running tests */
|
||||
return NULL;
|
||||
|
||||
state.funcs->color = 1;
|
||||
CheckasmFuncVersion *v = &state.current_func->versions;
|
||||
void *ref = func;
|
||||
|
|
|
@ -34,6 +34,12 @@
|
|||
#define UNIT_TEST 1
|
||||
#include "src/fg_apply_tmpl.c"
|
||||
|
||||
static const char ss_name[][4] = {
|
||||
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
|
||||
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
|
||||
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
|
||||
};
|
||||
|
||||
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
|
@ -72,6 +78,64 @@ static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
report("gen_grain_y");
|
||||
}
|
||||
|
||||
static void check_gen_grnuv(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
entry grain_lut_y[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
|
||||
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
|
||||
const entry grain_lut_y[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *data, intptr_t uv HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
|
||||
const enum Dav1dPixelLayout layout = layout_idx + 1;
|
||||
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (check_func(dsp->generate_grain_uv[layout_idx],
|
||||
"gen_grain_uv_ar%d_%dbpc_%s",
|
||||
i, BITDEPTH, ss_name[layout_idx]))
|
||||
{
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data.num_y_points = rnd() & 1;
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut_y, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int uv = rnd() & 1;
|
||||
const int num_uv_pos = num_y_pos + !!fg_data.num_y_points;
|
||||
for (int n = 0; n < num_uv_pos; n++)
|
||||
fg_data.ar_coeffs_uv[uv][n] = (rnd() & 0xff) - 128;
|
||||
if (!fg_data.num_y_points)
|
||||
fg_data.ar_coeffs_uv[uv][num_uv_pos] = 0;
|
||||
memset(grain_lut_c, 0xff, sizeof(grain_lut_c));
|
||||
memset(grain_lut_a, 0xff, sizeof(grain_lut_a));
|
||||
call_ref(grain_lut_c, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
int diff = 0, w = ss_x ? 44 : GRAIN_WIDTH;
|
||||
for (int y = 0; y < (ss_y ? 38 : GRAIN_HEIGHT); y++)
|
||||
diff |= memcmp(grain_lut_a[y], grain_lut_c[y], w * sizeof(entry));
|
||||
if (diff) fail();
|
||||
|
||||
bench_new(grain_lut_a, grain_lut_y, &fg_data, uv HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
report("gen_grain_uv");
|
||||
}
|
||||
|
||||
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
|
@ -157,11 +221,6 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
|||
int is_identity HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
|
||||
const char ss_name[][4] = {
|
||||
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
|
||||
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
|
||||
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
|
||||
};
|
||||
const enum Dav1dPixelLayout layout = layout_idx + 1;
|
||||
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
@ -264,6 +323,7 @@ void bitfn(checkasm_check_filmgrain)(void) {
|
|||
bitfn(dav1d_film_grain_dsp_init)(&c);
|
||||
|
||||
check_gen_grny(&c);
|
||||
check_gen_grnuv(&c);
|
||||
check_fgy_sbrow(&c);
|
||||
check_fguv_sbrow(&c);
|
||||
}
|
||||
|
|
|
@ -29,6 +29,8 @@
|
|||
#include "src/ipred.h"
|
||||
#include "src/levels.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
static const char *const intra_pred_mode_names[N_IMPL_INTRA_PRED_MODES] = {
|
||||
[DC_PRED] = "dc",
|
||||
[DC_128_PRED] = "dc_128",
|
||||
|
@ -83,11 +85,16 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
{
|
||||
const ptrdiff_t stride = w * sizeof(pixel);
|
||||
|
||||
int a = 0;
|
||||
if (mode >= Z1_PRED && mode <= Z3_PRED) /* angle */
|
||||
int a = 0, maxw = 0, maxh = 0;
|
||||
if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
|
||||
a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
|
||||
(rnd() & 0x600);
|
||||
else if (mode == FILTER_PRED) /* filter_idx */
|
||||
if (mode == Z2_PRED) {
|
||||
maxw = rnd(), maxh = rnd();
|
||||
maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
|
||||
maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
|
||||
}
|
||||
} else if (mode == FILTER_PRED) /* filter_idx */
|
||||
a = (rnd() % 5) | (rnd() & ~511);
|
||||
|
||||
#if BITDEPTH == 16
|
||||
|
@ -99,13 +106,23 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
|
|||
for (int i = -h * 2; i <= w * 2; i++)
|
||||
topleft[i] = rnd() & bitdepth_max;
|
||||
|
||||
const int maxw = 1 + (rnd() % 128), maxh = 1 + (rnd() % 128);
|
||||
call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride,
|
||||
w, h, "dst");
|
||||
if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
|
||||
w, h, "dst"))
|
||||
{
|
||||
if (mode == Z1_PRED || mode == Z3_PRED)
|
||||
fprintf(stderr, "angle = %d (0x%03x)\n",
|
||||
a & 0x1ff, a & 0x600);
|
||||
else if (mode == Z2_PRED)
|
||||
fprintf(stderr, "angle = %d (0x%03x), "
|
||||
"max_width = %d, max_height = %d\n",
|
||||
a & 0x1ff, a & 0x600, maxw, maxh);
|
||||
else if (mode == FILTER_PRED)
|
||||
fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
|
||||
}
|
||||
|
||||
bench_new(a_dst, stride, topleft, w, h, a, 128, 128
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
|
|
|
@ -192,7 +192,7 @@ static const EnumParseTable cpu_mask_tbl[] = {
|
|||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
{ "neon", DAV1D_ARM_CPU_FLAG_NEON },
|
||||
#elif ARCH_X86
|
||||
{ "sse2", X86_CPU_MASK_SSE },
|
||||
{ "sse2", X86_CPU_MASK_SSE2 },
|
||||
{ "ssse3", X86_CPU_MASK_SSSE3 },
|
||||
{ "sse41", X86_CPU_MASK_SSE41 },
|
||||
{ "avx2", X86_CPU_MASK_AVX2 },
|
||||
|
|
Загрузка…
Ссылка в новой задаче