346 строки
10 KiB
NASM
346 строки
10 KiB
NASM
;
|
|
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
|
;
|
|
; This source code is subject to the terms of the BSD 2 Clause License and
|
|
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
|
; was not distributed with this source code in the LICENSE file, you can
|
|
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
|
; Media Patent License 1.0 was not distributed with this source code in the
|
|
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
|
;
|
|
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION .text
|
|
|
|
%macro convolve_fn 1-2
|
|
%ifidn %1, avg
|
|
%define AUX_XMM_REGS 4
|
|
%else
|
|
%define AUX_XMM_REGS 0
|
|
%endif
|
|
%ifidn %2, highbd
|
|
%define pavg pavgw
|
|
cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
|
|
dst, dst_stride, \
|
|
fx, fxs, fy, fys, w, h, bd
|
|
%else
|
|
%define pavg pavgb
|
|
cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
|
|
dst, dst_stride, \
|
|
fx, fxs, fy, fys, w, h
|
|
%endif
|
|
mov r4d, dword wm
|
|
%ifidn %2, highbd
|
|
shl r4d, 1
|
|
shl srcq, 1
|
|
shl src_strideq, 1
|
|
shl dstq, 1
|
|
shl dst_strideq, 1
|
|
%else
|
|
cmp r4d, 4
|
|
je .w4
|
|
%endif
|
|
cmp r4d, 8
|
|
je .w8
|
|
cmp r4d, 16
|
|
je .w16
|
|
cmp r4d, 32
|
|
je .w32
|
|
|
|
%if CONFIG_AV1 && CONFIG_EXT_PARTITION
|
|
cmp r4d, 64
|
|
je .w64
|
|
%ifidn %2, highbd
|
|
cmp r4d, 128
|
|
je .w128
|
|
|
|
.w256:
|
|
mov r4d, dword hm
|
|
.loop256:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+32]
|
|
movu m3, [srcq+48]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq+16]
|
|
pavg m2, [dstq+32]
|
|
pavg m3, [dstq+48]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+32], m2
|
|
mova [dstq+48], m3
|
|
movu m0, [srcq+64]
|
|
movu m1, [srcq+80]
|
|
movu m2, [srcq+96]
|
|
movu m3, [srcq+112]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq+64]
|
|
pavg m1, [dstq+80]
|
|
pavg m2, [dstq+96]
|
|
pavg m3, [dstq+112]
|
|
%endif
|
|
mova [dstq+64], m0
|
|
mova [dstq+80], m1
|
|
mova [dstq+96], m2
|
|
mova [dstq+112], m3
|
|
movu m0, [srcq+128]
|
|
movu m1, [srcq+128+16]
|
|
movu m2, [srcq+128+32]
|
|
movu m3, [srcq+128+48]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq+128]
|
|
pavg m1, [dstq+128+16]
|
|
pavg m2, [dstq+128+32]
|
|
pavg m3, [dstq+128+48]
|
|
%endif
|
|
mova [dstq+128 ], m0
|
|
mova [dstq+128+16], m1
|
|
mova [dstq+128+32], m2
|
|
mova [dstq+128+48], m3
|
|
movu m0, [srcq+128+64]
|
|
movu m1, [srcq+128+80]
|
|
movu m2, [srcq+128+96]
|
|
movu m3, [srcq+128+112]
|
|
add srcq, src_strideq
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq+128+64]
|
|
pavg m1, [dstq+128+80]
|
|
pavg m2, [dstq+128+96]
|
|
pavg m3, [dstq+128+112]
|
|
%endif
|
|
mova [dstq+128+64], m0
|
|
mova [dstq+128+80], m1
|
|
mova [dstq+128+96], m2
|
|
mova [dstq+128+112], m3
|
|
add dstq, dst_strideq
|
|
sub r4d, 1
|
|
jnz .loop256
|
|
RET
|
|
%endif
|
|
|
|
.w128:
|
|
mov r4d, dword hm
|
|
.loop128:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+32]
|
|
movu m3, [srcq+48]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq+16]
|
|
pavg m2, [dstq+32]
|
|
pavg m3, [dstq+48]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+32], m2
|
|
mova [dstq+48], m3
|
|
movu m0, [srcq+64]
|
|
movu m1, [srcq+80]
|
|
movu m2, [srcq+96]
|
|
movu m3, [srcq+112]
|
|
add srcq, src_strideq
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq+64]
|
|
pavg m1, [dstq+80]
|
|
pavg m2, [dstq+96]
|
|
pavg m3, [dstq+112]
|
|
%endif
|
|
mova [dstq+64], m0
|
|
mova [dstq+80], m1
|
|
mova [dstq+96], m2
|
|
mova [dstq+112], m3
|
|
add dstq, dst_strideq
|
|
sub r4d, 1
|
|
jnz .loop128
|
|
RET
|
|
|
|
%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION
|
|
|
|
%ifidn %2, highbd
|
|
cmp r4d, 64
|
|
je .w64
|
|
|
|
mov r4d, dword hm
|
|
.loop128:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+32]
|
|
movu m3, [srcq+48]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq+16]
|
|
pavg m2, [dstq+32]
|
|
pavg m3, [dstq+48]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+32], m2
|
|
mova [dstq+48], m3
|
|
movu m0, [srcq+64]
|
|
movu m1, [srcq+80]
|
|
movu m2, [srcq+96]
|
|
movu m3, [srcq+112]
|
|
add srcq, src_strideq
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq+64]
|
|
pavg m1, [dstq+80]
|
|
pavg m2, [dstq+96]
|
|
pavg m3, [dstq+112]
|
|
%endif
|
|
mova [dstq+64], m0
|
|
mova [dstq+80], m1
|
|
mova [dstq+96], m2
|
|
mova [dstq+112], m3
|
|
add dstq, dst_strideq
|
|
sub r4d, 1
|
|
jnz .loop128
|
|
RET
|
|
%endif
|
|
%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION
|
|
|
|
.w64:
|
|
mov r4d, dword hm
|
|
.loop64:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+32]
|
|
movu m3, [srcq+48]
|
|
add srcq, src_strideq
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq+16]
|
|
pavg m2, [dstq+32]
|
|
pavg m3, [dstq+48]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+32], m2
|
|
mova [dstq+48], m3
|
|
add dstq, dst_strideq
|
|
sub r4d, 1
|
|
jnz .loop64
|
|
RET
|
|
|
|
.w32:
|
|
mov r4d, dword hm
|
|
.loop32:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+16]
|
|
movu m2, [srcq+src_strideq]
|
|
movu m3, [srcq+src_strideq+16]
|
|
lea srcq, [srcq+src_strideq*2]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq +16]
|
|
pavg m2, [dstq+dst_strideq]
|
|
pavg m3, [dstq+dst_strideq+16]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq +16], m1
|
|
mova [dstq+dst_strideq ], m2
|
|
mova [dstq+dst_strideq+16], m3
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
sub r4d, 2
|
|
jnz .loop32
|
|
RET
|
|
|
|
.w16:
|
|
mov r4d, dword hm
|
|
lea r5q, [src_strideq*3]
|
|
lea r6q, [dst_strideq*3]
|
|
.loop16:
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+src_strideq]
|
|
movu m2, [srcq+src_strideq*2]
|
|
movu m3, [srcq+r5q]
|
|
lea srcq, [srcq+src_strideq*4]
|
|
%ifidn %1, avg
|
|
pavg m0, [dstq]
|
|
pavg m1, [dstq+dst_strideq]
|
|
pavg m2, [dstq+dst_strideq*2]
|
|
pavg m3, [dstq+r6q]
|
|
%endif
|
|
mova [dstq ], m0
|
|
mova [dstq+dst_strideq ], m1
|
|
mova [dstq+dst_strideq*2], m2
|
|
mova [dstq+r6q ], m3
|
|
lea dstq, [dstq+dst_strideq*4]
|
|
sub r4d, 4
|
|
jnz .loop16
|
|
RET
|
|
|
|
.w8:
|
|
mov r4d, dword hm
|
|
lea r5q, [src_strideq*3]
|
|
lea r6q, [dst_strideq*3]
|
|
.loop8:
|
|
movh m0, [srcq]
|
|
movh m1, [srcq+src_strideq]
|
|
movh m2, [srcq+src_strideq*2]
|
|
movh m3, [srcq+r5q]
|
|
lea srcq, [srcq+src_strideq*4]
|
|
%ifidn %1, avg
|
|
movh m4, [dstq]
|
|
movh m5, [dstq+dst_strideq]
|
|
movh m6, [dstq+dst_strideq*2]
|
|
movh m7, [dstq+r6q]
|
|
pavg m0, m4
|
|
pavg m1, m5
|
|
pavg m2, m6
|
|
pavg m3, m7
|
|
%endif
|
|
movh [dstq ], m0
|
|
movh [dstq+dst_strideq ], m1
|
|
movh [dstq+dst_strideq*2], m2
|
|
movh [dstq+r6q ], m3
|
|
lea dstq, [dstq+dst_strideq*4]
|
|
sub r4d, 4
|
|
jnz .loop8
|
|
RET
|
|
|
|
%ifnidn %2, highbd
|
|
.w4:
|
|
mov r4d, dword hm
|
|
lea r5q, [src_strideq*3]
|
|
lea r6q, [dst_strideq*3]
|
|
.loop4:
|
|
movd m0, [srcq]
|
|
movd m1, [srcq+src_strideq]
|
|
movd m2, [srcq+src_strideq*2]
|
|
movd m3, [srcq+r5q]
|
|
lea srcq, [srcq+src_strideq*4]
|
|
%ifidn %1, avg
|
|
movd m4, [dstq]
|
|
movd m5, [dstq+dst_strideq]
|
|
movd m6, [dstq+dst_strideq*2]
|
|
movd m7, [dstq+r6q]
|
|
pavg m0, m4
|
|
pavg m1, m5
|
|
pavg m2, m6
|
|
pavg m3, m7
|
|
%endif
|
|
movd [dstq ], m0
|
|
movd [dstq+dst_strideq ], m1
|
|
movd [dstq+dst_strideq*2], m2
|
|
movd [dstq+r6q ], m3
|
|
lea dstq, [dstq+dst_strideq*4]
|
|
sub r4d, 4
|
|
jnz .loop4
|
|
RET
|
|
%endif
|
|
%endmacro
|
|
|
|
INIT_XMM sse2
|
|
convolve_fn copy
|
|
convolve_fn avg
|
|
%if CONFIG_AOM_HIGHBITDEPTH
|
|
convolve_fn copy, highbd
|
|
convolve_fn avg, highbd
|
|
%endif
|