1490 строки
43 KiB
NASM
1490 строки
43 KiB
NASM
;
|
|
; Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
|
;
|
|
; This source code is subject to the terms of the BSD 2 Clause License and
|
|
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
|
; was not distributed with this source code in the LICENSE file, you can
|
|
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
|
; Media Patent License 1.0 was not distributed with this source code in the
|
|
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
|
;
|
|
|
|
;
|
|
|
|
%include "third_party/x86inc/x86inc.asm"
|
|
|
|
SECTION_RODATA
|
|
pw_8: times 8 dw 8
|
|
bilin_filter_m_sse2: times 8 dw 16
|
|
times 8 dw 0
|
|
times 8 dw 14
|
|
times 8 dw 2
|
|
times 8 dw 12
|
|
times 8 dw 4
|
|
times 8 dw 10
|
|
times 8 dw 6
|
|
times 16 dw 8
|
|
times 8 dw 6
|
|
times 8 dw 10
|
|
times 8 dw 4
|
|
times 8 dw 12
|
|
times 8 dw 2
|
|
times 8 dw 14
|
|
|
|
bilin_filter_m_ssse3: times 8 db 16, 0
|
|
times 8 db 14, 2
|
|
times 8 db 12, 4
|
|
times 8 db 10, 6
|
|
times 16 db 8
|
|
times 8 db 6, 10
|
|
times 8 db 4, 12
|
|
times 8 db 2, 14
|
|
|
|
SECTION .text
|
|
|
|
; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
|
|
; int x_offset, int y_offset,
|
|
; const uint8_t *dst, ptrdiff_t dst_stride,
|
|
; int height, unsigned int *sse);
|
|
;
|
|
; This function returns the SE and stores SSE in the given pointer.
|
|
|
|
%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
|
|
psubw %3, %4
|
|
psubw %1, %2
|
|
paddw %5, %3
|
|
pmaddwd %3, %3
|
|
paddw %5, %1
|
|
pmaddwd %1, %1
|
|
paddd %6, %3
|
|
paddd %6, %1
|
|
%endmacro
|
|
|
|
%macro STORE_AND_RET 1
|
|
%if %1 > 4
|
|
; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
|
|
; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
|
|
; We have to sign-extend it before adding the words within the register
|
|
; and outputing to a dword.
|
|
pcmpgtw m5, m6 ; mask for 0 > x
|
|
movhlps m3, m7
|
|
punpcklwd m4, m6, m5
|
|
punpckhwd m6, m5 ; sign-extend m6 word->dword
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
pshufd m3, m7, 0x1
|
|
movhlps m4, m6
|
|
paddd m7, m3
|
|
paddd m6, m4
|
|
mov r1, ssem ; r1 = unsigned int *sse
|
|
pshufd m4, m6, 0x1
|
|
movd [r1], m7 ; store sse
|
|
paddd m6, m4
|
|
movd raxd, m6 ; store sum as return value
|
|
%else ; 4xh
|
|
pshuflw m4, m6, 0xe
|
|
pshuflw m3, m7, 0xe
|
|
paddw m6, m4
|
|
paddd m7, m3
|
|
pcmpgtw m5, m6 ; mask for 0 > x
|
|
mov r1, ssem ; r1 = unsigned int *sse
|
|
punpcklwd m6, m5 ; sign-extend m6 word->dword
|
|
movd [r1], m7 ; store sse
|
|
pshuflw m4, m6, 0xe
|
|
paddd m6, m4
|
|
movd raxd, m6 ; store sum as return value
|
|
%endif
|
|
RET
|
|
%endmacro
|
|
|
|
%macro INC_SRC_BY_SRC_STRIDE 0
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
add srcq, src_stridemp
|
|
%else
|
|
add srcq, src_strideq
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro SUBPEL_VARIANCE 1-2 0 ; W
|
|
%if cpuflag(ssse3)
|
|
%define bilin_filter_m bilin_filter_m_ssse3
|
|
%define filter_idx_shift 4
|
|
%else
|
|
%define bilin_filter_m bilin_filter_m_sse2
|
|
%define filter_idx_shift 5
|
|
%endif
|
|
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
|
|
; 11, not 13, if the registers are ordered correctly. May make a minor speed
|
|
; difference on Win64
|
|
|
|
%ifdef PIC ; 64bit PIC
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, height, sse
|
|
%define sec_str sec_strideq
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse
|
|
%endif
|
|
%define block_height heightd
|
|
%define bilin_filter sseq
|
|
%else
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse, g_bilin_filter, g_pw_8
|
|
%define block_height dword heightm
|
|
%define sec_str sec_stridemp
|
|
|
|
;Store bilin_filter and pw_8 location in stack
|
|
%if GET_GOT_DEFINED == 1
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
%endif
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse, \
|
|
g_bilin_filter, g_pw_8
|
|
%define block_height heightd
|
|
|
|
;Store bilin_filter and pw_8 location in stack
|
|
%if GET_GOT_DEFINED == 1
|
|
GET_GOT eax
|
|
add esp, 4 ; restore esp
|
|
%endif
|
|
|
|
lea ecx, [GLOBAL(bilin_filter_m)]
|
|
mov g_bilin_filterm, ecx
|
|
|
|
lea ecx, [GLOBAL(pw_8)]
|
|
mov g_pw_8m, ecx
|
|
|
|
LOAD_IF_USED 0, 1 ; load eax, ecx back
|
|
%endif
|
|
%else
|
|
%if %2 == 1 ; avg
|
|
cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
|
|
7 + 2 * ARCH_X86_64, 13, src, src_stride, \
|
|
x_offset, y_offset, \
|
|
dst, dst_stride, \
|
|
sec, sec_stride, \
|
|
height, sse
|
|
%if ARCH_X86_64
|
|
%define block_height heightd
|
|
%define sec_str sec_strideq
|
|
%else
|
|
%define block_height dword heightm
|
|
%define sec_str sec_stridemp
|
|
%endif
|
|
%else
|
|
cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
|
|
y_offset, dst, dst_stride, height, sse
|
|
%define block_height heightd
|
|
%endif
|
|
|
|
%define bilin_filter bilin_filter_m
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 4
|
|
%define movx movd
|
|
%else
|
|
%define movx movh
|
|
%endif
|
|
|
|
ASSERT %1 <= 16 ; m6 overflows if w > 16
|
|
pxor m6, m6 ; sum
|
|
pxor m7, m7 ; sse
|
|
; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
|
|
; could perhaps use it for something more productive then
|
|
pxor m5, m5 ; dedicated zero register
|
|
%if %1 < 16
|
|
sar block_height, 1
|
|
%if %2 == 1 ; avg
|
|
shl sec_str, 1
|
|
%endif
|
|
%endif
|
|
|
|
; FIXME(rbultje) replace by jumptable?
|
|
test x_offsetd, x_offsetd
|
|
jnz .x_nonzero
|
|
; x_offset == 0
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_zero_y_nonzero
|
|
|
|
; x_offset == 0 && y_offset == 0
|
|
.x_zero_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
mova m1, [dstq]
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
|
|
%if %2 == 0 ; !avg
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
%if %2 == 1 ; avg
|
|
%if %1 > 4
|
|
movhps m0, [srcq+src_strideq]
|
|
%else ; 4xh
|
|
movx m1, [srcq+src_strideq]
|
|
punpckldq m0, m1
|
|
%endif
|
|
%else ; !avg
|
|
movx m2, [srcq+src_strideq]
|
|
%endif
|
|
|
|
movx m1, [dstq]
|
|
movx m3, [dstq+dst_strideq]
|
|
|
|
%if %2 == 1 ; avg
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
%endif
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%if %1 > 4
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; 4xh
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%else ; !avg
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_zero_y_zero_loop
|
|
STORE_AND_RET %1
|
|
|
|
.x_zero_y_nonzero:
|
|
cmp y_offsetd, 4
|
|
jne .x_zero_y_nonhalf
|
|
|
|
; x_offset == 0 && y_offset == 0.5
|
|
.x_zero_y_half_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+src_strideq]
|
|
mova m1, [dstq]
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m2, [srcq+src_strideq]
|
|
%if %2 == 1 ; avg
|
|
%if %1 > 4
|
|
movhps m2, [srcq+src_strideq*2]
|
|
%else ; 4xh
|
|
movx m1, [srcq+src_strideq*2]
|
|
punpckldq m2, m1
|
|
%endif
|
|
movx m1, [dstq]
|
|
%if %1 > 4
|
|
movlhps m0, m2
|
|
%else ; 4xh
|
|
punpckldq m0, m2
|
|
%endif
|
|
movx m3, [dstq+dst_strideq]
|
|
pavgb m0, m2
|
|
punpcklbw m1, m5
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpcklbw m3, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; 4xh
|
|
movh m4, [secq]
|
|
pavgb m0, m4
|
|
punpcklbw m3, m5
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%else ; !avg
|
|
movx m4, [srcq+src_strideq*2]
|
|
movx m1, [dstq]
|
|
pavgb m0, m2
|
|
movx m3, [dstq+dst_strideq]
|
|
pavgb m2, m4
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_zero_y_half_loop
|
|
STORE_AND_RET %1
|
|
|
|
.x_zero_y_nonhalf:
|
|
; x_offset == 0 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && %1 > 4
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32 or mmx
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0, reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_zero_y_other_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+src_strideq]
|
|
mova m1, [dstq]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m4, m5
|
|
; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
|
|
; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
|
|
; instructions is the same (5), but it is 1 mul instead of 2, so might be
|
|
; slightly faster because of pmullw latency. It would also cut our rodata
|
|
; tables in half for this function, and save 1-2 registers on x86-64.
|
|
pmullw m2, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, filter_rnd
|
|
paddw m2, m3
|
|
paddw m0, m4
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m2, [srcq+src_strideq]
|
|
movx m4, [srcq+src_strideq*2]
|
|
movx m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
movx m1, [dstq]
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m1, m2, filter_y_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
pmullw m4, filter_y_b
|
|
paddw m0, m1
|
|
paddw m2, filter_rnd
|
|
movx m1, [dstq]
|
|
paddw m2, m4
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
%if %1 == 4
|
|
movlhps m0, m2
|
|
%endif
|
|
packuswb m0, m2
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; 4xh
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_zero_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET %1
|
|
|
|
.x_nonzero:
|
|
cmp x_offsetd, 4
|
|
jne .x_nonhalf
|
|
; x_offset == 0.5
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_half_y_nonzero
|
|
|
|
; x_offset == 0.5 && y_offset == 0
|
|
.x_half_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
%if %2 == 1 ; avg
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m4, [srcq+1]
|
|
%if %2 == 1 ; avg
|
|
%if %1 > 4
|
|
movhps m0, [srcq+src_strideq]
|
|
movhps m4, [srcq+src_strideq+1]
|
|
%else ; 4xh
|
|
movx m1, [srcq+src_strideq]
|
|
punpckldq m0, m1
|
|
movx m2, [srcq+src_strideq+1]
|
|
punpckldq m4, m2
|
|
%endif
|
|
movx m1, [dstq]
|
|
movx m3, [dstq+dst_strideq]
|
|
pavgb m0, m4
|
|
punpcklbw m3, m5
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpcklbw m1, m5
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else ; 4xh
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m1, m5
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%else ; !avg
|
|
movx m2, [srcq+src_strideq]
|
|
movx m1, [dstq]
|
|
pavgb m0, m4
|
|
movx m4, [srcq+src_strideq+1]
|
|
movx m3, [dstq+dst_strideq]
|
|
pavgb m2, m4
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_half_y_zero_loop
|
|
STORE_AND_RET %1
|
|
|
|
.x_half_y_nonzero:
|
|
cmp y_offsetd, 4
|
|
jne .x_half_y_nonhalf
|
|
|
|
; x_offset == 0.5 && y_offset == 0.5
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_half_loop:
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m4, m3
|
|
punpckhbw m3, m1, m5
|
|
pavgb m0, m4
|
|
%if %2 == 1 ; avg
|
|
punpcklbw m1, m5
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_half_loop:
|
|
movx m2, [srcq]
|
|
movx m3, [srcq+1]
|
|
%if %2 == 1 ; avg
|
|
%if %1 > 4
|
|
movhps m2, [srcq+src_strideq]
|
|
movhps m3, [srcq+src_strideq+1]
|
|
%else
|
|
movx m1, [srcq+src_strideq]
|
|
punpckldq m2, m1
|
|
movx m1, [srcq+src_strideq+1]
|
|
punpckldq m3, m1
|
|
%endif
|
|
pavgb m2, m3
|
|
%if %1 > 4
|
|
movlhps m0, m2
|
|
movhlps m4, m2
|
|
%else ; 4xh
|
|
punpckldq m0, m2
|
|
pshuflw m4, m2, 0xe
|
|
%endif
|
|
movx m1, [dstq]
|
|
pavgb m0, m2
|
|
movx m3, [dstq+dst_strideq]
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
%endif
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%if %1 > 4
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%else ; !avg
|
|
movx m4, [srcq+src_strideq]
|
|
movx m1, [srcq+src_strideq+1]
|
|
pavgb m2, m3
|
|
pavgb m4, m1
|
|
pavgb m0, m2
|
|
pavgb m2, m4
|
|
movx m1, [dstq]
|
|
movx m3, [dstq+dst_strideq]
|
|
punpcklbw m0, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_half_y_half_loop
|
|
STORE_AND_RET %1
|
|
|
|
.x_half_y_nonhalf:
|
|
; x_offset == 0.5 && y_offset == bilin interpolation
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && %1 > 4
|
|
mova m8, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_y_a m8
|
|
%define filter_y_b m9
|
|
%define filter_rnd m10
|
|
%else ;x86_32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; x_offset == 0.5. We can reuse x_offset reg
|
|
%define tempq x_offsetq
|
|
add y_offsetq, g_bilin_filterm
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add y_offsetq, bilin_filter
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
.x_half_y_other_loop:
|
|
movu m4, [srcq]
|
|
movu m2, [srcq+1]
|
|
mova m1, [dstq]
|
|
pavgb m4, m2
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
pmullw m2, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, filter_rnd
|
|
punpcklbw m0, m5
|
|
paddw m2, m3
|
|
punpcklbw m3, m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
paddw m0, m3
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m3, [srcq+1]
|
|
add srcq, src_strideq
|
|
pavgb m0, m3
|
|
%if notcpuflag(ssse3)
|
|
punpcklbw m0, m5
|
|
%endif
|
|
.x_half_y_other_loop:
|
|
movx m2, [srcq]
|
|
movx m1, [srcq+1]
|
|
movx m4, [srcq+src_strideq]
|
|
movx m3, [srcq+src_strideq+1]
|
|
pavgb m2, m1
|
|
pavgb m4, m3
|
|
movx m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
movx m1, [dstq]
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_y_a
|
|
pmullw m1, m2, filter_y_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
paddw m0, m1
|
|
pmullw m1, m4, filter_y_b
|
|
paddw m2, filter_rnd
|
|
paddw m2, m1
|
|
movx m1, [dstq]
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
%if %1 == 4
|
|
movlhps m0, m2
|
|
%endif
|
|
packuswb m0, m2
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_half_y_other_loop
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET %1
|
|
|
|
.x_nonhalf:
|
|
test y_offsetd, y_offsetd
|
|
jnz .x_nonhalf_y_nonzero
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && %1 > 4
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
;y_offset == 0. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
.x_other_y_zero_loop:
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m4, [srcq+1]
|
|
mova m1, [dstq]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m4, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m4, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m2, m3
|
|
paddw m0, m4
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m1, [srcq+1]
|
|
movx m2, [srcq+src_strideq]
|
|
movx m4, [srcq+src_strideq+1]
|
|
movx m3, [dstq+dst_strideq]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
movx m1, [dstq]
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_x_a
|
|
pmaddubsw m2, filter_x_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m2, m5
|
|
punpcklbw m4, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m0, m1
|
|
paddw m2, filter_rnd
|
|
movx m1, [dstq]
|
|
paddw m2, m4
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
%if %1 == 4
|
|
movlhps m0, m2
|
|
%endif
|
|
packuswb m0, m2
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%endif
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_other_y_zero_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET %1
|
|
|
|
.x_nonhalf_y_nonzero:
|
|
cmp y_offsetd, 4
|
|
jne .x_nonhalf_y_nonhalf
|
|
|
|
; x_offset == bilin interpolation && y_offset == 0.5
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && %1 > 4
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_rnd m10
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; y_offset == 0.5. We can reuse y_offset reg.
|
|
%define tempq y_offsetq
|
|
add x_offsetq, g_bilin_filterm
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m1
|
|
paddw m2, m3
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
add srcq, src_strideq
|
|
packuswb m0, m2
|
|
.x_other_y_half_loop:
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
mova m1, [dstq]
|
|
punpckhbw m2, m4, m3
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m4, m2
|
|
pavgb m0, m4
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m1, m5
|
|
%else
|
|
punpckhbw m2, m4, m5
|
|
punpckhbw m1, m3, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m4, m3
|
|
paddw m2, m1
|
|
mova m1, [dstq]
|
|
psraw m4, 4
|
|
psraw m2, 4
|
|
punpckhbw m3, m1, m5
|
|
; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
|
|
; have a 1-register shortage to be able to store the backup of the bilin
|
|
; filtered second line as words as cache for the next line. Packing into
|
|
; a byte costs 1 pack and 2 unpacks, but saves a register.
|
|
packuswb m4, m2
|
|
punpcklbw m1, m5
|
|
pavgb m0, m4
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
pavgb m0, [secq]
|
|
%endif
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
add srcq, src_strideq
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m1
|
|
%endif
|
|
add srcq, src_strideq
|
|
psraw m0, 4
|
|
.x_other_y_half_loop:
|
|
movx m2, [srcq]
|
|
movx m1, [srcq+1]
|
|
movx m4, [srcq+src_strideq]
|
|
movx m3, [srcq+src_strideq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m2, m1
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
movx m1, [dstq]
|
|
movx m3, [dstq+dst_strideq]
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
paddw m2, m1
|
|
movx m1, [dstq]
|
|
paddw m4, m3
|
|
movx m3, [dstq+dst_strideq]
|
|
%endif
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
pavgw m0, m2
|
|
pavgw m2, m4
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline - also consider going to bytes here
|
|
%if %1 == 4
|
|
movlhps m0, m2
|
|
%endif
|
|
packuswb m0, m2
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%endif
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
lea srcq, [srcq+src_strideq*2]
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_other_y_half_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_rnd
|
|
STORE_AND_RET %1
|
|
|
|
.x_nonhalf_y_nonhalf:
|
|
%ifdef PIC
|
|
lea bilin_filter, [bilin_filter_m]
|
|
%endif
|
|
shl x_offsetd, filter_idx_shift
|
|
shl y_offsetd, filter_idx_shift
|
|
%if ARCH_X86_64 && %1 > 4
|
|
mova m8, [bilin_filter+x_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m9, [bilin_filter+x_offsetq+16]
|
|
%endif
|
|
mova m10, [bilin_filter+y_offsetq]
|
|
%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
|
|
mova m11, [bilin_filter+y_offsetq+16]
|
|
%endif
|
|
mova m12, [pw_8]
|
|
%define filter_x_a m8
|
|
%define filter_x_b m9
|
|
%define filter_y_a m10
|
|
%define filter_y_b m11
|
|
%define filter_rnd m12
|
|
%else ; x86-32
|
|
%if ARCH_X86=1 && CONFIG_PIC=1
|
|
; In this case, there is NO unused register. Used src_stride register. Later,
|
|
; src_stride has to be loaded from stack when it is needed.
|
|
%define tempq src_strideq
|
|
mov tempq, g_bilin_filterm
|
|
add x_offsetq, tempq
|
|
add y_offsetq, tempq
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
|
|
mov tempq, g_pw_8m
|
|
%define filter_rnd [tempq]
|
|
%else
|
|
add x_offsetq, bilin_filter
|
|
add y_offsetq, bilin_filter
|
|
%define filter_x_a [x_offsetq]
|
|
%define filter_x_b [x_offsetq+16]
|
|
%define filter_y_a [y_offsetq]
|
|
%define filter_y_b [y_offsetq+16]
|
|
%define filter_rnd [pw_8]
|
|
%endif
|
|
%endif
|
|
|
|
; x_offset == bilin interpolation && y_offset == bilin interpolation
|
|
%if %1 == 16
|
|
movu m0, [srcq]
|
|
movu m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpckhbw m2, m0, m1
|
|
punpcklbw m0, m1
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpckhbw m2, m0, m5
|
|
punpckhbw m3, m1, m5
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m1
|
|
paddw m2, m3
|
|
%endif
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
packuswb m0, m2
|
|
.x_other_y_other_loop:
|
|
%if cpuflag(ssse3)
|
|
movu m4, [srcq]
|
|
movu m3, [srcq+1]
|
|
mova m1, [dstq]
|
|
punpckhbw m2, m4, m3
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
punpckhbw m3, m1, m5
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m4, m2
|
|
punpckhbw m2, m0, m4
|
|
punpcklbw m0, m4
|
|
pmaddubsw m2, filter_y_a
|
|
pmaddubsw m0, filter_y_a
|
|
punpcklbw m1, m5
|
|
paddw m2, filter_rnd
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
psraw m0, 4
|
|
%else
|
|
movu m3, [srcq]
|
|
movu m4, [srcq+1]
|
|
punpckhbw m1, m3, m5
|
|
punpckhbw m2, m4, m5
|
|
punpcklbw m3, m5
|
|
punpcklbw m4, m5
|
|
pmullw m3, filter_x_a
|
|
pmullw m4, filter_x_b
|
|
paddw m3, filter_rnd
|
|
pmullw m1, filter_x_a
|
|
pmullw m2, filter_x_b
|
|
paddw m1, filter_rnd
|
|
paddw m3, m4
|
|
paddw m1, m2
|
|
psraw m3, 4
|
|
psraw m1, 4
|
|
packuswb m4, m3, m1
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
pmullw m2, filter_y_a
|
|
pmullw m1, filter_y_b
|
|
paddw m2, filter_rnd
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, filter_y_b
|
|
paddw m2, m1
|
|
mova m1, [dstq]
|
|
paddw m0, filter_rnd
|
|
psraw m2, 4
|
|
paddw m0, m3
|
|
punpckhbw m3, m1, m5
|
|
psraw m0, 4
|
|
punpcklbw m1, m5
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
packuswb m0, m2
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
add dstq, dst_strideq
|
|
%else ; %1 < 16
|
|
movx m0, [srcq]
|
|
movx m1, [srcq+1]
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m0, m1
|
|
pmaddubsw m0, filter_x_a
|
|
paddw m0, filter_rnd
|
|
%else
|
|
punpcklbw m0, m5
|
|
punpcklbw m1, m5
|
|
pmullw m0, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m0, filter_rnd
|
|
paddw m0, m1
|
|
%endif
|
|
psraw m0, 4
|
|
%if cpuflag(ssse3)
|
|
packuswb m0, m0
|
|
%endif
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
|
|
.x_other_y_other_loop:
|
|
movx m2, [srcq]
|
|
movx m1, [srcq+1]
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
movx m4, [srcq]
|
|
movx m3, [srcq+1]
|
|
|
|
%if cpuflag(ssse3)
|
|
punpcklbw m2, m1
|
|
punpcklbw m4, m3
|
|
pmaddubsw m2, filter_x_a
|
|
pmaddubsw m4, filter_x_a
|
|
movx m3, [dstq+dst_strideq]
|
|
movx m1, [dstq]
|
|
paddw m2, filter_rnd
|
|
paddw m4, filter_rnd
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
packuswb m2, m2
|
|
packuswb m4, m4
|
|
punpcklbw m0, m2
|
|
punpcklbw m2, m4
|
|
pmaddubsw m0, filter_y_a
|
|
pmaddubsw m2, filter_y_a
|
|
punpcklbw m3, m5
|
|
paddw m0, filter_rnd
|
|
paddw m2, filter_rnd
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
punpcklbw m1, m5
|
|
%else
|
|
punpcklbw m2, m5
|
|
punpcklbw m1, m5
|
|
punpcklbw m4, m5
|
|
punpcklbw m3, m5
|
|
pmullw m2, filter_x_a
|
|
pmullw m1, filter_x_b
|
|
paddw m2, filter_rnd
|
|
pmullw m4, filter_x_a
|
|
pmullw m3, filter_x_b
|
|
paddw m4, filter_rnd
|
|
paddw m2, m1
|
|
paddw m4, m3
|
|
psraw m2, 4
|
|
psraw m4, 4
|
|
pmullw m0, filter_y_a
|
|
pmullw m3, m2, filter_y_b
|
|
paddw m0, filter_rnd
|
|
pmullw m2, filter_y_a
|
|
pmullw m1, m4, filter_y_b
|
|
paddw m2, filter_rnd
|
|
paddw m0, m3
|
|
movx m3, [dstq+dst_strideq]
|
|
paddw m2, m1
|
|
movx m1, [dstq]
|
|
psraw m0, 4
|
|
psraw m2, 4
|
|
punpcklbw m3, m5
|
|
punpcklbw m1, m5
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
; FIXME(rbultje) pipeline
|
|
%if %1 == 4
|
|
movlhps m0, m2
|
|
%endif
|
|
packuswb m0, m2
|
|
%if %1 > 4
|
|
pavgb m0, [secq]
|
|
punpckhbw m2, m0, m5
|
|
punpcklbw m0, m5
|
|
%else
|
|
movh m2, [secq]
|
|
pavgb m0, m2
|
|
punpcklbw m0, m5
|
|
movhlps m2, m0
|
|
%endif
|
|
%endif
|
|
SUM_SSE m0, m1, m2, m3, m6, m7
|
|
mova m0, m4
|
|
|
|
INC_SRC_BY_SRC_STRIDE
|
|
lea dstq, [dstq+dst_strideq*2]
|
|
%endif
|
|
%if %2 == 1 ; avg
|
|
add secq, sec_str
|
|
%endif
|
|
dec block_height
|
|
jg .x_other_y_other_loop
|
|
%undef filter_x_a
|
|
%undef filter_x_b
|
|
%undef filter_y_a
|
|
%undef filter_y_b
|
|
%undef filter_rnd
|
|
%undef movx
|
|
STORE_AND_RET %1
|
|
%endmacro
|
|
|
|
; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
|
|
; between the ssse3 and non-ssse3 version. It may make sense to merge their
|
|
; code in the sense that the ssse3 version would jump to the appropriate
|
|
; location in the sse/2 version, rather than duplicating that code in the
|
|
; binary.
|
|
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 4
|
|
SUBPEL_VARIANCE 8
|
|
SUBPEL_VARIANCE 16
|
|
|
|
INIT_XMM ssse3
|
|
SUBPEL_VARIANCE 4
|
|
SUBPEL_VARIANCE 8
|
|
SUBPEL_VARIANCE 16
|
|
|
|
INIT_XMM sse2
|
|
SUBPEL_VARIANCE 4, 1
|
|
SUBPEL_VARIANCE 8, 1
|
|
SUBPEL_VARIANCE 16, 1
|
|
|
|
INIT_XMM ssse3
|
|
SUBPEL_VARIANCE 4, 1
|
|
SUBPEL_VARIANCE 8, 1
|
|
SUBPEL_VARIANCE 16, 1
|