Get optimized inv_txfm func work with HBD build

This commit takes several upstream commits in libvpx, which make a few
assemblly optimized inverse tranform functions to work for builds when
configured with --enable-vpx-highbitdepth.

Change-Id: Ibc1f18de196c1401a60faf851e5bee18ed616e69
This commit is contained in:
Yaowu Xu 2016-02-01 16:46:53 -08:00
Родитель c95eb4c776
Коммит bacba875d9
3 изменённых файлов: 118 добавлений и 12 удалений

Просмотреть файл

@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_iwht4x4_1_add/;
add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_iwht4x4_16_add/;
specialize qw/vpx_iwht4x4_16_add/, "$sse2_x86inc";
add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_1_add/;
@ -762,7 +762,7 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add/;
@ -785,10 +785,10 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct4x4_1_add sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add sse2/;
specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_12_add sse2/;
specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_1_add sse2/;
@ -803,14 +803,15 @@ if (vpx_config("CONFIG_VPX_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct16x16_1_add sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/;
specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add sse2/;
specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64_x86inc";
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2/;
specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1_add sse2/;

Просмотреть файл

@ -220,7 +220,24 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m12, [pw_11585x2]
lea r3, [2 * strideq]
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
mova m2, [inputq + 64]
packssdw m2, [inputq + 80]
mova m3, [inputq + 96]
packssdw m3, [inputq + 112]
mova m4, [inputq + 128]
packssdw m4, [inputq + 144]
mova m5, [inputq + 160]
packssdw m5, [inputq + 176]
mova m6, [inputq + 192]
packssdw m6, [inputq + 208]
mova m7, [inputq + 224]
packssdw m7, [inputq + 240]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
@ -229,7 +246,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
mova m5, [inputq + 80]
mova m6, [inputq + 96]
mova m7, [inputq + 112]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
IDCT8_1D
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@ -254,10 +271,21 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
lea r3, [2 * strideq]
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
mova m2, [inputq + 64]
packssdw m2, [inputq + 80]
mova m3, [inputq + 96]
packssdw m3, [inputq + 112]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
%endif
punpcklwd m0, m1
punpcklwd m2, m3
@ -765,6 +793,24 @@ idct32x32_34:
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
@ -773,6 +819,7 @@ idct32x32_34_transpose:
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@ -1176,6 +1223,24 @@ idct32x32_135:
mov r7, 2
idct32x32_135_transpose:
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
@ -1184,7 +1249,7 @@ idct32x32_135_transpose:
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
@ -1196,14 +1261,22 @@ idct32x32_135_transpose:
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
%if CONFIG_VPX_HIGHBITDEPTH
add r3, 32
%else
add r3, 16
%endif
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
%if CONFIG_VPX_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32]
%endif
dec r6
jnz idct32x32_135
@ -1614,6 +1687,24 @@ idct32x32_1024:
mov r7, 4
idct32x32_1024_transpose:
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
@ -1622,6 +1713,7 @@ idct32x32_1024_transpose:
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
@ -1633,8 +1725,11 @@ idct32x32_1024_transpose:
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
%if CONFIG_VPX_HIGHBITDEPTH
add r3, 32
%else
add r3, 16
%endif
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
@ -1642,7 +1737,11 @@ idct32x32_1024_transpose:
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
%if CONFIG_VPX_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32]
%endif
dec r6
jnz idct32x32_1024

Просмотреть файл

@ -82,9 +82,15 @@ SECTION .text
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
%if CONFIG_VPX_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
%endif
psraw m0, 2
psraw m1, 2