VP9: Add ssse3 version of vpx_idct32x32_135_add()

Change-Id: I9a780131efaad28cf1ad233ae64c5c319a329727
This commit is contained in:
Scott LaVarnway 2015-12-02 04:50:46 -08:00
Родитель 2669e05949
Коммит f0b0b1fe62
4 изменённых файлов: 511 добавлений и 3 удалений

Просмотреть файл

@ -174,6 +174,9 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
else if (eob <= 34)
// non-zero coeff only in upper-left 8x8
vpx_idct32x32_34_add(input, dest, stride);
else if (eob <= 135)
// non-zero coeff only in upper-left 16x16
vpx_idct32x32_135_add(input, dest, stride);
else
vpx_idct32x32_1024_add(input, dest, stride);
}

Просмотреть файл

@ -1194,6 +1194,33 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
}
}
void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
tran_low_t out[32 * 32] = {0};
tran_low_t *outptr = out;
int i, j;
tran_low_t temp_in[32], temp_out[32];
// Rows
// only upper-left 16x16 has non-zero coeff
for (i = 0; i < 16; ++i) {
idct32_c(input, outptr);
input += 32;
outptr += 32;
}
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j)
temp_in[j] = out[j * 32 + i];
idct32_c(temp_in, temp_out);
for (j = 0; j < 32; ++j) {
dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
ROUND_POWER_OF_TWO(temp_out[j], 6));
}
}
}
void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
int stride) {
tran_low_t out[32 * 32] = {0};

Просмотреть файл

@ -754,6 +754,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
@ -802,6 +805,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/;
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add sse2/;
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2/;
@ -853,6 +860,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add/;
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add/;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add/;
@ -892,6 +902,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc";
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
$vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon;
$vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
$vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc";
# Need to add 34 eob idct32x32 neon implementation.

Просмотреть файл

@ -18,9 +18,13 @@ SECTION_RODATA
pw_11585x2: times 8 dw 23170
pw_m2404x2: times 8 dw -2404*2
pw_m4756x2: times 8 dw -4756*2
pw_m5520x2: times 8 dw -5520*2
pw_m2404x2: times 8 dw -2404*2
pw_m4756x2: times 8 dw -4756*2
pw_m5520x2: times 8 dw -5520*2
pw_m8423x2: times 8 dw -8423*2
pw_m9102x2: times 8 dw -9102*2
pw_m10394x2: times 8 dw -10394*2
pw_m11003x2: times 8 dw -11003*2
pw_16364x2: times 8 dw 16364*2
pw_16305x2: times 8 dw 16305*2
@ -29,6 +33,18 @@ pw_16069x2: times 8 dw 16069*2
pw_15893x2: times 8 dw 15893*2
pw_15679x2: times 8 dw 15679*2
pw_15426x2: times 8 dw 15426*2
pw_15137x2: times 8 dw 15137*2
pw_14811x2: times 8 dw 14811*2
pw_14449x2: times 8 dw 14449*2
pw_14053x2: times 8 dw 14053*2
pw_13623x2: times 8 dw 13623*2
pw_13160x2: times 8 dw 13160*2
pw_12665x2: times 8 dw 12665*2
pw_12140x2: times 8 dw 12140*2
pw__9760x2: times 8 dw 9760*2
pw__7723x2: times 8 dw 7723*2
pw__7005x2: times 8 dw 7005*2
pw__6270x2: times 8 dw 6270*2
pw__3981x2: times 8 dw 3981*2
pw__3196x2: times 8 dw 3196*2
pw__1606x2: times 8 dw 1606*2
@ -793,6 +809,450 @@ idct32x32_34_transpose_2:
RET
%macro IDCT32X32_135 4
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m1, [rsp + transposed_in + 16 * 1]
mova m11, m1
pmulhrsw m1, [pw___804x2] ; stp1_16
pmulhrsw m11, [pw_16364x2] ; stp2_31
mova m7, [rsp + transposed_in + 16 * 7]
mova m12, m7
pmulhrsw m7, [pw_15426x2] ; stp1_28
pmulhrsw m12, [pw_m5520x2] ; stp2_19
mova m3, [rsp + transposed_in + 16 * 9]
mova m4, m3
pmulhrsw m3, [pw__7005x2] ; stp1_18
pmulhrsw m4, [pw_14811x2] ; stp2_29
mova m0, [rsp + transposed_in + 16 * 15]
mova m2, m0
pmulhrsw m0, [pw_12140x2] ; stp1_30
pmulhrsw m2, [pw_m11003x2] ; stp2_17
; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
mova [stp + %3 + idx16], m1
mova [stp + %3 + idx17], m0
mova [stp + %3 + idx18], m4
mova [stp + %3 + idx19], m7
mova [stp + %4 + idx28], m12
mova [stp + %4 + idx29], m3
mova [stp + %4 + idx30], m2
mova [stp + %4 + idx31], m11
; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m2, [rsp + transposed_in + 16 * 3]
mova m3, m2
pmulhrsw m3, [pw_m2404x2] ; stp1_23
pmulhrsw m2, [pw_16207x2] ; stp2_24
mova m5, [rsp + transposed_in + 16 * 5]
mova m6, m5
pmulhrsw m5, [pw__3981x2] ; stp1_20
pmulhrsw m6, [pw_15893x2] ; stp2_27
mova m14, [rsp + transposed_in + 16 * 11]
mova m13, m14
pmulhrsw m13, [pw_m8423x2] ; stp1_21
pmulhrsw m14, [pw_14053x2] ; stp2_26
mova m0, [rsp + transposed_in + 16 * 13]
mova m1, m0
pmulhrsw m0, [pw__9760x2] ; stp1_22
pmulhrsw m1, [pw_13160x2] ; stp2_25
; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m4, [stp + %3 + idx16]
mova m7, [stp + %3 + idx17]
mova m11, [stp + %3 + idx18]
mova m12, [stp + %3 + idx19]
SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
mova [stp + %3 + idx16], m4
mova [stp + %3 + idx17], m7
mova [stp + %3 + idx18], m11
mova [stp + %3 + idx19], m12
mova m4, [stp + %4 + idx28]
mova m7, [stp + %4 + idx29]
mova m11, [stp + %4 + idx30]
mova m12, [stp + %4 + idx31]
SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
mova [stp + %4 + idx28], m4
mova [stp + %4 + idx29], m7
mova [stp + %4 + idx30], m11
mova [stp + %4 + idx31], m12
; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 6, 5, 9
pmulhrsw m6, m10 ; stp1_27
pmulhrsw m5, m10 ; stp1_20
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_26
pmulhrsw m14, m10 ; stp1_21
SUM_SUB 1, 0, 9
pmulhrsw m1, m10 ; stp1_25
pmulhrsw m0, m10 ; stp1_22
SUM_SUB 2, 3, 9
pmulhrsw m2, m10 ; stp1_25
pmulhrsw m3, m10 ; stp1_22
%else
BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
SWAP 6, 5
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
SWAP 13, 14
BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
SWAP 1, 0
BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
SWAP 2, 3
%endif
mova [stp + %3 + idx20], m5
mova [stp + %3 + idx21], m14
mova [stp + %3 + idx22], m0
mova [stp + %3 + idx23], m3
mova [stp + %4 + idx24], m2
mova [stp + %4 + idx25], m1
mova [stp + %4 + idx26], m13
mova [stp + %4 + idx27], m6
; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 2]
mova m1, m0
pmulhrsw m0, [pw__1606x2] ; stp1_8
pmulhrsw m1, [pw_16305x2] ; stp2_15
mova m6, [rsp + transposed_in + 16 * 6]
mova m7, m6
pmulhrsw m7, [pw_m4756x2] ; stp2_11
pmulhrsw m6, [pw_15679x2] ; stp1_12
mova m4, [rsp + transposed_in + 16 * 10]
mova m5, m4
pmulhrsw m4, [pw__7723x2] ; stp1_10
pmulhrsw m5, [pw_14449x2] ; stp2_13
mova m2, [rsp + transposed_in + 16 * 14]
mova m3, m2
pmulhrsw m3, [pw_m10394x2] ; stp1_9
pmulhrsw m2, [pw_12665x2] ; stp2_14
; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 5, 4, 9
pmulhrsw m5, m10 ; stp1_13
pmulhrsw m4, m10 ; stp1_10
SUM_SUB 6, 7, 9
pmulhrsw m6, m10 ; stp1_12
pmulhrsw m7, m10 ; stp1_11
%else
BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
SWAP 5, 4
BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
SWAP 6, 7
%endif
; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova [stp + %2 + idx8], m0
mova [stp + %2 + idx9], m2
mova [stp + %2 + idx10], m4
mova [stp + %2 + idx11], m7
mova [stp + %2 + idx12], m6
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m3
mova [stp + %2 + idx15], m1
; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m11, [rsp + transposed_in + 16 * 4]
mova m12, m11
pmulhrsw m11, [pw__3196x2] ; stp1_4
pmulhrsw m12, [pw_16069x2] ; stp1_7
mova m13, [rsp + transposed_in + 16 * 12]
mova m14, m13
pmulhrsw m13, [pw_13623x2] ; stp1_6
pmulhrsw m14, [pw_m9102x2] ; stp1_5
; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m0, [rsp + transposed_in + 16 * 0]
mova m2, [rsp + transposed_in + 16 * 8]
pmulhrsw m0, [pw_11585x2] ; stp1_1
mova m3, m2
pmulhrsw m2, [pw__6270x2] ; stp1_2
pmulhrsw m3, [pw_15137x2] ; stp1_3
SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
%if 0 ; overflow occurs in SUM_SUB when using test streams
mova m10, [pw_11585x2]
SUM_SUB 13, 14, 9
pmulhrsw m13, m10 ; stp1_6
pmulhrsw m14, m10 ; stp1_5
%else
BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
SWAP 13, 14
%endif
mova m1, m0 ; stp1_0 = stp1_1
SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m4, [stp + %2 + idx12]
mova m5, [stp + %2 + idx13]
mova m6, [stp + %2 + idx14]
mova m7, [stp + %2 + idx15]
SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
; 0-3, 28-31 final stage
mova m10, [stp + %4 + idx31]
mova m15, [stp + %4 + idx30]
SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
mova [stp + %1 + idx0], m0
mova [stp + %1 + idx1], m1
mova [stp + %4 + idx31], m10
mova [stp + %4 + idx30], m15
mova m0, [stp + %4 + idx29]
mova m1, [stp + %4 + idx28]
SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
mova [stp + %1 + idx2], m2
mova [stp + %1 + idx3], m3
mova [stp + %4 + idx29], m0
mova [stp + %4 + idx28], m1
; 12-15, 16-19 final stage
mova m0, [stp + %3 + idx16]
mova m1, [stp + %3 + idx17]
mova m2, [stp + %3 + idx18]
mova m3, [stp + %3 + idx19]
SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
mova [stp + %2 + idx12], m4
mova [stp + %2 + idx13], m5
mova [stp + %2 + idx14], m6
mova [stp + %2 + idx15], m7
mova [stp + %3 + idx16], m0
mova [stp + %3 + idx17], m1
mova [stp + %3 + idx18], m2
mova [stp + %3 + idx19], m3
mova m4, [stp + %2 + idx8]
mova m5, [stp + %2 + idx9]
mova m6, [stp + %2 + idx10]
mova m7, [stp + %2 + idx11]
SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
; 4-7, 24-27 final stage
mova m3, [stp + %4 + idx24]
mova m2, [stp + %4 + idx25]
mova m1, [stp + %4 + idx26]
mova m0, [stp + %4 + idx27]
SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
mova [stp + %4 + idx24], m3
mova [stp + %4 + idx25], m2
mova [stp + %4 + idx26], m1
mova [stp + %4 + idx27], m0
mova [stp + %1 + idx4], m11
mova [stp + %1 + idx5], m14
mova [stp + %1 + idx6], m13
mova [stp + %1 + idx7], m12
; 8-11, 20-23 final stage
mova m0, [stp + %3 + idx20]
mova m1, [stp + %3 + idx21]
mova m2, [stp + %3 + idx22]
mova m3, [stp + %3 + idx23]
SUM_SUB 7, 0, 9 ; stp1_11, stp_20
SUM_SUB 6, 1, 9 ; stp1_10, stp_21
SUM_SUB 5, 2, 9 ; stp1_9, stp_22
SUM_SUB 4, 3, 9 ; stp1_8, stp_23
mova [stp + %2 + idx8], m4
mova [stp + %2 + idx9], m5
mova [stp + %2 + idx10], m6
mova [stp + %2 + idx11], m7
mova [stp + %3 + idx20], m0
mova [stp + %3 + idx21], m1
mova [stp + %3 + idx22], m2
mova [stp + %3 + idx23], m3
%endmacro
INIT_XMM ssse3
cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
mova m8, [pd_8192]
mov r6, 2
lea stp, [rsp + pass_one_start]
idct32x32_135:
mov r3, inputq
lea r4, [rsp + transposed_in]
mov r7, 2
idct32x32_135_transpose:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
mova [r4 + 16 * 1], m1
mova [r4 + 16 * 2], m2
mova [r4 + 16 * 3], m3
mova [r4 + 16 * 4], m4
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
add r3, 16
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
lea inputq, [inputq + 16 * 32]
dec r6
jnz idct32x32_135
mov r6, 4
lea stp, [rsp + pass_one_start]
lea r9, [rsp + pass_one_start]
idct32x32_135_2:
lea r4, [rsp + transposed_in]
mov r3, r9
mov r7, 2
idct32x32_135_transpose_2:
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 1]
mova m2, [r3 + 16 * 2]
mova m3, [r3 + 16 * 3]
mova m4, [r3 + 16 * 4]
mova m5, [r3 + 16 * 5]
mova m6, [r3 + 16 * 6]
mova m7, [r3 + 16 * 7]
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
mova [r4 + 0], m0
mova [r4 + 16 * 1], m1
mova [r4 + 16 * 2], m2
mova [r4 + 16 * 3], m3
mova [r4 + 16 * 4], m4
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
add r3, 16 * 8
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose_2
IDCT32X32_135 16*0, 16*8, 16*16, 16*24
lea stp, [stp + 16 * 32]
add r9, 16 * 32
dec r6
jnz idct32x32_135_2
RECON_AND_STORE pass_two_start
RET
%macro IDCT32X32_1024 4
; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mova m1, [rsp + transposed_in + 16 * 1]