mips msa vp9 updated convolve horiz, vert, hv, copy, avg module

Updated sources according to improved version of common MSA macros.
Enabled respective convolve MSA hooks and tests.
Overall, this is just upgrading the code with styling changes.

Change-Id: If5ad6ef8ea7ca47feed6d2fc9f34f0f0e8b6694d
This commit is contained in:
Parag Salasakar 2015-06-02 10:29:56 +05:30
Родитель cf1c0ebc3a
Коммит ebf7466cd8
9 изменённых файлов: 1176 добавлений и 2005 удалений

Просмотреть файл

@ -1815,8 +1815,7 @@ INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest, ::testing::Values(
make_tuple(64, 64, &convolve8_dspr2))); make_tuple(64, 64, &convolve8_dspr2)));
#endif #endif
#if 0 // HAVE_MSA #if HAVE_MSA
// TODO(parag): enable when function hooks are added
const ConvolveFunctions convolve8_msa( const ConvolveFunctions convolve8_msa(
vp9_convolve_copy_msa, vp9_convolve_avg_msa, vp9_convolve_copy_msa, vp9_convolve_avg_msa,
vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_c, vp9_convolve8_horiz_msa, vp9_convolve8_avg_horiz_c,

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -26,93 +26,68 @@ static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
v16u8 mask0, mask1, mask2, mask3; v16u8 mask0, mask1, mask2, mask3, out;
v8i16 filt_horiz; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4; v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
mask0 = LD_UB(&mc_filt_mask_arr[16]);
src -= (3 + 3 * src_stride); src -= (3 + 3 * src_stride);
/* rearranging filter */ /* rearranging filter */
filt_horiz = LOAD_SH(filter_horiz); filt = LD_SH(filter_horiz);
filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
mask1 = mask0 + 2; mask1 = mask0 + 2;
mask2 = mask0 + 4; mask2 = mask0 + 4;
mask3 = mask0 + 6; mask3 = mask0 + 6;
LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride); src += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
src0, src1, src2, src3, src4, src5, src6, 128); filt_hz1, filt_hz2, filt_hz3);
hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, filt = LD_SH(filter_vert);
filt_horiz0, filt_horiz1, filt_horiz2, SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
filt_horiz3);
horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2,
filt_horiz3);
horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2,
filt_horiz3);
horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2,
filt_horiz3);
horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
filt = LOAD_SH(filter_vert); ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
filt_vert0 = __msa_splati_h(filt, 0); out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
filt_vert1 = __msa_splati_h(filt, 1);
filt_vert2 = __msa_splati_h(filt, 2);
filt_vert3 = __msa_splati_h(filt, 3);
out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); LD_SB4(src, src_stride, src7, src8, src9, src10);
XORI_B4_128_SB(src7, src8, src9, src10);
src += (4 * src_stride); src += (4 * src_stride);
XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
filt_hz0, filt_hz1, filt_hz2, filt_hz3);
hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3, hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
filt_horiz3); hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8); out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); filt_vt2, filt_vt3);
SRARI_H2_SH(tmp0, tmp1, FILTER_BITS);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, SAT_SH2_SH(tmp0, tmp1, 7);
filt_vert2, filt_vert3); out = PCKEV_XORI128_UB(tmp0, tmp1);
ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2,
filt_horiz3);
horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
filt_vert2, filt_vert3);
tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
horiz_out5 = horiz_out9; hz_out5 = hz_out9;
out0 = out2; out0 = out2;
out1 = out3; out1 = out3;
out2 = out4; out2 = out4;
@ -125,108 +100,87 @@ static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3; v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3; v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
v16u8 mask0, mask1, mask2, mask3; v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3; v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7; v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
v8i16 horiz_out8, horiz_out9, horiz_out10;
v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
v8i16 tmp0, tmp1, tmp2, tmp3;
mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
mask0 = LD_UB(&mc_filt_mask_arr[0]);
src -= (3 + 3 * src_stride); src -= (3 + 3 * src_stride);
/* rearranging filter */ /* rearranging filter */
filt_horiz = LOAD_SH(filter_horiz); filt = LD_SH(filter_horiz);
filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0); SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
mask1 = mask0 + 2; mask1 = mask0 + 2;
mask2 = mask0 + 4; mask2 = mask0 + 4;
mask3 = mask0 + 6; mask3 = mask0 + 6;
LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6); LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
src += (7 * src_stride); src += (7 * src_stride);
XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6, XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
src0, src1, src2, src3, src4, src5, src6, 128); hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
filt_hz1, filt_hz2, filt_hz3);
horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0, filt = LD_SH(filter_vert);
filt_horiz1, filt_horiz2, filt_horiz3); SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
filt_horiz1, filt_horiz2, filt_horiz3);
filt = LOAD_SH(filter_vert); ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
filt_vert0 = __msa_splati_h(filt, 0); ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
filt_vert1 = __msa_splati_h(filt, 1); ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
filt_vert2 = __msa_splati_h(filt, 2);
filt_vert3 = __msa_splati_h(filt, 3);
out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10); LD_SB4(src, src_stride, src7, src8, src9, src10);
src += (4 * src_stride); src += (4 * src_stride);
XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128); XORI_B4_128_SB(src7, src8, src9, src10);
horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0, hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
filt_horiz1, filt_horiz2, filt_horiz3); filt_hz0, filt_hz1, filt_hz2, filt_hz3);
out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6); hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
filt_vert2, filt_vert3); out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7); tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0, hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
filt_horiz1, filt_horiz2, filt_horiz3); filt_hz0, filt_hz1, filt_hz2, filt_hz3);
out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7); hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
filt_vert2, filt_vert3); out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7); tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
filt_vt2, filt_vt3);
horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0, SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
filt_horiz1, filt_horiz2, filt_horiz3); SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8); vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1, ST8x4_UB(vec0, vec1, dst, dst_stride);
filt_vert2, filt_vert3);
tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
filt_horiz0, filt_horiz1, filt_horiz2,
filt_horiz3);
out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
filt_vert2, filt_vert3);
tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
horiz_out6 = horiz_out10; hz_out6 = hz_out10;
out0 = out2; out0 = out2;
out1 = out3; out1 = out3;
out2 = out8; out2 = out8;
@ -279,175 +233,89 @@ static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_horiz,
int8_t *filter_vert) { int8_t *filter_vert) {
uint32_t out0, out1, out2, out3;
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask;
v16u8 res0, res1, horiz_vec; v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
v16u8 filt_vert, filt_horiz, vec0, vec1; v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
v8u16 filt, tmp0, tmp1;
v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
mask = LOAD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
/* rearranging filter */ /* rearranging filter */
filt = LOAD_UH(filter_horiz); filt = LD_UH(filter_horiz);
filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
filt = LOAD_UH(filter_vert); filt = LD_UH(filter_vert);
filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
SAT_UH2_UH(tmp0, tmp1, 7);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
tmp0 = __msa_dotp_u_h(vec0, filt_vert);
tmp1 = __msa_dotp_u_h(vec1, filt_vert);
tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
out0 = __msa_copy_u_w((v4i32)res0, 0);
out1 = __msa_copy_u_w((v4i32)res0, 1);
out2 = __msa_copy_u_w((v4i32)res1, 0);
out3 = __msa_copy_u_w((v4i32)res1, 1);
STORE_WORD(dst, out0);
dst += dst_stride;
STORE_WORD(dst, out1);
dst += dst_stride;
STORE_WORD(dst, out2);
dst += dst_stride;
STORE_WORD(dst, out3);
} }
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_horiz,
int8_t *filter_vert) { int8_t *filter_vert) {
uint32_t out0, out1, out2, out3;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
v16u8 filt_horiz, filt_vert, horiz_vec;
v16u8 vec0, vec1, vec2, vec3;
v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
v8u16 vec4, vec5, vec6, vec7, filt;
v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
v16i8 res0, res1, res2, res3; v16i8 res0, res1, res2, res3;
v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
mask = LOAD_SB(&mc_filt_mask_arr[16]); mask = LD_SB(&mc_filt_mask_arr[16]);
/* rearranging filter */ /* rearranging filter */
filt = LOAD_UH(filter_horiz); filt = LD_UH(filter_horiz);
filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0); filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
filt = LOAD_UH(filter_vert); filt = LD_UH(filter_vert);
filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0); filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
LOAD_8VECS_SB(src, src_stride, LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
src8 = LOAD_SB(src); src8 = LD_SB(src);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
hz_out3, hz_out5, 8);
hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz); ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7); DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
vec4, vec5, vec6, vec7);
horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4); SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7); PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
res2, res3);
horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6); ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz); dst += (4 * dst_stride);
horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7); ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
vec4 = __msa_dotp_u_h(vec0, filt_vert);
vec5 = __msa_dotp_u_h(vec1, filt_vert);
vec6 = __msa_dotp_u_h(vec2, filt_vert);
vec7 = __msa_dotp_u_h(vec3, filt_vert);
vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
out0 = __msa_copy_u_w((v4i32)res0, 0);
out1 = __msa_copy_u_w((v4i32)res0, 1);
out2 = __msa_copy_u_w((v4i32)res1, 0);
out3 = __msa_copy_u_w((v4i32)res1, 1);
STORE_WORD(dst, out0);
dst += dst_stride;
STORE_WORD(dst, out1);
dst += dst_stride;
STORE_WORD(dst, out2);
dst += dst_stride;
STORE_WORD(dst, out3);
dst += dst_stride;
out0 = __msa_copy_u_w((v4i32)res2, 0);
out1 = __msa_copy_u_w((v4i32)res2, 1);
out2 = __msa_copy_u_w((v4i32)res3, 0);
out3 = __msa_copy_u_w((v4i32)res3, 1);
STORE_WORD(dst, out0);
dst += dst_stride;
STORE_WORD(dst, out1);
dst += dst_stride;
STORE_WORD(dst, out2);
dst += dst_stride;
STORE_WORD(dst, out3);
} }
static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_horiz, int8_t *filter_vert,
int8_t *filter_vert,
int32_t height) { int32_t height) {
if (4 == height) { if (4 == height) {
common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
filter_horiz, filter_vert); filter_vert);
} else if (8 == height) { } else if (8 == height) {
common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
filter_horiz, filter_vert); filter_vert);
} }
} }
@ -455,63 +323,43 @@ static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride,
int8_t *filter_horiz, int8_t *filter_horiz,
int8_t *filter_vert) { int8_t *filter_vert) {
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
v16u8 filt_horiz, filt_vert, horiz_vec; v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
v16u8 vec0, vec1, vec2, vec3; v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
v8u16 horiz_out0, horiz_out1;
v8u16 tmp0, tmp1, tmp2, tmp3;
v8i16 filt; v8i16 filt;
mask = LOAD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
/* rearranging filter */ /* rearranging filter */
filt = LOAD_SH(filter_horiz); filt = LD_SH(filter_horiz);
filt_horiz = (v16u8)__msa_splati_h(filt, 0); filt_hz = (v16u8)__msa_splati_h(filt, 0);
filt = LOAD_SH(filter_vert); filt = LD_SH(filter_vert);
filt_vert = (v16u8)__msa_splati_h(filt, 0); filt_vt = (v16u8)__msa_splati_h(filt, 0);
LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4); LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
src += (5 * src_stride);
horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
tmp0 = __msa_dotp_u_h(vec0, filt_vt);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); tmp1 = __msa_dotp_u_h(vec1, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
tmp0 = __msa_dotp_u_h(vec0, filt_vert); vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
tmp2 = __msa_dotp_u_h(vec2, filt_vt);
horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); tmp3 = __msa_dotp_u_h(vec3, filt_vt);
vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
tmp1 = __msa_dotp_u_h(vec1, filt_vert); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); ST8x4_UB(out0, out1, dst, dst_stride);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
tmp2 = __msa_dotp_u_h(vec2, filt_vert);
horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
tmp3 = __msa_dotp_u_h(vec3, filt_vert);
tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
} }
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
@ -522,106 +370,76 @@ static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
int8_t *filter_vert, int8_t *filter_vert,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, mask; v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
v16u8 filt_horiz, filt_vert, vec0, horiz_vec; v16u8 filt_hz, filt_vt, vec0;
v8u16 horiz_out0, horiz_out1; v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
v8i16 filt; v8i16 filt;
mask = LOAD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
/* rearranging filter */ /* rearranging filter */
filt = LOAD_SH(filter_horiz); filt = LD_SH(filter_horiz);
filt_horiz = (v16u8)__msa_splati_h(filt, 0); filt_hz = (v16u8)__msa_splati_h(filt, 0);
filt = LOAD_SH(filter_vert); filt = LD_SH(filter_vert);
filt_vert = (v16u8)__msa_splati_h(filt, 0); filt_vt = (v16u8)__msa_splati_h(filt, 0);
src0 = LOAD_SB(src); src0 = LD_SB(src);
src += src_stride; src += src_stride;
horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
for (loop_cnt = (height >> 3); loop_cnt--;) { for (loop_cnt = (height >> 3); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4); LD_SB4(src, src_stride, src1, src2, src3, src4);
src += (4 * src_stride); src += (4 * src_stride);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); tmp1 = __msa_dotp_u_h(vec0, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
tmp1 = __msa_dotp_u_h(vec0, filt_vert); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp2 = __msa_dotp_u_h(vec0, filt_vt);
horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH2_UH(tmp1, tmp2, 7);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
tmp3 = __msa_dotp_u_h(vec0, filt_vt);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7); LD_SB4(src, src_stride, src1, src2, src3, src4);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
tmp3 = __msa_dotp_u_h(vec0, filt_vert);
horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
src += (4 * src_stride); src += (4 * src_stride);
vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp4 = __msa_dotp_u_h(vec0, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); SRARI_H2_UH(tmp3, tmp4, FILTER_BITS);
tmp4 = __msa_dotp_u_h(vec0, filt_vert); SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7); ST8x4_UB(out0, out1, dst, dst_stride);
tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); tmp5 = __msa_dotp_u_h(vec0, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
tmp5 = __msa_dotp_u_h(vec0, filt_vert); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp6 = __msa_dotp_u_h(vec0, filt_vt);
horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7); tmp7 = __msa_dotp_u_h(vec0, filt_vt);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1); hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
tmp6 = __msa_dotp_u_h(vec0, filt_vert); vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
tmp8 = __msa_dotp_u_h(vec0, filt_vt);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS);
horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7); PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
ST8x4_UB(out0, out1, dst, dst_stride);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
tmp7 = __msa_dotp_u_h(vec0, filt_vert);
horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
tmp8 = __msa_dotp_u_h(vec0, filt_vert);
tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
@ -645,108 +463,64 @@ static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
int32_t height) { int32_t height) {
uint32_t loop_cnt; uint32_t loop_cnt;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
v16u8 filt_horiz, filt_vert, vec0, horiz_vec; v16u8 filt_hz, filt_vt, vec0, vec1;
v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2; v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
v8i16 filt; v8i16 filt;
mask = LOAD_SB(&mc_filt_mask_arr[0]); mask = LD_SB(&mc_filt_mask_arr[0]);
/* rearranging filter */ /* rearranging filter */
filt = LOAD_SH(filter_horiz); filt = LD_SH(filter_horiz);
filt_horiz = (v16u8)__msa_splati_h(filt, 0); filt_hz = (v16u8)__msa_splati_h(filt, 0);
filt = LOAD_SH(filter_vert); filt = LD_SH(filter_vert);
filt_vert = (v16u8)__msa_splati_h(filt, 0); filt_vt = (v16u8)__msa_splati_h(filt, 0);
src0 = LOAD_SB(src);
src1 = LOAD_SB(src + 8);
horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
LD_SB2(src, 8, src0, src1);
src += src_stride; src += src_stride;
hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
for (loop_cnt = (height >> 2); loop_cnt--;) { for (loop_cnt = (height >> 2); loop_cnt--;) {
LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6); LD_SB4(src, src_stride, src0, src2, src4, src6);
LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7); LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
src += (4 * src_stride); src += (4 * src_stride);
horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0); hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH2_UH(tmp1, tmp2, 7);
horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); PCKEV_ST_SB(tmp1, tmp2, dst);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
tmp1 = __msa_dotp_u_h(vec0, filt_vert);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
tmp2 = __msa_dotp_u_h(vec0, filt_vert);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
dst += dst_stride; dst += dst_stride;
horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2); hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH2_UH(tmp1, tmp2, 7);
horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); PCKEV_ST_SB(tmp1, tmp2, dst);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
tmp1 = __msa_dotp_u_h(vec0, filt_vert);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
tmp2 = __msa_dotp_u_h(vec0, filt_vert);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
dst += dst_stride; dst += dst_stride;
horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4); hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH2_UH(tmp1, tmp2, 7);
horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); PCKEV_ST_SB(tmp1, tmp2, dst);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
tmp1 = __msa_dotp_u_h(vec0, filt_vert);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
tmp2 = __msa_dotp_u_h(vec0, filt_vert);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
dst += dst_stride; dst += dst_stride;
horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6); hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz); hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7); ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7); SRARI_H2_UH(tmp1, tmp2, FILTER_BITS);
horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz); SAT_UH2_UH(tmp1, tmp2, 7);
horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7); PCKEV_ST_SB(tmp1, tmp2, dst);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
tmp1 = __msa_dotp_u_h(vec0, filt_vert);
vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
tmp2 = __msa_dotp_u_h(vec0, filt_vert);
tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
dst += dst_stride; dst += dst_stride;
} }
} }

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -19,46 +19,35 @@ static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
if (0 == (height % 4)) { if (0 == (height % 4)) {
for (cnt = (height / 4); cnt--;) { for (cnt = (height / 4); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
dst0 = __msa_aver_u_b(src0, dst0); AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
dst1 = __msa_aver_u_b(src1, dst1); dst0, dst1, dst2, dst3);
dst2 = __msa_aver_u_b(src2, dst2);
dst3 = __msa_aver_u_b(src3, dst3);
out0 = __msa_copy_u_w((v4i32)dst0, 0); out0 = __msa_copy_u_w((v4i32)dst0, 0);
out1 = __msa_copy_u_w((v4i32)dst1, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0);
out2 = __msa_copy_u_w((v4i32)dst2, 0); out2 = __msa_copy_u_w((v4i32)dst2, 0);
out3 = __msa_copy_u_w((v4i32)dst3, 0); out3 = __msa_copy_u_w((v4i32)dst3, 0);
SW4(out0, out1, out2, out3, dst, dst_stride);
STORE_WORD(dst, out0); dst += (4 * dst_stride);
dst += dst_stride;
STORE_WORD(dst, out1);
dst += dst_stride;
STORE_WORD(dst, out2);
dst += dst_stride;
STORE_WORD(dst, out3);
dst += dst_stride;
} }
} else if (0 == (height % 2)) { } else if (0 == (height % 2)) {
for (cnt = (height / 2); cnt--;) { for (cnt = (height / 2); cnt--;) {
LOAD_2VECS_UB(src, src_stride, src0, src1); LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride); src += (2 * src_stride);
LOAD_2VECS_UB(dst, dst_stride, dst0, dst1); LD_UB2(dst, dst_stride, dst0, dst1);
dst0 = __msa_aver_u_b(src0, dst0); AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
dst1 = __msa_aver_u_b(src1, dst1);
out0 = __msa_copy_u_w((v4i32)dst0, 0); out0 = __msa_copy_u_w((v4i32)dst0, 0);
out1 = __msa_copy_u_w((v4i32)dst1, 0); out1 = __msa_copy_u_w((v4i32)dst1, 0);
SW(out0, dst);
STORE_WORD(dst, out0);
dst += dst_stride; dst += dst_stride;
STORE_WORD(dst, out1); SW(out1, dst);
dst += dst_stride; dst += dst_stride;
} }
} }
@ -72,29 +61,19 @@ static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
v16u8 dst0, dst1, dst2, dst3; v16u8 dst0, dst1, dst2, dst3;
for (cnt = (height / 4); cnt--;) { for (cnt = (height / 4); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3); AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
dst0, dst1, dst2, dst3);
dst0 = __msa_aver_u_b(src0, dst0);
dst1 = __msa_aver_u_b(src1, dst1);
dst2 = __msa_aver_u_b(src2, dst2);
dst3 = __msa_aver_u_b(src3, dst3);
out0 = __msa_copy_u_d((v2i64)dst0, 0); out0 = __msa_copy_u_d((v2i64)dst0, 0);
out1 = __msa_copy_u_d((v2i64)dst1, 0); out1 = __msa_copy_u_d((v2i64)dst1, 0);
out2 = __msa_copy_u_d((v2i64)dst2, 0); out2 = __msa_copy_u_d((v2i64)dst2, 0);
out3 = __msa_copy_u_d((v2i64)dst3, 0); out3 = __msa_copy_u_d((v2i64)dst3, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
STORE_DWORD(dst, out0); dst += (4 * dst_stride);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
} }
} }
@ -105,24 +84,15 @@ static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
for (cnt = (height / 8); cnt--;) { for (cnt = (height / 8); cnt--;) {
LOAD_8VECS_UB(src, src_stride, LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
LOAD_8VECS_UB(dst, dst_stride, AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); dst0, dst1, dst2, dst3);
AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
dst0 = __msa_aver_u_b(src0, dst0); dst4, dst5, dst6, dst7);
dst1 = __msa_aver_u_b(src1, dst1); ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
dst2 = __msa_aver_u_b(src2, dst2);
dst3 = __msa_aver_u_b(src3, dst3);
dst4 = __msa_aver_u_b(src4, dst4);
dst5 = __msa_aver_u_b(src5, dst5);
dst6 = __msa_aver_u_b(src6, dst6);
dst7 = __msa_aver_u_b(src7, dst7);
STORE_8VECS_UB(dst, dst_stride,
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
dst += (8 * dst_stride); dst += (8 * dst_stride);
} }
} }
@ -137,99 +107,34 @@ static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
for (cnt = (height / 8); cnt--;) { for (cnt = (height / 8); cnt--;) {
src0 = LOAD_UB(src); LD_UB4(src, src_stride, src0, src2, src4, src6);
src1 = LOAD_UB(src + 16); LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
src += src_stride; src += (4 * src_stride);
src2 = LOAD_UB(src); LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
src3 = LOAD_UB(src + 16); LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
src += src_stride; dst_dup += (4 * dst_stride);
src4 = LOAD_UB(src); LD_UB4(src, src_stride, src8, src10, src12, src14);
src5 = LOAD_UB(src + 16); LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
src += src_stride; src += (4 * src_stride);
src6 = LOAD_UB(src); LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
src7 = LOAD_UB(src + 16); LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
src += src_stride; dst_dup += (4 * dst_stride);
dst0 = LOAD_UB(dst_dup); AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
dst1 = LOAD_UB(dst_dup + 16); dst0, dst1, dst2, dst3);
dst_dup += dst_stride; AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
dst2 = LOAD_UB(dst_dup); dst4, dst5, dst6, dst7);
dst3 = LOAD_UB(dst_dup + 16); AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
dst_dup += dst_stride; dst8, dst9, dst10, dst11);
dst4 = LOAD_UB(dst_dup); AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
dst5 = LOAD_UB(dst_dup + 16); dst12, dst13, dst14, dst15);
dst_dup += dst_stride;
dst6 = LOAD_UB(dst_dup);
dst7 = LOAD_UB(dst_dup + 16);
dst_dup += dst_stride;
src8 = LOAD_UB(src); ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
src9 = LOAD_UB(src + 16); ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
src += src_stride; dst += (4 * dst_stride);
src10 = LOAD_UB(src); ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
src11 = LOAD_UB(src + 16); ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
src += src_stride; dst += (4 * dst_stride);
src12 = LOAD_UB(src);
src13 = LOAD_UB(src + 16);
src += src_stride;
src14 = LOAD_UB(src);
src15 = LOAD_UB(src + 16);
src += src_stride;
dst8 = LOAD_UB(dst_dup);
dst9 = LOAD_UB(dst_dup + 16);
dst_dup += dst_stride;
dst10 = LOAD_UB(dst_dup);
dst11 = LOAD_UB(dst_dup + 16);
dst_dup += dst_stride;
dst12 = LOAD_UB(dst_dup);
dst13 = LOAD_UB(dst_dup + 16);
dst_dup += dst_stride;
dst14 = LOAD_UB(dst_dup);
dst15 = LOAD_UB(dst_dup + 16);
dst_dup += dst_stride;
dst0 = __msa_aver_u_b(src0, dst0);
dst1 = __msa_aver_u_b(src1, dst1);
dst2 = __msa_aver_u_b(src2, dst2);
dst3 = __msa_aver_u_b(src3, dst3);
dst4 = __msa_aver_u_b(src4, dst4);
dst5 = __msa_aver_u_b(src5, dst5);
dst6 = __msa_aver_u_b(src6, dst6);
dst7 = __msa_aver_u_b(src7, dst7);
dst8 = __msa_aver_u_b(src8, dst8);
dst9 = __msa_aver_u_b(src9, dst9);
dst10 = __msa_aver_u_b(src10, dst10);
dst11 = __msa_aver_u_b(src11, dst11);
dst12 = __msa_aver_u_b(src12, dst12);
dst13 = __msa_aver_u_b(src13, dst13);
dst14 = __msa_aver_u_b(src14, dst14);
dst15 = __msa_aver_u_b(src15, dst15);
STORE_UB(dst0, dst);
STORE_UB(dst1, dst + 16);
dst += dst_stride;
STORE_UB(dst2, dst);
STORE_UB(dst3, dst + 16);
dst += dst_stride;
STORE_UB(dst4, dst);
STORE_UB(dst5, dst + 16);
dst += dst_stride;
STORE_UB(dst6, dst);
STORE_UB(dst7, dst + 16);
dst += dst_stride;
STORE_UB(dst8, dst);
STORE_UB(dst9, dst + 16);
dst += dst_stride;
STORE_UB(dst10, dst);
STORE_UB(dst11, dst + 16);
dst += dst_stride;
STORE_UB(dst12, dst);
STORE_UB(dst13, dst + 16);
dst += dst_stride;
STORE_UB(dst14, dst);
STORE_UB(dst15, dst + 16);
dst += dst_stride;
} }
} }
@ -243,48 +148,40 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
for (cnt = (height / 4); cnt--;) { for (cnt = (height / 4); cnt--;) {
LOAD_4VECS_UB(src, 16, src0, src1, src2, src3); LD_UB4(src, 16, src0, src1, src2, src3);
src += src_stride; src += src_stride;
LOAD_4VECS_UB(src, 16, src4, src5, src6, src7); LD_UB4(src, 16, src4, src5, src6, src7);
src += src_stride; src += src_stride;
LOAD_4VECS_UB(src, 16, src8, src9, src10, src11); LD_UB4(src, 16, src8, src9, src10, src11);
src += src_stride; src += src_stride;
LOAD_4VECS_UB(src, 16, src12, src13, src14, src15); LD_UB4(src, 16, src12, src13, src14, src15);
src += src_stride; src += src_stride;
LOAD_4VECS_UB(dst_dup, 16, dst0, dst1, dst2, dst3); LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
dst_dup += dst_stride; dst_dup += dst_stride;
LOAD_4VECS_UB(dst_dup, 16, dst4, dst5, dst6, dst7); LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
dst_dup += dst_stride; dst_dup += dst_stride;
LOAD_4VECS_UB(dst_dup, 16, dst8, dst9, dst10, dst11); LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
dst_dup += dst_stride; dst_dup += dst_stride;
LOAD_4VECS_UB(dst_dup, 16, dst12, dst13, dst14, dst15); LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
dst_dup += dst_stride; dst_dup += dst_stride;
dst0 = __msa_aver_u_b(src0, dst0); AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
dst1 = __msa_aver_u_b(src1, dst1); dst0, dst1, dst2, dst3);
dst2 = __msa_aver_u_b(src2, dst2); AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
dst3 = __msa_aver_u_b(src3, dst3); dst4, dst5, dst6, dst7);
dst4 = __msa_aver_u_b(src4, dst4); AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
dst5 = __msa_aver_u_b(src5, dst5); dst8, dst9, dst10, dst11);
dst6 = __msa_aver_u_b(src6, dst6); AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
dst7 = __msa_aver_u_b(src7, dst7); dst12, dst13, dst14, dst15);
dst8 = __msa_aver_u_b(src8, dst8);
dst9 = __msa_aver_u_b(src9, dst9);
dst10 = __msa_aver_u_b(src10, dst10);
dst11 = __msa_aver_u_b(src11, dst11);
dst12 = __msa_aver_u_b(src12, dst12);
dst13 = __msa_aver_u_b(src13, dst13);
dst14 = __msa_aver_u_b(src14, dst14);
dst15 = __msa_aver_u_b(src15, dst15);
STORE_4VECS_UB(dst, 16, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
dst += dst_stride; dst += dst_stride;
STORE_4VECS_UB(dst, 16, dst4, dst5, dst6, dst7); ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
dst += dst_stride; dst += dst_stride;
STORE_4VECS_UB(dst, 16, dst8, dst9, dst10, dst11); ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
dst += dst_stride; dst += dst_stride;
STORE_4VECS_UB(dst, 16, dst12, dst13, dst14, dst15); ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
dst += dst_stride; dst += dst_stride;
} }
} }

Просмотреть файл

@ -12,16 +12,14 @@
#include "vp9/common/mips/msa/vp9_macros_msa.h" #include "vp9/common/mips/msa/vp9_macros_msa.h"
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride, int32_t height) {
int32_t height) {
int32_t cnt; int32_t cnt;
uint64_t out0, out1, out2, out3, out4, out5, out6, out7; uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) { if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) { for (cnt = (height / 12); cnt--;) {
LOAD_8VECS_UB(src, src_stride, LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64)src0, 0); out0 = __msa_copy_u_d((v2i64)src0, 0);
@ -33,44 +31,24 @@ static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
out6 = __msa_copy_u_d((v2i64)src6, 0); out6 = __msa_copy_u_d((v2i64)src6, 0);
out7 = __msa_copy_u_d((v2i64)src7, 0); out7 = __msa_copy_u_d((v2i64)src7, 0);
STORE_DWORD(dst, out0); SD4(out0, out1, out2, out3, dst, dst_stride);
dst += dst_stride; dst += (4 * dst_stride);
STORE_DWORD(dst, out1); SD4(out4, out5, out6, out7, dst, dst_stride);
dst += dst_stride; dst += (4 * dst_stride);
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
STORE_DWORD(dst, out4);
dst += dst_stride;
STORE_DWORD(dst, out5);
dst += dst_stride;
STORE_DWORD(dst, out6);
dst += dst_stride;
STORE_DWORD(dst, out7);
dst += dst_stride;
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64)src0, 0); out0 = __msa_copy_u_d((v2i64)src0, 0);
out1 = __msa_copy_u_d((v2i64)src1, 0); out1 = __msa_copy_u_d((v2i64)src1, 0);
out2 = __msa_copy_u_d((v2i64)src2, 0); out2 = __msa_copy_u_d((v2i64)src2, 0);
out3 = __msa_copy_u_d((v2i64)src3, 0); out3 = __msa_copy_u_d((v2i64)src3, 0);
SD4(out0, out1, out2, out3, dst, dst_stride);
STORE_DWORD(dst, out0); dst += (4 * dst_stride);
dst += dst_stride;
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
} }
} else if (0 == height % 8) { } else if (0 == height % 8) {
for (cnt = height >> 3; cnt--;) { for (cnt = height >> 3; cnt--;) {
LOAD_8VECS_UB(src, src_stride, LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
out0 = __msa_copy_u_d((v2i64)src0, 0); out0 = __msa_copy_u_d((v2i64)src0, 0);
@ -82,53 +60,33 @@ static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
out6 = __msa_copy_u_d((v2i64)src6, 0); out6 = __msa_copy_u_d((v2i64)src6, 0);
out7 = __msa_copy_u_d((v2i64)src7, 0); out7 = __msa_copy_u_d((v2i64)src7, 0);
STORE_DWORD(dst, out0); SD4(out0, out1, out2, out3, dst, dst_stride);
dst += dst_stride; dst += (4 * dst_stride);
STORE_DWORD(dst, out1); SD4(out4, out5, out6, out7, dst, dst_stride);
dst += dst_stride; dst += (4 * dst_stride);
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
STORE_DWORD(dst, out4);
dst += dst_stride;
STORE_DWORD(dst, out5);
dst += dst_stride;
STORE_DWORD(dst, out6);
dst += dst_stride;
STORE_DWORD(dst, out7);
dst += dst_stride;
} }
} else if (0 == height % 4) { } else if (0 == height % 4) {
for (cnt = (height / 4); cnt--;) { for (cnt = (height / 4); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
out0 = __msa_copy_u_d((v2i64)src0, 0); out0 = __msa_copy_u_d((v2i64)src0, 0);
out1 = __msa_copy_u_d((v2i64)src1, 0); out1 = __msa_copy_u_d((v2i64)src1, 0);
out2 = __msa_copy_u_d((v2i64)src2, 0); out2 = __msa_copy_u_d((v2i64)src2, 0);
out3 = __msa_copy_u_d((v2i64)src3, 0); out3 = __msa_copy_u_d((v2i64)src3, 0);
STORE_DWORD(dst, out0); SD4(out0, out1, out2, out3, dst, dst_stride);
dst += dst_stride; dst += (4 * dst_stride);
STORE_DWORD(dst, out1);
dst += dst_stride;
STORE_DWORD(dst, out2);
dst += dst_stride;
STORE_DWORD(dst, out3);
dst += dst_stride;
} }
} else if (0 == height % 2) { } else if (0 == height % 2) {
for (cnt = (height / 2); cnt--;) { for (cnt = (height / 2); cnt--;) {
LOAD_2VECS_UB(src, src_stride, src0, src1); LD_UB2(src, src_stride, src0, src1);
src += (2 * src_stride); src += (2 * src_stride);
out0 = __msa_copy_u_d((v2i64)src0, 0); out0 = __msa_copy_u_d((v2i64)src0, 0);
out1 = __msa_copy_u_d((v2i64)src1, 0); out1 = __msa_copy_u_d((v2i64)src1, 0);
STORE_DWORD(dst, out0); SD(out0, dst);
dst += dst_stride; dst += dst_stride;
STORE_DWORD(dst, out1); SD(out1, dst);
dst += dst_stride; dst += dst_stride;
} }
} }
@ -147,12 +105,12 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
dst_tmp = dst; dst_tmp = dst;
for (loop_cnt = (height >> 3); loop_cnt--;) { for (loop_cnt = (height >> 3); loop_cnt--;) {
LOAD_8VECS_UB(src_tmp, src_stride, LD_UB8(src_tmp, src_stride,
src0, src1, src2, src3, src4, src5, src6, src7); src0, src1, src2, src3, src4, src5, src6, src7);
src_tmp += (8 * src_stride); src_tmp += (8 * src_stride);
STORE_8VECS_UB(dst_tmp, dst_stride, ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
src0, src1, src2, src3, src4, src5, src6, src7); dst_tmp, dst_stride);
dst_tmp += (8 * dst_stride); dst_tmp += (8 * dst_stride);
} }
@ -162,90 +120,79 @@ static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
} }
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride, int32_t height) {
int32_t height) {
int32_t cnt; int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) { if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) { for (cnt = (height / 12); cnt--;) {
LOAD_8VECS_UB(src, src_stride, LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride); src += (8 * src_stride);
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
STORE_8VECS_UB(dst, dst_stride,
src0, src1, src2, src3, src4, src5, src6, src7);
dst += (8 * dst_stride); dst += (8 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} else if (0 == height % 8) { } else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
} else if (0 == height % 4) { } else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) { for (cnt = (height >> 2); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride); src += (4 * src_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); ST_UB4(src0, src1, src2, src3, dst, dst_stride);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
} }
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride, int32_t height) {
int32_t height) {
int32_t cnt; int32_t cnt;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
if (0 == height % 12) { if (0 == height % 12) {
for (cnt = (height / 12); cnt--;) { for (cnt = (height / 12); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride); src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride); dst += (4 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride); src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride); dst += (4 * dst_stride);
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride); src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} else if (0 == height % 8) { } else if (0 == height % 8) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
} else if (0 == height % 4) { } else if (0 == height % 4) {
for (cnt = (height >> 2); cnt--;) { for (cnt = (height >> 2); cnt--;) {
LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3); LD_UB4(src, src_stride, src0, src1, src2, src3);
LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7); LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
src += (4 * src_stride); src += (4 * src_stride);
ST_UB4(src0, src1, src2, src3, dst, dst_stride);
STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3); ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
dst += (4 * dst_stride); dst += (4 * dst_stride);
} }
} }
} }
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
uint8_t *dst, int32_t dst_stride, uint8_t *dst, int32_t dst_stride, int32_t height) {
int32_t height) {
copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
} }
@ -264,8 +211,8 @@ void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
uint32_t cnt, tmp; uint32_t cnt, tmp;
/* 1 word storage */ /* 1 word storage */
for (cnt = h; cnt--;) { for (cnt = h; cnt--;) {
tmp = LOAD_WORD(src); tmp = LW(src);
STORE_WORD(dst, tmp); SW(tmp, dst);
src += src_stride; src += src_stride;
dst += dst_stride; dst += dst_stride;
} }

Просмотреть файл

@ -16,142 +16,104 @@
extern const uint8_t mc_filt_mask_arr[16 * 3]; extern const uint8_t mc_filt_mask_arr[16 * 3];
#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3, \
filt_h0, filt_h1, filt_h2, filt_h3) ({ \
v8i16 vec0, vec1, vec2, vec3, horiz_out; \
\
vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src)); \
vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src)); \
vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src)); \
vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src)); \
vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3); \
vec0 = __msa_adds_s_h(vec0, vec2); \
horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
\
horiz_out; \
})
#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3, \
filt_h0, filt_h1, filt_h2, filt_h3) ({ \
v8i16 vec0, vec1, vec2, vec3, horiz_out; \
\
vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \
vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0)); \
vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \
vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1); \
vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2)); \
vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3); \
vec0 = __msa_adds_s_h(vec0, vec2); \
horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7); \
\
horiz_out; \
})
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
filt0, filt1, filt2, filt3) ({ \ filt0, filt1, filt2, filt3) ({ \
v8i16 tmp0, tmp1; \ v8i16 tmp0, tmp1; \
\ \
tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0)); \ tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1)); \ tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2)); \ tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3)); \ tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \
tmp0 = __msa_adds_s_h(tmp0, tmp1); \ tmp0 = __msa_adds_s_h(tmp0, tmp1); \
\ \
tmp0; \ tmp0; \
}) })
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
filt_h0, filt_h1, filt_h2, filt_h3) ({ \
v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
v8i16 hz_out_m; \
\
VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
vec0_m, vec1_m, vec2_m, vec3_m); \
hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
filt_h0, filt_h1, filt_h2, filt_h3); \
\
hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \
hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
\
hz_out_m; \
})
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \ mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \ filt0, filt1, filt2, filt3, \
out0, out1) { \ out0, out1) { \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \ v8i16 res0_m, res1_m, res2_m, res3_m; \
\ \
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0)); \ VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2)); \ DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
\ VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
\ DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0)); \ VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2)); \ DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
\ ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m); \
res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2)); \
\
res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m); \
res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m); \
\
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2)); \
\
res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res2_m); \
out1 = __msa_adds_s_h(res1_m, res3_m); \
} }
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
mask0, mask1, mask2, mask3, \ mask0, mask1, mask2, mask3, \
filt0, filt1, filt2, filt3, \ filt0, filt1, filt2, filt3, \
out0, out1, out2, out3) { \ out0, out1, out2, out3) { \
v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
v8i16 vec4_m, vec5_m, vec6_m, vec7_m; \ v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
v8i16 res0_m, res1_m, res2_m, res3_m; \
v8i16 res4_m, res5_m, res6_m, res7_m; \
\ \
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0)); \ VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1)); \ VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2)); \ DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3)); \ res0_m, res1_m, res2_m, res3_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
res4_m, res5_m, res6_m, res7_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
res0_m, res1_m, res2_m, res3_m); \
VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
res4_m, res5_m, res6_m, res7_m); \
ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
res7_m, out0, out1, out2, out3); \
}
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) { \
v16u8 tmp_m; \
\ \
res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0)); \ tmp_m = PCKEV_XORI128_UB(in1, in0); \
res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0)); \ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0)); \ ST_UB(tmp_m, (pdst)); \
res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0)); \ }
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) { \
v16u8 tmp_m; \
\ \
vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0)); \ tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1)); \ tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \
vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2)); \ ST_UB(tmp_m, (pdst)); \
vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3)); \ }
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
pdst, stride) { \
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
uint8_t *pdst_m = (uint8_t *)(pdst); \
\ \
res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2)); \ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2)); \ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2)); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2)); \ ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1)); \
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3)); \
\
res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m); \
res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m); \
res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m); \
res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m); \
\
vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0)); \
vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1)); \
vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2)); \
vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3)); \
\
res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m); \
res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m); \
res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m); \
res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m); \
\
out0 = __msa_adds_s_h(res0_m, res4_m); \
out1 = __msa_adds_s_h(res1_m, res5_m); \
out2 = __msa_adds_s_h(res2_m, res6_m); \
out3 = __msa_adds_s_h(res3_m, res7_m); \
} }
#endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */ #endif /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */

Просмотреть файл

@ -286,19 +286,19 @@ specialize qw/vp9_filter_by_weight8x8 sse2/;
# Sub Pixel Filters # Sub Pixel Filters
# #
add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve_copy neon dspr2/, "$sse2_x86inc"; specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve_avg neon dspr2/, "$sse2_x86inc"; specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2/, "$avx2_ssse3"; specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2/, "$avx2_ssse3"; specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2/, "$avx2_ssse3"; specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/; specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;

Просмотреть файл

@ -132,6 +132,12 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_ds
# common (msa) # common (msa)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c