From fbac961b4763f276e97cfee217e327f91bb9c456 Mon Sep 17 00:00:00 2001 From: Parag Salasakar Date: Thu, 11 Jun 2015 09:11:04 +0530 Subject: [PATCH] mips msa vp9 filter by weight optimization filter by weight - average improvement ~2x-3x Change-Id: I4832033335d339cdafdce697f07ce3e643920057 --- vp9/common/mips/msa/vp9_macros_msa.h | 34 +++++++ vp9/common/mips/msa/vp9_mfqe_msa.c | 137 +++++++++++++++++++++++++++ vp9/common/vp9_rtcd_defs.pl | 4 +- vp9/vp9_common.mk | 4 + 4 files changed, 177 insertions(+), 2 deletions(-) create mode 100644 vp9/common/mips/msa/vp9_mfqe_msa.c diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h index f1217d5ab..2043e13b3 100644 --- a/vp9/common/mips/msa/vp9_macros_msa.h +++ b/vp9/common/mips/msa/vp9_macros_msa.h @@ -244,6 +244,22 @@ out3 = LW((psrc) + 3 * stride); \ } +/* Description : Load double words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Details : Loads double word in 'out0' from (psrc) + Loads double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ +} +#define LD4(psrc, stride, out0, out1, out2, out3) { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ +} + /* Description : Store 4 words with stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Stores word from 'in0' to (pdst) @@ -482,6 +498,24 @@ SD(out0_m, pdst); \ } +/* Description : Store as 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from input vector 'in' is copied + and stored to destination memory at (pdst) + Index 1 double word element from input vector 'in' is copied + and stored to destination memory at (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + /* Description : Store as 8x4 byte block to destination memory from input vectors Arguments : Inputs - in0, in1, pdst, stride diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c new file mode 100644 index 000000000..64cb9a818 --- /dev/null +++ b/vp9/common/mips/msa/vp9_mfqe_msa.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + uint64_t src0_d, src1_d, dst0_d, dst1_d; + v16i8 src0 = { 0 }; + v16i8 src1 = { 0 }; + v16i8 dst0 = { 0 }; + v16i8 dst1 = { 0 }; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 2; row--;) { + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2(dst_ptr, dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src0); + INSERT_D2_SB(dst0_d, dst1_d, dst0); + + LD2(src_ptr, src_stride, src0_d, src1_d); + src_ptr += (2 * src_stride); + LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d); + INSERT_D2_SB(src0_d, src1_d, src1); + INSERT_D2_SB(dst0_d, dst1_d, dst1); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst0, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r); + ST8x2_UB(dst1, dst_ptr, dst_stride); + dst_ptr += (2 * dst_stride); + } +} + +static void filter_by_weight16x16_msa(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + int32_t src_weight) { + int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight; + int32_t row; + v16i8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l; + + src_wt = __msa_fill_h(src_weight); + dst_wt = __msa_fill_h(dst_weight); + + for (row = 4; row--;) { + LD_SB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3); + + UNPCK_UB_SH(src0, src_r, src_l); + UNPCK_UB_SH(dst0, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src1, src_r, src_l); + UNPCK_UB_SH(dst1, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src2, src_r, src_l); + UNPCK_UB_SH(dst2, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + + UNPCK_UB_SH(src3, src_r, src_l); + UNPCK_UB_SH(dst3, dst_r, dst_l); + res_h_r = (src_r * src_wt); + res_h_r += (dst_r * dst_wt); + res_h_l = (src_l * src_wt); + res_h_l += (dst_l * dst_wt); + SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION); + PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr); + dst_ptr += dst_stride; + } +} + +void vp9_filter_by_weight8x8_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight8x8_msa(src, src_stride, dst, dst_stride, src_weight); +} + +void vp9_filter_by_weight16x16_msa(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int src_weight) { + filter_by_weight16x16_msa(src, src_stride, dst, dst_stride, src_weight); +} diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 503512686..2f262a6f1 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -276,10 +276,10 @@ specialize qw/vp9_plane_add_noise sse2/; $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight16x16 sse2/; +specialize qw/vp9_filter_by_weight16x16 sse2 msa/; add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; -specialize qw/vp9_filter_by_weight8x8 sse2/; +specialize qw/vp9_filter_by_weight8x8 sse2 msa/; } # diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index b01fdd186..6f091eefb 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -152,6 +152,10 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_loopfilter_msa.h +ifeq ($(CONFIG_VP9_POSTPROC),yes) +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +endif + VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h ifeq ($(ARCH_X86_64), yes)