VP8 for ARMv8 by using NEON intrinsics 04

Add dequant_idct_neon.c
- vp8_dequant_idct_add_neon

vpxdec  --summary --noblit ../videos/tears_of_steel_1080p.webm
Before => After, 13.25 => 13.22 (fps)

Change-Id: Id48f39e1da58dd3d8d37658e94989411997f4f7c
Signed-off-by: James Yu <james.yu@linaro.org>
This commit is contained in:
James Yu 2013-12-17 19:06:31 +08:00
Родитель d749ab6221
Коммит 28b2f82f97
3 изменённых файлов: 143 добавлений и 132 удалений

Просмотреть файл

@ -1,131 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_add_neon(short *input, short *dq,
; unsigned char *dest, int stride)
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *dest
; r3 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
add r1, r2, r3 ; r1 = dest + stride
lsl r3, #1 ; 2x stride
vld1.32 {d14[0]}, [r2], r3
vld1.32 {d14[1]}, [r1], r3
vld1.32 {d15[0]}, [r2]
vld1.32 {d15[1]}, [r1]
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; memset(input, 0, 32) -- 32bytes
vmov.i16 q14, #0
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vmov q15, q14
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vst1.16 {q14, q15}, [r0]
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
sub r2, r2, r3
sub r1, r1, r3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r2], r3
vst1.32 {d0[1]}, [r1], r3
vst1.32 {d1[0]}, [r2]
vst1.32 {d1[1]}, [r1]
bx lr
ENDP ; |vp8_dequant_idct_add_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

Просмотреть файл

@ -0,0 +1,142 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_dequant_idct_add_neon(
int16_t *input,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int32x2_t d14, d15;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
int16x8_t q1, q2, q3, q4, q5, q6;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x2x2_t d2tmp0, d2tmp1;
int16x4x2_t d2tmp2, d2tmp3;
d14 = d15 = vdup_n_s32(0);
// load input
q3 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
input += 8;
q4 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
// load dq
q5 = vld1q_s16(dq);
dq += 8;
q6 = vld1q_s16(dq);
// load src from dst
dst0 = dst;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
dst0 += stride;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
vreinterpretq_u16_s16(q5)));
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
vreinterpretq_u16_s16(q6)));
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
// loop 2
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
vreinterpret_u8_s32(d14)));
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
vreinterpret_u8_s32(d15)));
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d14, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d14, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 1);
return;
}

Просмотреть файл

@ -173,7 +173,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict16x16_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/save_reg_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_0_2x_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon$(ASM)
@ -187,6 +186,7 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance16x16s_neon
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.sh))