From 61f0c090dff65135c1828a7c407f51fe21405926 Mon Sep 17 00:00:00 2001 From: Tero Rintaluoma Date: Mon, 9 May 2011 10:09:41 +0300 Subject: [PATCH] neon fast quantize block pair vp8_fast_quantize_b_pair_neon function added to quantize two adjacent blocks at the same time to improve performance. - Additional 3-6% speedup compared to neon optimized fast quantizer (Tanya VGA@30fps, 1Mbps stream, cpu-used=-5..-16) Change-Id: I3fcbf141e5d05e9118c38ca37310458afbabaa4e --- vp8/encoder/arm/arm_csystemdependent.c | 4 +- vp8/encoder/arm/neon/fastquantizeb_neon.asm | 139 +++++++++++++++++++- vp8/encoder/arm/quantize_arm.c | 62 +++++++++ vp8/encoder/arm/quantize_arm.h | 17 +++ vp8/encoder/block.h | 1 + vp8/encoder/encodeframe.c | 11 +- vp8/encoder/ethreading.c | 1 + vp8/encoder/generic/csystemdependent.c | 4 +- vp8/encoder/onyx_if.c | 10 +- vp8/encoder/quantize.c | 23 +++- vp8/encoder/quantize.h | 35 ++++- vp8/vp8cx_arm.mk | 1 + 12 files changed, 289 insertions(+), 19 deletions(-) create mode 100644 vp8/encoder/arm/quantize_arm.c diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index af2a5df98..db079d5ed 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon; cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon; - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ + /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; } #endif diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm index 3dd92b12e..dcf3c5090 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm @@ -10,6 +10,7 @@ EXPORT |vp8_fast_quantize_b_neon| + EXPORT |vp8_fast_quantize_b_pair_neon| INCLUDE asm_enc_offsets.asm @@ -19,6 +20,138 @@ AREA ||.text||, CODE, READONLY, ALIGN=4 +;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); +|vp8_fast_quantize_b_pair_neon| PROC + + stmfd sp!, {r4-r9} + vstmdb sp!, {q4-q7} + + ldr r4, [r0, #vp8_block_coeff] + ldr r5, [r0, #vp8_block_quant_fast] + ldr r6, [r0, #vp8_block_round] + + vld1.16 {q0, q1}, [r4@128] ; load z + + ldr r7, [r2, #vp8_blockd_qcoeff] + + vabs.s16 q4, q0 ; calculate x = abs(z) + vabs.s16 q5, q1 + + ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative + vshr.s16 q2, q0, #15 ; sz + vshr.s16 q3, q1, #15 + + vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] + vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] + + ldr r4, [r1, #vp8_block_coeff] + + vadd.s16 q4, q6 ; x + Round + vadd.s16 q5, q7 + + vld1.16 {q0, q1}, [r4@128] ; load z2 + + vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q5, q9 + + vabs.s16 q10, q0 ; calculate x2 = abs(z_2) + vabs.s16 q11, q1 + vshr.s16 q12, q0, #15 ; sz2 + vshr.s16 q13, q1, #15 + + ;modify data to have its original sign + veor.s16 q4, q2 ; y^sz + veor.s16 q5, q3 + + vadd.s16 q10, q6 ; x2 + Round + vadd.s16 q11, q7 + + ldr r8, [r2, #vp8_blockd_dequant] + + vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 + vqdmulh.s16 q11, q9 + + vshr.s16 q4, #1 ; right shift 1 after vqdmulh + vshr.s16 q5, #1 + + vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] + + vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q5, q3 + + vshr.s16 q10, #1 ; right shift 1 after vqdmulh + vshr.s16 q11, #1 + + ldr r9, [r2, #vp8_blockd_dqcoeff] + + veor.s16 q10, q12 ; y2^sz2 + veor.s16 q11, q13 + + vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 + + + vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) + vsub.s16 q11, q13 + + ldr r6, [r3, #vp8_blockd_qcoeff] + + vmul.s16 q2, q6, q4 ; x * Dequant + vmul.s16 q3, q7, q5 + + ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table + + vceq.s16 q8, q8 ; set q8 to all 1 + + vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 + + vmul.s16 q12, q6, q10 ; x2 * Dequant + vmul.s16 q13, q7, q11 + + vld1.16 {q6, q7}, [r0@128] ; load inverse scan order + + vtst.16 q14, q4, q8 ; now find eob + vtst.16 q15, q5, q8 ; non-zero element is set to all 1 + + vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant + + ldr r7, [r3, #vp8_blockd_dqcoeff] + + vand q0, q6, q14 ; get all valid numbers from scan array + vand q1, q7, q15 + + vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant + + vtst.16 q2, q10, q8 ; now find eob + vtst.16 q3, q11, q8 ; non-zero element is set to all 1 + + vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 + + vand q10, q6, q2 ; get all valid numbers from scan array + vand q11, q7, q3 + vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 + + vmax.u16 d0, d0, d1 + vmax.u16 d20, d20, d21 + vmovl.u16 q0, d0 + vmovl.u16 q10, d20 + + + vmax.u32 d0, d0, d1 + vmax.u32 d20, d20, d21 + vpmax.u32 d0, d0, d0 + vpmax.u32 d20, d20, d20 + + add r4, r2, #vp8_blockd_eob + add r5, r3, #vp8_blockd_eob + + vst1.32 {d0[0]}, [r4@32] + vst1.32 {d20[0]}, [r5@32] + + vldmia sp!, {q4-q7} + ldmfd sp!, {r4-r9} + bx lr + + ENDP ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) |vp8_fast_quantize_b_neon| PROC @@ -97,10 +230,8 @@ vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - vmov.32 r0, d0[0] ; this instruction takes 1+13 cycles - ; if we have vfp, we could use - ; vstr s0, [r1, #vp8_blockd_eob] - str r0, [r1, #vp8_blockd_eob] + add r4, r1, #vp8_blockd_eob + vst1.32 {d0[0]}, [r4@32] ldmfd sp!, {r4-r7} bx lr diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c new file mode 100644 index 000000000..52d84013e --- /dev/null +++ b/vp8/encoder/arm/quantize_arm.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "vpx_mem/vpx_mem.h" + +#include "vp8/encoder/quantize.h" +#include "vp8/common/entropy.h" + + +#if HAVE_ARMV7 + +/* vp8_quantize_mbX functions here differs from corresponding ones in + * quantize.c only by using quantize_b_pair function pointer instead of + * the regular quantize_b function pointer */ +void vp8_quantize_mby_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 16; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if(has_2nd_order) + x->quantize_b(&x->block[24], &x->e_mbd.block[24]); +} + +void vp8_quantize_mb_neon(MACROBLOCK *x) +{ + int i; + int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED + && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); + + for (i = 0; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); + + if (has_2nd_order) + x->quantize_b(&x->block[i], &x->e_mbd.block[i]); +} + + +void vp8_quantize_mbuv_neon(MACROBLOCK *x) +{ + int i; + + for (i = 16; i < 24; i+=2) + x->quantize_b_pair(&x->block[i], &x->block[i+1], + &x->e_mbd.block[i], &x->e_mbd.block[i+1]); +} + +#endif /* HAVE_ARMV7 */ diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h index af4187ac1..7d2088d2d 100644 --- a/vp8/encoder/arm/quantize_arm.h +++ b/vp8/encoder/arm/quantize_arm.h @@ -16,8 +16,10 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 +#endif #endif /* HAVE_ARMV6 */ @@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6); #if HAVE_ARMV7 extern prototype_quantize_block(vp8_fast_quantize_b_neon); +extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); +#if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_neon +#undef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon + +#undef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_neon + +#undef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_neon + +#undef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_neon +#endif + #endif /* HAVE_ARMV7 */ #endif diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index fbdc89e87..fabd82a06 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -119,6 +119,7 @@ typedef struct void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); void (*short_walsh4x4)(short *input, short *output, int pitch); void (*quantize_b)(BLOCK *b, BLOCKD *d); + void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1); } MACROBLOCK; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 1bb026048..98d6232b0 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1142,8 +1142,10 @@ int vp8cx_encode_inter_macroblock /* Are we using the fast quantizer for the mode selection? */ if(cpi->sf.use_fastquant_for_pick) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, - fastquantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb_pair); /* the fast quantizer does not use zbin_extra, so * do not recalculate */ @@ -1155,7 +1157,10 @@ int vp8cx_encode_inter_macroblock /* switch back to the regular quantizer for the encode */ if (cpi->sf.improved_quant) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb_pair); } /* restore cpi->zbin_mode_boost_enabled */ diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 1d92f20af..565e4f22e 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -328,6 +328,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4; z->short_walsh4x4 = x->short_walsh4x4; z->quantize_b = x->quantize_b; + z->quantize_b_pair = x->quantize_b_pair; z->optimize = x->optimize; /* diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 928f559f3..9af3f183a 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -17,8 +17,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi); void vp8_arch_arm_encoder_init(VP8_COMP *cpi); -extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); - void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); @@ -88,7 +86,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c; cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; + cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; + cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_c; cpi->rtcd.search.full_search = vp8_full_search_sad; cpi->rtcd.search.refining_search = vp8_refining_search_sad; cpi->rtcd.search.diamond_search = vp8_diamond_search_sad; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 18ffa02a8..2bdd46d4b 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1200,11 +1200,17 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (cpi->sf.improved_quant) { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + quantb_pair); } else { - cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb); + cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb); + cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize, + fastquantb_pair); } if (cpi->sf.improved_quant != last_improved_quant) vp8cx_init_quantizer(cpi); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 49e8e1b9b..503d24123 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -269,7 +269,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) #endif -void vp8_quantize_mby(MACROBLOCK *x) +void vp8_quantize_mby_c(MACROBLOCK *x) { int i; int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -282,7 +282,7 @@ void vp8_quantize_mby(MACROBLOCK *x) x->quantize_b(&x->block[24], &x->e_mbd.block[24]); } -void vp8_quantize_mb(MACROBLOCK *x) +void vp8_quantize_mb_c(MACROBLOCK *x) { int i; int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -293,7 +293,7 @@ void vp8_quantize_mb(MACROBLOCK *x) } -void vp8_quantize_mbuv(MACROBLOCK *x) +void vp8_quantize_mbuv_c(MACROBLOCK *x) { int i; @@ -301,6 +301,22 @@ void vp8_quantize_mbuv(MACROBLOCK *x) x->quantize_b(&x->block[i], &x->e_mbd.block[i]); } +/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of + * these two C functions if corresponding optimized routine is not available. + * NEON optimized version implements currently the fast quantization for pair + * of blocks. */ +void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) +{ + vp8_regular_quantize_b(b1, d1); + vp8_regular_quantize_b(b2, d2); +} + +void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) +{ + vp8_fast_quantize_b_c(b1, d1); + vp8_fast_quantize_b_c(b2, d2); +} + static const int qrounding_factors[129] = { @@ -715,3 +731,4 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q) vp8cx_init_quantizer(cpi); } + diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h index d9a041071..f1f0156d8 100644 --- a/vp8/encoder/quantize.h +++ b/vp8/encoder/quantize.h @@ -17,6 +17,11 @@ #define prototype_quantize_block(sym) \ void (sym)(BLOCK *b,BLOCKD *d) +#define prototype_quantize_block_pair(sym) \ + void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) + +#define prototype_quantize_mb(sym) \ + void (sym)(MACROBLOCK *x) #if ARCH_X86 || ARCH_X86_64 #include "x86/quantize_x86.h" @@ -31,17 +36,43 @@ #endif extern prototype_quantize_block(vp8_quantize_quantb); +#ifndef vp8_quantize_quantb_pair +#define vp8_quantize_quantb_pair vp8_regular_quantize_b_pair +#endif +extern prototype_quantize_block_pair(vp8_quantize_quantb_pair); + #ifndef vp8_quantize_fastquantb #define vp8_quantize_fastquantb vp8_fast_quantize_b_c #endif extern prototype_quantize_block(vp8_quantize_fastquantb); +#ifndef vp8_quantize_fastquantb_pair +#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c +#endif +extern prototype_quantize_block_pair(vp8_quantize_fastquantb_pair); + typedef struct { prototype_quantize_block(*quantb); + prototype_quantize_block_pair(*quantb_pair); prototype_quantize_block(*fastquantb); + prototype_quantize_block_pair(*fastquantb_pair); } vp8_quantize_rtcd_vtable_t; +#ifndef vp8_quantize_mb +#define vp8_quantize_mb vp8_quantize_mb_c +#endif +extern prototype_quantize_mb(vp8_quantize_mb); + +#ifndef vp8_quantize_mbuv +#define vp8_quantize_mbuv vp8_quantize_mbuv_c +#endif +extern prototype_quantize_mb(vp8_quantize_mbuv); + +#ifndef vp8_quantize_mby +#define vp8_quantize_mby vp8_quantize_mby_c +#endif +extern prototype_quantize_mb(vp8_quantize_mby); #if CONFIG_RUNTIME_CPU_DETECT #define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn @@ -51,10 +82,6 @@ typedef struct extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d); -extern void vp8_quantize_mb(MACROBLOCK *x); -extern void vp8_quantize_mbuv(MACROBLOCK *x); -extern void vp8_quantize_mby(MACROBLOCK *x); - struct VP8_COMP; extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q); extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi); diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 03d42d215..165dada2b 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -15,6 +15,7 @@ # encoder VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c +VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/dct_arm.c VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c