vp8_fast_quantize_b_pair_neon function added to quantize
two adjacent blocks at the same time to improve performance.
 - Additional 3-6% speedup compared to neon optimized fast
   quantizer (Tanya VGA@30fps, 1Mbps stream, cpu-used=-5..-16)

Change-Id: I3fcbf141e5d05e9118c38ca37310458afbabaa4e
This commit is contained in:
Tero Rintaluoma 2011-05-09 10:09:41 +03:00
Родитель 9e4f76c154
Коммит 61f0c090df
12 изменённых файлов: 289 добавлений и 19 удалений

Просмотреть файл

@ -121,8 +121,10 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
cpi->rtcd.encodemb.submby = vp8_subtract_mby_neon;
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_neon;
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
}
#endif

Просмотреть файл

@ -10,6 +10,7 @@
EXPORT |vp8_fast_quantize_b_neon|
EXPORT |vp8_fast_quantize_b_pair_neon|
INCLUDE asm_enc_offsets.asm
@ -19,6 +20,138 @@
AREA ||.text||, CODE, READONLY, ALIGN=4
;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
|vp8_fast_quantize_b_pair_neon| PROC
stmfd sp!, {r4-r9}
vstmdb sp!, {q4-q7}
ldr r4, [r0, #vp8_block_coeff]
ldr r5, [r0, #vp8_block_quant_fast]
ldr r6, [r0, #vp8_block_round]
vld1.16 {q0, q1}, [r4@128] ; load z
ldr r7, [r2, #vp8_blockd_qcoeff]
vabs.s16 q4, q0 ; calculate x = abs(z)
vabs.s16 q5, q1
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vshr.s16 q3, q1, #15
vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
ldr r4, [r1, #vp8_block_coeff]
vadd.s16 q4, q6 ; x + Round
vadd.s16 q5, q7
vld1.16 {q0, q1}, [r4@128] ; load z2
vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q5, q9
vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
vabs.s16 q11, q1
vshr.s16 q12, q0, #15 ; sz2
vshr.s16 q13, q1, #15
;modify data to have its original sign
veor.s16 q4, q2 ; y^sz
veor.s16 q5, q3
vadd.s16 q10, q6 ; x2 + Round
vadd.s16 q11, q7
ldr r8, [r2, #vp8_blockd_dequant]
vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q11, q9
vshr.s16 q4, #1 ; right shift 1 after vqdmulh
vshr.s16 q5, #1
vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q5, q3
vshr.s16 q10, #1 ; right shift 1 after vqdmulh
vshr.s16 q11, #1
ldr r9, [r2, #vp8_blockd_dqcoeff]
veor.s16 q10, q12 ; y2^sz2
veor.s16 q11, q13
vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q11, q13
ldr r6, [r3, #vp8_blockd_qcoeff]
vmul.s16 q2, q6, q4 ; x * Dequant
vmul.s16 q3, q7, q5
ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
vceq.s16 q8, q8 ; set q8 to all 1
vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
vmul.s16 q12, q6, q10 ; x2 * Dequant
vmul.s16 q13, q7, q11
vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
vtst.16 q14, q4, q8 ; now find eob
vtst.16 q15, q5, q8 ; non-zero element is set to all 1
vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
ldr r7, [r3, #vp8_blockd_dqcoeff]
vand q0, q6, q14 ; get all valid numbers from scan array
vand q1, q7, q15
vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
vtst.16 q2, q10, q8 ; now find eob
vtst.16 q3, q11, q8 ; non-zero element is set to all 1
vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
vand q10, q6, q2 ; get all valid numbers from scan array
vand q11, q7, q3
vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
vmax.u16 d0, d0, d1
vmax.u16 d20, d20, d21
vmovl.u16 q0, d0
vmovl.u16 q10, d20
vmax.u32 d0, d0, d1
vmax.u32 d20, d20, d21
vpmax.u32 d0, d0, d0
vpmax.u32 d20, d20, d20
add r4, r2, #vp8_blockd_eob
add r5, r3, #vp8_blockd_eob
vst1.32 {d0[0]}, [r4@32]
vst1.32 {d20[0]}, [r5@32]
vldmia sp!, {q4-q7}
ldmfd sp!, {r4-r9}
bx lr
ENDP
;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|vp8_fast_quantize_b_neon| PROC
@ -97,10 +230,8 @@
vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
vmov.32 r0, d0[0] ; this instruction takes 1+13 cycles
; if we have vfp, we could use
; vstr s0, [r1, #vp8_blockd_eob]
str r0, [r1, #vp8_blockd_eob]
add r4, r1, #vp8_blockd_eob
vst1.32 {d0[0]}, [r4@32]
ldmfd sp!, {r4-r7}
bx lr

Просмотреть файл

@ -0,0 +1,62 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include "vpx_mem/vpx_mem.h"
#include "vp8/encoder/quantize.h"
#include "vp8/common/entropy.h"
#if HAVE_ARMV7
/* vp8_quantize_mbX functions here differs from corresponding ones in
* quantize.c only by using quantize_b_pair function pointer instead of
* the regular quantize_b function pointer */
void vp8_quantize_mby_neon(MACROBLOCK *x)
{
int i;
int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
&& x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
for (i = 0; i < 16; i+=2)
x->quantize_b_pair(&x->block[i], &x->block[i+1],
&x->e_mbd.block[i], &x->e_mbd.block[i+1]);
if(has_2nd_order)
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
}
void vp8_quantize_mb_neon(MACROBLOCK *x)
{
int i;
int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
&& x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
for (i = 0; i < 24; i+=2)
x->quantize_b_pair(&x->block[i], &x->block[i+1],
&x->e_mbd.block[i], &x->e_mbd.block[i+1]);
if (has_2nd_order)
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
}
void vp8_quantize_mbuv_neon(MACROBLOCK *x)
{
int i;
for (i = 16; i < 24; i+=2)
x->quantize_b_pair(&x->block[i], &x->block[i+1],
&x->e_mbd.block[i], &x->e_mbd.block[i+1]);
}
#endif /* HAVE_ARMV7 */

Просмотреть файл

@ -16,8 +16,10 @@
extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6
#endif
#endif /* HAVE_ARMV6 */
@ -25,10 +27,25 @@ extern prototype_quantize_block(vp8_fast_quantize_b_armv6);
#if HAVE_ARMV7
extern prototype_quantize_block(vp8_fast_quantize_b_neon);
extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
#undef vp8_quantize_fastquantb_pair
#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon
#undef vp8_quantize_mb
#define vp8_quantize_mb vp8_quantize_mb_neon
#undef vp8_quantize_mbuv
#define vp8_quantize_mbuv vp8_quantize_mbuv_neon
#undef vp8_quantize_mby
#define vp8_quantize_mby vp8_quantize_mby_neon
#endif
#endif /* HAVE_ARMV7 */
#endif

Просмотреть файл

@ -119,6 +119,7 @@ typedef struct
void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
void (*quantize_b)(BLOCK *b, BLOCKD *d);
void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
} MACROBLOCK;

Просмотреть файл

@ -1142,8 +1142,10 @@ int vp8cx_encode_inter_macroblock
/* Are we using the fast quantizer for the mode selection? */
if(cpi->sf.use_fastquant_for_pick)
{
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
fastquantb);
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
fastquantb);
cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
fastquantb_pair);
/* the fast quantizer does not use zbin_extra, so
* do not recalculate */
@ -1155,7 +1157,10 @@ int vp8cx_encode_inter_macroblock
/* switch back to the regular quantizer for the encode */
if (cpi->sf.improved_quant)
{
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
quantb);
cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
quantb_pair);
}
/* restore cpi->zbin_mode_boost_enabled */

Просмотреть файл

@ -328,6 +328,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
z->quantize_b_pair = x->quantize_b_pair;
z->optimize = x->optimize;
/*

Просмотреть файл

@ -17,8 +17,6 @@
void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
void vp8_arch_arm_encoder_init(VP8_COMP *cpi);
extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
@ -88,7 +86,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.encodemb.submbuv = vp8_subtract_mbuv_c;
cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c;
cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_c;
cpi->rtcd.search.full_search = vp8_full_search_sad;
cpi->rtcd.search.refining_search = vp8_refining_search_sad;
cpi->rtcd.search.diamond_search = vp8_diamond_search_sad;

Просмотреть файл

@ -1200,11 +1200,17 @@ void vp8_set_speed_features(VP8_COMP *cpi)
if (cpi->sf.improved_quant)
{
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
quantb);
cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
quantb_pair);
}
else
{
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
cpi->mb.quantize_b = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
fastquantb);
cpi->mb.quantize_b_pair = QUANTIZE_INVOKE(&cpi->rtcd.quantize,
fastquantb_pair);
}
if (cpi->sf.improved_quant != last_improved_quant)
vp8cx_init_quantizer(cpi);

Просмотреть файл

@ -269,7 +269,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
#endif
void vp8_quantize_mby(MACROBLOCK *x)
void vp8_quantize_mby_c(MACROBLOCK *x)
{
int i;
int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@ -282,7 +282,7 @@ void vp8_quantize_mby(MACROBLOCK *x)
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
}
void vp8_quantize_mb(MACROBLOCK *x)
void vp8_quantize_mb_c(MACROBLOCK *x)
{
int i;
int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@ -293,7 +293,7 @@ void vp8_quantize_mb(MACROBLOCK *x)
}
void vp8_quantize_mbuv(MACROBLOCK *x)
void vp8_quantize_mbuv_c(MACROBLOCK *x)
{
int i;
@ -301,6 +301,22 @@ void vp8_quantize_mbuv(MACROBLOCK *x)
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
}
/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
* these two C functions if corresponding optimized routine is not available.
* NEON optimized version implements currently the fast quantization for pair
* of blocks. */
void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
{
vp8_regular_quantize_b(b1, d1);
vp8_regular_quantize_b(b2, d2);
}
void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
{
vp8_fast_quantize_b_c(b1, d1);
vp8_fast_quantize_b_c(b2, d2);
}
static const int qrounding_factors[129] =
{
@ -715,3 +731,4 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
vp8cx_init_quantizer(cpi);
}

Просмотреть файл

@ -17,6 +17,11 @@
#define prototype_quantize_block(sym) \
void (sym)(BLOCK *b,BLOCKD *d)
#define prototype_quantize_block_pair(sym) \
void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
#define prototype_quantize_mb(sym) \
void (sym)(MACROBLOCK *x)
#if ARCH_X86 || ARCH_X86_64
#include "x86/quantize_x86.h"
@ -31,17 +36,43 @@
#endif
extern prototype_quantize_block(vp8_quantize_quantb);
#ifndef vp8_quantize_quantb_pair
#define vp8_quantize_quantb_pair vp8_regular_quantize_b_pair
#endif
extern prototype_quantize_block_pair(vp8_quantize_quantb_pair);
#ifndef vp8_quantize_fastquantb
#define vp8_quantize_fastquantb vp8_fast_quantize_b_c
#endif
extern prototype_quantize_block(vp8_quantize_fastquantb);
#ifndef vp8_quantize_fastquantb_pair
#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_c
#endif
extern prototype_quantize_block_pair(vp8_quantize_fastquantb_pair);
typedef struct
{
prototype_quantize_block(*quantb);
prototype_quantize_block_pair(*quantb_pair);
prototype_quantize_block(*fastquantb);
prototype_quantize_block_pair(*fastquantb_pair);
} vp8_quantize_rtcd_vtable_t;
#ifndef vp8_quantize_mb
#define vp8_quantize_mb vp8_quantize_mb_c
#endif
extern prototype_quantize_mb(vp8_quantize_mb);
#ifndef vp8_quantize_mbuv
#define vp8_quantize_mbuv vp8_quantize_mbuv_c
#endif
extern prototype_quantize_mb(vp8_quantize_mbuv);
#ifndef vp8_quantize_mby
#define vp8_quantize_mby vp8_quantize_mby_c
#endif
extern prototype_quantize_mb(vp8_quantize_mby);
#if CONFIG_RUNTIME_CPU_DETECT
#define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn
@ -51,10 +82,6 @@ typedef struct
extern void vp8_strict_quantize_b(BLOCK *b,BLOCKD *d);
extern void vp8_quantize_mb(MACROBLOCK *x);
extern void vp8_quantize_mbuv(MACROBLOCK *x);
extern void vp8_quantize_mby(MACROBLOCK *x);
struct VP8_COMP;
extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);

Просмотреть файл

@ -15,6 +15,7 @@
# encoder
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/dct_arm.c
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c