Merge "vp8 quantization -> intrinsics"

This commit is contained in:
Johann 2014-11-04 16:28:46 -08:00 коммит произвёл Gerrit Code Review
Родитель 9f9e30d7bf 7ae75c3d52
Коммит 412eaaf090
4 изменённых файлов: 208 добавлений и 264 удалений

Просмотреть файл

@ -454,16 +454,14 @@ add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
specialize qw/vp8_regular_quantize_b sse2 sse4_1/;
add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/;
$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
# no asm yet
add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
specialize qw/vp8_fast_quantize_b_pair neon_asm/;
$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
specialize qw/vp8_fast_quantize_b_pair neon/;
add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
specialize qw/vp8_quantize_mb neon/;

Просмотреть файл

@ -1,258 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_quantize_b_neon|
EXPORT |vp8_fast_quantize_b_pair_neon|
INCLUDE vp8_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
|vp8_fast_quantize_b_pair_neon| PROC
stmfd sp!, {r4-r9}
vstmdb sp!, {q4-q7}
ldr r4, [r0, #vp8_block_coeff]
ldr r5, [r0, #vp8_block_quant_fast]
ldr r6, [r0, #vp8_block_round]
vld1.16 {q0, q1}, [r4@128] ; load z
ldr r7, [r2, #vp8_blockd_qcoeff]
vabs.s16 q4, q0 ; calculate x = abs(z)
vabs.s16 q5, q1
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vshr.s16 q3, q1, #15
vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
ldr r4, [r1, #vp8_block_coeff]
vadd.s16 q4, q6 ; x + Round
vadd.s16 q5, q7
vld1.16 {q0, q1}, [r4@128] ; load z2
vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q5, q9
vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
vabs.s16 q11, q1
vshr.s16 q12, q0, #15 ; sz2
vshr.s16 q13, q1, #15
;modify data to have its original sign
veor.s16 q4, q2 ; y^sz
veor.s16 q5, q3
vadd.s16 q10, q6 ; x2 + Round
vadd.s16 q11, q7
ldr r8, [r2, #vp8_blockd_dequant]
vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q11, q9
vshr.s16 q4, #1 ; right shift 1 after vqdmulh
vshr.s16 q5, #1
vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q5, q3
vshr.s16 q10, #1 ; right shift 1 after vqdmulh
vshr.s16 q11, #1
ldr r9, [r2, #vp8_blockd_dqcoeff]
veor.s16 q10, q12 ; y2^sz2
veor.s16 q11, q13
vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q11, q13
ldr r6, [r3, #vp8_blockd_qcoeff]
vmul.s16 q2, q6, q4 ; x * Dequant
vmul.s16 q3, q7, q5
adr r0, inv_zig_zag ; load ptr of inverse zigzag table
vceq.s16 q8, q8 ; set q8 to all 1
vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
vmul.s16 q12, q6, q10 ; x2 * Dequant
vmul.s16 q13, q7, q11
vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
vtst.16 q14, q4, q8 ; now find eob
vtst.16 q15, q5, q8 ; non-zero element is set to all 1
vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
ldr r7, [r3, #vp8_blockd_dqcoeff]
vand q0, q6, q14 ; get all valid numbers from scan array
vand q1, q7, q15
vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
vtst.16 q2, q10, q8 ; now find eob
vtst.16 q3, q11, q8 ; non-zero element is set to all 1
vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
vand q10, q6, q2 ; get all valid numbers from scan array
vand q11, q7, q3
vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
vmax.u16 d0, d0, d1
vmax.u16 d20, d20, d21
vmovl.u16 q0, d0
vmovl.u16 q10, d20
vmax.u32 d0, d0, d1
vmax.u32 d20, d20, d21
vpmax.u32 d0, d0, d0
vpmax.u32 d20, d20, d20
ldr r4, [r2, #vp8_blockd_eob]
ldr r5, [r3, #vp8_blockd_eob]
vst1.8 {d0[0]}, [r4] ; store eob
vst1.8 {d20[0]}, [r5] ; store eob
vldmia sp!, {q4-q7}
ldmfd sp!, {r4-r9}
bx lr
ENDP
;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|vp8_fast_quantize_b_neon| PROC
stmfd sp!, {r4-r7}
ldr r3, [r0, #vp8_block_coeff]
ldr r4, [r0, #vp8_block_quant_fast]
ldr r5, [r0, #vp8_block_round]
vld1.16 {q0, q1}, [r3@128] ; load z
vorr.s16 q14, q0, q1 ; check if all zero (step 1)
ldr r6, [r1, #vp8_blockd_qcoeff]
ldr r7, [r1, #vp8_blockd_dqcoeff]
vorr.s16 d28, d28, d29 ; check if all zero (step 2)
vabs.s16 q12, q0 ; calculate x = abs(z)
vabs.s16 q13, q1
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vmov r2, r3, d28 ; check if all zero (step 3)
vshr.s16 q3, q1, #15
vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
vadd.s16 q12, q14 ; x + Round
vadd.s16 q13, q15
adr r0, inv_zig_zag ; load ptr of inverse zigzag table
vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q13, q9
vld1.16 {q10, q11}, [r0@128]; load inverse scan order
vceq.s16 q8, q8 ; set q8 to all 1
ldr r4, [r1, #vp8_blockd_dequant]
vshr.s16 q12, #1 ; right shift 1 after vqdmulh
vshr.s16 q13, #1
ldr r5, [r1, #vp8_blockd_eob]
orr r2, r2, r3 ; check if all zero (step 4)
cmp r2, #0 ; check if all zero (step 5)
beq zero_output ; check if all zero (step 6)
;modify data to have its original sign
veor.s16 q12, q2 ; y^sz
veor.s16 q13, q3
vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q13, q3
vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
vtst.16 q14, q12, q8 ; now find eob
vtst.16 q15, q13, q8 ; non-zero element is set to all 1
vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
vand q10, q10, q14 ; get all valid numbers from scan array
vand q11, q11, q15
vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
vmax.u16 d0, d0, d1
vmovl.u16 q0, d0
vmul.s16 q2, q12 ; x * Dequant
vmul.s16 q3, q13
vmax.u32 d0, d0, d1
vpmax.u32 d0, d0, d0
vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
vst1.8 {d0[0]}, [r5] ; store eob
ldmfd sp!, {r4-r7}
bx lr
zero_output
strb r2, [r5] ; store eob
vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
ldmfd sp!, {r4-r7}
bx lr
ENDP
; default inverse zigzag table is defined in vp8/common/entropy.c
ALIGN 16 ; enable use of @128 bit aligned loads
inv_zig_zag
DCW 0x0001, 0x0002, 0x0006, 0x0007
DCW 0x0003, 0x0005, 0x0008, 0x000d
DCW 0x0004, 0x0009, 0x000c, 0x000e
DCW 0x000a, 0x000b, 0x000f, 0x0010
END

Просмотреть файл

@ -0,0 +1,205 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/encoder/block.h"
#include "vpx_mem/vpx_mem.h"
static const uint16_t inv_zig_zag[16] = {
0x0001, 0x0002, 0x0006, 0x0007,
0x0003, 0x0005, 0x0008, 0x000d,
0x0004, 0x0009, 0x000c, 0x000e,
0x000a, 0x000b, 0x000f, 0x0010
};
void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
const int16x8_t one_q = vdupq_n_s16(0xff),
z0 = vld1q_s16(b->coeff),
z1 = vld1q_s16(b->coeff + 8),
round0 = vld1q_s16(b->round),
round1 = vld1q_s16(b->round + 8),
quant0 = vld1q_s16(b->quant_fast),
quant1 = vld1q_s16(b->quant_fast + 8),
dequant0 = vld1q_s16(d->dequant),
dequant1 = vld1q_s16(d->dequant + 8);
const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
zig_zag1 = vld1q_u16(inv_zig_zag + 8);
int16x8_t x0, x1, sz0, sz1, y0, y1;
uint16x8_t eob0, eob1;
uint16x4_t eob_d16;
uint32x2_t eob_d32;
uint32x4_t eob_q32;
/* sign of z: z >> 15 */
sz0 = vshrq_n_s16(z0, 15);
sz1 = vshrq_n_s16(z1, 15);
/* x = abs(z) */
x0 = vabsq_s16(z0);
x1 = vabsq_s16(z1);
/* x += round */
x0 = vaddq_s16(x0, round0);
x1 = vaddq_s16(x1, round1);
/* y = 2 * (x * quant) >> 16 */
y0 = vqdmulhq_s16(x0, quant0);
y1 = vqdmulhq_s16(x1, quant1);
/* Compensate for doubling in vqdmulhq */
y0 = vshrq_n_s16(y0, 1);
y1 = vshrq_n_s16(y1, 1);
/* Restore sign bit */
y0 = veorq_s16(y0, sz0);
y1 = veorq_s16(y1, sz1);
x0 = vsubq_s16(y0, sz0);
x1 = vsubq_s16(y1, sz1);
/* find non-zero elements */
eob0 = vtstq_s16(x0, one_q);
eob1 = vtstq_s16(x1, one_q);
/* mask zig zag */
eob0 = vandq_u16(eob0, zig_zag0);
eob1 = vandq_u16(eob1, zig_zag1);
/* select the largest value */
eob0 = vmaxq_u16(eob0, eob1);
eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
eob_q32 = vmovl_u16(eob_d16);
eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
eob_d32 = vpmax_u32(eob_d32, eob_d32);
/* qcoeff = x */
vst1q_s16(d->qcoeff, x0);
vst1q_s16(d->qcoeff + 8, x1);
/* dqcoeff = x * dequant */
vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
}
void vp8_fast_quantize_b_pair_neon(BLOCK *b0, BLOCK *b1,
BLOCKD *d0, BLOCKD *d1) {
const int16x8_t one_q = vdupq_n_s16(0xff),
b0_z0 = vld1q_s16(b0->coeff),
b0_z1 = vld1q_s16(b0->coeff + 8),
b0_round0 = vld1q_s16(b0->round),
b0_round1 = vld1q_s16(b0->round + 8),
b0_quant0 = vld1q_s16(b0->quant_fast),
b0_quant1 = vld1q_s16(b0->quant_fast + 8),
d0_dequant0 = vld1q_s16(d0->dequant),
d0_dequant1 = vld1q_s16(d0->dequant + 8),
b1_z0 = vld1q_s16(b1->coeff),
b1_z1 = vld1q_s16(b1->coeff + 8),
b1_round0 = vld1q_s16(b1->round),
b1_round1 = vld1q_s16(b1->round + 8),
b1_quant0 = vld1q_s16(b1->quant_fast),
b1_quant1 = vld1q_s16(b1->quant_fast + 8),
d1_dequant0 = vld1q_s16(d1->dequant),
d1_dequant1 = vld1q_s16(d1->dequant + 8);
const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
zig_zag1 = vld1q_u16(inv_zig_zag + 8);
int16x8_t b0_x0, b0_x1, b0_sz0, b0_sz1, b0_y0, b0_y1,
b1_x0, b1_x1, b1_sz0, b1_sz1, b1_y0, b1_y1;
uint16x8_t b0_eob0, b0_eob1,
b1_eob0, b1_eob1;
uint16x4_t b0_eob_d16, b1_eob_d16;
uint32x2_t b0_eob_d32, b1_eob_d32;
uint32x4_t b0_eob_q32, b1_eob_q32;
/* sign of z: z >> 15 */
b0_sz0 = vshrq_n_s16(b0_z0, 15);
b0_sz1 = vshrq_n_s16(b0_z1, 15);
b1_sz0 = vshrq_n_s16(b1_z0, 15);
b1_sz1 = vshrq_n_s16(b1_z1, 15);
/* x = abs(z) */
b0_x0 = vabsq_s16(b0_z0);
b0_x1 = vabsq_s16(b0_z1);
b1_x0 = vabsq_s16(b1_z0);
b1_x1 = vabsq_s16(b1_z1);
/* x += round */
b0_x0 = vaddq_s16(b0_x0, b0_round0);
b0_x1 = vaddq_s16(b0_x1, b0_round1);
b1_x0 = vaddq_s16(b1_x0, b1_round0);
b1_x1 = vaddq_s16(b1_x1, b1_round1);
/* y = 2 * (x * quant) >> 16 */
b0_y0 = vqdmulhq_s16(b0_x0, b0_quant0);
b0_y1 = vqdmulhq_s16(b0_x1, b0_quant1);
b1_y0 = vqdmulhq_s16(b1_x0, b1_quant0);
b1_y1 = vqdmulhq_s16(b1_x1, b1_quant1);
/* Compensate for doubling in vqdmulhq */
b0_y0 = vshrq_n_s16(b0_y0, 1);
b0_y1 = vshrq_n_s16(b0_y1, 1);
b1_y0 = vshrq_n_s16(b1_y0, 1);
b1_y1 = vshrq_n_s16(b1_y1, 1);
/* Restore sign bit */
b0_y0 = veorq_s16(b0_y0, b0_sz0);
b0_y1 = veorq_s16(b0_y1, b0_sz1);
b0_x0 = vsubq_s16(b0_y0, b0_sz0);
b0_x1 = vsubq_s16(b0_y1, b0_sz1);
b1_y0 = veorq_s16(b1_y0, b1_sz0);
b1_y1 = veorq_s16(b1_y1, b1_sz1);
b1_x0 = vsubq_s16(b1_y0, b1_sz0);
b1_x1 = vsubq_s16(b1_y1, b1_sz1);
/* find non-zero elements */
b0_eob0 = vtstq_s16(b0_x0, one_q);
b0_eob1 = vtstq_s16(b0_x1, one_q);
b1_eob0 = vtstq_s16(b1_x0, one_q);
b1_eob1 = vtstq_s16(b1_x1, one_q);
/* mask zig zag */
b0_eob0 = vandq_u16(b0_eob0, zig_zag0);
b0_eob1 = vandq_u16(b0_eob1, zig_zag1);
b1_eob0 = vandq_u16(b1_eob0, zig_zag0);
b1_eob1 = vandq_u16(b1_eob1, zig_zag1);
/* select the largest value */
b0_eob0 = vmaxq_u16(b0_eob0, b0_eob1);
b0_eob_d16 = vmax_u16(vget_low_u16(b0_eob0),
vget_high_u16(b0_eob0));
b0_eob_q32 = vmovl_u16(b0_eob_d16);
b0_eob_d32 = vmax_u32(vget_low_u32(b0_eob_q32),
vget_high_u32(b0_eob_q32));
b0_eob_d32 = vpmax_u32(b0_eob_d32, b0_eob_d32);
b1_eob0 = vmaxq_u16(b1_eob0, b1_eob1);
b1_eob_d16 = vmax_u16(vget_low_u16(b1_eob0),
vget_high_u16(b1_eob0));
b1_eob_q32 = vmovl_u16(b1_eob_d16);
b1_eob_d32 = vmax_u32(vget_low_u32(b1_eob_q32),
vget_high_u32(b1_eob_q32));
b1_eob_d32 = vpmax_u32(b1_eob_d32, b1_eob_d32);
/* qcoeff = x */
vst1q_s16(d0->qcoeff, b0_x0);
vst1q_s16(d0->qcoeff + 8, b0_x1);
vst1q_s16(d1->qcoeff, b1_x0);
vst1q_s16(d1->qcoeff + 8, b1_x1);
/* dqcoeff = x * dequant */
vst1q_s16(d0->dqcoeff, vmulq_s16(d0_dequant0, b0_x0));
vst1q_s16(d0->dqcoeff + 8, vmulq_s16(d0_dequant1, b0_x1));
vst1q_s16(d1->dqcoeff, vmulq_s16(d1_dequant0, b1_x0));
vst1q_s16(d1->dqcoeff + 8, vmulq_s16(d1_dequant1, b1_x1));
vst1_lane_s8((int8_t *)d0->eob, vreinterpret_s8_u32(b0_eob_d32), 0);
vst1_lane_s8((int8_t *)d1->eob, vreinterpret_s8_u32(b1_eob_d32), 0);
return;
}

Просмотреть файл

@ -35,9 +35,8 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
#File list for neon
# encoder
VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM)
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c