Adds armv6 optimized variance calculation
Adds vp8_sub_pixel_variance16x16_armv6 function to encoder. Integrates ARMv6 optimized bilinear interpolations from vp8/common/arm/armv6 and adds new assembly file for variance16x16 calculation. - vp8_filter_block2d_bil_first_pass_armv6 (integrated) - vp8_filter_block2d_bil_second_pass_armv6 (integrated) - vp8_variance16x16_armv6 (new) - bilinearfilter_arm.h (new) Change-Id: I18a8331ce7d031ceedd6cd415ecacb0c8f3392db
This commit is contained in:
Родитель
e5aaac24bb
Коммит
cb14764fab
|
@ -12,26 +12,7 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include "filter.h"
|
#include "filter.h"
|
||||||
#include "subpixel.h"
|
#include "subpixel.h"
|
||||||
|
#include "arm/bilinearfilter_arm.h"
|
||||||
extern void vp8_filter_block2d_bil_first_pass_armv6
|
|
||||||
(
|
|
||||||
unsigned char *src_ptr,
|
|
||||||
unsigned short *dst_ptr,
|
|
||||||
unsigned int src_pitch,
|
|
||||||
unsigned int height,
|
|
||||||
unsigned int width,
|
|
||||||
const short *vp8_filter
|
|
||||||
);
|
|
||||||
|
|
||||||
extern void vp8_filter_block2d_bil_second_pass_armv6
|
|
||||||
(
|
|
||||||
unsigned short *src_ptr,
|
|
||||||
unsigned char *dst_ptr,
|
|
||||||
int dst_pitch,
|
|
||||||
unsigned int height,
|
|
||||||
unsigned int width,
|
|
||||||
const short *vp8_filter
|
|
||||||
);
|
|
||||||
|
|
||||||
void vp8_filter_block2d_bil_armv6
|
void vp8_filter_block2d_bil_armv6
|
||||||
(
|
(
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef BILINEARFILTER_ARM_H
|
||||||
|
#define BILINEARFILTER_ARM_H
|
||||||
|
|
||||||
|
extern void vp8_filter_block2d_bil_first_pass_armv6
|
||||||
|
(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
unsigned short *dst_ptr,
|
||||||
|
unsigned int src_pitch,
|
||||||
|
unsigned int height,
|
||||||
|
unsigned int width,
|
||||||
|
const short *vp8_filter
|
||||||
|
);
|
||||||
|
|
||||||
|
extern void vp8_filter_block2d_bil_second_pass_armv6
|
||||||
|
(
|
||||||
|
const unsigned short *src_ptr,
|
||||||
|
unsigned char *dst_ptr,
|
||||||
|
int dst_pitch,
|
||||||
|
unsigned int height,
|
||||||
|
unsigned int width,
|
||||||
|
const short *vp8_filter
|
||||||
|
);
|
||||||
|
|
||||||
|
#endif /* BILINEARFILTER_ARM_H */
|
|
@ -38,14 +38,14 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
|
||||||
/*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
|
/*cpi->rtcd.variance.var4x4 = vp8_variance4x4_c;
|
||||||
cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
|
cpi->rtcd.variance.var8x8 = vp8_variance8x8_c;
|
||||||
cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
|
cpi->rtcd.variance.var8x16 = vp8_variance8x16_c;
|
||||||
cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;
|
cpi->rtcd.variance.var16x8 = vp8_variance16x8_c;*/
|
||||||
cpi->rtcd.variance.var16x16 = vp8_variance16x16_c;*/
|
cpi->rtcd.variance.var16x16 = vp8_variance16x16_armv6;
|
||||||
|
|
||||||
/*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
|
/*cpi->rtcd.variance.subpixvar4x4 = vp8_sub_pixel_variance4x4_c;
|
||||||
cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
|
cpi->rtcd.variance.subpixvar8x8 = vp8_sub_pixel_variance8x8_c;
|
||||||
cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
|
cpi->rtcd.variance.subpixvar8x16 = vp8_sub_pixel_variance8x16_c;
|
||||||
cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;
|
cpi->rtcd.variance.subpixvar16x8 = vp8_sub_pixel_variance16x8_c;*/
|
||||||
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_c;*/
|
cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_armv6;
|
||||||
|
|
||||||
/*cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
|
/*cpi->rtcd.variance.mse16x16 = vp8_mse16x16_c;
|
||||||
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
|
cpi->rtcd.variance.getmbss = vp8_get_mb_ss_c;*/
|
||||||
|
|
|
@ -0,0 +1,147 @@
|
||||||
|
;
|
||||||
|
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license
|
||||||
|
; that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. An additional intellectual property rights grant can be found
|
||||||
|
; in the file PATENTS. All contributing project authors may
|
||||||
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
EXPORT |vp8_variance16x16_armv6|
|
||||||
|
|
||||||
|
ARM
|
||||||
|
REQUIRE8
|
||||||
|
PRESERVE8
|
||||||
|
|
||||||
|
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||||
|
|
||||||
|
; r0 unsigned char *src_ptr
|
||||||
|
; r1 int source_stride
|
||||||
|
; r2 unsigned char *ref_ptr
|
||||||
|
; r3 int recon_stride
|
||||||
|
; stack unsigned int *sse
|
||||||
|
|vp8_variance16x16_armv6| PROC
|
||||||
|
|
||||||
|
stmfd sp!, {r4-r12, lr}
|
||||||
|
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||||
|
mov r8, #0 ; initialize sum = 0
|
||||||
|
mov r11, #0 ; initialize sse = 0
|
||||||
|
|
||||||
|
loop
|
||||||
|
; 1st 4 pixels
|
||||||
|
ldr r4, [r0, #0x0] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #0x0] ; load 4 ref pixels
|
||||||
|
|
||||||
|
mov lr, #0 ; constant zero
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
; calculate total sum
|
||||||
|
adds r8, r8, r4 ; add positive differences to sum
|
||||||
|
subs r8, r8, r5 ; substract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 2nd 4 pixels
|
||||||
|
ldr r4, [r0, #0x4] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #0x4] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; substract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 3rd 4 pixels
|
||||||
|
ldr r4, [r0, #0x8] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #0x8] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; substract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 4th 4 pixels
|
||||||
|
ldr r4, [r0, #0xc] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #0xc] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
add r0, r0, r1 ; set src_ptr to next row
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
add r2, r2, r3 ; set dst_ptr to next row
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; substract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
|
||||||
|
subs r12, r12, #1
|
||||||
|
|
||||||
|
bne loop
|
||||||
|
|
||||||
|
; return stuff
|
||||||
|
ldr r6, [sp, #0x28] ; get address of sse
|
||||||
|
mul r0, r8, r8 ; sum * sum
|
||||||
|
str r11, [r6] ; store sse
|
||||||
|
sub r0, r11, r0, ASR #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
|
||||||
|
END
|
|
@ -10,6 +10,40 @@
|
||||||
|
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
#include "variance.h"
|
#include "variance.h"
|
||||||
|
#include "filter.h"
|
||||||
|
#include "arm/bilinearfilter_arm.h"
|
||||||
|
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
|
||||||
|
unsigned int vp8_sub_pixel_variance16x16_armv6
|
||||||
|
(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_pixels_per_line,
|
||||||
|
int xoffset,
|
||||||
|
int yoffset,
|
||||||
|
const unsigned char *dst_ptr,
|
||||||
|
int dst_pixels_per_line,
|
||||||
|
unsigned int *sse
|
||||||
|
)
|
||||||
|
{
|
||||||
|
unsigned short first_pass[36*16];
|
||||||
|
unsigned char second_pass[20*16];
|
||||||
|
const short *HFilter, *VFilter;
|
||||||
|
|
||||||
|
HFilter = vp8_bilinear_filters[xoffset];
|
||||||
|
VFilter = vp8_bilinear_filters[yoffset];
|
||||||
|
|
||||||
|
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
|
||||||
|
src_pixels_per_line,
|
||||||
|
17, 16, HFilter);
|
||||||
|
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||||
|
16, 16, 16, VFilter);
|
||||||
|
|
||||||
|
return vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
|
||||||
|
dst_pixels_per_line, sse);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#if HAVE_ARMV7
|
#if HAVE_ARMV7
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,23 @@
|
||||||
#ifndef VARIANCE_ARM_H
|
#ifndef VARIANCE_ARM_H
|
||||||
#define VARIANCE_ARM_H
|
#define VARIANCE_ARM_H
|
||||||
|
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
|
||||||
|
extern prototype_variance(vp8_variance16x16_armv6);
|
||||||
|
extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_armv6);
|
||||||
|
|
||||||
|
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||||
|
|
||||||
|
#undef vp8_variance_subpixvar16x16
|
||||||
|
#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_armv6
|
||||||
|
|
||||||
|
#undef vp8_variance_var16x16
|
||||||
|
#define vp8_variance_var16x16 vp8_variance16x16_armv6
|
||||||
|
|
||||||
|
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
|
||||||
|
|
||||||
|
#endif /* HAVE_ARMV6 */
|
||||||
|
|
||||||
#if HAVE_ARMV7
|
#if HAVE_ARMV7
|
||||||
extern prototype_sad(vp8_sad4x4_neon);
|
extern prototype_sad(vp8_sad4x4_neon);
|
||||||
extern prototype_sad(vp8_sad8x8_neon);
|
extern prototype_sad(vp8_sad8x8_neon);
|
||||||
|
|
|
@ -116,6 +116,7 @@ VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
|
||||||
|
|
||||||
# common (c)
|
# common (c)
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.c
|
||||||
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/bilinearfilter_arm.h
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/filter_arm.c
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/filter_arm.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/loopfilter_arm.c
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/loopfilter_arm.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/reconintra_arm.c
|
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/reconintra_arm.c
|
||||||
|
|
|
@ -17,9 +17,10 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/arm_csystemdependent.c
|
||||||
VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
|
VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
|
||||||
|
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/variance_arm.c
|
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
|
||||||
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c
|
||||||
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.h
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
|
||||||
|
|
||||||
VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c
|
VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/boolhuff.c
|
||||||
|
@ -33,6 +34,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
|
||||||
|
|
||||||
#File list for armv6
|
#File list for armv6
|
||||||
# encoder
|
# encoder
|
||||||
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM)
|
||||||
|
|
||||||
#File list for neon
|
#File list for neon
|
||||||
|
|
Загрузка…
Ссылка в новой задаче