Merge "Update armv6 loopfilter to new interface"

This commit is contained in:
Johann 2011-07-13 04:09:55 -07:00 коммит произвёл Code Review
Родитель 1a219c22b1 c231b0175d
Коммит 8f910594bd
5 изменённых файлов: 70 добавлений и 87 удалений

Просмотреть файл

@ -54,9 +54,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; rtcd->loopfilter.simple_mb_v =
vp8_loop_filter_simple_vertical_edge_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; rtcd->loopfilter.simple_mb_h =
vp8_loop_filter_simple_horizontal_edge_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;

Просмотреть файл

@ -53,14 +53,11 @@ count RN r5
;r0 unsigned char *src_ptr, ;r0 unsigned char *src_ptr,
;r1 int src_pixel_step, ;r1 int src_pixel_step,
;r2 const char *flimit, ;r2 const char *blimit,
;r3 const char *limit, ;r3 const char *limit,
;stack const char *thresh, ;stack const char *thresh,
;stack int count ;stack int count
;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_horizontal_edge_armv6| PROC |vp8_loop_filter_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@ -72,14 +69,18 @@ count RN r5
sub sp, sp, #16 ; create temp buffer sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3 ldr r9, [src], pstep ; p3
ldr r4, [r2], #4 ; flimit ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2 ldr r10, [src], pstep ; p2
ldr r2, [r3], #4 ; limit ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1 ldr r11, [src], pstep ; p1
uadd8 r4, r4, r4 ; flimit * 2 orr r4, r4, r4, lsl #8
ldr r3, [r6], #4 ; thresh ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|Hnext8| |Hnext8|
; vp8_filter_mask() function ; vp8_filter_mask() function
@ -275,14 +276,18 @@ count RN r5
sub sp, sp, #16 ; create temp buffer sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3 ldr r9, [src], pstep ; p3
ldr r4, [r2], #4 ; flimit ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2 ldr r10, [src], pstep ; p2
ldr r2, [r3], #4 ; limit ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1 ldr r11, [src], pstep ; p1
uadd8 r4, r4, r4 ; flimit * 2 orr r4, r4, r4, lsl #8
ldr r3, [r6], #4 ; thresh ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|MBHnext8| |MBHnext8|
@ -584,15 +589,19 @@ count RN r5
sub sp, sp, #16 ; create temp buffer sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data ldr r6, [src], pstep ; load source data
ldr r4, [r2], #4 ; flimit ldrb r4, [r2] ; blimit
ldr r7, [src], pstep ldr r7, [src], pstep
ldr r2, [r3], #4 ; limit ldrb r2, [r3] ; limit
ldr r8, [src], pstep ldr r8, [src], pstep
uadd8 r4, r4, r4 ; flimit * 2 orr r4, r4, r4, lsl #8
ldr r3, [r12], #4 ; thresh ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
ldr lr, [src], pstep ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|Vnext8| |Vnext8|
@ -855,18 +864,22 @@ count RN r5
sub sp, sp, #16 ; create temp buffer sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data ldr r6, [src], pstep ; load source data
ldr r4, [r2], #4 ; flimit ldrb r4, [r2] ; blimit
pld [src, #23] pld [src, #23]
ldr r7, [src], pstep ldr r7, [src], pstep
ldr r2, [r3], #4 ; limit ldrb r2, [r3] ; limit
pld [src, #23] pld [src, #23]
ldr r8, [src], pstep ldr r8, [src], pstep
uadd8 r4, r4, r4 ; flimit * 2 orr r4, r4, r4, lsl #8
ldr r3, [r12], #4 ; thresh ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
pld [src, #23] pld [src, #23]
ldr lr, [src], pstep ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|MBVnext8| |MBVnext8|
; vp8_filter_mask() function ; vp8_filter_mask() function
@ -906,6 +919,7 @@ count RN r5
str lr, [sp, #8] str lr, [sp, #8]
ldr lr, [src], pstep ldr lr, [src], pstep
TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
ldr lr, [sp, #8] ; load back (f)limit accumulator ldr lr, [sp, #8] ; load back (f)limit accumulator
@ -954,6 +968,7 @@ count RN r5
beq mbvskip_filter ; skip filtering beq mbvskip_filter ; skip filtering
;vp8_hevmask() function ;vp8_hevmask() function
;calculate high edge variance ;calculate high edge variance
@ -1121,6 +1136,7 @@ count RN r5
smlabb r8, r6, lr, r7 smlabb r8, r6, lr, r7
smlatb r6, r6, lr, r7 smlatb r6, r6, lr, r7
smlabb r9, r10, lr, r7 smlabb r9, r10, lr, r7
smlatb r10, r10, lr, r7 smlatb r10, r10, lr, r7
ssat r8, #8, r8, asr #7 ssat r8, #8, r8, asr #7
ssat r6, #8, r6, asr #7 ssat r6, #8, r6, asr #7

Просмотреть файл

@ -45,35 +45,28 @@
MEND MEND
src RN r0 src RN r0
pstep RN r1 pstep RN r1
;r0 unsigned char *src_ptr, ;r0 unsigned char *src_ptr,
;r1 int src_pixel_step, ;r1 int src_pixel_step,
;r2 const char *flimit, ;r2 const char *blimit
;r3 const char *limit,
;stack const char *thresh,
;stack int count
; All 16 elements in flimit are equal. So, in the code, only one load is needed
; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC |vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr} stmdb sp!, {r4 - r11, lr}
ldr r12, [r3] ; limit ldrb r12, [r2] ; blimit
ldr r3, [src, -pstep, lsl #1] ; p1 ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0 ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0 ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1 ldr r6, [src, pstep] ; q1
ldr r7, [r2] ; flimit orr r12, r12, r12, lsl #8 ; blimit
ldr r2, c0x80808080 ldr r2, c0x80808080
ldr r9, [sp, #40] ; count for 8-in-parallel orr r12, r12, r12, lsl #16 ; blimit
uadd8 r7, r7, r7 ; flimit * 2 mov r9, #4 ; double the count. we're doing 4 at a time
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
mov lr, #0 ; need 0 in a couple places mov lr, #0 ; need 0 in a couple places
|simple_hnext8| |simple_hnext8|
@ -148,34 +141,32 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr} stmdb sp!, {r4 - r11, lr}
ldr r12, [r2] ; r12: flimit ldrb r12, [r2] ; r12: blimit
ldr r2, c0x80808080 ldr r2, c0x80808080
ldr r7, [r3] ; limit orr r12, r12, r12, lsl #8
; load soure data to r7, r8, r9, r10 ; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2] ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block pld [src, #23] ; preload for next block
ldrh r4, [src], pstep ldrh r4, [src], pstep
uadd8 r12, r12, r12 ; flimit * 2 orr r12, r12, r12, lsl #16
ldrh r5, [src, #-2] ldrh r5, [src, #-2]
pld [src, #23] pld [src, #23]
ldrh r6, [src], pstep ldrh r6, [src], pstep
uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16 pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2] ldrh r3, [src, #-2]
pld [src, #23] pld [src, #23]
ldrh r4, [src], pstep ldrh r4, [src], pstep
ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16 pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2] ldrh r5, [src, #-2]
pld [src, #23] pld [src, #23]
ldrh r6, [src], pstep ldrh r6, [src], pstep
mov r11, r11, lsl #1 ; 4-in-parallel mov r11, #4 ; double the count. we're doing 4 at a time
|simple_vnext8| |simple_vnext8|
; vp8_simple_filter_mask() function ; vp8_simple_filter_mask() function

Просмотреть файл

@ -18,8 +18,6 @@ extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#endif #endif
#if HAVE_ARMV7 #if HAVE_ARMV7
@ -55,15 +53,6 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
} }
void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
}
/* Vertical MB Filtering */ /* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) int y_stride, int uv_stride, loop_filter_info *lfi)
@ -77,15 +66,6 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
} }
void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
}
/* Horizontal B Filtering */ /* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) int y_stride, int uv_stride, loop_filter_info *lfi)
@ -101,15 +81,12 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
} }
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
int y_stride, int uv_stride, loop_filter_info *lfi) const unsigned char *blimit)
{ {
(void) u_ptr; vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
(void) v_ptr; vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
(void) uv_stride; vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
} }
/* Vertical B Filtering */ /* Vertical B Filtering */
@ -127,15 +104,12 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
} }
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
int y_stride, int uv_stride, loop_filter_info *lfi) const unsigned char *blimit)
{ {
(void) u_ptr; vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
(void) v_ptr; vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
(void) uv_stride; vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
} }
#endif #endif

Просмотреть файл

@ -19,10 +19,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT #if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v #undef vp8_lf_normal_mb_v
@ -38,13 +38,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v #undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 #define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
#undef vp8_lf_simple_b_v #undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h #undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 #define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
#undef vp8_lf_simple_b_h #undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6 #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6