diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index bd5c0759d..8896cf03f 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -51,9 +51,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6; rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6; rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6; - rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6; + rtcd->loopfilter.simple_mb_v = + vp8_loop_filter_simple_vertical_edge_armv6; rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6; - rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6; + rtcd->loopfilter.simple_mb_h = + vp8_loop_filter_simple_horizontal_edge_armv6; rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6; rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm index c7441b055..1cbbbcdef 100644 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ b/vp8/common/arm/armv6/loopfilter_v6.asm @@ -53,14 +53,11 @@ count RN r5 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, +;r2 const char *blimit, ;r3 const char *limit, ;stack const char *thresh, ;stack int count -;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed -;for flimit. Same way applies to limit and thresh. - ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- @@ -72,14 +69,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Hnext8| ; vp8_filter_mask() function @@ -275,14 +276,18 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r9, [src], pstep ; p3 - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r10, [src], pstep ; p2 - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r11, [src], pstep ; p1 - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r6], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBHnext8| @@ -584,15 +589,19 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |Vnext8| @@ -855,18 +864,22 @@ count RN r5 sub sp, sp, #16 ; create temp buffer ldr r6, [src], pstep ; load source data - ldr r4, [r2], #4 ; flimit + ldrb r4, [r2] ; blimit pld [src, #23] ldr r7, [src], pstep - ldr r2, [r3], #4 ; limit + ldrb r2, [r3] ; limit pld [src, #23] ldr r8, [src], pstep - uadd8 r4, r4, r4 ; flimit * 2 - ldr r3, [r12], #4 ; thresh + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 pld [src, #23] ldr lr, [src], pstep mov count, count, lsl #1 ; 4-in-parallel - uadd8 r4, r4, r2 ; flimit * 2 + limit + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 |MBVnext8| ; vp8_filter_mask() function @@ -906,6 +919,7 @@ count RN r5 str lr, [sp, #8] ldr lr, [src], pstep + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 ldr lr, [sp, #8] ; load back (f)limit accumulator @@ -954,6 +968,7 @@ count RN r5 beq mbvskip_filter ; skip filtering + ;vp8_hevmask() function ;calculate high edge variance @@ -1121,6 +1136,7 @@ count RN r5 smlabb r8, r6, lr, r7 smlatb r6, r6, lr, r7 smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 ssat r8, #8, r8, asr #7 ssat r6, #8, r6, asr #7 diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm index 40a71f49d..5e00cf01b 100644 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm @@ -45,35 +45,28 @@ MEND + src RN r0 pstep RN r1 ;r0 unsigned char *src_ptr, ;r1 int src_pixel_step, -;r2 const char *flimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -; All 16 elements in flimit are equal. So, in the code, only one load is needed -; for flimit. Same applies to limit. thresh is not used in simple looopfilter +;r2 const char *blimit ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |vp8_loop_filter_simple_horizontal_edge_armv6| PROC ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r3] ; limit + ldrb r12, [r2] ; blimit ldr r3, [src, -pstep, lsl #1] ; p1 ldr r4, [src, -pstep] ; p0 ldr r5, [src] ; q0 ldr r6, [src, pstep] ; q1 - ldr r7, [r2] ; flimit + orr r12, r12, r12, lsl #8 ; blimit ldr r2, c0x80808080 - ldr r9, [sp, #40] ; count for 8-in-parallel - uadd8 r7, r7, r7 ; flimit * 2 - mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time - uadd8 r12, r7, r12 ; flimit * 2 + limit + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time mov lr, #0 ; need 0 in a couple places |simple_hnext8| @@ -148,34 +141,32 @@ pstep RN r1 ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- stmdb sp!, {r4 - r11, lr} - ldr r12, [r2] ; r12: flimit + ldrb r12, [r2] ; r12: blimit ldr r2, c0x80808080 - ldr r7, [r3] ; limit + orr r12, r12, r12, lsl #8 ; load soure data to r7, r8, r9, r10 ldrh r3, [src, #-2] pld [src, #23] ; preload for next block ldrh r4, [src], pstep - uadd8 r12, r12, r12 ; flimit * 2 + orr r12, r12, r12, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - uadd8 r12, r12, r7 ; flimit * 2 + limit pkhbt r7, r3, r4, lsl #16 ldrh r3, [src, #-2] pld [src, #23] ldrh r4, [src], pstep - ldr r11, [sp, #40] ; count (r11) for 8-in-parallel pkhbt r8, r5, r6, lsl #16 ldrh r5, [src, #-2] pld [src, #23] ldrh r6, [src], pstep - mov r11, r11, lsl #1 ; 4-in-parallel + mov r11, #4 ; double the count. we're doing 4 at a time |simple_vnext8| ; vp8_simple_filter_mask() function diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 1ec2b7484..c841d455a 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -18,8 +18,6 @@ extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #endif #if HAVE_ARMV7 @@ -55,15 +53,6 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); -} - /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) @@ -77,15 +66,6 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); -} - /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, loop_filter_info *lfi) @@ -101,15 +81,12 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); } /* Vertical B Filtering */ @@ -127,15 +104,12 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) +void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { - (void) u_ptr; - (void) v_ptr; - (void) uv_stride; - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); } #endif diff --git a/vp8/common/arm/loopfilter_arm.h b/vp8/common/arm/loopfilter_arm.h index 27159b59f..390a547b0 100644 --- a/vp8/common/arm/loopfilter_arm.h +++ b/vp8/common/arm/loopfilter_arm.h @@ -19,10 +19,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6); extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6); extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6); -extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6); +extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_lf_normal_mb_v @@ -38,13 +38,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6); #define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6 #undef vp8_lf_simple_mb_v -#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6 +#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6 #undef vp8_lf_simple_b_v #define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6 #undef vp8_lf_simple_mb_h -#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6 +#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6 #undef vp8_lf_simple_b_h #define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6