Remove ARM optimizations from VP9
Change-Id: I9f0ae635fb9a95c4aa1529c177ccb07e2b76970b
This commit is contained in:
Родитель
0d793ccfb6
Коммит
34591b54dd
1
libs.mk
1
libs.mk
|
@ -109,7 +109,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
|
|||
CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
|
||||
CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
|
||||
CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
|
||||
CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
|
||||
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
|
||||
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
|
||||
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
|
||||
|
|
24
vp8/vp8dx.mk
24
vp8/vp8dx.mk
|
@ -18,30 +18,6 @@ VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
|
|||
|
||||
VP8_DX_SRCS-yes += vp8_dx_iface.c
|
||||
|
||||
# common
|
||||
#define ARM
|
||||
#define DISABLE_THREAD
|
||||
|
||||
#INCLUDES += algo/vpx_common/vpx_mem/include
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += decoder
|
||||
|
||||
|
||||
|
||||
# decoder
|
||||
#define ARM
|
||||
#define DISABLE_THREAD
|
||||
|
||||
#INCLUDES += algo/vpx_common/vpx_mem/include
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += common
|
||||
#INCLUDES += decoder
|
||||
|
||||
VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
|
||||
VP8_DX_SRCS-yes += decoder/dboolhuff.c
|
||||
VP8_DX_SRCS-yes += decoder/decodemv.c
|
||||
|
|
|
@ -1,237 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_filter_block2d_bil_first_pass_armv6|
|
||||
EXPORT |vp9_filter_block2d_bil_second_pass_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned short *dst_ptr,
|
||||
; r2 unsigned int src_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vp9_filter
|
||||
;-------------------------------------
|
||||
; The output is transposed stroed in output array to make it easy for second pass filtering.
|
||||
|vp9_filter_block2d_bil_first_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp9_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
mov r12, r3 ; outer-loop counter
|
||||
|
||||
add r7, r2, r4 ; preload next row
|
||||
pld [r0, r7]
|
||||
|
||||
sub r2, r2, r4 ; src increment for height loop
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
|
||||
mov r3, r3, lsl #1 ; height*2
|
||||
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
|
||||
|
||||
mov r11, r1 ; save dst_ptr for each row
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_1st_filter
|
||||
|
||||
|bil_height_loop_1st_v6|
|
||||
ldrb r6, [r0] ; load source data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|
||||
|
||||
|bil_width_loop_1st_v6|
|
||||
ldrb r9, [r0, #3]
|
||||
ldrb r10, [r0, #4]
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
|
||||
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
|
||||
|
||||
smuad r6, r6, r5 ; apply the filter
|
||||
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
|
||||
smuad r7, r7, r5
|
||||
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
|
||||
|
||||
smuad r8, r8, r5
|
||||
smuad r9, r9, r5
|
||||
|
||||
add r0, r0, #4
|
||||
subs lr, lr, #1
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #16, r6, asr #7
|
||||
usat r7, #16, r7, asr #7
|
||||
|
||||
strh r6, [r1], r3 ; result is transposed and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strh r7, [r1], r3
|
||||
add r9, r9, #0x40
|
||||
usat r8, #16, r8, asr #7
|
||||
usat r9, #16, r9, asr #7
|
||||
|
||||
strh r8, [r1], r3 ; result is transposed and stored
|
||||
|
||||
ldrneb r6, [r0] ; load source data
|
||||
strh r9, [r1], r3
|
||||
|
||||
ldrneb r7, [r0, #1]
|
||||
ldrneb r8, [r0, #2]
|
||||
|
||||
bne bil_width_loop_1st_v6
|
||||
|
||||
add r0, r0, r2 ; move to next input row
|
||||
subs r12, r12, #1
|
||||
|
||||
add r9, r2, r4, lsl #1 ; adding back block width
|
||||
pld [r0, r9] ; preload next row
|
||||
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_1st_v6
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_1st_filter|
|
||||
|bil_height_loop_null_1st|
|
||||
mov lr, r4, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_null_1st|
|
||||
ldrb r6, [r0] ; load data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
ldrb r9, [r0, #3]
|
||||
|
||||
strh r6, [r1], r3 ; store it to immediate buffer
|
||||
add r0, r0, #4
|
||||
strh r7, [r1], r3
|
||||
subs lr, lr, #1
|
||||
strh r8, [r1], r3
|
||||
strh r9, [r1], r3
|
||||
|
||||
bne bil_width_loop_null_1st
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r2 ; move to next input line
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_1st
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP ; |vp9_filter_block2d_bil_first_pass_armv6|
|
||||
|
||||
|
||||
;---------------------------------
|
||||
; r0 unsigned short *src_ptr,
|
||||
; r1 unsigned char *dst_ptr,
|
||||
; r2 int dst_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vp9_filter
|
||||
;---------------------------------
|
||||
|vp9_filter_block2d_bil_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp9_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
|
||||
mov r11, r1
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_2nd_filter
|
||||
|
||||
|bil_height_loop_2nd|
|
||||
ldr r6, [r0] ; load the data
|
||||
ldr r8, [r0, #4]
|
||||
ldrh r10, [r0, #8]
|
||||
mov lr, r3, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_2nd|
|
||||
pkhtb r7, r6, r8 ; src[1] | src[2]
|
||||
pkhtb r9, r8, r10 ; src[3] | src[4]
|
||||
|
||||
smuad r6, r6, r5 ; apply filter
|
||||
smuad r8, r8, r5 ; apply filter
|
||||
|
||||
subs lr, lr, #1
|
||||
|
||||
smuadx r7, r7, r5 ; apply filter
|
||||
smuadx r9, r9, r5 ; apply filter
|
||||
|
||||
add r0, r0, #8
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #8, r6, asr #7
|
||||
usat r7, #8, r7, asr #7
|
||||
strb r6, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strb r7, [r1], r2
|
||||
add r9, r9, #0x40
|
||||
usat r8, #8, r8, asr #7
|
||||
usat r9, #8, r9, asr #7
|
||||
strb r8, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
ldrne r6, [r0] ; load data
|
||||
strb r9, [r1], r2
|
||||
ldrne r8, [r0, #4]
|
||||
ldrneh r10, [r0, #8]
|
||||
|
||||
bne bil_width_loop_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4 ; update src for next row
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_2nd
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_2nd_filter|
|
||||
|bil_height_loop_null_2nd|
|
||||
mov lr, r3, lsr #2
|
||||
|
||||
|bil_width_loop_null_2nd|
|
||||
ldr r6, [r0], #4 ; load data
|
||||
subs lr, lr, #1
|
||||
ldr r8, [r0], #4
|
||||
|
||||
strb r6, [r1], r2 ; store data
|
||||
mov r7, r6, lsr #16
|
||||
strb r7, [r1], r2
|
||||
mov r9, r8, lsr #16
|
||||
strb r8, [r1], r2
|
||||
strb r9, [r1], r2
|
||||
|
||||
bne bil_width_loop_null_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_2nd
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp9_filter_block2d_second_pass_armv6|
|
||||
|
||||
END
|
|
@ -1,186 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem16x16_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem16x16_v6| PROC
|
||||
stmdb sp!, {r4 - r7}
|
||||
;push {r4-r7}
|
||||
|
||||
;preload
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
ands r4, r0, #15
|
||||
beq copy_mem16x16_fast
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem16x16_8
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem16x16_4
|
||||
|
||||
;copy one byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
ldrb r6, [r0, #2]
|
||||
ldrb r7, [r0, #3]
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
strb r6, [r2, #2]
|
||||
strb r7, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
ldrb r6, [r0, #6]
|
||||
ldrb r7, [r0, #7]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
strb r6, [r2, #6]
|
||||
strb r7, [r2, #7]
|
||||
|
||||
ldrb r4, [r0, #8]
|
||||
ldrb r5, [r0, #9]
|
||||
ldrb r6, [r0, #10]
|
||||
ldrb r7, [r0, #11]
|
||||
|
||||
strb r4, [r2, #8]
|
||||
strb r5, [r2, #9]
|
||||
strb r6, [r2, #10]
|
||||
strb r7, [r2, #11]
|
||||
|
||||
ldrb r4, [r0, #12]
|
||||
ldrb r5, [r0, #13]
|
||||
ldrb r6, [r0, #14]
|
||||
ldrb r7, [r0, #15]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #12]
|
||||
strb r5, [r2, #13]
|
||||
strb r6, [r2, #14]
|
||||
strb r7, [r2, #15]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
ldrneb r6, [r0, #2]
|
||||
ldrneb r7, [r0, #3]
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
bne copy_mem16x16_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem16x16_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r0, #8]
|
||||
ldr r7, [r0, #12]
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
str r6, [r2, #8]
|
||||
str r7, [r2, #12]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
ldrne r6, [r0, #8]
|
||||
ldrne r7, [r0, #12]
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
bne copy_mem16x16_4_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem16x16_8
|
||||
sub r1, r1, #16
|
||||
sub r3, r3, #16
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_8_loop
|
||||
ldmia r0!, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
ldmia r0!, {r6-r7}
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
stmia r2!, {r4-r5}
|
||||
subs r12, r12, #1
|
||||
;stm r2, {r4-r5}
|
||||
stmia r2!, {r6-r7}
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
bne copy_mem16x16_8_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 16 bytes each time
|
||||
copy_mem16x16_fast
|
||||
;sub r1, r1, #16
|
||||
;sub r3, r3, #16
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_fast_loop
|
||||
ldmia r0, {r4-r7}
|
||||
;ldm r0, {r4-r7}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r7}
|
||||
;stm r2, {r4-r7}
|
||||
add r2, r2, r3
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
bne copy_mem16x16_fast_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem16x16_v6|
|
||||
|
||||
END
|
|
@ -1,128 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem8x4_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem8x4_v6| PROC
|
||||
;push {r4-r5}
|
||||
stmdb sp!, {r4-r5}
|
||||
|
||||
;preload
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem8x4_fast
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem8x4_4
|
||||
|
||||
;copy 1 byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
|
||||
ldrb r4, [r0, #2]
|
||||
ldrb r5, [r0, #3]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #2]
|
||||
strb r5, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
|
||||
ldrb r4, [r0, #6]
|
||||
ldrb r5, [r0, #7]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #6]
|
||||
strb r5, [r2, #7]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
|
||||
bne copy_mem8x4_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem8x4_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
|
||||
bne copy_mem8x4_4_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem8x4_fast
|
||||
;sub r1, r1, #8
|
||||
;sub r3, r3, #8
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_fast_loop
|
||||
ldmia r0, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r5}
|
||||
;stm r2, {r4-r5}
|
||||
add r2, r2, r3
|
||||
|
||||
bne copy_mem8x4_fast_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem8x4_v6|
|
||||
|
||||
END
|
|
@ -1,128 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem8x8_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem8x8_v6| PROC
|
||||
;push {r4-r5}
|
||||
stmdb sp!, {r4-r5}
|
||||
|
||||
;preload
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem8x8_fast
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem8x8_4
|
||||
|
||||
;copy 1 byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
|
||||
ldrb r4, [r0, #2]
|
||||
ldrb r5, [r0, #3]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #2]
|
||||
strb r5, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
|
||||
ldrb r4, [r0, #6]
|
||||
ldrb r5, [r0, #7]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #6]
|
||||
strb r5, [r2, #7]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
|
||||
bne copy_mem8x8_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem8x8_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
|
||||
bne copy_mem8x8_4_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem8x8_fast
|
||||
;sub r1, r1, #8
|
||||
;sub r3, r3, #8
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_fast_loop
|
||||
ldmia r0, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r5}
|
||||
;stm r2, {r4-r5}
|
||||
add r2, r2, r3
|
||||
|
||||
bne copy_mem8x8_fast_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem8x8_v6|
|
||||
|
||||
END
|
|
@ -1,67 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_dc_only_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
|
||||
; unsigned char *dst_ptr, int pitch, int stride)
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 dest_ptr
|
||||
; r3 pitch
|
||||
; sp stride
|
||||
|
||||
|vp8_dc_only_idct_add_v6| PROC
|
||||
stmdb sp!, {r4 - r7, lr}
|
||||
|
||||
add r0, r0, #4 ; input_dc += 4
|
||||
ldr r12, c0x0000FFFF
|
||||
ldr r4, [r1], r3
|
||||
ldr r6, [r1], r3
|
||||
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
|
||||
ldr lr, [sp, #20]
|
||||
orr r0, r0, r0, lsl #16 ; a1 | a1
|
||||
|
||||
uxtab16 r5, r0, r4 ; a1+2 | a1+0
|
||||
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
|
||||
uxtab16 r7, r0, r6
|
||||
uxtab16 r6, r0, r6, ror #8
|
||||
usat16 r5, #8, r5
|
||||
usat16 r4, #8, r4
|
||||
usat16 r7, #8, r7
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
ldr r4, [r1], r3
|
||||
ldr r6, [r1]
|
||||
str r5, [r2], lr
|
||||
str r7, [r2], lr
|
||||
|
||||
uxtab16 r5, r0, r4
|
||||
uxtab16 r4, r0, r4, ror #8
|
||||
uxtab16 r7, r0, r6
|
||||
uxtab16 r6, r0, r6, ror #8
|
||||
usat16 r5, #8, r5
|
||||
usat16 r4, #8, r4
|
||||
usat16 r7, #8, r7
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
str r5, [r2], lr
|
||||
str r7, [r2]
|
||||
|
||||
ldmia sp!, {r4 - r7, pc}
|
||||
|
||||
ENDP ; |vp8_dc_only_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
c0x0000FFFF DCD 0x0000FFFF
|
||||
END
|
|
@ -1,624 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_filter_block2d_first_pass_armv6|
|
||||
EXPORT |vp9_filter_block2d_first_pass_16x16_armv6|
|
||||
EXPORT |vp9_filter_block2d_first_pass_8x8_armv6|
|
||||
EXPORT |vp9_filter_block2d_second_pass_armv6|
|
||||
EXPORT |vp9_filter4_block2d_second_pass_armv6|
|
||||
EXPORT |vp9_filter_block2d_first_pass_only_armv6|
|
||||
EXPORT |vp9_filter_block2d_second_pass_only_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 short *output_ptr
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int output_width
|
||||
; stack unsigned int output_height
|
||||
; stack const short *vp9_filter
|
||||
;-------------------------------------
|
||||
; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with
|
||||
; the output being a 2 byte value and the intput being a 1 byte value.
|
||||
|vp9_filter_block2d_first_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp9_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; --------------------------
|
||||
; 16x16 version
|
||||
; -----------------------------
|
||||
|vp9_filter_block2d_first_pass_16x16_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp9_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
add r4, r2, #18 ; preload next low
|
||||
pld [r0, r4]
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_16_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_16_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_16_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r11, r2, #34 ; adding back block width(=16)
|
||||
pld [r0, r11] ; preload next low
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_16_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; --------------------------
|
||||
; 8x8 version
|
||||
; -----------------------------
|
||||
|vp9_filter_block2d_first_pass_8x8_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp9_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
add r4, r2, #10 ; preload next low
|
||||
pld [r0, r4]
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_8_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_8_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_8_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r11, r2, #18 ; adding back block width(=8)
|
||||
pld [r0, r11] ; preload next low
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_8_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;---------------------------------
|
||||
; r0 short *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int output_pitch,
|
||||
; r3 unsigned int cnt,
|
||||
; stack const short *vp9_filter
|
||||
;---------------------------------
|
||||
|vp9_filter_block2d_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #36] ; vp9_filter address
|
||||
sub sp, sp, #4
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
str r1, [sp] ; push destination to stack
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
pkhbt r12, r5, r4 ; pack the filter differently
|
||||
pkhbt r11, r6, r5
|
||||
|
||||
sub r0, r0, #4 ; offset input buffer
|
||||
|
||||
|height_loop_2nd|
|
||||
ldr r8, [r0] ; load the data
|
||||
ldr r9, [r0, #4]
|
||||
orr r7, r7, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_2nd|
|
||||
smuad lr, r4, r8 ; apply filter
|
||||
sub r7, r7, #1
|
||||
smulbt r8, r4, r8
|
||||
|
||||
ldr r10, [r0, #8]
|
||||
|
||||
smlad lr, r5, r9, lr
|
||||
smladx r8, r12, r9, r8
|
||||
|
||||
ldrh r9, [r0, #12]
|
||||
|
||||
smlad lr, r6, r10, lr
|
||||
smladx r8, r11, r10, r8
|
||||
|
||||
add r0, r0, #4
|
||||
smlatb r10, r6, r9, r8
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ands r8, r7, #0xff
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r1], r2 ; the result is transposed back and stored
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrne r8, [r0] ; load data for next loop
|
||||
ldrne r9, [r0, #4]
|
||||
strb r10, [r1], r2
|
||||
|
||||
bne width_loop_2nd
|
||||
|
||||
ldr r1, [sp] ; update dst for next loop
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #16 ; updata src for next loop
|
||||
add r1, r1, #1
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_2nd
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;---------------------------------
|
||||
; r0 short *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int output_pitch,
|
||||
; r3 unsigned int cnt,
|
||||
; stack const short *vp9_filter
|
||||
;---------------------------------
|
||||
|vp9_filter4_block2d_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #36] ; vp9_filter address
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
add lr, r1, r3 ; save final destination pointer
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
pkhbt r12, r5, r4 ; pack the filter differently
|
||||
pkhbt r11, r6, r5
|
||||
mov r4, #0x40 ; rounding factor (for smlad{x})
|
||||
|
||||
|height_loop_2nd_4|
|
||||
ldrd r8, [r0, #-4] ; load the data
|
||||
orr r7, r7, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_2nd_4|
|
||||
ldr r10, [r0, #4]!
|
||||
smladx r6, r9, r12, r4 ; apply filter
|
||||
pkhbt r8, r9, r8
|
||||
smlad r5, r8, r12, r4
|
||||
pkhbt r8, r10, r9
|
||||
smladx r6, r10, r11, r6
|
||||
sub r7, r7, #1
|
||||
smlad r5, r8, r11, r5
|
||||
|
||||
mov r8, r9 ; shift the data for the next loop
|
||||
mov r9, r10
|
||||
|
||||
usat r6, #8, r6, asr #7 ; shift and clamp
|
||||
usat r5, #8, r5, asr #7
|
||||
|
||||
strb r5, [r1], r2 ; the result is transposed back and stored
|
||||
tst r7, #0xff
|
||||
strb r6, [r1], r2
|
||||
|
||||
bne width_loop_2nd_4
|
||||
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #16 ; update src for next loop
|
||||
sub r1, lr, r7, lsr #16 ; update dst for next loop
|
||||
|
||||
bne height_loop_2nd_4
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;------------------------------------
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int cnt,
|
||||
; stack unsigned int output_pitch,
|
||||
; stack const short *vp9_filter
|
||||
;------------------------------------
|
||||
|vp9_filter_block2d_first_pass_only_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
add r7, r2, r3 ; preload next low
|
||||
add r7, r7, #2
|
||||
pld [r0, r7]
|
||||
|
||||
ldr r4, [sp, #36] ; output pitch
|
||||
ldr r11, [sp, #40] ; HFilter address
|
||||
sub sp, sp, #8
|
||||
|
||||
mov r7, r3
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
sub r4, r4, r3
|
||||
str r4, [sp] ; save modified output pitch
|
||||
str r2, [sp, #4]
|
||||
|
||||
mov r2, #0x40
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_only_6|
|
||||
ldrb r8, [r0, #-2] ; load data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
|
||||
mov r12, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_1st_only_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
;; smuad lr, lr, r4
|
||||
smlad lr, lr, r4, r2
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
;; smuad r8, r8, r4
|
||||
smlad r8, r8, r4, r2
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r10, r10, r6, r8
|
||||
|
||||
;; add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
;; add r10, r10, #0x40
|
||||
strb lr, [r1], #1 ; store the result
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrneb r9, [r0, #-1]
|
||||
strb r10, [r1], #1
|
||||
ldrneb r10, [r0], #2
|
||||
|
||||
bne width_loop_1st_only_6
|
||||
|
||||
ldr lr, [sp] ; load back output pitch
|
||||
ldr r12, [sp, #4] ; load back output pitch
|
||||
subs r7, r7, #1
|
||||
add r0, r0, r12 ; updata src for next loop
|
||||
|
||||
add r11, r12, r3 ; preload next low
|
||||
add r11, r11, #2
|
||||
pld [r0, r11]
|
||||
|
||||
add r1, r1, lr ; update dst for next loop
|
||||
|
||||
bne height_loop_1st_only_6
|
||||
|
||||
add sp, sp, #8
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp9_filter_block2d_first_pass_only_armv6|
|
||||
|
||||
|
||||
;------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int cnt,
|
||||
; stack unsigned int output_pitch,
|
||||
; stack const short *vp9_filter
|
||||
;------------------------------------
|
||||
|vp9_filter_block2d_second_pass_only_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; VFilter address
|
||||
ldr r12, [sp, #36] ; output pitch
|
||||
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
|
||||
|
||||
sub sp, sp, #8
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r0, [sp] ; save r0 to stack
|
||||
str r1, [sp, #4] ; save dst to stack
|
||||
|
||||
; six tap filter
|
||||
|width_loop_2nd_only_6|
|
||||
ldrb r8, [r0], r2 ; load data
|
||||
orr r7, r7, r3 ; loop counter
|
||||
ldrb r9, [r0], r2
|
||||
ldrb r10, [r0], r2
|
||||
|
||||
|height_loop_2nd_only_6|
|
||||
; filter first column in this inner loop, than, move to next colum.
|
||||
ldrb r11, [r0], r2
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0], r2
|
||||
|
||||
smuad lr, lr, r4
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0], r2
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0]
|
||||
|
||||
sub r7, r7, #2
|
||||
sub r0, r0, r2, lsl #2
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r10, r10, r6, r8
|
||||
|
||||
ands r9, r7, #0xff
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0], r2 ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r1], r12 ; store the result for the column
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrneb r9, [r0], r2
|
||||
strb r10, [r1], r12
|
||||
ldrneb r10, [r0], r2
|
||||
|
||||
bne height_loop_2nd_only_6
|
||||
|
||||
ldr r0, [sp]
|
||||
ldr r1, [sp, #4]
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #1 ; move to filter next column
|
||||
str r0, [sp]
|
||||
add r1, r1, #1
|
||||
str r1, [sp, #4]
|
||||
|
||||
bne width_loop_2nd_only_6
|
||||
|
||||
add sp, sp, #8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp9_filter_block2d_second_pass_only_armv6|
|
||||
|
||||
END
|
|
@ -1,345 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
|
||||
EXPORT |vp8_short_idct4x4llm_1_v6|
|
||||
EXPORT |vp8_short_idct4x4llm_v6|
|
||||
EXPORT |vp8_short_idct4x4llm_v6_scott|
|
||||
EXPORT |vp8_short_idct4x4llm_v6_dual|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench: 3/5
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
|
||||
;
|
||||
ldrsh r0, [r0] ; load input[0] 1, r0 un 2
|
||||
add r0, r0, #4 ; 1 +4
|
||||
stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
|
||||
mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
|
||||
pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
|
||||
mov r5, r4 ; expand expand
|
||||
|
||||
strd r4, [r1], r2 ; *output = r0, post inc 1
|
||||
strd r4, [r1], r2 ; 1
|
||||
strd r4, [r1], r2 ; 1
|
||||
strd r4, [r1] ; 1
|
||||
;
|
||||
ldmia sp!, {r4, r5, pc} ; replace vars, return restore
|
||||
ENDP ; |vp8_short_idct4x4llm_1_v6|
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
|
||||
;
|
||||
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
|
||||
;
|
||||
mov r4, #0x00004E00 ; 1 cst
|
||||
orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r5, #0x00008A00 ; 1 cst
|
||||
orr r5, r5, #0x0000008C ; sinpi8sqrt2
|
||||
;
|
||||
mov r6, #4 ; i=4 1 i
|
||||
loop1 ;
|
||||
ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
|
||||
ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
|
||||
ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
|
||||
ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
|
||||
smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
|
||||
smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
|
||||
add r9, r7, r8 ; a1 = [0] + [8] 1 a1
|
||||
sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
|
||||
add r11, r3, r11 ; temp2 1
|
||||
rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
|
||||
smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
|
||||
smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
|
||||
add r8, r7, r11 ; b1 + c1 1 b+c
|
||||
strh r8, [r1, r2] ; out[pitch] = b1+c1 1
|
||||
sub r7, r7, r11 ; b1 - c1 1 b-c
|
||||
add r10, r12, r10 ; temp1 1
|
||||
add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
|
||||
add r10, r9, r3 ; a1 + d1 1 a+d
|
||||
sub r3, r9, r3 ; a1 - d1 1 a-d
|
||||
add r8, r2, r2 ; pitch * 2 1 p*2
|
||||
strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
|
||||
add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
|
||||
strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
|
||||
subs r6, r6, #1 ; i-- 1 --
|
||||
strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
|
||||
bne loop1 ; if i>0, continue
|
||||
;
|
||||
sub r1, r1, #8 ; set up out for next loop 1 -4
|
||||
; for this iteration, input=prev output
|
||||
mov r6, #4 ; i=4 1 i
|
||||
; b returnfull
|
||||
loop2 ;
|
||||
ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
|
||||
ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
|
||||
ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
|
||||
ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
|
||||
smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
|
||||
smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
|
||||
add r7, r0, r3 ; a1 = [0] + [2] 1 a1
|
||||
sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
|
||||
add r10, r8, r10 ; temp2 1
|
||||
rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
|
||||
smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
|
||||
smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
|
||||
add r3, r0, r9 ; b1+c1 1 b+c
|
||||
add r3, r3, #4 ; b1+c1+4 1 +4
|
||||
add r10, r11, r10 ; temp1 1
|
||||
mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
|
||||
strh r3, [r1, #2] ; out[1] = b1+c1 1
|
||||
add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
|
||||
add r3, r7, r10 ; a1+d1 1 a+d
|
||||
add r3, r3, #4 ; a1+d1+4 1 +4
|
||||
sub r7, r7, r10 ; a1-d1 1 a-d
|
||||
add r7, r7, #4 ; a1-d1+4 1 +4
|
||||
mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
|
||||
mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
|
||||
strh r7, [r1, #6] ; out[3] = a1-d1 1
|
||||
sub r0, r0, r9 ; b1-c1 1 b-c
|
||||
add r0, r0, #4 ; b1-c1+4 1 +4
|
||||
subs r6, r6, #1 ; i-- 1 --
|
||||
mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
|
||||
strh r0, [r1, #4] ; out[2] = b1-c1 1
|
||||
strh r3, [r1], r2 ; out[0] = a1+d1 1
|
||||
; add r1, r1, r2 ; out += pitch 1 ++
|
||||
bne loop2 ; if i>0, continue
|
||||
returnfull ;
|
||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
||||
ENDP
|
||||
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
|
||||
; mov r0, #0 ;
|
||||
; ldr r0, [r0] ;
|
||||
stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
|
||||
;
|
||||
mov r3, #0x00004E00 ; cos
|
||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
;
|
||||
mov r5, #0x2 ; i i
|
||||
;
|
||||
short_idct4x4llm_v6_scott_loop1 ;
|
||||
ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
|
||||
ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
|
||||
;
|
||||
smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
|
||||
smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
|
||||
;
|
||||
smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
|
||||
smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
|
||||
;
|
||||
add r6, r6, r7 ; partial c1 lt1-lt2
|
||||
add r12, r12, r14 ; partial d1 l2t2+l2t1
|
||||
;
|
||||
smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
|
||||
smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
|
||||
;
|
||||
smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
|
||||
smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
|
||||
;
|
||||
add r7, r14, r7 ; partial c1_2 ht1+ht2
|
||||
sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
|
||||
;
|
||||
pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
|
||||
pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
|
||||
;
|
||||
usub16 r6, r6, r10 ; c1_2 | c1_1 c
|
||||
uadd16 r12, r12, r11 ; d1_2 | d1_1 d
|
||||
;
|
||||
ldr r10, [r0, #0] ; i1 | i0 1,0
|
||||
ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
|
||||
;
|
||||
;;;;;; add r0, r0, #0x4 ; +4
|
||||
;;;;;; add r1, r1, #0x4 ; +4
|
||||
;
|
||||
uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
|
||||
usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
|
||||
;
|
||||
uadd16 r7, r8, r12 ; a1 + d1 pair a+d
|
||||
usub16 r14, r8, r12 ; a1 - d1 pair a-d
|
||||
;
|
||||
str r7, [r1] ; op[0] = a1 + d1
|
||||
str r14, [r1, r2] ; op[pitch*3] = a1 - d1
|
||||
;
|
||||
add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
|
||||
add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
|
||||
;
|
||||
subs r5, r5, #0x1 ; --
|
||||
bne short_idct4x4llm_v6_scott_loop1 ;
|
||||
;
|
||||
sub r1, r1, #16 ; reset output ptr
|
||||
mov r5, #0x4 ;
|
||||
mov r0, r1 ; input = output
|
||||
;
|
||||
short_idct4x4llm_v6_scott_loop2 ;
|
||||
;
|
||||
subs r5, r5, #0x1 ;
|
||||
bne short_idct4x4llm_v6_scott_loop2 ;
|
||||
;
|
||||
ldmia sp!, {r4 - r11, pc} ;
|
||||
ENDP ;
|
||||
;
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
|
||||
;
|
||||
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
|
||||
mov r3, #0x00004E00 ; cos
|
||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
mov r5, #0x2 ; i=2 i
|
||||
loop1_dual
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
|
||||
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
|
||||
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
|
||||
|
||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
||||
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
|
||||
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
|
||||
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
|
||||
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
|
||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
|
||||
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
|
||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
|
||||
subs r5, r5, #0x1 ; i-- --
|
||||
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
|
||||
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
|
||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
||||
usub16 r7, r8, r7 ; c c
|
||||
uadd16 r6, r6, r10 ; d d
|
||||
uadd16 r10, r11, r14 ; a a
|
||||
usub16 r8, r11, r14 ; b b
|
||||
uadd16 r9, r10, r6 ; a+d a+d
|
||||
usub16 r10, r10, r6 ; a-d a-d
|
||||
uadd16 r6, r8, r7 ; b+c b+c
|
||||
usub16 r7, r8, r7 ; b-c b-c
|
||||
str r6, [r1, r2] ; o5 | o4
|
||||
add r6, r2, r2 ; pitch * 2 p2
|
||||
str r7, [r1, r6] ; o9 | o8
|
||||
add r6, r6, r2 ; pitch * 3 p3
|
||||
str r10, [r1, r6] ; o13 | o12
|
||||
str r9, [r1], #0x4 ; o1 | o0 ++
|
||||
bne loop1_dual ;
|
||||
mov r5, #0x2 ; i=2 i
|
||||
sub r0, r1, #8 ; reset input/output i/o
|
||||
loop2_dual
|
||||
ldr r6, [r0, r2] ; i5 | i4 5|4
|
||||
ldr r1, [r0] ; i1 | i0 1|0
|
||||
ldr r12, [r0, #0x4] ; i3 | i2 3|2
|
||||
add r14, r2, #0x4 ; pitch + 2 p+2
|
||||
ldr r14, [r0, r14] ; i7 | i6 7|6
|
||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
||||
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
||||
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
|
||||
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
|
||||
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
|
||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
|
||||
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
|
||||
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
|
||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
|
||||
uadd16 r10, r11, r9 ; a a
|
||||
usub16 r9, r11, r9 ; b b
|
||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
|
||||
subs r5, r5, #0x1 ; i-- --
|
||||
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
|
||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
|
||||
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
|
||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
|
||||
|
||||
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
|
||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
|
||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
|
||||
usub16 r12, r8, r6 ; c (o1 | o5) c
|
||||
uadd16 r6, r11, r1 ; d (o3 | o7) d
|
||||
uadd16 r7, r10, r6 ; a+d a+d
|
||||
mov r8, #0x4 ; set up 4's 4
|
||||
orr r8, r8, #0x40000 ; 4|4
|
||||
usub16 r6, r10, r6 ; a-d a-d
|
||||
uadd16 r6, r6, r8 ; a-d+4 3|7
|
||||
uadd16 r7, r7, r8 ; a+d+4 0|4
|
||||
uadd16 r10, r9, r12 ; b+c b+c
|
||||
usub16 r1, r9, r12 ; b-c b-c
|
||||
uadd16 r10, r10, r8 ; b+c+4 1|5
|
||||
uadd16 r1, r1, r8 ; b-c+4 2|6
|
||||
mov r8, r10, asr #19 ; o1 >> 3
|
||||
strh r8, [r0, #2] ; o1
|
||||
mov r8, r1, asr #19 ; o2 >> 3
|
||||
strh r8, [r0, #4] ; o2
|
||||
mov r8, r6, asr #19 ; o3 >> 3
|
||||
strh r8, [r0, #6] ; o3
|
||||
mov r8, r7, asr #19 ; o0 >> 3
|
||||
strh r8, [r0], r2 ; o0 +p
|
||||
sxth r10, r10 ;
|
||||
mov r8, r10, asr #3 ; o5 >> 3
|
||||
strh r8, [r0, #2] ; o5
|
||||
sxth r1, r1 ;
|
||||
mov r8, r1, asr #3 ; o6 >> 3
|
||||
strh r8, [r0, #4] ; o6
|
||||
sxth r6, r6 ;
|
||||
mov r8, r6, asr #3 ; o7 >> 3
|
||||
strh r8, [r0, #6] ; o7
|
||||
sxth r7, r7 ;
|
||||
mov r8, r7, asr #3 ; o4 >> 3
|
||||
strh r8, [r0], r2 ; o4 +p
|
||||
;;;;; subs r5, r5, #0x1 ; i-- --
|
||||
bne loop2_dual ;
|
||||
;
|
||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,152 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_inv_walsh4x4_v6|
|
||||
EXPORT |vp8_short_inv_walsh4x4_1_v6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_v6| PROC
|
||||
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r2, [r0], #4 ; [1 | 0]
|
||||
ldr r3, [r0], #4 ; [3 | 2]
|
||||
ldr r4, [r0], #4 ; [5 | 4]
|
||||
ldr r5, [r0], #4 ; [7 | 6]
|
||||
ldr r6, [r0], #4 ; [9 | 8]
|
||||
ldr r7, [r0], #4 ; [11 | 10]
|
||||
ldr r8, [r0], #4 ; [13 | 12]
|
||||
ldr r9, [r0] ; [15 | 14]
|
||||
|
||||
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
|
||||
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
|
||||
qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
|
||||
qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
|
||||
|
||||
qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
|
||||
qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
|
||||
qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
|
||||
qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
|
||||
|
||||
qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
|
||||
qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
|
||||
qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
|
||||
qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
|
||||
|
||||
qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
|
||||
qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
|
||||
qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
|
||||
qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
|
||||
|
||||
; first transform complete
|
||||
|
||||
qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
|
||||
qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
|
||||
qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
|
||||
qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
|
||||
|
||||
qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
ldr r10, c0x00030003
|
||||
qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
|
||||
qadd16 r2, r2, r10 ; [b2+3|c2+3]
|
||||
qadd16 r3, r3, r10 ; [a2+3|d2+3]
|
||||
qadd16 r4, r4, r10 ; [b2+3|c2+3]
|
||||
qadd16 r5, r5, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r12, r2, #3 ; [1 | x]
|
||||
pkhtb r12, r12, r3, asr #19; [1 | 0]
|
||||
lsl lr, r3, #16 ; [~3 | x]
|
||||
lsl r2, r2, #16 ; [~2 | x]
|
||||
asr lr, lr, #3 ; [3 | x]
|
||||
pkhtb lr, lr, r2, asr #19 ; [3 | 2]
|
||||
|
||||
asr r2, r4, #3 ; [5 | x]
|
||||
pkhtb r2, r2, r5, asr #19 ; [5 | 4]
|
||||
lsl r3, r5, #16 ; [~7 | x]
|
||||
lsl r4, r4, #16 ; [~6 | x]
|
||||
asr r3, r3, #3 ; [7 | x]
|
||||
pkhtb r3, r3, r4, asr #19 ; [7 | 6]
|
||||
|
||||
str r12, [r1], #4
|
||||
str lr, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r3, [r1], #4
|
||||
|
||||
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
|
||||
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
|
||||
qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
|
||||
qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
|
||||
|
||||
qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
|
||||
qadd16 r6, r6, r10 ; [b2+3|c2+3]
|
||||
qadd16 r7, r7, r10 ; [a2+3|d2+3]
|
||||
qadd16 r8, r8, r10 ; [b2+3|c2+3]
|
||||
qadd16 r9, r9, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r2, r6, #3 ; [9 | x]
|
||||
pkhtb r2, r2, r7, asr #19 ; [9 | 8]
|
||||
lsl r3, r7, #16 ; [~11| x]
|
||||
lsl r4, r6, #16 ; [~10| x]
|
||||
asr r3, r3, #3 ; [11 | x]
|
||||
pkhtb r3, r3, r4, asr #19 ; [11 | 10]
|
||||
|
||||
asr r4, r8, #3 ; [13 | x]
|
||||
pkhtb r4, r4, r9, asr #19 ; [13 | 12]
|
||||
lsl r5, r9, #16 ; [~15| x]
|
||||
lsl r6, r8, #16 ; [~14| x]
|
||||
asr r5, r5, #3 ; [15 | x]
|
||||
pkhtb r5, r5, r6, asr #19 ; [15 | 14]
|
||||
|
||||
str r2, [r1], #4
|
||||
str r3, [r1], #4
|
||||
str r4, [r1], #4
|
||||
str r5, [r1]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_short_inv_walsh4x4_v6|
|
||||
|
||||
|
||||
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_1_v6| PROC
|
||||
|
||||
ldrsh r2, [r0] ; [0]
|
||||
add r2, r2, #3 ; [0] + 3
|
||||
asr r2, r2, #3 ; a1 ([0]+3) >> 3
|
||||
lsl r2, r2, #16 ; [a1 | x]
|
||||
orr r2, r2, r2, lsr #16 ; [a1 | a1]
|
||||
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1], #4
|
||||
str r2, [r1]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
|
||||
|
||||
; Constant Pool
|
||||
c0x00030003 DCD 0x00030003
|
||||
END
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,281 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon_b_armv6|
|
||||
EXPORT |vp8_recon2b_armv6|
|
||||
EXPORT |vp8_recon4b_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
prd RN r0
|
||||
dif RN r1
|
||||
dst RN r2
|
||||
stride RN r3
|
||||
|
||||
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
|
||||
; R0 char* pred_ptr
|
||||
; R1 short * dif_ptr
|
||||
; R2 char * dst_ptr
|
||||
; R3 int stride
|
||||
|
||||
; Description:
|
||||
; Loop through the block adding the Pred and Diff together. Clamp and then
|
||||
; store back into the Dst.
|
||||
|
||||
; Restrictions :
|
||||
; all buffers are expected to be 4 byte aligned coming in and
|
||||
; going out.
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_recon_b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #8] ; 1 | 0
|
||||
;; ldr r7, [dif, #12] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #16] ; 1 | 0
|
||||
;; ldr r7, [dif, #20] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #24] ; 1 | 0
|
||||
;; ldr r7, [dif, #28] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |recon_b|
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
; R0 char *pred_ptr
|
||||
; R1 short *dif_ptr
|
||||
; R2 char *dst_ptr
|
||||
; R3 int stride
|
||||
|vp8_recon4b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
mov lr, #4
|
||||
|
||||
recon4b_loop
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst]
|
||||
|
||||
;4, 5, 6, 7
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #32]
|
||||
;; ldr r7, [dif, #36]
|
||||
ldr r6, [dif, #8]
|
||||
ldr r7, [dif, #12]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #4]
|
||||
|
||||
;8, 9, 10, 11
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #64]
|
||||
;; ldr r7, [dif, #68]
|
||||
ldr r6, [dif, #16]
|
||||
ldr r7, [dif, #20]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #8]
|
||||
|
||||
;12, 13, 14, 15
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #96]
|
||||
;; ldr r7, [dif, #100]
|
||||
ldr r6, [dif, #24]
|
||||
ldr r7, [dif, #28]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #12]
|
||||
|
||||
add dst, dst, stride
|
||||
;; add dif, dif, #8
|
||||
add dif, dif, #32
|
||||
|
||||
subs lr, lr, #1
|
||||
bne recon4b_loop
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |Recon4B|
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
; R0 char *pred_ptr
|
||||
; R1 short *dif_ptr
|
||||
; R2 char *dst_ptr
|
||||
; R3 int stride
|
||||
|vp8_recon2b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
mov lr, #4
|
||||
|
||||
recon2b_loop
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #4
|
||||
ldr r6, [dif, #0]
|
||||
ldr r7, [dif, #4]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst]
|
||||
|
||||
;4, 5, 6, 7
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #32]
|
||||
;; ldr r7, [dif, #36]
|
||||
ldr r6, [dif, #8]
|
||||
ldr r7, [dif, #12]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #4]
|
||||
|
||||
add dst, dst, stride
|
||||
;; add dif, dif, #8
|
||||
add dif, dif, #16
|
||||
|
||||
subs lr, lr, #1
|
||||
bne recon2b_loop
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |Recon2B|
|
||||
|
||||
END
|
|
@ -1,286 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
|
||||
EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
MACRO
|
||||
TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
|
||||
; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
|
||||
; a0: 03 02 01 00
|
||||
; a1: 13 12 11 10
|
||||
; a2: 23 22 21 20
|
||||
; a3: 33 32 31 30
|
||||
; b3 b2 b1 b0
|
||||
|
||||
uxtb16 $b1, $a1 ; xx 12 xx 10
|
||||
uxtb16 $b0, $a0 ; xx 02 xx 00
|
||||
uxtb16 $b3, $a3 ; xx 32 xx 30
|
||||
uxtb16 $b2, $a2 ; xx 22 xx 20
|
||||
orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
|
||||
orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
|
||||
|
||||
uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
|
||||
uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
|
||||
uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
|
||||
uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
|
||||
orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
|
||||
orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
|
||||
|
||||
pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
|
||||
pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
|
||||
|
||||
pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
|
||||
pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
|
||||
MEND
|
||||
|
||||
|
||||
|
||||
src RN r0
|
||||
pstep RN r1
|
||||
|
||||
;r0 unsigned char *src_ptr,
|
||||
;r1 int src_pixel_step,
|
||||
;r2 const char *blimit
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrb r12, [r2] ; blimit
|
||||
ldr r3, [src, -pstep, lsl #1] ; p1
|
||||
ldr r4, [src, -pstep] ; p0
|
||||
ldr r5, [src] ; q0
|
||||
ldr r6, [src, pstep] ; q1
|
||||
orr r12, r12, r12, lsl #8 ; blimit
|
||||
ldr r2, c0x80808080
|
||||
orr r12, r12, r12, lsl #16 ; blimit
|
||||
mov r9, #4 ; double the count. we're doing 4 at a time
|
||||
mov lr, #0 ; need 0 in a couple places
|
||||
|
||||
|simple_hnext8|
|
||||
; vp8_simple_filter_mask()
|
||||
|
||||
uqsub8 r7, r3, r6 ; p1 - q1
|
||||
uqsub8 r8, r6, r3 ; q1 - p1
|
||||
uqsub8 r10, r4, r5 ; p0 - q0
|
||||
uqsub8 r11, r5, r4 ; q0 - p0
|
||||
orr r8, r8, r7 ; abs(p1 - q1)
|
||||
orr r10, r10, r11 ; abs(p0 - q0)
|
||||
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
|
||||
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
|
||||
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
mvn r8, #0
|
||||
usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
|
||||
sel r10, r8, lr ; filter mask: F or 0
|
||||
cmp r10, #0
|
||||
beq simple_hskip_filter ; skip filtering if all masks are 0x00
|
||||
|
||||
;vp8_simple_filter()
|
||||
|
||||
eor r3, r3, r2 ; p1 offset to convert to a signed value
|
||||
eor r6, r6, r2 ; q1 offset to convert to a signed value
|
||||
eor r4, r4, r2 ; p0 offset to convert to a signed value
|
||||
eor r5, r5, r2 ; q0 offset to convert to a signed value
|
||||
|
||||
qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
|
||||
qsub8 r6, r5, r4 ; q0 - p0
|
||||
qadd8 r3, r3, r6 ; += q0 - p0
|
||||
ldr r7, c0x04040404
|
||||
qadd8 r3, r3, r6 ; += q0 - p0
|
||||
ldr r8, c0x03030303
|
||||
qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
|
||||
;STALL
|
||||
and r3, r3, r10 ; vp9_filter &= mask
|
||||
|
||||
qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4
|
||||
qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3
|
||||
|
||||
shadd8 r7 , r7 , lr
|
||||
shadd8 r8 , r8 , lr
|
||||
shadd8 r7 , r7 , lr
|
||||
shadd8 r8 , r8 , lr
|
||||
shadd8 r7 , r7 , lr ; Filter1 >>= 3
|
||||
shadd8 r8 , r8 , lr ; Filter2 >>= 3
|
||||
|
||||
qsub8 r5 ,r5, r7 ; u = q0 - Filter1
|
||||
qadd8 r4, r4, r8 ; u = p0 + Filter2
|
||||
eor r5, r5, r2 ; *oq0 = u^0x80
|
||||
str r5, [src] ; store oq0 result
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
str r4, [src, -pstep] ; store op0 result
|
||||
|
||||
|simple_hskip_filter|
|
||||
subs r9, r9, #1
|
||||
addne src, src, #4 ; next row
|
||||
|
||||
ldrne r3, [src, -pstep, lsl #1] ; p1
|
||||
ldrne r4, [src, -pstep] ; p0
|
||||
ldrne r5, [src] ; q0
|
||||
ldrne r6, [src, pstep] ; q1
|
||||
|
||||
bne simple_hnext8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6|
|
||||
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
|vp9_loop_filter_simple_vertical_edge_armv6| PROC
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrb r12, [r2] ; r12: blimit
|
||||
ldr r2, c0x80808080
|
||||
orr r12, r12, r12, lsl #8
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrh r3, [src, #-2]
|
||||
pld [src, #23] ; preload for next block
|
||||
ldrh r4, [src], pstep
|
||||
orr r12, r12, r12, lsl #16
|
||||
|
||||
ldrh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r6, [src], pstep
|
||||
|
||||
pkhbt r7, r3, r4, lsl #16
|
||||
|
||||
ldrh r3, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r4, [src], pstep
|
||||
|
||||
pkhbt r8, r5, r6, lsl #16
|
||||
|
||||
ldrh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r6, [src], pstep
|
||||
mov r11, #4 ; double the count. we're doing 4 at a time
|
||||
|
||||
|simple_vnext8|
|
||||
; vp8_simple_filter_mask() function
|
||||
pkhbt r9, r3, r4, lsl #16
|
||||
pkhbt r10, r5, r6, lsl #16
|
||||
|
||||
;transpose r7, r8, r9, r10 to r3, r4, r5, r6
|
||||
TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
|
||||
|
||||
uqsub8 r7, r3, r6 ; p1 - q1
|
||||
uqsub8 r8, r6, r3 ; q1 - p1
|
||||
uqsub8 r9, r4, r5 ; p0 - q0
|
||||
uqsub8 r10, r5, r4 ; q0 - p0
|
||||
orr r7, r7, r8 ; abs(p1 - q1)
|
||||
orr r9, r9, r10 ; abs(p0 - q0)
|
||||
mov r8, #0
|
||||
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
|
||||
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
|
||||
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
mvn r10, #0 ; r10 == -1
|
||||
|
||||
usub8 r7, r12, r7 ; compare to flimit
|
||||
sel lr, r10, r8 ; filter mask
|
||||
|
||||
cmp lr, #0
|
||||
beq simple_vskip_filter ; skip filtering
|
||||
|
||||
;vp8_simple_filter() function
|
||||
eor r3, r3, r2 ; p1 offset to convert to a signed value
|
||||
eor r6, r6, r2 ; q1 offset to convert to a signed value
|
||||
eor r4, r4, r2 ; p0 offset to convert to a signed value
|
||||
eor r5, r5, r2 ; q0 offset to convert to a signed value
|
||||
|
||||
qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
|
||||
qsub8 r6, r5, r4 ; q0 - p0
|
||||
|
||||
qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
|
||||
ldr r9, c0x03030303 ; r9 = 3
|
||||
|
||||
qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
|
||||
ldr r7, c0x04040404
|
||||
|
||||
qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
|
||||
;STALL
|
||||
and r3, r3, lr ; vp9_filter &= mask
|
||||
|
||||
qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3
|
||||
qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4
|
||||
|
||||
shadd8 r9 , r9 , r8
|
||||
shadd8 r3 , r3 , r8
|
||||
shadd8 r9 , r9 , r8
|
||||
shadd8 r3 , r3 , r8
|
||||
shadd8 r9 , r9 , r8 ; Filter2 >>= 3
|
||||
shadd8 r3 , r3 , r8 ; Filter1 >>= 3
|
||||
|
||||
;calculate output
|
||||
sub src, src, pstep, lsl #2
|
||||
|
||||
qadd8 r4, r4, r9 ; u = p0 + Filter2
|
||||
qsub8 r5, r5, r3 ; u = q0 - Filter1
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
eor r5, r5, r2 ; *oq0 = u^0x80
|
||||
|
||||
strb r4, [src, #-1] ; store the result
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
strb r5, [src], pstep
|
||||
|
||||
|simple_vskip_filter|
|
||||
subs r11, r11, #1
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrneh r3, [src, #-2]
|
||||
pld [src, #23] ; preload for next block
|
||||
ldrneh r4, [src], pstep
|
||||
|
||||
ldrneh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r6, [src], pstep
|
||||
|
||||
pkhbt r7, r3, r4, lsl #16
|
||||
|
||||
ldrneh r3, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r4, [src], pstep
|
||||
|
||||
pkhbt r8, r5, r6, lsl #16
|
||||
|
||||
ldrneh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r6, [src], pstep
|
||||
|
||||
bne simple_vnext8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6|
|
||||
|
||||
; Constant Pool
|
||||
c0x80808080 DCD 0x80808080
|
||||
c0x03030303 DCD 0x03030303
|
||||
c0x04040404 DCD 0x04040404
|
||||
|
||||
END
|
|
@ -1,273 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x4_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack unsigned char *dst_ptr,
|
||||
; stack int dst_pitch
|
||||
;-------------------------------------
|
||||
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
|
||||
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
|
||||
;and the result is stored in transpose.
|
||||
|vp8_sixtap_predict8x4_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
add lr, sp, #4 ;point to temporary buffer
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;first-pass filter
|
||||
adr r12, filter8_coeff
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
add r3, r1, #10 ; preload next low
|
||||
pld [r0, r3]
|
||||
|
||||
add r2, r12, r2, lsl #4 ;calculate filter location
|
||||
add r0, r0, #3 ;adjust src only for loading convinience
|
||||
|
||||
ldr r3, [r2] ; load up packed filter coefficients
|
||||
ldr r4, [r2, #4]
|
||||
ldr r5, [r2, #8]
|
||||
|
||||
mov r2, #0x90000 ; height=9 is top part of counter
|
||||
|
||||
sub r1, r1, #8
|
||||
|
||||
|first_pass_hloop_v6|
|
||||
ldrb r6, [r0, #-5] ; load source data
|
||||
ldrb r7, [r0, #-4]
|
||||
ldrb r8, [r0, #-3]
|
||||
ldrb r9, [r0, #-2]
|
||||
ldrb r10, [r0, #-1]
|
||||
|
||||
orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; r7 | r6
|
||||
pkhbt r7, r7, r8, lsl #16 ; r8 | r7
|
||||
|
||||
pkhbt r8, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
|first_pass_wloop_v6|
|
||||
smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1]
|
||||
smuad r12, r7, r3
|
||||
|
||||
ldrb r6, [r0], #1
|
||||
|
||||
smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3]
|
||||
ldrb r7, [r0], #1
|
||||
smlad r12, r9, r4, r12
|
||||
|
||||
pkhbt r10, r10, r6, lsl #16 ; r10 | r9
|
||||
pkhbt r6, r6, r7, lsl #16 ; r11 | r10
|
||||
smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5]
|
||||
smlad r12, r6, r5, r12
|
||||
|
||||
sub r2, r2, #1
|
||||
|
||||
add r11, r11, #0x40 ; round_shift_and_clamp
|
||||
tst r2, #0xff ; test loop counter
|
||||
usat r11, #8, r11, asr #7
|
||||
add r12, r12, #0x40
|
||||
strh r11, [lr], #20 ; result is transposed and stored, which
|
||||
usat r12, #8, r12, asr #7
|
||||
|
||||
strh r12, [lr], #20
|
||||
|
||||
movne r11, r6
|
||||
movne r12, r7
|
||||
|
||||
movne r6, r8
|
||||
movne r7, r9
|
||||
movne r8, r10
|
||||
movne r9, r11
|
||||
movne r10, r12
|
||||
|
||||
bne first_pass_wloop_v6
|
||||
|
||||
;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
|
||||
;;IF ARCHITECTURE=6
|
||||
;pld [src, ppl]
|
||||
;;pld [src, r9]
|
||||
;;ENDIF
|
||||
|
||||
subs r2, r2, #0x10000
|
||||
|
||||
sub lr, lr, #158
|
||||
|
||||
add r0, r0, r1 ; move to next input line
|
||||
|
||||
add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
|
||||
pld [r0, r11]
|
||||
|
||||
bne first_pass_hloop_v6
|
||||
|
||||
;second pass filter
|
||||
secondpass_filter
|
||||
ldr r3, [sp], #4 ; load back yoffset
|
||||
ldr r0, [sp, #216] ; load dst address from stack 180+36
|
||||
ldr r1, [sp, #220] ; load dst stride from stack 180+40
|
||||
|
||||
cmp r3, #0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
adr r12, filter8_coeff
|
||||
add lr, r12, r3, lsl #4 ;calculate filter location
|
||||
|
||||
mov r2, #0x00080000
|
||||
|
||||
ldr r3, [lr] ; load up packed filter coefficients
|
||||
ldr r4, [lr, #4]
|
||||
ldr r5, [lr, #8]
|
||||
|
||||
pkhbt r12, r4, r3 ; pack the filter differently
|
||||
pkhbt r11, r5, r4
|
||||
|
||||
second_pass_hloop_v6
|
||||
ldr r6, [sp] ; load the data
|
||||
ldr r7, [sp, #4]
|
||||
|
||||
orr r2, r2, #2 ; loop counter
|
||||
|
||||
second_pass_wloop_v6
|
||||
smuad lr, r3, r6 ; apply filter
|
||||
smulbt r10, r3, r6
|
||||
|
||||
ldr r8, [sp, #8]
|
||||
|
||||
smlad lr, r4, r7, lr
|
||||
smladx r10, r12, r7, r10
|
||||
|
||||
ldrh r9, [sp, #12]
|
||||
|
||||
smlad lr, r5, r8, lr
|
||||
smladx r10, r11, r8, r10
|
||||
|
||||
add sp, sp, #4
|
||||
smlatb r10, r5, r9, r10
|
||||
|
||||
sub r2, r2, #1
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
tst r2, #0xff
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r0], r1 ; the result is transposed back and stored
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
strb r10, [r0],r1
|
||||
|
||||
movne r6, r7
|
||||
movne r7, r8
|
||||
|
||||
bne second_pass_wloop_v6
|
||||
|
||||
subs r2, r2, #0x10000
|
||||
add sp, sp, #12 ; updata src for next loop (20-8)
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #1
|
||||
|
||||
bne second_pass_hloop_v6
|
||||
|
||||
add sp, sp, #20
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r1, r1, #8
|
||||
mov r2, #9
|
||||
|
||||
skip_firstpass_hloop
|
||||
ldrb r4, [r0], #1 ; load data
|
||||
subs r2, r2, #1
|
||||
ldrb r5, [r0], #1
|
||||
strh r4, [lr], #20 ; store it to immediate buffer
|
||||
ldrb r6, [r0], #1 ; load data
|
||||
strh r5, [lr], #20
|
||||
ldrb r7, [r0], #1
|
||||
strh r6, [lr], #20
|
||||
ldrb r8, [r0], #1
|
||||
strh r7, [lr], #20
|
||||
ldrb r9, [r0], #1
|
||||
strh r8, [lr], #20
|
||||
ldrb r10, [r0], #1
|
||||
strh r9, [lr], #20
|
||||
ldrb r11, [r0], #1
|
||||
strh r10, [lr], #20
|
||||
add r0, r0, r1 ; move to next input line
|
||||
strh r11, [lr], #20
|
||||
|
||||
sub lr, lr, #158 ; move over to next column
|
||||
bne skip_firstpass_hloop
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;--------------------
|
||||
skip_secondpass_filter
|
||||
mov r2, #8
|
||||
add sp, sp, #4 ;start from src[0] instead of src[-2]
|
||||
|
||||
skip_secondpass_hloop
|
||||
ldr r6, [sp], #4
|
||||
subs r2, r2, #1
|
||||
ldr r8, [sp], #4
|
||||
|
||||
mov r7, r6, lsr #16 ; unpack
|
||||
strb r6, [r0], r1
|
||||
mov r9, r8, lsr #16
|
||||
strb r7, [r0], r1
|
||||
add sp, sp, #12 ; 20-8
|
||||
strb r8, [r0], r1
|
||||
strb r9, [r0], r1
|
||||
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #1
|
||||
|
||||
bne skip_secondpass_hloop
|
||||
|
||||
add sp, sp, #16 ; 180 - (160 +4)
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
;One word each is reserved. Label filter_coeff can be used to access the data.
|
||||
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
|
||||
filter8_coeff
|
||||
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
|
||||
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
|
||||
DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
|
||||
DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
|
||||
DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
|
||||
DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
|
||||
DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
|
||||
DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
|
||||
|
||||
;DCD 0, 0, 128, 0, 0, 0
|
||||
;DCD 0, -6, 123, 12, -1, 0
|
||||
;DCD 2, -11, 108, 36, -8, 1
|
||||
;DCD 0, -9, 93, 50, -6, 0
|
||||
;DCD 3, -16, 77, 77, -16, 3
|
||||
;DCD 0, -6, 50, 93, -9, 0
|
||||
;DCD 1, -8, 36, 108, -11, 2
|
||||
;DCD 0, -1, 12, 123, -6, 0
|
||||
|
||||
END
|
|
@ -1,357 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict16x16_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, bifilter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {d31}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_bfilter16x16_only
|
||||
|
||||
sub sp, sp, #272 ;reserve space on stack for temporary storage
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
mov lr, sp
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
mov r2, #3 ;loop counter
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
filt_blk2d_fp16x16_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vst1.u8 {d18, d19, d20, d21}, [lr]!
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
bne filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;First-pass filtering for rest 5 lines
|
||||
vld1.u8 {d14, d15, d16}, [r0], r1
|
||||
|
||||
vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q10, d3, d0
|
||||
vmull.u8 q11, d5, d0
|
||||
vmull.u8 q12, d6, d0
|
||||
vmull.u8 q13, d8, d0
|
||||
vmull.u8 q14, d9, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
|
||||
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q11, d5, d1
|
||||
vmlal.u8 q13, d8, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
|
||||
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q12, d6, d1
|
||||
vmlal.u8 q14, d9, d1
|
||||
|
||||
vmull.u8 q1, d11, d0
|
||||
vmull.u8 q2, d12, d0
|
||||
vmull.u8 q3, d14, d0
|
||||
vmull.u8 q4, d15, d0
|
||||
|
||||
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
|
||||
vext.8 d14, d14, d15, #1
|
||||
|
||||
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q3, d14, d1
|
||||
|
||||
vext.8 d12, d12, d13, #1
|
||||
vext.8 d15, d15, d16, #1
|
||||
|
||||
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q4, d15, d1
|
||||
|
||||
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d11, q10, #7
|
||||
vqrshrn.u16 d12, q11, #7
|
||||
vqrshrn.u16 d13, q12, #7
|
||||
vqrshrn.u16 d14, q13, #7
|
||||
vqrshrn.u16 d15, q14, #7
|
||||
vqrshrn.u16 d16, q1, #7
|
||||
vqrshrn.u16 d17, q2, #7
|
||||
vqrshrn.u16 d18, q3, #7
|
||||
vqrshrn.u16 d19, q4, #7
|
||||
|
||||
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]!
|
||||
vst1.u8 {d18, d19}, [lr]!
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
sub lr, lr, #272
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vld1.u8 {d22, d23}, [lr]! ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
filt_blk2d_sp16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [lr]!
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vld1.u8 {d26, d27}, [lr]!
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [lr]!
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [lr]!
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r4], r5 ;store result
|
||||
vst1.u8 {d4, d5}, [r4], r5
|
||||
vst1.u8 {d6, d7}, [r4], r5
|
||||
vmov q11, q15
|
||||
vst1.u8 {d8, d9}, [r4], r5
|
||||
|
||||
bne filt_blk2d_sp16x16_loop_neon
|
||||
|
||||
add sp, sp, #272
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16_only
|
||||
mov r2, #4 ;loop counter
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
vst1.u8 {d14, d15}, [r4], r5 ;store result
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
|
||||
vst1.u8 {d16, d17}, [r4], r5
|
||||
vst1.u8 {d18, d19}, [r4], r5
|
||||
vst1.u8 {d20, d21}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16_only
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d22, d23}, [r0], r1 ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
filt_blk2d_spo16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [r0], r1
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vld1.u8 {d26, d27}, [r0], r1
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [r0], r1
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [r0], r1
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r4], r5 ;store result
|
||||
subs r12, r12, #1
|
||||
vst1.u8 {d4, d5}, [r4], r5
|
||||
vmov q11, q15
|
||||
vst1.u8 {d6, d7}, [r4], r5
|
||||
vst1.u8 {d8, d9}, [r4], r5
|
||||
|
||||
bne filt_blk2d_spo16x16_loop_neon
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter16_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,130 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict4x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict4x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (5x4)
|
||||
vld1.u8 {d2}, [r0], r1 ;load src data
|
||||
add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
|
||||
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vld1.u32 {d31}, [r2] ;first_pass filter
|
||||
|
||||
vld1.u8 {d4}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
|
||||
vld1.u8 {d5}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {d6}, [r0], r1
|
||||
|
||||
vshr.u64 q4, q1, #8 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q2, #8
|
||||
vshr.u64 d12, d6, #8
|
||||
|
||||
vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d4, d5
|
||||
vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q8, d4, d0
|
||||
vmull.u8 q9, d6, d0
|
||||
|
||||
vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1])
|
||||
vmlal.u8 q8, d10, d1
|
||||
vmlal.u8 q9, d12, d1
|
||||
|
||||
vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d29, q8, #7
|
||||
vqrshrn.u16 d30, q9, #7
|
||||
|
||||
;Second pass: 4x4
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3 ;calculate Vfilter location
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d28, d0
|
||||
vmull.u8 q2, d29, d0
|
||||
|
||||
vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
|
||||
vext.8 d27, d29, d30, #4
|
||||
|
||||
vmlal.u8 q1, d26, d1
|
||||
vmlal.u8 q2, d27, d1
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
|
||||
vst1.32 {d2[0]}, [r4] ;store result
|
||||
vst1.32 {d2[1]}, [r0]
|
||||
vst1.32 {d3[0]}, [r1]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
|
||||
vld1.32 {d28[0]}, [r0], r1 ;load src data
|
||||
vld1.32 {d28[1]}, [r0], r1
|
||||
vld1.32 {d29[0]}, [r0], r1
|
||||
vld1.32 {d29[1]}, [r0], r1
|
||||
vld1.32 {d30[0]}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.32 {d28[0]}, [r4], lr ;store result
|
||||
vst1.32 {d28[1]}, [r4], lr
|
||||
vst1.32 {d29[0]}, [r4], lr
|
||||
vst1.32 {d29[1]}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter4_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,135 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict8x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict8x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter8x4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (5x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
vqrshrn.u16 d26, q10, #7
|
||||
|
||||
;Second pass: 4x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
add r0, r4, lr
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
add r1, r0, lr
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
|
||||
add r2, r1, lr
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
|
||||
vst1.u8 {d2}, [r4] ;store result
|
||||
vst1.u8 {d3}, [r0]
|
||||
vst1.u8 {d4}, [r1]
|
||||
vst1.u8 {d5}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.u8 {d22}, [r4], lr ;store result
|
||||
vst1.u8 {d23}, [r4], lr
|
||||
vst1.u8 {d24}, [r4], lr
|
||||
vst1.u8 {d25}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter8x4_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,183 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_bilinear_predict8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_bilinear_predict8x8_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, bifilter8_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d27, q7, #7
|
||||
vqrshrn.u16 d28, q8, #7
|
||||
vqrshrn.u16 d29, q9, #7
|
||||
vqrshrn.u16 d30, q10, #7
|
||||
|
||||
;Second pass: 8x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
add r0, r4, lr
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
add r1, r0, lr
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
vmlal.u8 q5, d27, d1
|
||||
vmlal.u8 q6, d28, d1
|
||||
vmlal.u8 q7, d29, d1
|
||||
vmlal.u8 q8, d30, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2}, [r4] ;store result
|
||||
vst1.u8 {d3}, [r0]
|
||||
vst1.u8 {d4}, [r1], lr
|
||||
vst1.u8 {d5}, [r1], lr
|
||||
vst1.u8 {d6}, [r1], lr
|
||||
vst1.u8 {d7}, [r1], lr
|
||||
vst1.u8 {d8}, [r1], lr
|
||||
vst1.u8 {d9}, [r1], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;---------------------
|
||||
skip_secondpass_filter
|
||||
vst1.u8 {d22}, [r4], lr ;store result
|
||||
vst1.u8 {d23}, [r4], lr
|
||||
vst1.u8 {d24}, [r4], lr
|
||||
vst1.u8 {d25}, [r4], lr
|
||||
vst1.u8 {d26}, [r4], lr
|
||||
vst1.u8 {d27}, [r4], lr
|
||||
vst1.u8 {d28}, [r4], lr
|
||||
vst1.u8 {d29}, [r4], lr
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bifilter8_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,584 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_build_intra_predictors_mby_neon_func|
|
||||
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred
|
||||
cmp r3, #1
|
||||
beq case_v_pred
|
||||
cmp r3, #2
|
||||
beq case_h_pred
|
||||
cmp r3, #3
|
||||
beq case_tm_pred
|
||||
|
||||
case_dc_pred
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_s_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred_s
|
||||
cmp r3, #1
|
||||
beq case_v_pred_s
|
||||
cmp r3, #2
|
||||
beq case_h_pred_s
|
||||
cmp r3, #3
|
||||
beq case_tm_pred_s
|
||||
|
||||
case_dc_pred_s
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left_s
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up_s
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up_s
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left_s
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left_s
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred_s
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred_s
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred_s
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop_s
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop_s
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
END
|
|
@ -1,59 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem16x16_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem16x16_neon| PROC
|
||||
|
||||
vld1.u8 {q0}, [r0], r1
|
||||
vld1.u8 {q1}, [r0], r1
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vst1.u8 {q0}, [r2], r3
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vst1.u8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {q2}, [r2], r3
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {q4}, [r2], r3
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {q5}, [r2], r3
|
||||
vld1.u8 {q8}, [r0], r1
|
||||
vst1.u8 {q6}, [r2], r3
|
||||
vld1.u8 {q9}, [r0], r1
|
||||
vst1.u8 {q7}, [r2], r3
|
||||
vld1.u8 {q10}, [r0], r1
|
||||
vst1.u8 {q8}, [r2], r3
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
vst1.u8 {q9}, [r2], r3
|
||||
vld1.u8 {q12}, [r0], r1
|
||||
vst1.u8 {q10}, [r2], r3
|
||||
vld1.u8 {q13}, [r0], r1
|
||||
vst1.u8 {q11}, [r2], r3
|
||||
vld1.u8 {q14}, [r0], r1
|
||||
vst1.u8 {q12}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
vst1.u8 {q13}, [r2], r3
|
||||
vst1.u8 {q14}, [r2], r3
|
||||
vst1.u8 {q15}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem16x16_neon|
|
||||
|
||||
END
|
|
@ -1,34 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem8x4_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem8x4_neon| PROC
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem8x4_neon|
|
||||
|
||||
END
|
|
@ -1,43 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_copy_mem8x8_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp9_copy_mem8x8_neon| PROC
|
||||
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vld1.u8 {d4}, [r0], r1
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
vld1.u8 {d5}, [r0], r1
|
||||
vst1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d6}, [r0], r1
|
||||
vst1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d7}, [r0], r1
|
||||
vst1.u8 {d6}, [r2], r3
|
||||
vst1.u8 {d7}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp9_copy_mem8x8_neon|
|
||||
|
||||
END
|
|
@ -1,49 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dc_only_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
|
||||
; unsigned char *dst_ptr, int pitch, int stride)
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 dst_ptr
|
||||
; r3 pitch
|
||||
; sp stride
|
||||
|vp8_dc_only_idct_add_neon| PROC
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
ldr r12, [sp]
|
||||
vdup.16 q0, r0
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r3
|
||||
vld1.32 {d2[1]}, [r1], r3
|
||||
vld1.32 {d4[0]}, [r1], r3
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
|
||||
vaddw.u8 q1, q0, d2
|
||||
vaddw.u8 q2, q0, d4
|
||||
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r12
|
||||
vst1.32 {d2[1]}, [r2], r12
|
||||
vst1.32 {d4[0]}, [r2], r12
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,80 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
EXPORT |vp8_short_inv_walsh4x4_neon|
|
||||
EXPORT |vp8_short_inv_walsh4x4_1_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_neon| PROC
|
||||
|
||||
; read in all four lines of values: d0->d3
|
||||
vld1.i16 {q0-q1}, [r0@128]
|
||||
|
||||
; first for loop
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [12]
|
||||
vadd.s16 d6, d1, d2 ;b = [4] + [8]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [12]
|
||||
vsub.s16 d7, d1, d2 ;c = [4] - [8]
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vtrn.32 d0, d2 ;d0: 0 1 8 9
|
||||
;d2: 2 3 10 11
|
||||
vtrn.32 d1, d3 ;d1: 4 5 12 13
|
||||
;d3: 6 7 14 15
|
||||
|
||||
vtrn.16 d0, d1 ;d0: 0 4 8 12
|
||||
;d1: 1 5 9 13
|
||||
vtrn.16 d2, d3 ;d2: 2 6 10 14
|
||||
;d3: 3 7 11 15
|
||||
|
||||
; second for loop
|
||||
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [3]
|
||||
vadd.s16 d6, d1, d2 ;b = [1] + [2]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [3]
|
||||
vsub.s16 d7, d1, d2 ;c = [1] - [2]
|
||||
|
||||
vmov.i16 q8, #3
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vadd.i16 q0, q0, q8 ;e/f += 3
|
||||
vadd.i16 q1, q1, q8 ;g/h += 3
|
||||
|
||||
vshr.s16 q0, q0, #3 ;e/f >> 3
|
||||
vshr.s16 q1, q1, #3 ;g/h >> 3
|
||||
|
||||
vst4.i16 {d0,d1,d2,d3}, [r1@128]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
|
||||
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|
||||
|vp8_short_inv_walsh4x4_1_neon| PROC
|
||||
ldrsh r2, [r0] ; load input[0]
|
||||
add r3, r2, #3 ; add 3
|
||||
add r2, r1, #16 ; base for last 8 output
|
||||
asr r0, r3, #3 ; right shift 3
|
||||
vdup.16 q0, r0 ; load and duplicate
|
||||
vst1.16 {q0}, [r1@128] ; write back 8
|
||||
vst1.16 {q0}, [r2@128] ; write back last 8
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_1_neon|
|
||||
|
||||
END
|
|
@ -1,397 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_loop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp9_loop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp9_loop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp9_loop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp9_loop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1
|
||||
add r1, r1, r1
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vld1.u8 {q3}, [r2@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r2@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r2@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r2@128] ; q2
|
||||
vld1.u8 {q10}, [r12@128] ; q3
|
||||
|
||||
sub r2, r2, r1, lsl #1
|
||||
sub r12, r12, r1, lsl #1
|
||||
|
||||
bl vp9_loop_filter_neon
|
||||
|
||||
vst1.u8 {q5}, [r2@128], r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r2@128], r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128], r1 ; store oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp9_loop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r3@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r3@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r3@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r3@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r3@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r3@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r3@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r3@64] ; q3
|
||||
vld1.u8 {d21}, [r12@64] ; q3
|
||||
|
||||
bl vp9_loop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r2, r2, r1, lsl #1
|
||||
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r2@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r2@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64] ; store u oq1
|
||||
vst1.u8 {d17}, [r2@64] ; store v oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; int count)
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|
||||
|vp9_loop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, #4 ; src ptr down by 4 columns
|
||||
add r1, r1, r1
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1, asr #1
|
||||
|
||||
vld1.u8 {d6}, [r2], r1
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d10}, [r2], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d14}, [r2], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d18}, [r2], r1
|
||||
vld1.u8 {d20}, [r12], r1
|
||||
|
||||
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d11}, [r2], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d15}, [r2], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d19}, [r2]
|
||||
vld1.u8 {d21}, [r12]
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp9_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
|
||||
sub r0, r0, #2 ; dst ptr
|
||||
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
add r12, r0, r1, asr #1
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
|
||||
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp9_loop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp9_loop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
sub r12, r0, #4 ; move u pointer down by 4 columns
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r3, r2, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r12], r1 ;load u data
|
||||
vld1.u8 {d7}, [r3], r1 ;load v data
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d9}, [r3], r1
|
||||
vld1.u8 {d10}, [r12], r1
|
||||
vld1.u8 {d11}, [r3], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d13}, [r3], r1
|
||||
vld1.u8 {d14}, [r12], r1
|
||||
vld1.u8 {d15}, [r3], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d17}, [r3], r1
|
||||
vld1.u8 {d18}, [r12], r1
|
||||
vld1.u8 {d19}, [r3], r1
|
||||
vld1.u8 {d20}, [r12]
|
||||
vld1.u8 {d21}, [r3]
|
||||
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp9_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
sub r0, r0, #2
|
||||
sub r2, r2, #2
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp9_loop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp9_loop_filter_neon();
|
||||
; This is a helper function for the loopfilters. The invidual functions do the
|
||||
; necessary load, transpose (if necessary) and store.
|
||||
|
||||
; r0-r3 PRESERVE
|
||||
; q0 flimit
|
||||
; q1 limit
|
||||
; q2 thresh
|
||||
; q3 p3
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3
|
||||
|vp9_loop_filter_neon| PROC
|
||||
|
||||
; vp9_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q3, q3, q4
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
|
||||
vmax.u8 q15, q15, q3
|
||||
|
||||
vmov.u8 q10, #0x80 ; 0x80
|
||||
|
||||
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
; vp9_filter() function
|
||||
; convert to signed
|
||||
veor q7, q7, q10 ; qs0
|
||||
vshr.u8 q2, q2, #1 ; a = a / 2
|
||||
veor q6, q6, q10 ; ps0
|
||||
|
||||
veor q5, q5, q10 ; ps1
|
||||
vqadd.u8 q9, q9, q2 ; a = b + a
|
||||
|
||||
veor q8, q8, q10 ; qs1
|
||||
|
||||
vmov.u8 q10, #3 ; #3
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q11, d15, d13
|
||||
|
||||
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vmovl.u8 q4, d20
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
|
||||
vmul.i16 q11, q11, q4
|
||||
|
||||
vand q1, q1, q14 ; vp9_filter &= hev
|
||||
vand q15, q15, q9 ; vp9_filter_mask
|
||||
|
||||
vaddw.s8 q2, q2, d2
|
||||
vaddw.s8 q11, q11, d3
|
||||
|
||||
vmov.u8 q9, #4 ; #4
|
||||
|
||||
; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q11
|
||||
vand q1, q1, q15 ; vp9_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3)
|
||||
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
|
||||
|
||||
|
||||
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
|
||||
|
||||
; outer tap adjustments: ++vp9_filter >> 1
|
||||
vrshr.s8 q1, q1, #1
|
||||
vbic q1, q1, q14 ; vp9_filter &= ~hev
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter)
|
||||
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
veor q5, q13, q0 ; *op1 = u^0x80
|
||||
veor q8, q12, q0 ; *oq1 = u^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,117 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon|
|
||||
EXPORT |vp9_loop_filter_bhs_neon|
|
||||
EXPORT |vp9_loop_filter_mbhs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp9_loop_filter_simple_horizontal_edge_neon| PROC
|
||||
|
||||
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
|
||||
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q5}, [r3@128], r1 ; p0
|
||||
vld1.u8 {q8}, [r0@128] ; q1
|
||||
vld1.u8 {q6}, [r3@128] ; p1
|
||||
|
||||
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q13, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q3, d15, d13
|
||||
|
||||
vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q3, q3, q13
|
||||
|
||||
vmov.u8 q10, #0x03 ; 0x03
|
||||
vmov.u8 q9, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q3, q3, d9
|
||||
|
||||
vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d9, q3
|
||||
|
||||
vand q14, q4, q15 ; vp9_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
|
||||
vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
sub r0, r0, r1
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
|
||||
vst1.u8 {q6}, [r3@128] ; store op0
|
||||
vst1.u8 {q7}, [r0@128] ; store oq0
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp9_loop_filter_bhs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
|
||||
bl vp9_loop_filter_simple_horizontal_edge_neon
|
||||
; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
|
||||
bl vp9_loop_filter_simple_horizontal_edge_neon
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
|
||||
pop {r4, lr}
|
||||
b vp9_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp9_loop_filter_bhs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp9_loop_filter_mbhs_neon| PROC
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp9_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp9_loop_filter_bhs_neon|
|
||||
|
||||
END
|
|
@ -1,154 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp9_loop_filter_simple_vertical_edge_neon|
|
||||
EXPORT |vp9_loop_filter_bvs_neon|
|
||||
EXPORT |vp9_loop_filter_mbvs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp9_loop_filter_simple_vertical_edge_neon| PROC
|
||||
sub r0, r0, #2 ; move src pointer down by 2 columns
|
||||
add r12, r1, r1
|
||||
add r3, r0, r1
|
||||
|
||||
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
|
||||
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
|
||||
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
|
||||
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
|
||||
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
|
||||
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
|
||||
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
|
||||
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
|
||||
|
||||
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
|
||||
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
|
||||
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
|
||||
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
|
||||
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
|
||||
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
|
||||
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
|
||||
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
|
||||
|
||||
vswp d7, d10
|
||||
vswp d12, d9
|
||||
|
||||
;vp9_filter_mask() function
|
||||
;vp8_hevmask() function
|
||||
sub r0, r0, r1, lsl #4
|
||||
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q11, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
|
||||
|
||||
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
|
||||
vsubl.s8 q13, d9, d11
|
||||
|
||||
vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q13, q13, q11
|
||||
|
||||
vmov.u8 q11, #0x03 ; 0x03
|
||||
vmov.u8 q12, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d29
|
||||
|
||||
vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d29, q13
|
||||
|
||||
add r0, r0, #1
|
||||
add r3, r0, r1
|
||||
|
||||
vand q14, q14, q15 ; vp9_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
|
||||
vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
add r12, r1, r1
|
||||
vswp d13, d14
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst2.8 {d12[0], d13[0]}, [r0], r12
|
||||
vst2.8 {d12[1], d13[1]}, [r3], r12
|
||||
vst2.8 {d12[2], d13[2]}, [r0], r12
|
||||
vst2.8 {d12[3], d13[3]}, [r3], r12
|
||||
vst2.8 {d12[4], d13[4]}, [r0], r12
|
||||
vst2.8 {d12[5], d13[5]}, [r3], r12
|
||||
vst2.8 {d12[6], d13[6]}, [r0], r12
|
||||
vst2.8 {d12[7], d13[7]}, [r3], r12
|
||||
vst2.8 {d14[0], d15[0]}, [r0], r12
|
||||
vst2.8 {d14[1], d15[1]}, [r3], r12
|
||||
vst2.8 {d14[2], d15[2]}, [r0], r12
|
||||
vst2.8 {d14[3], d15[3]}, [r3], r12
|
||||
vst2.8 {d14[4], d15[4]}, [r0], r12
|
||||
vst2.8 {d14[5], d15[5]}, [r3], r12
|
||||
vst2.8 {d14[6], d15[6]}, [r0], r12
|
||||
vst2.8 {d14[7], d15[7]}, [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp9_loop_filter_simple_vertical_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp9_loop_filter_bvs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
mov r4, r0
|
||||
add r0, r0, #4
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
bl vp9_loop_filter_simple_vertical_edge_neon
|
||||
; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1
|
||||
add r0, r4, #8
|
||||
bl vp9_loop_filter_simple_vertical_edge_neon
|
||||
add r0, r4, #12
|
||||
pop {r4, lr}
|
||||
b vp9_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp9_loop_filter_bvs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp9_loop_filter_mbvs_neon| PROC
|
||||
ldrb r3, [r2] ; load mblim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp9_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp9_loop_filter_bvs_neon|
|
||||
END
|
|
@ -1,469 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
add r1, r1, r1 ; double stride
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
|
||||
|
||||
vld1.u8 {q3}, [r0@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r0@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r0@128], r1 ; q2
|
||||
vld1.u8 {q10}, [r12@128], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #2
|
||||
add r0, r12, r1, lsr #1
|
||||
|
||||
vst1.u8 {q4}, [r12@128],r1 ; store op2
|
||||
vst1.u8 {q5}, [r0@128],r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r0@128],r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128] ; store oq1
|
||||
vst1.u8 {q9}, [r0@128] ; store oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|
||||
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r0@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r0@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r0@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r0@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r0@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r0@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r0@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r0@64], r1 ; q3
|
||||
vld1.u8 {d21}, [r12@64], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
|
||||
vst1.u8 {d8}, [r0@64], r1 ; store u op2
|
||||
vst1.u8 {d9}, [r12@64], r1 ; store v op2
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r12@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r12@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
|
||||
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
|
||||
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
|
||||
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move src pointer down by 4 columns
|
||||
vdup.s8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
|
||||
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 const signed char *flimit,
|
||||
; r3 const signed char *limit,
|
||||
; sp const signed char *thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move u pointer down by 4 columns
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ;load u data
|
||||
vld1.u8 {d7}, [r12], r1 ;load v data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_neon()
|
||||
; This is a helper function for the macroblock loopfilters. The individual
|
||||
; functions do the necessary load, transpose (if necessary), preserve (if
|
||||
; necessary) and store.
|
||||
|
||||
; r0,r1 PRESERVE
|
||||
; r2 mblimit
|
||||
; r3 limit
|
||||
|
||||
; q2 thresh
|
||||
; q3 p3 PRESERVE
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3 PRESERVE
|
||||
|
||||
|vp8_mbloop_filter_neon| PROC
|
||||
|
||||
; vp9_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q1, q1, q0
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
|
||||
vmax.u8 q15, q15, q1
|
||||
|
||||
vdup.u8 q1, r3 ; limit
|
||||
vdup.u8 q2, r2 ; mblimit
|
||||
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
|
||||
vmov.u16 q11, #3 ; #3
|
||||
|
||||
; vp9_filter
|
||||
; convert to signed
|
||||
veor q7, q7, q0 ; qs0
|
||||
vshr.u8 q1, q1, #1 ; a = a / 2
|
||||
veor q6, q6, q0 ; ps0
|
||||
veor q5, q5, q0 ; ps1
|
||||
|
||||
vqadd.u8 q12, q12, q1 ; a = b + a
|
||||
|
||||
veor q8, q8, q0 ; qs1
|
||||
veor q4, q4, q0 ; ps2
|
||||
veor q9, q9, q0 ; qs2
|
||||
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; qs0 - ps0
|
||||
vsubl.s8 q13, d15, d13
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
|
||||
|
||||
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
|
||||
vand q15, q15, q12 ; vp9_filter_mask
|
||||
|
||||
vmul.i16 q13, q13, q11
|
||||
|
||||
vmov.u8 q12, #3 ; #3
|
||||
|
||||
vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d3
|
||||
|
||||
vmov.u8 q11, #4 ; #4
|
||||
|
||||
; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q13
|
||||
|
||||
vand q1, q1, q15 ; vp9_filter &= mask
|
||||
|
||||
vmov.u16 q15, #63 ; #63
|
||||
|
||||
vand q13, q1, q14 ; Filter2 &= hev
|
||||
|
||||
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
|
||||
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
|
||||
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
|
||||
|
||||
vmov q11, q15
|
||||
vmov q12, q15
|
||||
|
||||
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
|
||||
|
||||
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
|
||||
|
||||
vbic q1, q1, q14 ; vp9_filter &= ~hev
|
||||
|
||||
; roughly 1/7th difference across boundary
|
||||
; roughly 2/7th difference across boundary
|
||||
; roughly 3/7th difference across boundary
|
||||
|
||||
vmov.u8 d5, #9 ; #9
|
||||
vmov.u8 d4, #18 ; #18
|
||||
|
||||
vmov q13, q15
|
||||
vmov q14, q15
|
||||
|
||||
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
|
||||
vmlal.s8 q11, d3, d5
|
||||
vmov.u8 d5, #27 ; #27
|
||||
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
|
||||
vmlal.s8 q13, d3, d4
|
||||
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
|
||||
vmlal.s8 q15, d3, d5
|
||||
|
||||
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
|
||||
vqshrn.s16 d1, q11, #7
|
||||
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
|
||||
vqshrn.s16 d25, q13, #7
|
||||
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
|
||||
vqshrn.s16 d29, q15, #7
|
||||
|
||||
vmov.u8 q1, #0x80 ; 0x80
|
||||
|
||||
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
|
||||
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
|
||||
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
|
||||
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
|
||||
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
|
||||
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
|
||||
|
||||
veor q9, q11, q1 ; *oq2 = s^0x80
|
||||
veor q4, q0, q1 ; *op2 = s^0x80
|
||||
veor q8, q13, q1 ; *oq1 = s^0x80
|
||||
veor q5, q12, q1 ; *op2 = s^0x80
|
||||
veor q7, q15, q1 ; *oq0 = s^0x80
|
||||
veor q6, q14, q1 ; *op0 = s^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_mbloop_filter_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,131 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon16x16mb_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int ystride,
|
||||
; stack unsigned char *udst_ptr,
|
||||
; stack unsigned char *vdst_ptr
|
||||
|
||||
|vp8_recon16x16mb_neon| PROC
|
||||
mov r12, #4 ;loop counter for Y loop
|
||||
|
||||
recon16x16mb_loop_y
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]!
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]!
|
||||
|
||||
pld [r0]
|
||||
pld [r1]
|
||||
pld [r1, #64]
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
vadd.s16 q7, q7, q15
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vqmovun.s16 d4, q4
|
||||
vqmovun.s16 d5, q5
|
||||
vst1.u8 {q0}, [r2], r3 ;store result
|
||||
vqmovun.s16 d6, q6
|
||||
vst1.u8 {q1}, [r2], r3
|
||||
vqmovun.s16 d7, q7
|
||||
vst1.u8 {q2}, [r2], r3
|
||||
subs r12, r12, #1
|
||||
|
||||
moveq r12, #2 ;loop counter for UV loop
|
||||
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
bne recon16x16mb_loop_y
|
||||
|
||||
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
|
||||
ldr r2, [sp] ;load upred_ptr
|
||||
|
||||
recon16x16mb_loop_uv
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]!
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]!
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vadd.s16 q7, q7, q15
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vst1.u8 {d0}, [r2], r3 ;store result
|
||||
vqmovun.s16 d4, q4
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vqmovun.s16 d5, q5
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vqmovun.s16 d6, q6
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
vqmovun.s16 d7, q7
|
||||
vst1.u8 {d4}, [r2], r3
|
||||
subs r12, r12, #1
|
||||
|
||||
vst1.u8 {d5}, [r2], r3
|
||||
vst1.u8 {d6}, [r2], r3
|
||||
vst1.u8 {d7}, [r2], r3
|
||||
|
||||
ldrne r2, [sp, #4] ;load vpred_ptr
|
||||
bne recon16x16mb_loop_uv
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,54 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon2b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon2b_neon| PROC
|
||||
vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
|
||||
vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
|
||||
|
||||
vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
|
||||
vld1.16 {q6, q7}, [r1]!
|
||||
vmovl.u8 q1, d17
|
||||
vmovl.u8 q2, d18
|
||||
vmovl.u8 q3, d19
|
||||
|
||||
vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q5
|
||||
vadd.s16 q2, q2, q6
|
||||
vadd.s16 q3, q3, q7
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
add r0, r2, r3
|
||||
|
||||
vst1.u8 {d0}, [r2] ;store result
|
||||
vst1.u8 {d1}, [r0], r3
|
||||
add r2, r0, r3
|
||||
vst1.u8 {d2}, [r0]
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,69 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon4b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon4b_neon| PROC
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
vadd.s16 q7, q7, q15
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vqmovun.s16 d4, q4
|
||||
vqmovun.s16 d5, q5
|
||||
vqmovun.s16 d6, q6
|
||||
vqmovun.s16 d7, q7
|
||||
add r0, r2, r3
|
||||
|
||||
vst1.u8 {q0}, [r2] ;store result
|
||||
vst1.u8 {q1}, [r0], r3
|
||||
add r2, r0, r3
|
||||
vst1.u8 {q2}, [r0]
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,29 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vp9/common/recon.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
|
||||
extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
|
||||
|
||||
void vp8_recon_mb_neon(MACROBLOCKD *xd) {
|
||||
unsigned char *pred_ptr = &xd->predictor[0];
|
||||
short *diff_ptr = &xd->diff[0];
|
||||
unsigned char *dst_ptr = xd->dst.y_buffer;
|
||||
unsigned char *udst_ptr = xd->dst.u_buffer;
|
||||
unsigned char *vdst_ptr = xd->dst.v_buffer;
|
||||
int ystride = xd->dst.y_stride;
|
||||
/*int uv_stride = xd->dst.uv_stride;*/
|
||||
|
||||
vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
|
||||
udst_ptr, vdst_ptr);
|
||||
}
|
|
@ -1,61 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon_b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon_b_neon| PROC
|
||||
mov r12, #16
|
||||
|
||||
vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
|
||||
vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {d29}, [r0], r12
|
||||
vld1.16 {q11, q12}, [r1]!
|
||||
vld1.u8 {d30}, [r0], r12
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vld1.u8 {d31}, [r0], r12
|
||||
vld1.16 {q13}, [r1]
|
||||
|
||||
vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
|
||||
vmovl.u8 q2, d30
|
||||
vmovl.u8 q3, d31
|
||||
|
||||
vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
|
||||
vadd.s16 d2, d2, d22
|
||||
vadd.s16 d4, d4, d24
|
||||
vadd.s16 d6, d6, d26
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
add r1, r2, r3
|
||||
|
||||
vst1.32 {d0[0]}, [r2] ;store result
|
||||
vst1.32 {d1[0]}, [r1], r3
|
||||
add r2, r1, r3
|
||||
vst1.32 {d2[0]}, [r1]
|
||||
vst1.32 {d3[0]}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,36 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_push_neon|
|
||||
EXPORT |vp9_pop_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
|vp9_push_neon| PROC
|
||||
vst1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vst1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|vp9_pop_neon| PROC
|
||||
vld1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vld1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_1_neon|
|
||||
EXPORT |vp8_dc_only_idct_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
|
||||
; r0 short *input;
|
||||
; r1 short *output;
|
||||
; r2 int pitch;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|vp8_short_idct4x4llm_1_neon| PROC
|
||||
vld1.16 {d0[]}, [r0] ;load input[0]
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
|
||||
vrshr.s16 d0, d0, #3
|
||||
|
||||
add r0, r12, r2
|
||||
|
||||
vst1.16 {d0}, [r1]
|
||||
vst1.16 {d0}, [r3]
|
||||
vst1.16 {d0}, [r12]
|
||||
vst1.16 {d0}, [r0]
|
||||
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
|
||||
; r0 short input_dc;
|
||||
; r1 short *output;
|
||||
; r2 int pitch;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|vp8_dc_only_idct_neon| PROC
|
||||
vdup.16 d0, r0
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
|
||||
vrshr.s16 d0, d0, #3
|
||||
|
||||
add r0, r12, r2
|
||||
|
||||
vst1.16 {d0}, [r1]
|
||||
vst1.16 {d0}, [r3]
|
||||
vst1.16 {d0}, [r12]
|
||||
vst1.16 {d0}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
|
@ -1,122 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;*************************************************************
|
||||
;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
|
||||
;r0 short * input
|
||||
;r1 short * output
|
||||
;r2 int pitch
|
||||
;*************************************************************
|
||||
;static const int cospi8sqrt2minus1=20091;
|
||||
;static const int sinpi8sqrt2 =35468;
|
||||
;static const int rounding = 0;
|
||||
;Optimization note: The resulted data from dequantization are signed 13-bit data that is
|
||||
;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
|
||||
;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
|
||||
;result of the multiplication that is needed in IDCT.
|
||||
|
||||
|vp8_short_idct4x4llm_neon| PROC
|
||||
adr r12, idct_coeff
|
||||
vld1.16 {q1, q2}, [r0]
|
||||
vld1.16 {d0}, [r12]
|
||||
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
;d6 - c1:temp1
|
||||
;d7 - d1:temp2
|
||||
;d8 - d1:temp1
|
||||
;d9 - c1:temp2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vswp d3, d4
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
add r0, r12, r2
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vst1.16 {d2}, [r1]
|
||||
vst1.16 {d3}, [r3]
|
||||
vst1.16 {d4}, [r12]
|
||||
vst1.16 {d5}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
idct_coeff
|
||||
DCD 0x4e7b4e7b, 0x8a8c8a8c
|
||||
|
||||
;20091, 20091, 35468, 35468
|
||||
|
||||
END
|
|
@ -1,490 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict16x16_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter16_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
|
||||
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
|
||||
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
|
||||
; that the result can be a large positive number (> 2^15-1), which could be confused as a
|
||||
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
|
||||
; which ensures that the result stays in s16 range. Finally, saturated add the result by
|
||||
; applying 3rd filter coeff. Same applys to other filter functions.
|
||||
|
||||
|vp8_sixtap_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter16x16_only
|
||||
|
||||
sub sp, sp, #336 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #7 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (21x16)
|
||||
filt_blk2d_fp16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q9, d7, d0
|
||||
vmull.u8 q10, d9, d0
|
||||
vmull.u8 q11, d10, d0
|
||||
vmull.u8 q12, d12, d0
|
||||
vmull.u8 q13, d13, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d9, d10, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q12, d30, d1
|
||||
|
||||
vext.8 d28, d7, d8, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d13, d14, #1
|
||||
|
||||
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q11, d29, d1
|
||||
vmlsl.u8 q13, d30, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d9, d10, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q12, d30, d4
|
||||
|
||||
vext.8 d28, d7, d8, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d13, d14, #4
|
||||
|
||||
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q11, d29, d4
|
||||
vmlsl.u8 q13, d30, d4
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
|
||||
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q12, d30, d5
|
||||
|
||||
vext.8 d28, d7, d8, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d13, d14, #5
|
||||
|
||||
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q11, d29, d5
|
||||
vmlal.u8 q13, d30, d5
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d9, d10, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
|
||||
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q12, d30, d2
|
||||
|
||||
vext.8 d28, d7, d8, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d13, d14, #2
|
||||
|
||||
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q11, d29, d2
|
||||
vmlal.u8 q13, d30, d2
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d9, d10, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
|
||||
vext.8 d15, d7, d8, #3
|
||||
vext.8 d31, d10, d11, #3
|
||||
vext.8 d6, d13, d14, #3
|
||||
|
||||
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
|
||||
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q12, q6
|
||||
|
||||
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q7, d31, d3
|
||||
vmull.u8 q3, d6, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q9, q6
|
||||
vqadd.s16 q11, q7
|
||||
vqadd.s16 q13, q3
|
||||
|
||||
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q9, #7
|
||||
vqrshrun.s16 d8, q10, #7
|
||||
vqrshrun.s16 d9, q11, #7
|
||||
vqrshrun.s16 d10, q12, #7
|
||||
vqrshrun.s16 d11, q13, #7
|
||||
|
||||
vst1.u8 {d6, d7, d8}, [lr]! ;store result
|
||||
vst1.u8 {d9, d10, d11}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter - do first 8-columns and then second 8-columns
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #336
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
mov r2, #16
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp16x16_outloop_neon
|
||||
vld1.u8 {d18}, [lr], r2 ;load src data
|
||||
vld1.u8 {d19}, [lr], r2
|
||||
vld1.u8 {d20}, [lr], r2
|
||||
vld1.u8 {d21}, [lr], r2
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [lr], r2
|
||||
|
||||
secondpass_inner_loop_neon
|
||||
vld1.u8 {d23}, [lr], r2 ;load src data
|
||||
vld1.u8 {d24}, [lr], r2
|
||||
vld1.u8 {d25}, [lr], r2
|
||||
vld1.u8 {d26}, [lr], r2
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub lr, lr, #336
|
||||
add lr, lr, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_sp16x16_outloop_neon
|
||||
|
||||
add sp, sp, #336
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter16x16_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #8 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
|
||||
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q7, d7, d0
|
||||
vmull.u8 q8, d9, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
|
||||
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d21, d9, d10, #1
|
||||
vext.8 d22, d7, d8, #1
|
||||
vext.8 d23, d10, d11, #1
|
||||
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d25, d9, d10, #4
|
||||
vext.8 d26, d7, d8, #4
|
||||
vext.8 d27, d10, d11, #4
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
|
||||
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d21, d1
|
||||
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q9, d23, d1
|
||||
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d25, d4
|
||||
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q9, d27, d4
|
||||
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
|
||||
vext.8 d20, d7, d8, #5
|
||||
vext.8 d21, d10, d11, #5
|
||||
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d23, d9, d10, #2
|
||||
vext.8 d24, d7, d8, #2
|
||||
vext.8 d25, d10, d11, #2
|
||||
|
||||
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d27, d9, d10, #3
|
||||
vext.8 d28, d7, d8, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
|
||||
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q9, d21, d5
|
||||
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d23, d2
|
||||
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q9, d25, d2
|
||||
|
||||
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q11, d27, d3
|
||||
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q15, d29, d3
|
||||
|
||||
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q11
|
||||
vqadd.s16 q7, q12
|
||||
vqadd.s16 q9, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q7, #7
|
||||
vqrshrun.s16 d8, q8, #7
|
||||
vqrshrun.s16 d9, q9, #7
|
||||
|
||||
vst1.u8 {q3}, [r4], r5 ;store result
|
||||
vst1.u8 {q4}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
secondpass_filter16x16_only
|
||||
;Second pass: 16x16
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_spo16x16_outloop_neon
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
|
||||
secondpass_only_inner_loop_neon
|
||||
vld1.u8 {d23}, [r0], r1 ;load src data
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_only_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub r0, r0, r1, lsl #4
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1
|
||||
add r0, r0, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_spo16x16_outloop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
END
|
|
@ -1,422 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter4_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, filter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter4x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter4x4_only
|
||||
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
;First Pass on rest 5-line data
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
|
||||
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
|
||||
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
|
||||
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
vqadd.s16 q12, q11
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d30, q8, #7
|
||||
vqrshrun.s16 d31, q12, #7
|
||||
|
||||
;Second pass: 4x4
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
firstpass_filter4x4_only
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
vst1.32 {d27[0]}, [r4] ;store result
|
||||
vst1.32 {d27[1]}, [r0]
|
||||
vst1.32 {d28[0]}, [r1]
|
||||
vst1.32 {d28[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
secondpass_filter4x4_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.32 {d27[0]}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.32 {d27[1]}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.32 {d28[0]}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.32 {d28[1]}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.32 {d29[0]}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.32 {d29[1]}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.32 {d30[0]}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.32 {d30[1]}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.32 {d31[0]}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,473 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x4_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x4_only
|
||||
|
||||
sub sp, sp, #32 ;reserve space on stack for temporary storage
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
mov lr, sp
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
|
||||
;Second pass: 8x4
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #32
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
add sp, sp, #32
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter8x4_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x8)
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x4_only
|
||||
;Second pass: 8x4
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,524 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x8_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x8_only
|
||||
|
||||
sub sp, sp, #64 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (13x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
filt_blk2d_fp8x8_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp8x8_loop_neon
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
;vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
;vld1.u8 {q4}, [r0], r1
|
||||
;vld1.u8 {q5}, [r0], r1
|
||||
;vld1.u8 {q6}, [r0], r1
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
sub lr, lr, #64
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vld1.u8 {q10}, [lr]!
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q11, #7
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
;Second pass: 8x8
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_sp8x8_loop_neon
|
||||
|
||||
add sp, sp, #64
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
firstpass_filter8x8_only
|
||||
;add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (8x8)
|
||||
filt_blk2d_fpo8x8_loop_neon
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
;
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x8_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r3, #2 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
;Second pass: 8x8
|
||||
filt_blk2d_spo8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_spo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,91 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vpx_ports/arm.h"
|
||||
#include "vp9/common/vp9_pragmas.h"
|
||||
#include "vp9/common/vp9_subpixel.h"
|
||||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "vp9/common/recon.h"
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
|
||||
void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
|
||||
int flags = arm_cpu_caps();
|
||||
rtcd->flags = flags;
|
||||
|
||||
/* Override default functions with fastest ones for this CPU. */
|
||||
#if HAVE_ARMV5TE
|
||||
if (flags & HAS_EDSP) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// The commented functions need to be re-written for vpx.
|
||||
#if HAVE_ARMV6
|
||||
if (flags & HAS_MEDIA) {
|
||||
rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6;
|
||||
rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6;
|
||||
rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6;
|
||||
rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6;
|
||||
|
||||
rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
|
||||
rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6;
|
||||
rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6;
|
||||
rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6;
|
||||
|
||||
// rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6;
|
||||
// rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual;
|
||||
// rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6;
|
||||
// rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6;
|
||||
|
||||
rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6;
|
||||
rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6;
|
||||
rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6;
|
||||
rtcd->recon.recon = vp9_recon_b_armv6;
|
||||
rtcd->recon.recon2 = vp9_recon2b_armv6;
|
||||
rtcd->recon.recon4 = vp9_recon4b_armv6;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
if (flags & HAS_NEON) {
|
||||
rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon;
|
||||
rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon;
|
||||
rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon;
|
||||
rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon;
|
||||
|
||||
rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
|
||||
rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon;
|
||||
rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon;
|
||||
rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon;
|
||||
|
||||
// rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon;
|
||||
// rtcd->idct.idct16 = vp9_short_idct4x4llm_neon;
|
||||
// rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon;
|
||||
// rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon;
|
||||
|
||||
rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon;
|
||||
rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon;
|
||||
rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon;
|
||||
rtcd->recon.recon = vp9_recon_b_neon;
|
||||
rtcd->recon.recon2 = vp9_recon2b_neon;
|
||||
rtcd->recon.recon4 = vp9_recon4b_neon;
|
||||
rtcd->recon.recon_mb = vp9_recon_mb_neon;
|
||||
rtcd->recon.build_intra_predictors_mby =
|
||||
vp9_build_intra_predictors_mby_neon;
|
||||
rtcd->recon.build_intra_predictors_mby_s =
|
||||
vp9_build_intra_predictors_mby_s_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
#include "vp9/common/vp9_subpixel.h"
|
||||
#include "vp9/common/arm/vp9_bilinearfilter_arm.h"
|
||||
|
||||
void vp9_filter_block2d_bil_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
unsigned int src_pitch,
|
||||
unsigned int dst_pitch,
|
||||
const short *HFilter,
|
||||
const short *VFilter,
|
||||
int Width,
|
||||
int Height
|
||||
) {
|
||||
unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
|
||||
|
||||
/* First filter 1-D horizontally... */
|
||||
vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
|
||||
|
||||
/* then 1-D vertically... */
|
||||
vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
|
||||
}
|
||||
|
||||
|
||||
void vp9_bilinear_predict4x4_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
|
||||
}
|
||||
|
||||
void vp9_bilinear_predict8x8_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
|
||||
}
|
||||
|
||||
void vp9_bilinear_predict8x4_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
|
||||
}
|
||||
|
||||
void vp9_bilinear_predict16x16_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_
|
||||
#define VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_
|
||||
|
||||
extern void vp9_filter_block2d_bil_first_pass_armv6
|
||||
(
|
||||
const unsigned char *src_ptr,
|
||||
unsigned short *dst_ptr,
|
||||
unsigned int src_pitch,
|
||||
unsigned int height,
|
||||
unsigned int width,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
extern void vp9_filter_block2d_bil_second_pass_armv6
|
||||
(
|
||||
const unsigned short *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch,
|
||||
unsigned int height,
|
||||
unsigned int width,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
#endif /* BILINEARFILTER_ARM_H */
|
|
@ -1,198 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include <math.h>
|
||||
#include "vp9/common/vp9_filter.h"
|
||||
#include "vp9/common/vp9_subpixel.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern void vp9_filter_block2d_first_pass_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int output_width,
|
||||
unsigned int output_height,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
// 8x8
|
||||
extern void vp9_filter_block2d_first_pass_8x8_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int output_width,
|
||||
unsigned int output_height,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
// 16x16
|
||||
extern void vp9_filter_block2d_first_pass_16x16_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
short *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int output_width,
|
||||
unsigned int output_height,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
extern void vp9_filter_block2d_second_pass_armv6
|
||||
(
|
||||
short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int output_pitch,
|
||||
unsigned int cnt,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
extern void vp9_filter4_block2d_second_pass_armv6
|
||||
(
|
||||
short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int output_pitch,
|
||||
unsigned int cnt,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
extern void vp9_filter_block2d_first_pass_only_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int cnt,
|
||||
unsigned int output_pitch,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
|
||||
extern void vp9_filter_block2d_second_pass_only_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int src_pixels_per_line,
|
||||
unsigned int cnt,
|
||||
unsigned int output_pitch,
|
||||
const short *vp9_filter
|
||||
);
|
||||
|
||||
#if HAVE_ARMV6
|
||||
void vp9_sixtap_predict_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
|
||||
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
/* Vfilter is null. First pass only */
|
||||
if (xoffset && !yoffset) {
|
||||
/*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
|
||||
vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
|
||||
|
||||
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
|
||||
} else {
|
||||
/* Vfilter is a 4 tap filter */
|
||||
if (yoffset & 0x1) {
|
||||
vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
|
||||
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
|
||||
}
|
||||
/* Vfilter is 6 tap filter */
|
||||
else {
|
||||
vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
|
||||
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_sixtap_predict8x8_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
if (xoffset && !yoffset) {
|
||||
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
|
||||
} else {
|
||||
if (yoffset & 0x1) {
|
||||
vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
|
||||
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
|
||||
} else {
|
||||
vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
|
||||
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void vp9_sixtap_predict16x16_armv6
|
||||
(
|
||||
unsigned char *src_ptr,
|
||||
int src_pixels_per_line,
|
||||
int xoffset,
|
||||
int yoffset,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_pitch
|
||||
) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
if (xoffset && !yoffset) {
|
||||
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
|
||||
} else {
|
||||
if (yoffset & 0x1) {
|
||||
vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
|
||||
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
|
||||
} else {
|
||||
vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
|
||||
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
|
@ -1,65 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_COMMON_ARM_VP9_IDCT_ARM_H_
|
||||
#define VP9_COMMON_ARM_VP9_IDCT_ARM_H_
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_idct(vp9_short_idct4x4llm_1_v6);
|
||||
extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
|
||||
extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
|
||||
extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
|
||||
extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp9_idct_idct1
|
||||
#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
|
||||
|
||||
#undef vp9_idct_idct16
|
||||
#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
|
||||
|
||||
#undef vp9_idct_idct1_scalar_add
|
||||
#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
|
||||
|
||||
#undef vp8_idct_iwalsh1
|
||||
#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
|
||||
|
||||
#undef vp8_idct_iwalsh16
|
||||
#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_idct(vp9_short_idct4x4llm_1_neon);
|
||||
extern prototype_idct(vp9_short_idct4x4llm_neon);
|
||||
extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
|
||||
extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
|
||||
extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp9_idct_idct1
|
||||
#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
|
||||
|
||||
#undef vp9_idct_idct16
|
||||
#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
|
||||
|
||||
#undef vp9_idct_idct1_scalar_add
|
||||
#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
|
||||
|
||||
#undef vp8_idct_iwalsh1
|
||||
#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
|
||||
|
||||
#undef vp8_idct_iwalsh16
|
||||
#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,166 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp9/common/vp9_loopfilter.h"
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
|
||||
extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
|
||||
extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
|
||||
extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
|
||||
unsigned char blimit, unsigned char limit, unsigned char thresh);
|
||||
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
|
||||
unsigned char blimit, unsigned char limit, unsigned char thresh,
|
||||
unsigned char *v);
|
||||
|
||||
extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
|
||||
extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
|
||||
|
||||
extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV6
|
||||
/*ARMV6 loopfilter functions*/
|
||||
/* Horizontal MB filtering */
|
||||
void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
/* Vertical MB Filtering */
|
||||
void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
/* Horizontal B Filtering */
|
||||
void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
|
||||
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
|
||||
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
|
||||
}
|
||||
|
||||
/* Vertical B Filtering */
|
||||
void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
|
||||
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
|
||||
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
/* NEON loopfilter functions */
|
||||
/* Horizontal MB filtering */
|
||||
void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
unsigned char mblim = *lfi->mblim;
|
||||
unsigned char lim = *lfi->lim;
|
||||
unsigned char hev_thr = *lfi->hev_thr;
|
||||
vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
|
||||
}
|
||||
|
||||
/* Vertical MB Filtering */
|
||||
void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
unsigned char mblim = *lfi->mblim;
|
||||
unsigned char lim = *lfi->lim;
|
||||
unsigned char hev_thr = *lfi->hev_thr;
|
||||
|
||||
vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
|
||||
}
|
||||
|
||||
/* Horizontal B Filtering */
|
||||
void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
unsigned char blim = *lfi->blim;
|
||||
unsigned char lim = *lfi->lim;
|
||||
unsigned char hev_thr = *lfi->hev_thr;
|
||||
|
||||
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
|
||||
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
|
||||
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
|
||||
}
|
||||
|
||||
/* Vertical B Filtering */
|
||||
void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
|
||||
int y_stride, int uv_stride, loop_filter_info *lfi) {
|
||||
unsigned char blim = *lfi->blim;
|
||||
unsigned char lim = *lfi->lim;
|
||||
unsigned char hev_thr = *lfi->hev_thr;
|
||||
|
||||
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
|
||||
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
|
||||
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
|
||||
|
||||
if (u_ptr)
|
||||
vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
|
||||
}
|
||||
#endif
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_
|
||||
#define VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_
|
||||
|
||||
#include "vpx_config.h"
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
|
||||
|
||||
#endif /* HAVE_ARMV6 */
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
|
||||
extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
|
||||
extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
|
||||
|
||||
#endif /* HAVE_ARMV7 */
|
||||
|
||||
#endif /* LOOPFILTER_ARM_H */
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_COMMON_ARM_VP9_RECON_ARM_H_
|
||||
#define VP9_COMMON_ARM_VP9_RECON_ARM_H_
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_recon_block(vp9_recon_b_armv6);
|
||||
extern prototype_recon_block(vp9_recon2b_armv6);
|
||||
extern prototype_recon_block(vp9_recon4b_armv6);
|
||||
|
||||
extern prototype_copy_block(vp9_copy_mem8x8_v6);
|
||||
extern prototype_copy_block(vp9_copy_mem8x4_v6);
|
||||
extern prototype_copy_block(vp9_copy_mem16x16_v6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon
|
||||
#define vp8_recon_recon vp9_recon_b_armv6
|
||||
|
||||
#undef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp9_recon2b_armv6
|
||||
|
||||
#undef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp9_recon4b_armv6
|
||||
|
||||
#undef vp8_recon_copy8x8
|
||||
#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
|
||||
|
||||
#undef vp8_recon_copy8x4
|
||||
#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
|
||||
|
||||
#undef vp8_recon_copy16x16
|
||||
#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_recon_block(vp9_recon_b_neon);
|
||||
extern prototype_recon_block(vp9_recon2b_neon);
|
||||
extern prototype_recon_block(vp9_recon4b_neon);
|
||||
|
||||
extern prototype_copy_block(vp9_copy_mem8x8_neon);
|
||||
extern prototype_copy_block(vp9_copy_mem8x4_neon);
|
||||
extern prototype_copy_block(vp9_copy_mem16x16_neon);
|
||||
|
||||
extern prototype_recon_macroblock(vp9_recon_mb_neon);
|
||||
|
||||
extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
|
||||
extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon
|
||||
#define vp8_recon_recon vp9_recon_b_neon
|
||||
|
||||
#undef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp9_recon2b_neon
|
||||
|
||||
#undef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp9_recon4b_neon
|
||||
|
||||
#undef vp8_recon_copy8x8
|
||||
#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
|
||||
|
||||
#undef vp8_recon_copy8x4
|
||||
#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
|
||||
|
||||
#undef vp8_recon_copy16x16
|
||||
#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
|
||||
|
||||
#undef vp8_recon_recon_mb
|
||||
#define vp8_recon_recon_mb vp9_recon_mb_neon
|
||||
|
||||
#undef vp9_recon_build_intra_predictors_mby
|
||||
#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
|
||||
|
||||
#undef vp9_recon_build_intra_predictors_mby_s
|
||||
#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vp9/common/vp9_reconintra.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vp9/common/recon.h"
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern void vp9_build_intra_predictors_mby_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
|
||||
unsigned char *y_buffer = xd->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = xd->predictor;
|
||||
int y_stride = xd->dst.y_stride;
|
||||
int mode = xd->mode_info_context->mbmi.mode;
|
||||
int Up = xd->up_available;
|
||||
int Left = xd->left_available;
|
||||
|
||||
vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
|
||||
y_stride, mode, Up, Left);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern void vp9_build_intra_predictors_mby_s_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
|
||||
unsigned char *y_buffer = xd->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = xd->predictor;
|
||||
int y_stride = xd->dst.y_stride;
|
||||
int mode = xd->mode_info_context->mbmi.mode;
|
||||
int Up = xd->up_available;
|
||||
int Left = xd->left_available;
|
||||
|
||||
vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
|
||||
y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,89 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_
|
||||
#define VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp9_subpix_sixtap16x16
|
||||
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
|
||||
|
||||
#undef vp9_subpix_sixtap8x8
|
||||
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
|
||||
|
||||
#undef vp9_subpix_sixtap8x4
|
||||
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
|
||||
|
||||
#undef vp9_subpix_sixtap4x4
|
||||
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
|
||||
|
||||
#undef vp9_subpix_bilinear16x16
|
||||
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
|
||||
|
||||
#undef vp9_subpix_bilinear8x8
|
||||
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
|
||||
|
||||
#undef vp9_subpix_bilinear8x4
|
||||
#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
|
||||
|
||||
#undef vp9_subpix_bilinear4x4
|
||||
#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
|
||||
extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
|
||||
extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp9_subpix_sixtap16x16
|
||||
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
|
||||
|
||||
#undef vp9_subpix_sixtap8x8
|
||||
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
|
||||
|
||||
#undef vp9_subpix_sixtap8x4
|
||||
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
|
||||
|
||||
#undef vp9_subpix_sixtap4x4
|
||||
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
|
||||
|
||||
#undef vp9_subpix_bilinear16x16
|
||||
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
|
||||
|
||||
#undef vp9_subpix_bilinear8x8
|
||||
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
|
||||
|
||||
#undef vp9_subpix_bilinear8x4
|
||||
#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
|
||||
|
||||
#undef vp9_subpix_bilinear4x4
|
||||
#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -12,29 +12,10 @@
|
|||
#include "vpx_config.h"
|
||||
#include "vpx/vpx_codec.h"
|
||||
#include "vpx_ports/asm_offsets.h"
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
BEGIN
|
||||
|
||||
/* vpx_scale */
|
||||
DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
|
||||
DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
|
||||
DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
|
||||
DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
|
||||
DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
|
||||
DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
|
||||
DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
|
||||
DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
|
||||
DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
|
||||
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
|
||||
DEFINE(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS);
|
||||
|
||||
END
|
||||
|
||||
/* add asserts for any offset that is not supported by assembly code */
|
||||
/* add asserts for any size that is not supported by assembly code */
|
||||
|
||||
#if HAVE_ARMV7
|
||||
/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
|
||||
ct_assert(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS == 32)
|
||||
#endif
|
||||
|
|
|
@ -22,11 +22,7 @@ typedef enum {
|
|||
SIMPLE_LOOPFILTER = 1
|
||||
} LOOPFILTERTYPE;
|
||||
|
||||
#if ARCH_ARM
|
||||
#define SIMD_WIDTH 1
|
||||
#else
|
||||
#define SIMD_WIDTH 16
|
||||
#endif
|
||||
|
||||
/* Need to align this structure so when it is declared and
|
||||
* passed it can be loaded into vector registers.
|
||||
|
@ -67,10 +63,6 @@ struct loop_filter_info {
|
|||
#include "x86/vp9_loopfilter_x86.h"
|
||||
#endif
|
||||
|
||||
#if ARCH_ARM
|
||||
#include "arm/vp9_loopfilter_arm.h"
|
||||
#endif
|
||||
|
||||
typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */
|
||||
int p, /* pitch */
|
||||
const unsigned char *blimit,
|
||||
|
|
|
@ -79,13 +79,11 @@ specialize vp9_dequant_idct_add_uv_block mmx
|
|||
# RECON
|
||||
#
|
||||
prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
|
||||
vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
|
||||
specialize vp9_copy_mem16x16 mmx sse2 dspr2
|
||||
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
|
||||
|
||||
prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem8x8 mmx media neon dspr2
|
||||
vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
|
||||
specialize vp9_copy_mem8x8 mmx dspr2
|
||||
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
|
||||
|
||||
prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
|
||||
|
@ -98,8 +96,7 @@ prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char
|
|||
specialize vp9_avg_mem8x8
|
||||
|
||||
prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
|
||||
specialize vp9_copy_mem8x4 mmx media neon dspr2
|
||||
vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
|
||||
specialize vp9_copy_mem8x4 mmx dspr2
|
||||
vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
|
||||
|
||||
prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
|
||||
|
@ -193,36 +190,28 @@ prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsign
|
|||
specialize vp9_loop_filter_bh8x8 sse2
|
||||
|
||||
prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
|
||||
specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
|
||||
specialize vp9_loop_filter_simple_mbv mmx sse2
|
||||
vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
|
||||
vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
|
||||
vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
|
||||
vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
|
||||
vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
|
||||
|
||||
prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
|
||||
specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
|
||||
specialize vp9_loop_filter_simple_mbh mmx sse2
|
||||
vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
|
||||
vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
|
||||
vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
|
||||
vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
|
||||
vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
|
||||
|
||||
prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
|
||||
specialize vp9_loop_filter_simple_bv mmx sse2 media neon
|
||||
specialize vp9_loop_filter_simple_bv mmx sse2
|
||||
vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
|
||||
vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
|
||||
vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
|
||||
vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
|
||||
vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
|
||||
|
||||
prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
|
||||
specialize vp9_loop_filter_simple_bh mmx sse2 media neon
|
||||
specialize vp9_loop_filter_simple_bh mmx sse2
|
||||
vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
|
||||
vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
|
||||
vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
|
||||
vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
|
||||
vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
|
||||
|
||||
#
|
||||
# post proc
|
||||
|
@ -683,7 +672,7 @@ prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int st
|
|||
specialize vp9_temporal_filter_apply sse2
|
||||
|
||||
prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
|
||||
specialize vp9_yv12_copy_partial_frame neon
|
||||
specialize vp9_yv12_copy_partial_frame
|
||||
|
||||
|
||||
fi
|
||||
|
@ -716,11 +705,11 @@ if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
|
|||
fi
|
||||
|
||||
prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
|
||||
specialize vp8_yv12_extend_frame_borders neon
|
||||
specialize vp8_yv12_extend_frame_borders
|
||||
|
||||
prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
|
||||
specialize vp8_yv12_copy_frame neon
|
||||
specialize vp8_yv12_copy_frame
|
||||
|
||||
prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
|
||||
specialize vp8_yv12_copy_y neon
|
||||
specialize vp8_yv12_copy_y
|
||||
|
||||
|
|
|
@ -1,218 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequant_dc_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride, int Dc)
|
||||
; r0 = input
|
||||
; r1 = dq
|
||||
; r2 = pred
|
||||
; r3 = dest
|
||||
; sp + 36 = pitch ; +4 = 40
|
||||
; sp + 40 = stride ; +4 = 44
|
||||
; sp + 44 = Dc ; +4 = 48
|
||||
|
||||
|
||||
|vp8_dequant_dc_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
ldr r6, [sp, #44]
|
||||
|
||||
ldr r4, [r0] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
sub sp, sp, #4
|
||||
str r3, [sp]
|
||||
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
mov r12, #3
|
||||
|
||||
vp8_dequant_dc_add_loop
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
ldrne r4, [r0, #4]
|
||||
ldrne r5, [r1], #4
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
bne vp8_dequant_dc_add_loop
|
||||
|
||||
sub r0, r0, #32
|
||||
mov r1, r0
|
||||
|
||||
; short_idct4x4llm_v6_dual
|
||||
ldr r3, cospi8sqrt2minus1
|
||||
ldr r4, sinpi8sqrt2
|
||||
ldr r6, [r0, #8]
|
||||
mov r5, #2
|
||||
vp8_dequant_dc_idct_loop1_v6
|
||||
ldr r12, [r0, #24]
|
||||
ldr r14, [r0, #16]
|
||||
smulwt r9, r3, r6
|
||||
smulwb r7, r3, r6
|
||||
smulwt r10, r4, r6
|
||||
smulwb r8, r4, r6
|
||||
pkhbt r7, r7, r9, lsl #16
|
||||
smulwt r11, r3, r12
|
||||
pkhbt r8, r8, r10, lsl #16
|
||||
uadd16 r6, r6, r7
|
||||
smulwt r7, r4, r12
|
||||
smulwb r9, r3, r12
|
||||
smulwb r10, r4, r12
|
||||
subs r5, r5, #1
|
||||
pkhbt r9, r9, r11, lsl #16
|
||||
ldr r11, [r0], #4
|
||||
pkhbt r10, r10, r7, lsl #16
|
||||
uadd16 r7, r12, r9
|
||||
usub16 r7, r8, r7
|
||||
uadd16 r6, r6, r10
|
||||
uadd16 r10, r11, r14
|
||||
usub16 r8, r11, r14
|
||||
uadd16 r9, r10, r6
|
||||
usub16 r10, r10, r6
|
||||
uadd16 r6, r8, r7
|
||||
usub16 r7, r8, r7
|
||||
str r6, [r1, #8]
|
||||
ldrne r6, [r0, #8]
|
||||
str r7, [r1, #16]
|
||||
str r10, [r1, #24]
|
||||
str r9, [r1], #4
|
||||
bne vp8_dequant_dc_idct_loop1_v6
|
||||
|
||||
mov r5, #2
|
||||
sub r0, r1, #8
|
||||
vp8_dequant_dc_idct_loop2_v6
|
||||
ldr r6, [r0], #4
|
||||
ldr r7, [r0], #4
|
||||
ldr r8, [r0], #4
|
||||
ldr r9, [r0], #4
|
||||
smulwt r1, r3, r6
|
||||
smulwt r12, r4, r6
|
||||
smulwt lr, r3, r8
|
||||
smulwt r10, r4, r8
|
||||
pkhbt r11, r8, r6, lsl #16
|
||||
pkhbt r1, lr, r1, lsl #16
|
||||
pkhbt r12, r10, r12, lsl #16
|
||||
pkhtb r6, r6, r8, asr #16
|
||||
uadd16 r6, r1, r6
|
||||
pkhbt lr, r9, r7, lsl #16
|
||||
uadd16 r10, r11, lr
|
||||
usub16 lr, r11, lr
|
||||
pkhtb r8, r7, r9, asr #16
|
||||
subs r5, r5, #1
|
||||
smulwt r1, r3, r8
|
||||
smulwb r7, r3, r8
|
||||
smulwt r11, r4, r8
|
||||
smulwb r9, r4, r8
|
||||
pkhbt r1, r7, r1, lsl #16
|
||||
uadd16 r8, r1, r8
|
||||
pkhbt r11, r9, r11, lsl #16
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp, #40]
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
uadd16 r6, r6, r9
|
||||
uadd16 r10, r14, r1
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2], r12
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
pkhtb r8, r8, r6, asr #19
|
||||
uxtb16 lr, r11, ror #8
|
||||
qadd16 r9, r9, lr
|
||||
uxtb16 lr, r11
|
||||
qadd16 r8, r8, lr
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2], r12
|
||||
ldr lr, [sp]
|
||||
ldr r12, [sp, #44]
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
mov r6, r6, lsl #16
|
||||
mov r7, r7, asr #3
|
||||
pkhtb r7, r7, r10, asr #19
|
||||
mov r1, r1, asr #3
|
||||
pkhtb r1, r1, r6, asr #19
|
||||
uxtb16 r8, r11, ror #8
|
||||
qadd16 r7, r7, r8
|
||||
uxtb16 r8, r11
|
||||
qadd16 r1, r1, r8
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [lr], r12
|
||||
str r1, [lr], r12
|
||||
str lr, [sp]
|
||||
bne vp8_dequant_dc_idct_loop2_v6
|
||||
|
||||
; vpx_memset
|
||||
sub r0, r0, #32
|
||||
add sp, sp, #4
|
||||
|
||||
mov r12, #0
|
||||
str r12, [r0]
|
||||
str r12, [r0, #4]
|
||||
str r12, [r0, #8]
|
||||
str r12, [r0, #12]
|
||||
str r12, [r0, #16]
|
||||
str r12, [r0, #20]
|
||||
str r12, [r0, #24]
|
||||
str r12, [r0, #28]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_dequant_dc_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||
sinpi8sqrt2 DCD 0x00008A8C
|
||||
c0x00040004 DCD 0x00040004
|
||||
|
||||
END
|
|
@ -1,196 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_dequant_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride)
|
||||
; r0 = input
|
||||
; r1 = dq
|
||||
; r2 = pred
|
||||
; r3 = dest
|
||||
; sp + 36 = pitch ; +4 = 40
|
||||
; sp + 40 = stride ; +4 = 44
|
||||
|
||||
|
||||
|vp8_dequant_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
ldr r4, [r0] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
sub sp, sp, #4
|
||||
str r3, [sp]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
vp8_dequant_add_loop
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
ldrne r4, [r0, #4]
|
||||
ldrne r5, [r1], #4
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
bne vp8_dequant_add_loop
|
||||
|
||||
sub r0, r0, #32
|
||||
mov r1, r0
|
||||
|
||||
; short_idct4x4llm_v6_dual
|
||||
ldr r3, cospi8sqrt2minus1
|
||||
ldr r4, sinpi8sqrt2
|
||||
ldr r6, [r0, #8]
|
||||
mov r5, #2
|
||||
vp8_dequant_idct_loop1_v6
|
||||
ldr r12, [r0, #24]
|
||||
ldr r14, [r0, #16]
|
||||
smulwt r9, r3, r6
|
||||
smulwb r7, r3, r6
|
||||
smulwt r10, r4, r6
|
||||
smulwb r8, r4, r6
|
||||
pkhbt r7, r7, r9, lsl #16
|
||||
smulwt r11, r3, r12
|
||||
pkhbt r8, r8, r10, lsl #16
|
||||
uadd16 r6, r6, r7
|
||||
smulwt r7, r4, r12
|
||||
smulwb r9, r3, r12
|
||||
smulwb r10, r4, r12
|
||||
subs r5, r5, #1
|
||||
pkhbt r9, r9, r11, lsl #16
|
||||
ldr r11, [r0], #4
|
||||
pkhbt r10, r10, r7, lsl #16
|
||||
uadd16 r7, r12, r9
|
||||
usub16 r7, r8, r7
|
||||
uadd16 r6, r6, r10
|
||||
uadd16 r10, r11, r14
|
||||
usub16 r8, r11, r14
|
||||
uadd16 r9, r10, r6
|
||||
usub16 r10, r10, r6
|
||||
uadd16 r6, r8, r7
|
||||
usub16 r7, r8, r7
|
||||
str r6, [r1, #8]
|
||||
ldrne r6, [r0, #8]
|
||||
str r7, [r1, #16]
|
||||
str r10, [r1, #24]
|
||||
str r9, [r1], #4
|
||||
bne vp8_dequant_idct_loop1_v6
|
||||
|
||||
mov r5, #2
|
||||
sub r0, r1, #8
|
||||
vp8_dequant_idct_loop2_v6
|
||||
ldr r6, [r0], #4
|
||||
ldr r7, [r0], #4
|
||||
ldr r8, [r0], #4
|
||||
ldr r9, [r0], #4
|
||||
smulwt r1, r3, r6
|
||||
smulwt r12, r4, r6
|
||||
smulwt lr, r3, r8
|
||||
smulwt r10, r4, r8
|
||||
pkhbt r11, r8, r6, lsl #16
|
||||
pkhbt r1, lr, r1, lsl #16
|
||||
pkhbt r12, r10, r12, lsl #16
|
||||
pkhtb r6, r6, r8, asr #16
|
||||
uadd16 r6, r1, r6
|
||||
pkhbt lr, r9, r7, lsl #16
|
||||
uadd16 r10, r11, lr
|
||||
usub16 lr, r11, lr
|
||||
pkhtb r8, r7, r9, asr #16
|
||||
subs r5, r5, #1
|
||||
smulwt r1, r3, r8
|
||||
smulwb r7, r3, r8
|
||||
smulwt r11, r4, r8
|
||||
smulwb r9, r4, r8
|
||||
pkhbt r1, r7, r1, lsl #16
|
||||
uadd16 r8, r1, r8
|
||||
pkhbt r11, r9, r11, lsl #16
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp, #40]
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
uadd16 r6, r6, r9
|
||||
uadd16 r10, r14, r1
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2], r12
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
pkhtb r8, r8, r6, asr #19
|
||||
uxtb16 lr, r11, ror #8
|
||||
qadd16 r9, r9, lr
|
||||
uxtb16 lr, r11
|
||||
qadd16 r8, r8, lr
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2], r12
|
||||
ldr lr, [sp]
|
||||
ldr r12, [sp, #44]
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
mov r6, r6, lsl #16
|
||||
mov r7, r7, asr #3
|
||||
pkhtb r7, r7, r10, asr #19
|
||||
mov r1, r1, asr #3
|
||||
pkhtb r1, r1, r6, asr #19
|
||||
uxtb16 r8, r11, ror #8
|
||||
qadd16 r7, r7, r8
|
||||
uxtb16 r8, r11
|
||||
qadd16 r1, r1, r8
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [lr], r12
|
||||
str r1, [lr], r12
|
||||
str lr, [sp]
|
||||
bne vp8_dequant_idct_loop2_v6
|
||||
|
||||
; vpx_memset
|
||||
sub r0, r0, #32
|
||||
add sp, sp, #4
|
||||
|
||||
mov r12, #0
|
||||
str r12, [r0]
|
||||
str r12, [r0, #4]
|
||||
str r12, [r0, #8]
|
||||
str r12, [r0, #12]
|
||||
str r12, [r0, #16]
|
||||
str r12, [r0, #20]
|
||||
str r12, [r0, #24]
|
||||
str r12, [r0, #28]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_dequant_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||
sinpi8sqrt2 DCD 0x00008A8C
|
||||
c0x00040004 DCD 0x00040004
|
||||
|
||||
END
|
|
@ -1,69 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequantize_b_loop_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------
|
||||
;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
; r0 short *Q,
|
||||
; r1 short *DQC
|
||||
; r2 short *DQ
|
||||
|vp8_dequantize_b_loop_v6| PROC
|
||||
stmdb sp!, {r4-r9, lr}
|
||||
|
||||
ldr r3, [r0] ;load Q
|
||||
ldr r4, [r1] ;load DQC
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r1, #4]
|
||||
|
||||
mov r12, #2 ;loop counter
|
||||
|
||||
dequant_loop
|
||||
smulbb r7, r3, r4 ;multiply
|
||||
smultt r8, r3, r4
|
||||
smulbb r9, r5, r6
|
||||
smultt lr, r5, r6
|
||||
|
||||
ldr r3, [r0, #8]
|
||||
ldr r4, [r1, #8]
|
||||
ldr r5, [r0, #12]
|
||||
ldr r6, [r1, #12]
|
||||
|
||||
strh r7, [r2], #2 ;store result
|
||||
smulbb r7, r3, r4 ;multiply
|
||||
strh r8, [r2], #2
|
||||
smultt r8, r3, r4
|
||||
strh r9, [r2], #2
|
||||
smulbb r9, r5, r6
|
||||
strh lr, [r2], #2
|
||||
smultt lr, r5, r6
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
add r0, r0, #16
|
||||
add r1, r1, #16
|
||||
|
||||
ldrne r3, [r0]
|
||||
strh r7, [r2], #2 ;store result
|
||||
ldrne r4, [r1]
|
||||
strh r8, [r2], #2
|
||||
ldrne r5, [r0, #4]
|
||||
strh r9, [r2], #2
|
||||
ldrne r6, [r1, #4]
|
||||
strh lr, [r2], #2
|
||||
|
||||
bne dequant_loop
|
||||
|
||||
ldmia sp!, {r4-r9, pc}
|
||||
ENDP ;|vp8_dequantize_b_loop_v6|
|
||||
|
||||
END
|
|
@ -1,137 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vp9/decoder/vp9_dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
|
||||
unsigned char *pre,
|
||||
unsigned char *dst, int stride,
|
||||
unsigned short *eobs, short *dc) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride,
|
||||
unsigned short *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
|
||||
((int *)(q + 32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
|
||||
((int *)(q + 48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv,
|
||||
int stride, unsigned short *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
|
||||
else {
|
||||
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
|
@ -1,129 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequant_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride)
|
||||
; r0 short *input,
|
||||
; r1 short *dq,
|
||||
; r2 unsigned char *pred
|
||||
; r3 unsigned char *dest
|
||||
; sp int pitch
|
||||
; sp+4 int stride
|
||||
|
||||
|vp8_dequant_idct_add_neon| PROC
|
||||
vld1.16 {q3, q4}, [r0]
|
||||
vld1.16 {q5, q6}, [r1]
|
||||
ldr r1, [sp] ; pitch
|
||||
vld1.32 {d14[0]}, [r2], r1
|
||||
vld1.32 {d14[1]}, [r2], r1
|
||||
vld1.32 {d15[0]}, [r2], r1
|
||||
vld1.32 {d15[1]}, [r2]
|
||||
|
||||
ldr r1, [sp, #4] ; stride
|
||||
|
||||
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
||||
vmul.i16 q2, q4, q6
|
||||
|
||||
;|short_idct4x4llm_neon| PROC
|
||||
vld1.16 {d0}, [r12]
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
; memset(input, 0, 32) -- 32bytes
|
||||
vmov.i16 q14, #0
|
||||
|
||||
vswp d3, d4
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vmov q15, q14
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vst1.16 {q14, q15}, [r0]
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vaddw.u8 q1, q1, d14
|
||||
vaddw.u8 q2, q2, d15
|
||||
|
||||
vqmovun.s16 d0, q1
|
||||
vqmovun.s16 d1, q2
|
||||
|
||||
vst1.32 {d0[0]}, [r3], r1
|
||||
vst1.32 {d0[1]}, [r3], r1
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r3]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |vp8_dequant_idct_add_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b4e7b
|
||||
sinpi8sqrt2 DCD 0x8a8c8a8c
|
||||
|
||||
END
|
|
@ -1,34 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequantize_b_loop_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 short *Q,
|
||||
; r1 short *DQC
|
||||
; r2 short *DQ
|
||||
|vp8_dequantize_b_loop_neon| PROC
|
||||
vld1.16 {q0, q1}, [r0]
|
||||
vld1.16 {q2, q3}, [r1]
|
||||
|
||||
vmul.i16 q4, q0, q2
|
||||
vmul.i16 q5, q1, q3
|
||||
|
||||
vst1.16 {q4, q5}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,113 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vp9/decoder/vp9_dequantize.h"
|
||||
|
||||
/* place these declarations here because we don't want to maintain them
|
||||
* outside of this scope
|
||||
*/
|
||||
void idct_dequant_dc_full_2x_neon
|
||||
(short *input, short *dq, unsigned char *pre, unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_dc_0_2x_neon
|
||||
(short *dc, unsigned char *pre, unsigned char *dst, int stride);
|
||||
void idct_dequant_full_2x_neon
|
||||
(short *q, short *dq, unsigned char *pre, unsigned char *dst,
|
||||
int pitch, int stride);
|
||||
void idct_dequant_0_2x_neon
|
||||
(short *q, short dq, unsigned char *pre, int pitch,
|
||||
unsigned char *dst, int stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
|
||||
unsigned char *pre,
|
||||
unsigned char *dst, int stride,
|
||||
unsigned short *eobs, short *dc) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride,
|
||||
unsigned short *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
|
||||
unsigned char *pre,
|
||||
unsigned char *dstu,
|
||||
unsigned char *dstv, int stride,
|
||||
unsigned short *eobs) {
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4 * stride;
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
|
||||
if (((short *)eobs)[2] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4 * stride;
|
||||
|
||||
if (((short *)eobs)[3] & 0xfefe)
|
||||
idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
|
||||
}
|
|
@ -1,79 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_0_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
|
||||
; int pitch, unsigned char *dst, int stride);
|
||||
; r0 *q
|
||||
; r1 dq
|
||||
; r2 *pre
|
||||
; r3 pitch
|
||||
; sp *dst
|
||||
; sp+4 stride
|
||||
|idct_dequant_0_2x_neon| PROC
|
||||
add r12, r2, #4
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d4[1]}, [r2]
|
||||
vld1.32 {d8[0]}, [r12], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d10[1]}, [r12]
|
||||
|
||||
ldrh r12, [r0] ; lo q
|
||||
ldrh r2, [r0, #32] ; hi q
|
||||
mov r3, #0
|
||||
strh r3, [r0]
|
||||
strh r3, [r0, #32]
|
||||
|
||||
sxth r12, r12 ; lo
|
||||
mul r0, r12, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q0, r0
|
||||
sxth r2, r2 ; hi
|
||||
mul r0, r2, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
|
||||
vaddw.u8 q1, q0, d2 ; lo
|
||||
vaddw.u8 q2, q0, d4
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
ldr r2, [sp] ; dst
|
||||
ldr r3, [sp, #4] ; stride
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
add r0, r2, #4
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_0_2x_neon|
|
||||
END
|
|
@ -1,69 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_dc_0_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *dc
|
||||
; r1 *pre
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_dc_0_2x_neon| PROC
|
||||
ldr r0, [r0] ; *dc
|
||||
mov r12, #16
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r12 ; lo
|
||||
vld1.32 {d2[1]}, [r1], r12
|
||||
vld1.32 {d4[0]}, [r1], r12
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
sub r1, r1, #44
|
||||
vld1.32 {d8[0]}, [r1], r12 ; hi
|
||||
vld1.32 {d8[1]}, [r1], r12
|
||||
vld1.32 {d10[0]}, [r1], r12
|
||||
vld1.32 {d10[1]}, [r1]
|
||||
|
||||
sxth r1, r0 ; lo *dc
|
||||
add r1, r1, #4
|
||||
asr r1, r1, #3
|
||||
vdup.16 q0, r1
|
||||
sxth r0, r0, ror #16 ; hi *dc
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
|
||||
vaddw.u8 q1, q0, d2 ; lo
|
||||
vaddw.u8 q2, q0, d4
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
add r0, r2, #4
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ;|idct_dequant_dc_0_2x_neon|
|
||||
END
|
|
@ -1,205 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_dc_full_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
|
||||
; unsigned char *dst, int stride, short *dc);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *pre
|
||||
; r3 *dst
|
||||
; sp stride
|
||||
; sp+4 *dc
|
||||
|idct_dequant_dc_full_2x_neon| PROC
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
mov r1, #16 ; pitch
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r1 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r1 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r1
|
||||
vld1.32 {d29[1]}, [r12], r1
|
||||
vld1.32 {d30[0]}, [r2], r1
|
||||
vld1.32 {d30[1]}, [r12], r1
|
||||
vld1.32 {d31[0]}, [r2]
|
||||
ldr r1, [sp, #4]
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
ldrh r12, [r1], #2 ; lo *dc
|
||||
ldrh r1, [r1] ; hi *dc
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
vmul.i16 q3, q3, q1
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
; move dc up to neon and overwrite first element
|
||||
vmov.16 d4[0], r12
|
||||
vmov.16 d8[0], r1
|
||||
|
||||
vld1.16 {d0}, [r2]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
vswp d5, d8
|
||||
vswp d7, d10
|
||||
|
||||
; _CONSTANTS_ * 4,12 >> 16
|
||||
; q6: 4 * sinpi : c1/temp1
|
||||
; q7: 12 * sinpi : d1/temp2
|
||||
; q8: 4 * cospi
|
||||
; q9: 12 * cospi
|
||||
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q7, q5, d0[2]
|
||||
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q9, q5, d0[0]
|
||||
|
||||
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
|
||||
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
|
||||
|
||||
; vqdmulh only accepts signed values. this was a problem because
|
||||
; our constant had the high bit set, and was treated as a negative value.
|
||||
; vqdmulh also doubles the value before it shifts by 16. we need to
|
||||
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
|
||||
; so we can shift the constant without losing precision. this avoids
|
||||
; shift again afterward, but also avoids the sign issue. win win!
|
||||
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
|
||||
; pre-shift it
|
||||
vshr.s16 q8, q8, #1
|
||||
vshr.s16 q9, q9, #1
|
||||
|
||||
; q4: 4 + 4 * cospi : d1/temp1
|
||||
; q5: 12 + 12 * cospi : c1/temp2
|
||||
vqadd.s16 q4, q4, q8
|
||||
vqadd.s16 q5, q5, q9
|
||||
|
||||
; c1 = temp1 - temp2
|
||||
; d1 = temp1 + temp2
|
||||
vqsub.s16 q2, q6, q5
|
||||
vqadd.s16 q3, q4, q7
|
||||
|
||||
; [0]: a1+d1
|
||||
; [1]: b1+c1
|
||||
; [2]: b1-c1
|
||||
; [3]: a1-d1
|
||||
vqadd.s16 q4, q10, q3
|
||||
vqadd.s16 q5, q11, q2
|
||||
vqsub.s16 q6, q11, q2
|
||||
vqsub.s16 q7, q10, q3
|
||||
|
||||
; rotate
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
; idct loop 2
|
||||
; q4: l 0, 4, 8,12 r 0, 4, 8,12
|
||||
; q5: l 1, 5, 9,13 r 1, 5, 9,13
|
||||
; q6: l 2, 6,10,14 r 2, 6,10,14
|
||||
; q7: l 3, 7,11,15 r 3, 7,11,15
|
||||
|
||||
; q8: 1 * sinpi : c1/temp1
|
||||
; q9: 3 * sinpi : d1/temp2
|
||||
; q10: 1 * cospi
|
||||
; q11: 3 * cospi
|
||||
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q9, q7, d0[2]
|
||||
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q11, q7, d0[0]
|
||||
|
||||
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
|
||||
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
|
||||
|
||||
; see note on shifting above
|
||||
vshr.s16 q10, q10, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
|
||||
; q10: 1 + 1 * cospi : d1/temp1
|
||||
; q11: 3 + 3 * cospi : c1/temp2
|
||||
vqadd.s16 q10, q5, q10
|
||||
vqadd.s16 q11, q7, q11
|
||||
|
||||
; q8: c1 = temp1 - temp2
|
||||
; q9: d1 = temp1 + temp2
|
||||
vqsub.s16 q8, q8, q11
|
||||
vqadd.s16 q9, q10, q9
|
||||
|
||||
; a1+d1
|
||||
; b1+c1
|
||||
; b1-c1
|
||||
; a1-d1
|
||||
vqadd.s16 q4, q2, q9
|
||||
vqadd.s16 q5, q3, q8
|
||||
vqsub.s16 q6, q3, q8
|
||||
vqsub.s16 q7, q2, q9
|
||||
|
||||
; +4 >> 3 (rounding)
|
||||
vrshr.s16 q4, q4, #3 ; lo
|
||||
vrshr.s16 q5, q5, #3
|
||||
vrshr.s16 q6, q6, #3 ; hi
|
||||
vrshr.s16 q7, q7, #3
|
||||
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
|
||||
; adding pre
|
||||
; input is still packed. pre was read interleaved
|
||||
vaddw.u8 q4, q4, d28
|
||||
vaddw.u8 q5, q5, d29
|
||||
vaddw.u8 q6, q6, d30
|
||||
vaddw.u8 q7, q7, d31
|
||||
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q14
|
||||
vst1.16 {q14, q15}, [r0] ; write over high input
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
ldr r1, [sp] ; stride
|
||||
add r2, r3, #4 ; hi
|
||||
vst1.32 {d0[0]}, [r3], r1 ; lo
|
||||
vst1.32 {d0[1]}, [r2], r1 ; hi
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r2], r1
|
||||
vst1.32 {d2[0]}, [r3], r1
|
||||
vst1.32 {d2[1]}, [r2], r1
|
||||
vst1.32 {d3[0]}, [r3]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_dc_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
sinpi8sqrt2 DCD 0x4546
|
||||
|
||||
END
|
|
@ -1,197 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_full_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
|
||||
; unsigned char *dst, int pitch, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *pre
|
||||
; r3 *dst
|
||||
; sp pitch
|
||||
; sp+4 stride
|
||||
|idct_dequant_full_2x_neon| PROC
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
ldr r1, [sp] ; pitch
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r1 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r1 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r1
|
||||
vld1.32 {d29[1]}, [r12], r1
|
||||
vld1.32 {d30[0]}, [r2], r1
|
||||
vld1.32 {d30[1]}, [r12], r1
|
||||
vld1.32 {d31[0]}, [r2]
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
vmul.i16 q3, q3, q1
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
vld1.16 {d0}, [r2]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
vswp d5, d8
|
||||
vswp d7, d10
|
||||
|
||||
; _CONSTANTS_ * 4,12 >> 16
|
||||
; q6: 4 * sinpi : c1/temp1
|
||||
; q7: 12 * sinpi : d1/temp2
|
||||
; q8: 4 * cospi
|
||||
; q9: 12 * cospi
|
||||
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q7, q5, d0[2]
|
||||
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q9, q5, d0[0]
|
||||
|
||||
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
|
||||
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
|
||||
|
||||
; vqdmulh only accepts signed values. this was a problem because
|
||||
; our constant had the high bit set, and was treated as a negative value.
|
||||
; vqdmulh also doubles the value before it shifts by 16. we need to
|
||||
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
|
||||
; so we can shift the constant without losing precision. this avoids
|
||||
; shift again afterward, but also avoids the sign issue. win win!
|
||||
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
|
||||
; pre-shift it
|
||||
vshr.s16 q8, q8, #1
|
||||
vshr.s16 q9, q9, #1
|
||||
|
||||
; q4: 4 + 4 * cospi : d1/temp1
|
||||
; q5: 12 + 12 * cospi : c1/temp2
|
||||
vqadd.s16 q4, q4, q8
|
||||
vqadd.s16 q5, q5, q9
|
||||
|
||||
; c1 = temp1 - temp2
|
||||
; d1 = temp1 + temp2
|
||||
vqsub.s16 q2, q6, q5
|
||||
vqadd.s16 q3, q4, q7
|
||||
|
||||
; [0]: a1+d1
|
||||
; [1]: b1+c1
|
||||
; [2]: b1-c1
|
||||
; [3]: a1-d1
|
||||
vqadd.s16 q4, q10, q3
|
||||
vqadd.s16 q5, q11, q2
|
||||
vqsub.s16 q6, q11, q2
|
||||
vqsub.s16 q7, q10, q3
|
||||
|
||||
; rotate
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
; idct loop 2
|
||||
; q4: l 0, 4, 8,12 r 0, 4, 8,12
|
||||
; q5: l 1, 5, 9,13 r 1, 5, 9,13
|
||||
; q6: l 2, 6,10,14 r 2, 6,10,14
|
||||
; q7: l 3, 7,11,15 r 3, 7,11,15
|
||||
|
||||
; q8: 1 * sinpi : c1/temp1
|
||||
; q9: 3 * sinpi : d1/temp2
|
||||
; q10: 1 * cospi
|
||||
; q11: 3 * cospi
|
||||
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q9, q7, d0[2]
|
||||
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q11, q7, d0[0]
|
||||
|
||||
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
|
||||
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
|
||||
|
||||
; see note on shifting above
|
||||
vshr.s16 q10, q10, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
|
||||
; q10: 1 + 1 * cospi : d1/temp1
|
||||
; q11: 3 + 3 * cospi : c1/temp2
|
||||
vqadd.s16 q10, q5, q10
|
||||
vqadd.s16 q11, q7, q11
|
||||
|
||||
; q8: c1 = temp1 - temp2
|
||||
; q9: d1 = temp1 + temp2
|
||||
vqsub.s16 q8, q8, q11
|
||||
vqadd.s16 q9, q10, q9
|
||||
|
||||
; a1+d1
|
||||
; b1+c1
|
||||
; b1-c1
|
||||
; a1-d1
|
||||
vqadd.s16 q4, q2, q9
|
||||
vqadd.s16 q5, q3, q8
|
||||
vqsub.s16 q6, q3, q8
|
||||
vqsub.s16 q7, q2, q9
|
||||
|
||||
; +4 >> 3 (rounding)
|
||||
vrshr.s16 q4, q4, #3 ; lo
|
||||
vrshr.s16 q5, q5, #3
|
||||
vrshr.s16 q6, q6, #3 ; hi
|
||||
vrshr.s16 q7, q7, #3
|
||||
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
|
||||
; adding pre
|
||||
; input is still packed. pre was read interleaved
|
||||
vaddw.u8 q4, q4, d28
|
||||
vaddw.u8 q5, q5, d29
|
||||
vaddw.u8 q6, q6, d30
|
||||
vaddw.u8 q7, q7, d31
|
||||
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q14
|
||||
vst1.16 {q14, q15}, [r0] ; write over high input
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
ldr r1, [sp, #4] ; stride
|
||||
add r2, r3, #4 ; hi
|
||||
vst1.32 {d0[0]}, [r3], r1 ; lo
|
||||
vst1.32 {d0[1]}, [r2], r1 ; hi
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r2], r1
|
||||
vst1.32 {d2[0]}, [r3], r1
|
||||
vst1.32 {d2[1]}, [r2], r1
|
||||
vst1.32 {d3[0]}, [r3]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
sinpi8sqrt2 DCD 0x4546
|
||||
|
||||
END
|
|
@ -1,44 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vp9/decoder/vp9_dequantize.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
|
||||
void vp9_dequantize_b_neon(BLOCKD *d) {
|
||||
short *DQ = d->dqcoeff;
|
||||
short *Q = d->qcoeff;
|
||||
short *DQC = d->dequant;
|
||||
|
||||
vp9_dequantize_b_loop_neon(Q, DQC, DQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV6
|
||||
void vp9_dequantize_b_v6(BLOCKD *d) {
|
||||
short *DQ = d->dqcoeff;
|
||||
short *Q = d->qcoeff;
|
||||
short *DQC = d->dequant;
|
||||
|
||||
vp9_dequantize_b_loop_v6(Q, DQC, DQ);
|
||||
}
|
||||
#endif
|
|
@ -28,9 +28,6 @@
|
|||
#include "vpx_ports/vpx_timer.h"
|
||||
#include "vp9/decoder/vp9_decodframe.h"
|
||||
#include "vp9/decoder/vp9_detokenize.h"
|
||||
#if ARCH_ARM
|
||||
#include "vpx_ports/arm.h"
|
||||
#endif
|
||||
|
||||
static int get_free_fb(VP9_COMMON *cm);
|
||||
static void ref_cnt_fb(int *buf, int *idx, int new_idx);
|
||||
|
@ -235,11 +232,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
|
|||
return pbi->common.error.error_code;
|
||||
}
|
||||
|
||||
/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
|
||||
#if HAVE_ARMV7
|
||||
extern void vp9_push_neon(int64_t *store);
|
||||
extern void vp9_pop_neon(int64_t *store);
|
||||
#endif
|
||||
|
||||
static int get_free_fb(VP9_COMMON *cm) {
|
||||
int i;
|
||||
|
@ -317,9 +309,6 @@ static int swap_frame_buffers(VP9_COMMON *cm) {
|
|||
int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
|
||||
const unsigned char **psource,
|
||||
int64_t time_stamp) {
|
||||
#if HAVE_ARMV7
|
||||
int64_t dx_store_reg[8];
|
||||
#endif
|
||||
VP9D_COMP *pbi = (VP9D_COMP *) ptr;
|
||||
VP9_COMMON *cm = &pbi->common;
|
||||
const unsigned char *source = *psource;
|
||||
|
@ -346,26 +335,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
|
|||
cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
|
||||
}
|
||||
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->rtcd.flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_push_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
|
||||
cm->new_fb_idx = get_free_fb(cm);
|
||||
|
||||
if (setjmp(pbi->common.error.jmp)) {
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->rtcd.flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_pop_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
pbi->common.error.setjmp = 0;
|
||||
|
||||
/* We do not know if the missing frame(s) was supposed to update
|
||||
|
@ -384,14 +356,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
|
|||
retcode = vp9_decode_frame(pbi, psource);
|
||||
|
||||
if (retcode < 0) {
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->rtcd.flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_pop_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
pbi->common.error.error_code = VPX_CODEC_ERROR;
|
||||
pbi->common.error.setjmp = 0;
|
||||
if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
|
||||
|
@ -401,14 +365,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
|
|||
|
||||
{
|
||||
if (swap_frame_buffers(cm)) {
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->rtcd.flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_pop_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
pbi->common.error.error_code = VPX_CODEC_ERROR;
|
||||
pbi->common.error.setjmp = 0;
|
||||
return -1;
|
||||
|
@ -455,14 +411,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
|
|||
pbi->last_time_stamp = time_stamp;
|
||||
pbi->source_sz = 0;
|
||||
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->rtcd.flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_pop_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
pbi->common.error.setjmp = 0;
|
||||
return retcode;
|
||||
}
|
||||
|
|
|
@ -1,286 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_start_encode|
|
||||
EXPORT |vp9_encode_bool|
|
||||
EXPORT |vp8_stop_encode|
|
||||
EXPORT |vp8_encode_value|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
; r0 BOOL_CODER *br
|
||||
; r1 unsigned char *source
|
||||
|
||||
|vp8_start_encode| PROC
|
||||
mov r12, #0
|
||||
mov r3, #255
|
||||
mvn r2, #23
|
||||
str r12, [r0, #vp9_writer_lowvalue]
|
||||
str r3, [r0, #vp9_writer_range]
|
||||
str r12, [r0, #vp9_writer_value]
|
||||
str r2, [r0, #vp9_writer_count]
|
||||
str r12, [r0, #vp9_writer_pos]
|
||||
str r1, [r0, #vp9_writer_buffer]
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
; r0 BOOL_CODER *br
|
||||
; r1 int bit
|
||||
; r2 int probability
|
||||
|vp9_encode_bool| PROC
|
||||
push {r4-r9, lr}
|
||||
|
||||
mov r4, r2
|
||||
|
||||
ldr r2, [r0, #vp9_writer_lowvalue]
|
||||
ldr r5, [r0, #vp9_writer_range]
|
||||
ldr r3, [r0, #vp9_writer_count]
|
||||
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
cmp r1, #0
|
||||
mul r6, r4, r7 ; ((range-1) * probability)
|
||||
|
||||
mov r7, #1
|
||||
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
|
||||
|
||||
addne r2, r2, r4 ; if (bit) lowvalue += split
|
||||
subne r4, r5, r4 ; if (bit) range = range-split
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start
|
||||
token_zero_while_loop
|
||||
mov r9, #0
|
||||
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r1, [r7, r4]
|
||||
cmpge r1, #0xff
|
||||
beq token_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r9, [r7, r4] ; w->buffer[x]
|
||||
add r9, r9, #1
|
||||
strb r9, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r9, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r1, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r1, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r9, r4] ; w->buffer[w->pos++]
|
||||
|
||||
token_count_lt_zero
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
str r2, [r0, #vp9_writer_lowvalue]
|
||||
str r5, [r0, #vp9_writer_range]
|
||||
str r3, [r0, #vp9_writer_count]
|
||||
pop {r4-r9, pc}
|
||||
ENDP
|
||||
|
||||
; r0 BOOL_CODER *br
|
||||
|vp8_stop_encode| PROC
|
||||
push {r4-r10, lr}
|
||||
|
||||
ldr r2, [r0, #vp9_writer_lowvalue]
|
||||
ldr r5, [r0, #vp9_writer_range]
|
||||
ldr r3, [r0, #vp9_writer_count]
|
||||
|
||||
mov r10, #32
|
||||
|
||||
stop_encode_loop
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
mov r4, r7, lsl #7 ; ((range-1) * 128)
|
||||
|
||||
mov r7, #1
|
||||
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero_se ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set_se
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start_se
|
||||
token_zero_while_loop_se
|
||||
mov r9, #0
|
||||
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start_se
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r1, [r7, r4]
|
||||
cmpge r1, #0xff
|
||||
beq token_zero_while_loop_se
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r9, [r7, r4] ; w->buffer[x]
|
||||
add r9, r9, #1
|
||||
strb r9, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set_se
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r9, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r1, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r1, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r9, r4] ; w->buffer[w->pos++]
|
||||
|
||||
token_count_lt_zero_se
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r10, r10, #1
|
||||
bne stop_encode_loop
|
||||
|
||||
str r2, [r0, #vp9_writer_lowvalue]
|
||||
str r5, [r0, #vp9_writer_range]
|
||||
str r3, [r0, #vp9_writer_count]
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; r0 BOOL_CODER *br
|
||||
; r1 int data
|
||||
; r2 int bits
|
||||
|vp8_encode_value| PROC
|
||||
push {r4-r11, lr}
|
||||
|
||||
mov r10, r2
|
||||
|
||||
ldr r2, [r0, #vp9_writer_lowvalue]
|
||||
ldr r5, [r0, #vp9_writer_range]
|
||||
ldr r3, [r0, #vp9_writer_count]
|
||||
|
||||
rsb r4, r10, #32 ; 32-n
|
||||
|
||||
; v is kept in r1 during the token pack loop
|
||||
lsl r1, r1, r4 ; r1 = v << 32 - n
|
||||
|
||||
encode_value_loop
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
; Decisions are made based on the bit value shifted
|
||||
; off of v, so set a flag here based on this.
|
||||
; This value is refered to as "bb"
|
||||
lsls r1, r1, #1 ; bit = v >> n
|
||||
mov r4, r7, lsl #7 ; ((range-1) * 128)
|
||||
|
||||
mov r7, #1
|
||||
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bit) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bit) range = range-split
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero_ev ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set_ev
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start_ev
|
||||
token_zero_while_loop_ev
|
||||
mov r9, #0
|
||||
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start_ev
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq token_zero_while_loop_ev
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r9, [r7, r4] ; w->buffer[x]
|
||||
add r9, r9, #1
|
||||
strb r9, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set_ev
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r9, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r9, r4] ; w->buffer[w->pos++]
|
||||
|
||||
token_count_lt_zero_ev
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r10, r10, #1
|
||||
bne encode_value_loop
|
||||
|
||||
str r2, [r0, #vp9_writer_lowvalue]
|
||||
str r5, [r0, #vp9_writer_range]
|
||||
str r3, [r0, #vp9_writer_count]
|
||||
pop {r4-r11, pc}
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,291 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8cx_pack_tokens_armv5|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
; r0 vp9_writer *w
|
||||
; r1 const TOKENEXTRA *p
|
||||
; r2 int xcount
|
||||
; r3 vp8_coef_encodings
|
||||
; s0 vp8_extra_bits
|
||||
; s1 vp8_coef_tree
|
||||
|vp8cx_pack_tokens_armv5| PROC
|
||||
push {r4-r11, lr}
|
||||
|
||||
; Add size of xcount * sizeof (TOKENEXTRA) to get stop
|
||||
; sizeof (TOKENEXTRA) is 8
|
||||
sub sp, sp, #12
|
||||
add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
|
||||
str r2, [sp, #0]
|
||||
str r3, [sp, #8] ; save vp8_coef_encodings
|
||||
ldr r2, [r0, #vp9_writer_lowvalue]
|
||||
ldr r5, [r0, #vp9_writer_range]
|
||||
ldr r3, [r0, #vp9_writer_count]
|
||||
b check_p_lt_stop
|
||||
|
||||
while_p_lt_stop
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r4, [sp, #8] ; vp8_coef_encodings
|
||||
mov lr, #0
|
||||
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
|
||||
ldr r9, [r1, #tokenextra_context_tree] ; pp
|
||||
|
||||
ldrb r7, [r1, #tokenextra_skip_eob_node]
|
||||
|
||||
ldr r6, [r4, #vp9_token_value] ; v
|
||||
ldr r8, [r4, #vp9_token_len] ; n
|
||||
|
||||
; vp8 specific skip_eob_node
|
||||
cmp r7, #0
|
||||
movne lr, #2 ; i = 2
|
||||
subne r8, r8, #1 ; --n
|
||||
|
||||
rsb r4, r8, #32 ; 32-n
|
||||
ldr r10, [sp, #52] ; vp8_coef_tree
|
||||
|
||||
; v is kept in r12 during the token pack loop
|
||||
lsl r12, r6, r4 ; r12 = v << 32 - n
|
||||
|
||||
; loop start
|
||||
token_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
; Decisions are made based on the bit value shifted
|
||||
; off of v, so set a flag here based on this.
|
||||
; This value is refered to as "bb"
|
||||
lsls r12, r12, #1 ; bb = v >> n
|
||||
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
|
||||
|
||||
; bb can only be 0 or 1. So only execute this statement
|
||||
; if bb == 1, otherwise it will act like i + 0
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start
|
||||
token_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq token_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4] ; w->buffer[x]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]
|
||||
|
||||
; r10 is used earlier in the loop, but r10 is used as
|
||||
; temp variable here. So after r10 is used, reload
|
||||
; vp8_coef_tree_dcd into r10
|
||||
ldr r10, [sp, #52] ; vp8_coef_tree
|
||||
|
||||
token_count_lt_zero
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne token_loop
|
||||
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r7, [sp, #48] ; vp8_extra_bits
|
||||
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
|
||||
; element. Here vp9_extra_bit_struct == 16
|
||||
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
|
||||
|
||||
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
|
||||
cmp r4, #0
|
||||
beq skip_extra_bits
|
||||
|
||||
; if( b->base_val)
|
||||
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
|
||||
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
|
||||
cmp r8, #0 ; if( L)
|
||||
beq no_extra_bits
|
||||
|
||||
ldr r9, [r12, #vp9_extra_bit_struct_prob]
|
||||
asr r7, lr, #1 ; v=e>>1
|
||||
|
||||
ldr r10, [r12, #vp9_extra_bit_struct_tree]
|
||||
str r10, [sp, #4] ; b->tree
|
||||
|
||||
rsb r4, r8, #32
|
||||
lsl r12, r7, r4
|
||||
|
||||
mov lr, #0 ; i = 0
|
||||
|
||||
extra_bits_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
lsls r12, r12, #1 ; v >> n
|
||||
mul r6, r4, r7 ; (range-1) * pp[i>>1]
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
clz r6, r4
|
||||
sub r6, r6, #24
|
||||
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi extra_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset= shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl extra_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos - 1
|
||||
b extra_zero_while_start
|
||||
extra_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
extra_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq extra_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4]
|
||||
extra_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
|
||||
ldr r10, [sp, #4] ; b->tree
|
||||
extra_count_lt_zero
|
||||
lsl r2, r2, r6
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne extra_bits_loop ; while (n)
|
||||
|
||||
no_extra_bits
|
||||
ldr lr, [r1, #4] ; e = p->Extra
|
||||
add r4, r5, #1 ; range + 1
|
||||
tst lr, #1
|
||||
lsr r4, r4, #1 ; split = (range + 1) >> 1
|
||||
addne r2, r2, r4 ; lowvalue += split
|
||||
subne r4, r5, r4 ; range = range-split
|
||||
tst r2, #0x80000000 ; lowvalue & 0x80000000
|
||||
lsl r5, r4, #1 ; range <<= 1
|
||||
beq end_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mov r7, #0
|
||||
sub r4, r4, #1
|
||||
b end_zero_while_start
|
||||
end_zero_while_loop
|
||||
strb r7, [r6, r4]
|
||||
sub r4, r4, #1 ; x--
|
||||
end_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r12, [r6, r4]
|
||||
cmpge r12, #0xff
|
||||
beq end_zero_while_loop
|
||||
|
||||
ldr r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r7, [r6, r4]
|
||||
add r7, r7, #1
|
||||
strb r7, [r6, r4]
|
||||
end_high_bit_not_set
|
||||
adds r3, r3, #1 ; ++count
|
||||
lsl r2, r2, #1 ; lowvalue <<= 1
|
||||
bne end_count_zero
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mvn r3, #7
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
lsr r6, r2, #24 ; lowvalue >> 24
|
||||
add r12, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r12, [r0, #0x10]
|
||||
strb r6, [r7, r4]
|
||||
end_count_zero
|
||||
skip_extra_bits
|
||||
add r1, r1, #TOKENEXTRA_SZ ; ++p
|
||||
check_p_lt_stop
|
||||
ldr r4, [sp, #0] ; stop
|
||||
cmp r1, r4 ; while( p < stop)
|
||||
bcc while_p_lt_stop
|
||||
|
||||
str r2, [r0, #vp9_writer_lowvalue]
|
||||
str r5, [r0, #vp9_writer_range]
|
||||
str r3, [r0, #vp9_writer_count]
|
||||
add sp, sp, #12
|
||||
pop {r4-r11, pc}
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,327 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8cx_pack_mb_row_tokens_armv5|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
; r0 VP8_COMP *cpi
|
||||
; r1 vp9_writer *w
|
||||
; r2 vp8_coef_encodings
|
||||
; r3 vp8_extra_bits
|
||||
; s0 vp8_coef_tree
|
||||
|
||||
|vp8cx_pack_mb_row_tokens_armv5| PROC
|
||||
push {r4-r11, lr}
|
||||
sub sp, sp, #24
|
||||
|
||||
; Compute address of cpi->common.mb_rows
|
||||
ldr r4, _VP8_COMP_common_
|
||||
ldr r6, _VP8_COMMON_MBrows_
|
||||
add r4, r0, r4
|
||||
|
||||
ldr r5, [r4, r6] ; load up mb_rows
|
||||
|
||||
str r2, [sp, #20] ; save vp8_coef_encodings
|
||||
str r5, [sp, #12] ; save mb_rows
|
||||
str r3, [sp, #8] ; save vp8_extra_bits
|
||||
|
||||
ldr r4, _VP8_COMP_tplist_
|
||||
add r4, r0, r4
|
||||
ldr r7, [r4, #0] ; dereference cpi->tp_list
|
||||
|
||||
mov r0, r1 ; keep same as other loops
|
||||
|
||||
ldr r2, [r0, #vp9_writer_lowvalue]
|
||||
ldr r5, [r0, #vp9_writer_range]
|
||||
ldr r3, [r0, #vp9_writer_count]
|
||||
|
||||
mb_row_loop
|
||||
|
||||
ldr r1, [r7, #tokenlist_start]
|
||||
ldr r9, [r7, #tokenlist_stop]
|
||||
str r9, [sp, #0] ; save stop for later comparison
|
||||
str r7, [sp, #16] ; tokenlist address for next time
|
||||
|
||||
b check_p_lt_stop
|
||||
|
||||
; actuall work gets done here!
|
||||
|
||||
while_p_lt_stop
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r4, [sp, #20] ; vp8_coef_encodings
|
||||
mov lr, #0
|
||||
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
|
||||
ldr r9, [r1, #tokenextra_context_tree] ; pp
|
||||
|
||||
ldrb r7, [r1, #tokenextra_skip_eob_node]
|
||||
|
||||
ldr r6, [r4, #vp9_token_value] ; v
|
||||
ldr r8, [r4, #vp9_token_len] ; n
|
||||
|
||||
; vp8 specific skip_eob_node
|
||||
cmp r7, #0
|
||||
movne lr, #2 ; i = 2
|
||||
subne r8, r8, #1 ; --n
|
||||
|
||||
rsb r4, r8, #32 ; 32-n
|
||||
ldr r10, [sp, #60] ; vp8_coef_tree
|
||||
|
||||
; v is kept in r12 during the token pack loop
|
||||
lsl r12, r6, r4 ; r12 = v << 32 - n
|
||||
|
||||
; loop start
|
||||
token_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
; Decisions are made based on the bit value shifted
|
||||
; off of v, so set a flag here based on this.
|
||||
; This value is refered to as "bb"
|
||||
lsls r12, r12, #1 ; bb = v >> n
|
||||
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
|
||||
|
||||
; bb can only be 0 or 1. So only execute this statement
|
||||
; if bb == 1, otherwise it will act like i + 0
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start
|
||||
token_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq token_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4] ; w->buffer[x]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]
|
||||
|
||||
; r10 is used earlier in the loop, but r10 is used as
|
||||
; temp variable here. So after r10 is used, reload
|
||||
; vp8_coef_tree_dcd into r10
|
||||
ldr r10, [sp, #60] ; vp8_coef_tree
|
||||
|
||||
token_count_lt_zero
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne token_loop
|
||||
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r7, [sp, #8] ; vp8_extra_bits
|
||||
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
|
||||
; element. Here vp9_extra_bit_struct == 16
|
||||
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
|
||||
|
||||
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
|
||||
cmp r4, #0
|
||||
beq skip_extra_bits
|
||||
|
||||
; if( b->base_val)
|
||||
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
|
||||
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
|
||||
cmp r8, #0 ; if( L)
|
||||
beq no_extra_bits
|
||||
|
||||
ldr r9, [r12, #vp9_extra_bit_struct_prob]
|
||||
asr r7, lr, #1 ; v=e>>1
|
||||
|
||||
ldr r10, [r12, #vp9_extra_bit_struct_tree]
|
||||
str r10, [sp, #4] ; b->tree
|
||||
|
||||
rsb r4, r8, #32
|
||||
lsl r12, r7, r4
|
||||
|
||||
mov lr, #0 ; i = 0
|
||||
|
||||
extra_bits_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
lsls r12, r12, #1 ; v >> n
|
||||
mul r6, r4, r7 ; (range-1) * pp[i>>1]
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
clz r6, r4
|
||||
sub r6, r6, #24
|
||||
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi extra_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset= shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl extra_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos - 1
|
||||
b extra_zero_while_start
|
||||
extra_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
extra_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq extra_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4]
|
||||
extra_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
|
||||
ldr r10, [sp, #4] ; b->tree
|
||||
extra_count_lt_zero
|
||||
lsl r2, r2, r6
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne extra_bits_loop ; while (n)
|
||||
|
||||
no_extra_bits
|
||||
ldr lr, [r1, #4] ; e = p->Extra
|
||||
add r4, r5, #1 ; range + 1
|
||||
tst lr, #1
|
||||
lsr r4, r4, #1 ; split = (range + 1) >> 1
|
||||
addne r2, r2, r4 ; lowvalue += split
|
||||
subne r4, r5, r4 ; range = range-split
|
||||
tst r2, #0x80000000 ; lowvalue & 0x80000000
|
||||
lsl r5, r4, #1 ; range <<= 1
|
||||
beq end_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mov r7, #0
|
||||
sub r4, r4, #1
|
||||
b end_zero_while_start
|
||||
end_zero_while_loop
|
||||
strb r7, [r6, r4]
|
||||
sub r4, r4, #1 ; x--
|
||||
end_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r12, [r6, r4]
|
||||
cmpge r12, #0xff
|
||||
beq end_zero_while_loop
|
||||
|
||||
ldr r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r7, [r6, r4]
|
||||
add r7, r7, #1
|
||||
strb r7, [r6, r4]
|
||||
end_high_bit_not_set
|
||||
adds r3, r3, #1 ; ++count
|
||||
lsl r2, r2, #1 ; lowvalue <<= 1
|
||||
bne end_count_zero
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mvn r3, #7
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
lsr r6, r2, #24 ; lowvalue >> 24
|
||||
add r12, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r12, [r0, #0x10]
|
||||
strb r6, [r7, r4]
|
||||
end_count_zero
|
||||
skip_extra_bits
|
||||
add r1, r1, #TOKENEXTRA_SZ ; ++p
|
||||
check_p_lt_stop
|
||||
ldr r4, [sp, #0] ; stop
|
||||
cmp r1, r4 ; while( p < stop)
|
||||
bcc while_p_lt_stop
|
||||
|
||||
ldr r6, [sp, #12] ; mb_rows
|
||||
ldr r7, [sp, #16] ; tokenlist address
|
||||
subs r6, r6, #1
|
||||
add r7, r7, #TOKENLIST_SZ ; next element in the array
|
||||
str r6, [sp, #12]
|
||||
bne mb_row_loop
|
||||
|
||||
str r2, [r0, #vp9_writer_lowvalue]
|
||||
str r5, [r0, #vp9_writer_range]
|
||||
str r3, [r0, #vp9_writer_count]
|
||||
add sp, sp, #24
|
||||
pop {r4-r11, pc}
|
||||
ENDP
|
||||
|
||||
_VP8_COMP_common_
|
||||
DCD vp8_comp_common
|
||||
_VP8_COMMON_MBrows_
|
||||
DCD vp8_common_mb_rows
|
||||
_VP8_COMP_tplist_
|
||||
DCD vp8_comp_tplist
|
||||
|
||||
END
|
|
@ -1,465 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
; r0 VP8_COMP *cpi
|
||||
; r1 unsigned char *cx_data
|
||||
; r2 int num_part
|
||||
; r3 *size
|
||||
; s0 vp8_coef_encodings
|
||||
; s1 vp8_extra_bits,
|
||||
; s2 const vp9_tree_index *,
|
||||
|
||||
|vp8cx_pack_tokens_into_partitions_armv5| PROC
|
||||
push {r4-r11, lr}
|
||||
sub sp, sp, #44
|
||||
|
||||
; Compute address of cpi->common.mb_rows
|
||||
ldr r4, _VP8_COMP_common_
|
||||
ldr r6, _VP8_COMMON_MBrows_
|
||||
add r4, r0, r4
|
||||
|
||||
ldr r5, [r4, r6] ; load up mb_rows
|
||||
|
||||
str r5, [sp, #36] ; save mb_rows
|
||||
str r1, [sp, #24] ; save cx_data
|
||||
str r2, [sp, #20] ; save num_part
|
||||
str r3, [sp, #8] ; save *size
|
||||
|
||||
; *size = 3*(num_part -1 );
|
||||
sub r2, r2, #1 ; num_part - 1
|
||||
add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
|
||||
str r2, [r3]
|
||||
|
||||
add r2, r2, r1 ; cx_data + *size
|
||||
str r2, [sp, #40] ; ptr
|
||||
|
||||
ldr r4, _VP8_COMP_tplist_
|
||||
add r4, r0, r4
|
||||
ldr r7, [r4, #0] ; dereference cpi->tp_list
|
||||
str r7, [sp, #32] ; store start of cpi->tp_list
|
||||
|
||||
ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
|
||||
add r0, r0, r11
|
||||
|
||||
mov r11, #0
|
||||
str r11, [sp, #28] ; i
|
||||
|
||||
numparts_loop
|
||||
ldr r10, [sp, #40] ; ptr
|
||||
ldr r5, [sp, #36] ; move mb_rows to the counting section
|
||||
sub r5, r5, r11 ; move start point with each partition
|
||||
; mb_rows starts at i
|
||||
str r5, [sp, #12]
|
||||
|
||||
; Reset all of the VP8 Writer data for each partition that
|
||||
; is processed.
|
||||
; start_encode
|
||||
mov r2, #0 ; vp9_writer_lowvalue
|
||||
mov r5, #255 ; vp9_writer_range
|
||||
mvn r3, #23 ; vp9_writer_count
|
||||
|
||||
str r2, [r0, #vp9_writer_value]
|
||||
str r2, [r0, #vp9_writer_pos]
|
||||
str r10, [r0, #vp9_writer_buffer]
|
||||
|
||||
mb_row_loop
|
||||
|
||||
ldr r1, [r7, #tokenlist_start]
|
||||
ldr r9, [r7, #tokenlist_stop]
|
||||
str r9, [sp, #0] ; save stop for later comparison
|
||||
str r7, [sp, #16] ; tokenlist address for next time
|
||||
|
||||
b check_p_lt_stop
|
||||
|
||||
; actual work gets done here!
|
||||
|
||||
while_p_lt_stop
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r4, [sp, #80] ; vp8_coef_encodings
|
||||
mov lr, #0
|
||||
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
|
||||
ldr r9, [r1, #tokenextra_context_tree] ; pp
|
||||
|
||||
ldrb r7, [r1, #tokenextra_skip_eob_node]
|
||||
|
||||
ldr r6, [r4, #vp9_token_value] ; v
|
||||
ldr r8, [r4, #vp9_token_len] ; n
|
||||
|
||||
; vp8 specific skip_eob_node
|
||||
cmp r7, #0
|
||||
movne lr, #2 ; i = 2
|
||||
subne r8, r8, #1 ; --n
|
||||
|
||||
rsb r4, r8, #32 ; 32-n
|
||||
ldr r10, [sp, #88] ; vp8_coef_tree
|
||||
|
||||
; v is kept in r12 during the token pack loop
|
||||
lsl r12, r6, r4 ; r12 = v << 32 - n
|
||||
|
||||
; loop start
|
||||
token_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
; Decisions are made based on the bit value shifted
|
||||
; off of v, so set a flag here based on this.
|
||||
; This value is refered to as "bb"
|
||||
lsls r12, r12, #1 ; bb = v >> n
|
||||
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
|
||||
|
||||
; bb can only be 0 or 1. So only execute this statement
|
||||
; if bb == 1, otherwise it will act like i + 0
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start
|
||||
token_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq token_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4] ; w->buffer[x]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]
|
||||
|
||||
; r10 is used earlier in the loop, but r10 is used as
|
||||
; temp variable here. So after r10 is used, reload
|
||||
; vp8_coef_tree_dcd into r10
|
||||
ldr r10, [sp, #88] ; vp8_coef_tree
|
||||
|
||||
token_count_lt_zero
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne token_loop
|
||||
|
||||
ldrb r6, [r1, #tokenextra_token] ; t
|
||||
ldr r7, [sp, #84] ; vp8_extra_bits
|
||||
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
|
||||
; element. Here vp9_extra_bit_struct == 16
|
||||
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
|
||||
|
||||
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
|
||||
cmp r4, #0
|
||||
beq skip_extra_bits
|
||||
|
||||
; if( b->base_val)
|
||||
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
|
||||
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
|
||||
cmp r8, #0 ; if( L)
|
||||
beq no_extra_bits
|
||||
|
||||
ldr r9, [r12, #vp9_extra_bit_struct_prob]
|
||||
asr r7, lr, #1 ; v=e>>1
|
||||
|
||||
ldr r10, [r12, #vp9_extra_bit_struct_tree]
|
||||
str r10, [sp, #4] ; b->tree
|
||||
|
||||
rsb r4, r8, #32
|
||||
lsl r12, r7, r4
|
||||
|
||||
mov lr, #0 ; i = 0
|
||||
|
||||
extra_bits_loop
|
||||
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
|
||||
sub r7, r5, #1 ; range-1
|
||||
lsls r12, r12, #1 ; v >> n
|
||||
mul r6, r4, r7 ; (range-1) * pp[i>>1]
|
||||
addcs lr, lr, #1 ; i + bb
|
||||
|
||||
mov r7, #1
|
||||
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
|
||||
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
|
||||
|
||||
addcs r2, r2, r4 ; if (bb) lowvalue += split
|
||||
subcs r4, r5, r4 ; if (bb) range = range-split
|
||||
|
||||
clz r6, r4
|
||||
sub r6, r6, #24
|
||||
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi extra_count_lt_zero ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset= shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl extra_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos - 1
|
||||
b extra_zero_while_start
|
||||
extra_zero_while_loop
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
extra_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq extra_zero_while_loop
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4]
|
||||
extra_high_bit_not_set
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
|
||||
ldr r10, [sp, #4] ; b->tree
|
||||
extra_count_lt_zero
|
||||
lsl r2, r2, r6
|
||||
|
||||
subs r8, r8, #1 ; --n
|
||||
bne extra_bits_loop ; while (n)
|
||||
|
||||
no_extra_bits
|
||||
ldr lr, [r1, #4] ; e = p->Extra
|
||||
add r4, r5, #1 ; range + 1
|
||||
tst lr, #1
|
||||
lsr r4, r4, #1 ; split = (range + 1) >> 1
|
||||
addne r2, r2, r4 ; lowvalue += split
|
||||
subne r4, r5, r4 ; range = range-split
|
||||
tst r2, #0x80000000 ; lowvalue & 0x80000000
|
||||
lsl r5, r4, #1 ; range <<= 1
|
||||
beq end_high_bit_not_set
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mov r7, #0
|
||||
sub r4, r4, #1
|
||||
b end_zero_while_start
|
||||
end_zero_while_loop
|
||||
strb r7, [r6, r4]
|
||||
sub r4, r4, #1 ; x--
|
||||
end_zero_while_start
|
||||
cmp r4, #0
|
||||
ldrge r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r12, [r6, r4]
|
||||
cmpge r12, #0xff
|
||||
beq end_zero_while_loop
|
||||
|
||||
ldr r6, [r0, #vp9_writer_buffer]
|
||||
ldrb r7, [r6, r4]
|
||||
add r7, r7, #1
|
||||
strb r7, [r6, r4]
|
||||
end_high_bit_not_set
|
||||
adds r3, r3, #1 ; ++count
|
||||
lsl r2, r2, #1 ; lowvalue <<= 1
|
||||
bne end_count_zero
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos]
|
||||
mvn r3, #7
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
lsr r6, r2, #24 ; lowvalue >> 24
|
||||
add r12, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r12, [r0, #0x10]
|
||||
strb r6, [r7, r4]
|
||||
end_count_zero
|
||||
skip_extra_bits
|
||||
add r1, r1, #TOKENEXTRA_SZ ; ++p
|
||||
check_p_lt_stop
|
||||
ldr r4, [sp, #0] ; stop
|
||||
cmp r1, r4 ; while( p < stop)
|
||||
bcc while_p_lt_stop
|
||||
|
||||
ldr r10, [sp, #20] ; num_parts
|
||||
mov r1, #TOKENLIST_SZ
|
||||
mul r1, r10, r1
|
||||
|
||||
ldr r6, [sp, #12] ; mb_rows
|
||||
ldr r7, [sp, #16] ; tokenlist address
|
||||
subs r6, r6, r10
|
||||
add r7, r7, r1 ; next element in the array
|
||||
str r6, [sp, #12]
|
||||
bgt mb_row_loop
|
||||
|
||||
mov r12, #32
|
||||
|
||||
stop_encode_loop
|
||||
sub r7, r5, #1 ; range-1
|
||||
|
||||
mov r4, r7, lsl #7 ; ((range-1) * 128)
|
||||
|
||||
mov r7, #1
|
||||
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
|
||||
|
||||
; Counting the leading zeros is used to normalize range.
|
||||
clz r6, r4
|
||||
sub r6, r6, #24 ; shift
|
||||
|
||||
; Flag is set on the sum of count. This flag is used later
|
||||
; to determine if count >= 0
|
||||
adds r3, r3, r6 ; count += shift
|
||||
lsl r5, r4, r6 ; range <<= shift
|
||||
bmi token_count_lt_zero_se ; if(count >= 0)
|
||||
|
||||
sub r6, r6, r3 ; offset = shift - count
|
||||
sub r4, r6, #1 ; offset-1
|
||||
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
|
||||
bpl token_high_bit_not_set_se
|
||||
|
||||
ldr r4, [r0, #vp9_writer_pos] ; x
|
||||
sub r4, r4, #1 ; x = w->pos-1
|
||||
b token_zero_while_start_se
|
||||
token_zero_while_loop_se
|
||||
mov r10, #0
|
||||
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
|
||||
sub r4, r4, #1 ; x--
|
||||
token_zero_while_start_se
|
||||
cmp r4, #0
|
||||
ldrge r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r11, [r7, r4]
|
||||
cmpge r11, #0xff
|
||||
beq token_zero_while_loop_se
|
||||
|
||||
ldr r7, [r0, #vp9_writer_buffer]
|
||||
ldrb r10, [r7, r4] ; w->buffer[x]
|
||||
add r10, r10, #1
|
||||
strb r10, [r7, r4] ; w->buffer[x] + 1
|
||||
token_high_bit_not_set_se
|
||||
rsb r4, r6, #24 ; 24-offset
|
||||
ldr r10, [r0, #vp9_writer_buffer]
|
||||
lsr r7, r2, r4 ; lowvalue >> (24-offset)
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
lsl r2, r2, r6 ; lowvalue <<= offset
|
||||
mov r6, r3 ; shift = count
|
||||
add r11, r4, #1 ; w->pos++
|
||||
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
|
||||
str r11, [r0, #vp9_writer_pos]
|
||||
sub r3, r3, #8 ; count -= 8
|
||||
strb r7, [r10, r4] ; w->buffer[w->pos++]
|
||||
|
||||
token_count_lt_zero_se
|
||||
lsl r2, r2, r6 ; lowvalue <<= shift
|
||||
|
||||
subs r12, r12, #1
|
||||
bne stop_encode_loop
|
||||
|
||||
ldr r10, [sp, #8] ; *size
|
||||
ldr r11, [r10]
|
||||
ldr r4, [r0, #vp9_writer_pos] ; w->pos
|
||||
add r11, r11, r4 ; *size += w->pos
|
||||
str r11, [r10]
|
||||
|
||||
ldr r9, [sp, #20] ; num_parts
|
||||
sub r9, r9, #1
|
||||
ldr r10, [sp, #28] ; i
|
||||
cmp r10, r9 ; if(i<(num_part - 1))
|
||||
bge skip_write_partition
|
||||
|
||||
ldr r12, [sp, #40] ; ptr
|
||||
add r12, r12, r4 ; ptr += w->pos
|
||||
str r12, [sp, #40]
|
||||
|
||||
ldr r9, [sp, #24] ; cx_data
|
||||
mov r8, r4, asr #8
|
||||
strb r4, [r9, #0]
|
||||
strb r8, [r9, #1]
|
||||
mov r4, r4, asr #16
|
||||
strb r4, [r9, #2]
|
||||
|
||||
add r9, r9, #3 ; cx_data += 3
|
||||
str r9, [sp, #24]
|
||||
|
||||
skip_write_partition
|
||||
|
||||
ldr r11, [sp, #28] ; i
|
||||
ldr r10, [sp, #20] ; num_parts
|
||||
|
||||
add r11, r11, #1 ; i++
|
||||
str r11, [sp, #28]
|
||||
|
||||
ldr r7, [sp, #32] ; cpi->tp_list[i]
|
||||
mov r1, #TOKENLIST_SZ
|
||||
add r7, r7, r1 ; next element in cpi->tp_list
|
||||
str r7, [sp, #32] ; cpi->tp_list[i+1]
|
||||
|
||||
cmp r10, r11
|
||||
bgt numparts_loop
|
||||
|
||||
|
||||
add sp, sp, #44
|
||||
pop {r4-r11, pc}
|
||||
ENDP
|
||||
|
||||
_VP8_COMP_common_
|
||||
DCD vp8_comp_common
|
||||
_VP8_COMMON_MBrows_
|
||||
DCD vp8_common_mb_rows
|
||||
_VP8_COMP_tplist_
|
||||
DCD vp8_comp_tplist
|
||||
_VP8_COMP_bc2_
|
||||
DCD vp8_comp_bc2
|
||||
|
||||
END
|
|
@ -1,223 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_fast_quantize_b_armv6|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 BLOCK *b
|
||||
; r1 BLOCKD *d
|
||||
|vp8_fast_quantize_b_armv6| PROC
|
||||
stmfd sp!, {r1, r4-r11, lr}
|
||||
|
||||
ldr r3, [r0, #vp8_block_coeff] ; coeff
|
||||
ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
|
||||
ldr r5, [r0, #vp8_block_round] ; round
|
||||
ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
|
||||
ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
|
||||
ldr r8, [r1, #vp8_blockd_dequant] ; dequant
|
||||
|
||||
ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
|
||||
; is used to update the counter so that
|
||||
; it can be used to mark nonzero
|
||||
; quantized coefficient pairs.
|
||||
|
||||
mov r1, #0 ; flags for quantized coeffs
|
||||
|
||||
; PART 1: quantization and dequantization loop
|
||||
loop
|
||||
ldr r9, [r3], #4 ; [z1 | z0]
|
||||
ldr r10, [r5], #4 ; [r1 | r0]
|
||||
ldr r11, [r4], #4 ; [q1 | q0]
|
||||
|
||||
ssat16 lr, #1, r9 ; [sz1 | sz0]
|
||||
eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
|
||||
ssub16 r9, r9, lr ; x = (z ^ sz) - sz
|
||||
sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
|
||||
|
||||
ldr r12, [r3], #4 ; [z3 | z2]
|
||||
|
||||
smulbb r0, r9, r11 ; [(x0+r0)*q0]
|
||||
smultt r9, r9, r11 ; [(x1+r1)*q1]
|
||||
|
||||
ldr r10, [r5], #4 ; [r3 | r2]
|
||||
|
||||
ssat16 r11, #1, r12 ; [sz3 | sz2]
|
||||
eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
|
||||
pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
|
||||
ldr r9, [r4], #4 ; [q3 | q2]
|
||||
ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
|
||||
|
||||
sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
|
||||
|
||||
eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
|
||||
|
||||
smulbb r10, r12, r9 ; [(x2+r2)*q2]
|
||||
smultt r12, r12, r9 ; [(x3+r3)*q3]
|
||||
|
||||
ssub16 r0, r0, lr ; x = (y ^ sz) - sz
|
||||
|
||||
cmp r0, #0 ; check if zero
|
||||
orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
|
||||
|
||||
str r0, [r6], #4 ; *qcoeff++ = x
|
||||
ldr r9, [r8], #4 ; [dq1 | dq0]
|
||||
|
||||
pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
|
||||
eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
|
||||
ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
|
||||
|
||||
cmp r10, #0 ; check if zero
|
||||
orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
|
||||
|
||||
str r10, [r6], #4 ; *qcoeff++ = x
|
||||
ldr r11, [r8], #4 ; [dq3 | dq2]
|
||||
|
||||
smulbb r12, r0, r9 ; [x0*dq0]
|
||||
smultt r0, r0, r9 ; [x1*dq1]
|
||||
|
||||
smulbb r9, r10, r11 ; [x2*dq2]
|
||||
smultt r10, r10, r11 ; [x3*dq3]
|
||||
|
||||
lsls r2, r2, #2 ; update loop counter
|
||||
strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
|
||||
strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
|
||||
strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
|
||||
strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
|
||||
add r7, r7, #8 ; dqcoeff += 8
|
||||
bne loop
|
||||
|
||||
; PART 2: check position for eob...
|
||||
mov lr, #0 ; init eob
|
||||
cmp r1, #0 ; coeffs after quantization?
|
||||
ldr r11, [sp, #0] ; restore BLOCKD pointer
|
||||
beq end ; skip eob calculations if all zero
|
||||
|
||||
ldr r0, [r11, #vp8_blockd_qcoeff]
|
||||
|
||||
; check shortcut for nonzero qcoeffs
|
||||
tst r1, #0x80
|
||||
bne quant_coeff_15_14
|
||||
tst r1, #0x20
|
||||
bne quant_coeff_13_11
|
||||
tst r1, #0x8
|
||||
bne quant_coeff_12_7
|
||||
tst r1, #0x40
|
||||
bne quant_coeff_10_9
|
||||
tst r1, #0x10
|
||||
bne quant_coeff_8_3
|
||||
tst r1, #0x2
|
||||
bne quant_coeff_6_5
|
||||
tst r1, #0x4
|
||||
bne quant_coeff_4_2
|
||||
b quant_coeff_1_0
|
||||
|
||||
quant_coeff_15_14
|
||||
ldrh r2, [r0, #30] ; rc=15, i=15
|
||||
mov lr, #16
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
ldrh r3, [r0, #28] ; rc=14, i=14
|
||||
mov lr, #15
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_13_11
|
||||
ldrh r2, [r0, #22] ; rc=11, i=13
|
||||
mov lr, #14
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_12_7
|
||||
ldrh r3, [r0, #14] ; rc=7, i=12
|
||||
mov lr, #13
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
ldrh r2, [r0, #20] ; rc=10, i=11
|
||||
mov lr, #12
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_10_9
|
||||
ldrh r3, [r0, #26] ; rc=13, i=10
|
||||
mov lr, #11
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
ldrh r2, [r0, #24] ; rc=12, i=9
|
||||
mov lr, #10
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_8_3
|
||||
ldrh r3, [r0, #18] ; rc=9, i=8
|
||||
mov lr, #9
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
ldrh r2, [r0, #12] ; rc=6, i=7
|
||||
mov lr, #8
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_6_5
|
||||
ldrh r3, [r0, #6] ; rc=3, i=6
|
||||
mov lr, #7
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
ldrh r2, [r0, #4] ; rc=2, i=5
|
||||
mov lr, #6
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_4_2
|
||||
ldrh r3, [r0, #10] ; rc=5, i=4
|
||||
mov lr, #5
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
ldrh r2, [r0, #16] ; rc=8, i=3
|
||||
mov lr, #4
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
ldrh r3, [r0, #8] ; rc=4, i=2
|
||||
mov lr, #3
|
||||
cmp r3, #0
|
||||
bne end
|
||||
|
||||
quant_coeff_1_0
|
||||
ldrh r2, [r0, #2] ; rc=1, i=1
|
||||
mov lr, #2
|
||||
cmp r2, #0
|
||||
bne end
|
||||
|
||||
mov lr, #1 ; rc=0, i=0
|
||||
|
||||
end
|
||||
str lr, [r11, #vp8_blockd_eob]
|
||||
ldmfd sp!, {r1, r4-r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
loop_count
|
||||
DCD 0x1000000
|
||||
|
||||
END
|
|
@ -1,138 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mse16x16_armv6|
|
||||
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;
|
||||
;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
|
||||
; So, we can remove this part of calculation.
|
||||
|
||||
|vp8_mse16x16_armv6| PROC
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov r4, #0 ; initialize sse = 0
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
subs r12, r12, #1 ; next row
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r1, [sp, #28] ; get address of sse
|
||||
mov r0, r4 ; return sse
|
||||
str r4, [r1] ; store sse
|
||||
|
||||
pop {r4-r9, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,95 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad16x16_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 const unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 const unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
; stack max_sad (not used)
|
||||
|vp8_sad16x16_armv6| PROC
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
mov r4, #0 ; sad = 0;
|
||||
mov r5, #8 ; loop count
|
||||
|
||||
loop
|
||||
; 1st row
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
|
||||
|
||||
usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
; 2nd row
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
|
||||
|
||||
usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
subs r5, r5, #1 ; decrement loop counter
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
bne loop
|
||||
|
||||
mov r0, r4 ; return sad
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,262 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_fdct4x4_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct4x4_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4 - r12, lr}
|
||||
|
||||
; PART 1
|
||||
|
||||
; coeffs 0-3
|
||||
ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
|
||||
|
||||
ldr r10, c7500
|
||||
ldr r11, c14500
|
||||
ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
|
||||
ldr lr, c0x00080008
|
||||
ror r5, r5, #16 ; [i2 | i3]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
|
||||
smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
|
||||
|
||||
pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
|
||||
pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
|
||||
|
||||
str r6, [r1, #4]
|
||||
|
||||
; coeffs 4-7
|
||||
ror r9, r9, #16 ; [i6 | i7]
|
||||
|
||||
qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
|
||||
qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
|
||||
smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
|
||||
|
||||
pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
|
||||
pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
|
||||
|
||||
str r6, [r1, #12]
|
||||
|
||||
; coeffs 8-11
|
||||
ror r5, r5, #16 ; [i10 | i11]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
|
||||
smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
|
||||
|
||||
pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
|
||||
pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
|
||||
|
||||
str r6, [r1, #20]
|
||||
|
||||
; coeffs 12-15
|
||||
ror r5, r5, #16 ; [i14 | i15]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
|
||||
smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
|
||||
pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
|
||||
|
||||
str r6, [r1, #28]
|
||||
|
||||
|
||||
; PART 2 -------------------------------------------------
|
||||
ldr r11, c12000
|
||||
ldr r10, c51000
|
||||
ldr lr, c0x00070007
|
||||
|
||||
qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
|
||||
qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
|
||||
qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
|
||||
qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
|
||||
|
||||
qadd16 r4, r4, lr ; a1 + 7
|
||||
|
||||
add r0, r11, #0x10000 ; add (d!=0)
|
||||
|
||||
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||
|
||||
ldr r12, c0x08a914e8 ; [2217 | 5352]
|
||||
|
||||
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||
asr r2, r2, #4 ; scale top halfword
|
||||
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||
asr r3, r3, #4 ; scale top halfword
|
||||
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||
|
||||
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||
str r4, [r1, #0] ; [ o1 | o0]
|
||||
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||
str r5, [r1, #16] ; [ o9 | o8]
|
||||
|
||||
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||
|
||||
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||
|
||||
lsls r6, r7, #16 ; d1 != 0 ?
|
||||
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||
asrs r6, r7, #16
|
||||
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||
|
||||
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||
|
||||
pkhtb r9, r9, r8, asr #16
|
||||
|
||||
sub r4, r4, r2
|
||||
sub r5, r5, r3
|
||||
|
||||
ldr r3, [r1, #4] ; [i3 | i2]
|
||||
|
||||
pkhtb r5, r5, r4, asr #16 ; [o13|o12]
|
||||
|
||||
str r9, [r1, #8] ; [o5 | 04]
|
||||
|
||||
ldr r9, [r1, #12] ; [i7 | i6]
|
||||
ldr r8, [r1, #28] ; [i15|i14]
|
||||
ldr r2, [r1, #20] ; [i11|i10]
|
||||
str r5, [r1, #24] ; [o13|o12]
|
||||
|
||||
qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
|
||||
qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
|
||||
|
||||
qadd16 r4, r4, lr ; a1 + 7
|
||||
|
||||
qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
|
||||
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||
qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
|
||||
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||
|
||||
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||
asr r2, r2, #4 ; scale top halfword
|
||||
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||
asr r3, r3, #4 ; scale top halfword
|
||||
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||
|
||||
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||
str r4, [r1, #4] ; [ o3 | o2]
|
||||
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||
str r5, [r1, #20] ; [ o11 | o10]
|
||||
|
||||
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||
|
||||
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||
|
||||
lsls r6, r7, #16 ; d1 != 0 ?
|
||||
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||
|
||||
asrs r6, r7, #16
|
||||
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||
|
||||
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||
|
||||
pkhtb r9, r9, r8, asr #16
|
||||
|
||||
sub r4, r4, r2
|
||||
sub r5, r5, r3
|
||||
|
||||
str r9, [r1, #12] ; [o7 | o6]
|
||||
pkhtb r5, r5, r4, asr #16 ; [o15|o14]
|
||||
|
||||
str r5, [r1, #28] ; [o15|o14]
|
||||
|
||||
ldmfd sp!, {r4 - r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; Used constants
|
||||
c7500
|
||||
DCD 7500
|
||||
c14500
|
||||
DCD 14500
|
||||
c0x22a453a0
|
||||
DCD 0x22a453a0
|
||||
c0x00080008
|
||||
DCD 0x00080008
|
||||
c12000
|
||||
DCD 12000
|
||||
c51000
|
||||
DCD 51000
|
||||
c0x00070007
|
||||
DCD 0x00070007
|
||||
c0x08a914e8
|
||||
DCD 0x08a914e8
|
||||
|
||||
END
|
|
@ -1,264 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_subtract_mby_armv6|
|
||||
EXPORT |vp8_subtract_mbuv_armv6|
|
||||
EXPORT |vp8_subtract_b_armv6|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 BLOCK *be
|
||||
; r1 BLOCKD *bd
|
||||
; r2 int pitch
|
||||
|vp8_subtract_b_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r9}
|
||||
|
||||
ldr r4, [r0, #vp8_block_base_src]
|
||||
ldr r5, [r0, #vp8_block_src]
|
||||
ldr r6, [r0, #vp8_block_src_diff]
|
||||
|
||||
ldr r3, [r4]
|
||||
ldr r7, [r0, #vp8_block_src_stride]
|
||||
add r3, r3, r5 ; src = *base_src + src
|
||||
ldr r8, [r1, #vp8_blockd_predictor]
|
||||
|
||||
mov r9, #4 ; loop count
|
||||
|
||||
loop_block
|
||||
|
||||
ldr r0, [r3], r7 ; src
|
||||
ldr r1, [r8], r2 ; pred
|
||||
|
||||
uxtb16 r4, r0 ; [s2 | s0]
|
||||
uxtb16 r5, r1 ; [p2 | p0]
|
||||
uxtb16 r0, r0, ror #8 ; [s3 | s1]
|
||||
uxtb16 r1, r1, ror #8 ; [p3 | p1]
|
||||
|
||||
usub16 r4, r4, r5 ; [d2 | d0]
|
||||
usub16 r5, r0, r1 ; [d3 | d1]
|
||||
|
||||
subs r9, r9, #1 ; decrement loop counter
|
||||
|
||||
pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
|
||||
pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
|
||||
|
||||
str r0, [r6, #0] ; diff
|
||||
str r1, [r6, #4] ; diff
|
||||
|
||||
add r6, r6, r2, lsl #1 ; update diff pointer
|
||||
bne loop_block
|
||||
|
||||
ldmfd sp!, {r4-r9}
|
||||
mov pc, lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
; r0 short *diff
|
||||
; r1 unsigned char *usrc
|
||||
; r2 unsigned char *vsrc
|
||||
; r3 unsigned char *pred
|
||||
; stack int stride
|
||||
|vp8_subtract_mbuv_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
add r0, r0, #512 ; set *diff point to Cb
|
||||
add r3, r3, #256 ; set *pred point to Cb
|
||||
|
||||
mov r4, #8 ; loop count
|
||||
ldr r5, [sp, #40] ; stride
|
||||
|
||||
; Subtract U block
|
||||
loop_u
|
||||
ldr r6, [r1] ; src (A)
|
||||
ldr r7, [r3], #4 ; pred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
|
||||
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (A)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r1, #4] ; src (B)
|
||||
ldr r11, [r3], #4 ; pred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
|
||||
str r8, [r0], #4 ; diff (A)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (B)
|
||||
str r9, [r0], #4 ; diff (A)
|
||||
|
||||
uxtb16 r9, r11 ; [p2 | p0] (B)
|
||||
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
|
||||
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (B)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
add r1, r1, r5 ; update usrc pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
|
||||
str r8, [r0], #4 ; diff (B)
|
||||
subs r4, r4, #1 ; update loop counter
|
||||
str r9, [r0], #4 ; diff (B)
|
||||
|
||||
bne loop_u
|
||||
|
||||
mov r4, #8 ; loop count
|
||||
|
||||
; Subtract V block
|
||||
loop_v
|
||||
ldr r6, [r2] ; src (A)
|
||||
ldr r7, [r3], #4 ; pred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
|
||||
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (A)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r2, #4] ; src (B)
|
||||
ldr r11, [r3], #4 ; pred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
|
||||
str r8, [r0], #4 ; diff (A)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (B)
|
||||
str r9, [r0], #4 ; diff (A)
|
||||
|
||||
uxtb16 r9, r11 ; [p2 | p0] (B)
|
||||
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
|
||||
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (B)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
add r2, r2, r5 ; update vsrc pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
|
||||
str r8, [r0], #4 ; diff (B)
|
||||
subs r4, r4, #1 ; update loop counter
|
||||
str r9, [r0], #4 ; diff (B)
|
||||
|
||||
bne loop_v
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
; r0 short *diff
|
||||
; r1 unsigned char *src
|
||||
; r2 unsigned char *pred
|
||||
; r3 int stride
|
||||
|vp8_subtract_mby_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r11}
|
||||
|
||||
mov r4, #16
|
||||
loop
|
||||
ldr r6, [r1] ; src (A)
|
||||
ldr r7, [r2], #4 ; pred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
|
||||
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (A)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r1, #4] ; src (B)
|
||||
ldr r11, [r2], #4 ; pred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
|
||||
str r8, [r0], #4 ; diff (A)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (B)
|
||||
str r9, [r0], #4 ; diff (A)
|
||||
|
||||
uxtb16 r9, r11 ; [p2 | p0] (B)
|
||||
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
|
||||
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (B)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
ldr r10, [r1, #8] ; src (C)
|
||||
ldr r11, [r2], #4 ; pred (C)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
|
||||
str r8, [r0], #4 ; diff (B)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (C)
|
||||
str r9, [r0], #4 ; diff (B)
|
||||
|
||||
uxtb16 r9, r11 ; [p2 | p0] (C)
|
||||
uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
|
||||
uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (C)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (C)
|
||||
|
||||
ldr r10, [r1, #12] ; src (D)
|
||||
ldr r11, [r2], #4 ; pred (D)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
|
||||
|
||||
str r8, [r0], #4 ; diff (C)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (D)
|
||||
str r9, [r0], #4 ; diff (C)
|
||||
|
||||
uxtb16 r9, r11 ; [p2 | p0] (D)
|
||||
uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
|
||||
uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
|
||||
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (D)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (D)
|
||||
|
||||
add r1, r1, r3 ; update src pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
|
||||
|
||||
str r8, [r0], #4 ; diff (D)
|
||||
subs r4, r4, #1 ; update loop counter
|
||||
str r9, [r0], #4 ; diff (D)
|
||||
|
||||
bne loop
|
||||
|
||||
ldmfd sp!, {r4-r11}
|
||||
mov pc, lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,153 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance16x16_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance16x16_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,101 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance8x8_armv6|
|
||||
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance8x8_armv6| PROC
|
||||
|
||||
push {r4-r10, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||
mov r4, #0 ; initialize sum = 0
|
||||
mov r5, #0 ; initialize sse = 0
|
||||
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1 ; next row
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r8, [sp, #32] ; get address of sse
|
||||
mul r1, r4, r4 ; sum * sum
|
||||
str r5, [r8] ; store sse
|
||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,181 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance_halfpixvar16x16_h_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
|
@ -1,222 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance_halfpixvar16x16_hv_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; pointer to pixels on the next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load source pixels a, row N
|
||||
ldr r6, [r0, #1] ; load source pixels b, row N
|
||||
ldr r5, [r9, #0] ; load source pixels c, row N+1
|
||||
ldr r7, [r9, #1] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load source pixels a, row N
|
||||
ldr r6, [r0, #5] ; load source pixels b, row N
|
||||
ldr r5, [r9, #4] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #5] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load source pixels a, row N
|
||||
ldr r6, [r0, #9] ; load source pixels b, row N
|
||||
ldr r5, [r9, #8] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #9] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load source pixels a, row N
|
||||
ldr r6, [r0, #13] ; load source pixels b, row N
|
||||
ldr r5, [r9, #12] ; load source pixels c, row N+1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
ldr r7, [r9, #13] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
|
@ -1,183 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance_halfpixvar16x16_v_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; set src pointer to next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r9, #0] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r9, #4] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r9, #8] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r9, #12] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
|
@ -1,212 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_walsh4x4_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
|
||||
; r0 short *input,
|
||||
; r1 short *output,
|
||||
; r2 int pitch
|
||||
|vp8_short_walsh4x4_armv6| PROC
|
||||
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrd r4, r5, [r0], r2
|
||||
ldr lr, c00040004
|
||||
ldrd r6, r7, [r0], r2
|
||||
|
||||
; 0-3
|
||||
qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
|
||||
qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
|
||||
|
||||
ldrd r8, r9, [r0], r2
|
||||
; 4-7
|
||||
qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
|
||||
qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
|
||||
|
||||
ldrd r10, r11, [r0]
|
||||
; 8-11
|
||||
qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
|
||||
qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
|
||||
|
||||
; 12-15
|
||||
qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
|
||||
qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
|
||||
|
||||
|
||||
lsls r2, r3, #16
|
||||
smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
|
||||
addne r11, r11, #1 ; A0 += (a1!=0)
|
||||
|
||||
lsls r2, r7, #16
|
||||
smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
|
||||
addne r12, r12, #1 ; C0 += (a1!=0)
|
||||
|
||||
add r0, r11, r12 ; a1_0 = A0 + C0
|
||||
sub r11, r11, r12 ; b1_0 = A0 - C0
|
||||
|
||||
lsls r2, r5, #16
|
||||
smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
|
||||
addne r12, r12, #1 ; B0 += (a1!=0)
|
||||
|
||||
lsls r2, r9, #16
|
||||
smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
|
||||
addne r2, r2, #1 ; D0 += (a1!=0)
|
||||
|
||||
add lr, r12, r2 ; d1_0 = B0 + D0
|
||||
sub r12, r12, r2 ; c1_0 = B0 - D0
|
||||
|
||||
; op[0,4,8,12]
|
||||
adds r2, r0, lr ; a2 = a1_0 + d1_0
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r0, r0, lr ; d2 = a1_0 - d1_0
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1] ; op[0]
|
||||
|
||||
addmi r0, r0, #1 ; += a2 < 0
|
||||
add r0, r0, #3 ; += 3
|
||||
ldr lr, c00040004
|
||||
mov r0, r0, asr #3 ; >> 3
|
||||
strh r0, [r1, #24] ; op[12]
|
||||
|
||||
adds r2, r11, r12 ; b2 = b1_0 + c1_0
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r0, r11, r12 ; c2 = b1_0 - c1_0
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #8] ; op[4]
|
||||
|
||||
addmi r0, r0, #1 ; += a2 < 0
|
||||
add r0, r0, #3 ; += 3
|
||||
smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
|
||||
smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
|
||||
mov r0, r0, asr #3 ; >> 3
|
||||
strh r0, [r1, #16] ; op[8]
|
||||
|
||||
|
||||
; op[3,7,11,15]
|
||||
add r0, r3, r7 ; a1_3 = A3 + C3
|
||||
sub r3, r3, r7 ; b1_3 = A3 - C3
|
||||
|
||||
smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
|
||||
smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
|
||||
add r7, r5, r9 ; d1_3 = B3 + D3
|
||||
sub r5, r5, r9 ; c1_3 = B3 - D3
|
||||
|
||||
adds r2, r0, r7 ; a2 = a1_3 + d1_3
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r3, r5 ; b2 = b1_3 + c1_3
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #6] ; op[3]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r3, r5 ; c2 = b1_3 - c1_3
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #14] ; op[7]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r0, r7 ; d2 = a1_3 - d1_3
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #22] ; op[11]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
|
||||
smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #30] ; op[15]
|
||||
|
||||
; op[1,5,9,13]
|
||||
add r0, r3, r5 ; a1_1 = A1 + C1
|
||||
sub r3, r3, r5 ; b1_1 = A1 - C1
|
||||
|
||||
smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
|
||||
smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
|
||||
add r5, r7, r9 ; d1_1 = B1 + D1
|
||||
sub r7, r7, r9 ; c1_1 = B1 - D1
|
||||
|
||||
adds r2, r0, r5 ; a2 = a1_1 + d1_1
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r3, r7 ; b2 = b1_1 + c1_1
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #2] ; op[1]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r3, r7 ; c2 = b1_1 - c1_1
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #10] ; op[5]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r0, r5 ; d2 = a1_1 - d1_1
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #18] ; op[9]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
|
||||
smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #26] ; op[13]
|
||||
|
||||
|
||||
; op[2,6,10,14]
|
||||
add r11, r4, r8 ; a1_2 = A2 + C2
|
||||
sub r12, r4, r8 ; b1_2 = A2 - C2
|
||||
|
||||
smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
|
||||
smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
|
||||
add r4, r6, r10 ; d1_2 = B2 + D2
|
||||
sub r8, r6, r10 ; c1_2 = B2 - D2
|
||||
|
||||
adds r2, r11, r4 ; a2 = a1_2 + d1_2
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r12, r8 ; b2 = b1_2 + c1_2
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #4] ; op[2]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r12, r8 ; c2 = b1_2 - c1_2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #12] ; op[6]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r11, r4 ; d2 = a1_2 - d1_2
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #20] ; op[10]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #28] ; op[14]
|
||||
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_short_walsh4x4_armv6|
|
||||
|
||||
c00040004
|
||||
DCD 0x00040004
|
||||
|
||||
END
|
|
@ -1,261 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_fast_quantize_b_neon|
|
||||
EXPORT |vp8_fast_quantize_b_pair_neon|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=4
|
||||
|
||||
;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
|
||||
|vp8_fast_quantize_b_pair_neon| PROC
|
||||
|
||||
stmfd sp!, {r4-r9}
|
||||
vstmdb sp!, {q4-q7}
|
||||
|
||||
ldr r4, [r0, #vp8_block_coeff]
|
||||
ldr r5, [r0, #vp8_block_quant_fast]
|
||||
ldr r6, [r0, #vp8_block_round]
|
||||
|
||||
vld1.16 {q0, q1}, [r4@128] ; load z
|
||||
|
||||
ldr r7, [r2, #vp8_blockd_qcoeff]
|
||||
|
||||
vabs.s16 q4, q0 ; calculate x = abs(z)
|
||||
vabs.s16 q5, q1
|
||||
|
||||
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
|
||||
vshr.s16 q2, q0, #15 ; sz
|
||||
vshr.s16 q3, q1, #15
|
||||
|
||||
vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
|
||||
vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
|
||||
|
||||
ldr r4, [r1, #vp8_block_coeff]
|
||||
|
||||
vadd.s16 q4, q6 ; x + Round
|
||||
vadd.s16 q5, q7
|
||||
|
||||
vld1.16 {q0, q1}, [r4@128] ; load z2
|
||||
|
||||
vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
|
||||
vqdmulh.s16 q5, q9
|
||||
|
||||
vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
|
||||
vabs.s16 q11, q1
|
||||
vshr.s16 q12, q0, #15 ; sz2
|
||||
vshr.s16 q13, q1, #15
|
||||
|
||||
;modify data to have its original sign
|
||||
veor.s16 q4, q2 ; y^sz
|
||||
veor.s16 q5, q3
|
||||
|
||||
vadd.s16 q10, q6 ; x2 + Round
|
||||
vadd.s16 q11, q7
|
||||
|
||||
ldr r8, [r2, #vp8_blockd_dequant]
|
||||
|
||||
vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
|
||||
vqdmulh.s16 q11, q9
|
||||
|
||||
vshr.s16 q4, #1 ; right shift 1 after vqdmulh
|
||||
vshr.s16 q5, #1
|
||||
|
||||
vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
|
||||
|
||||
vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
|
||||
vsub.s16 q5, q3
|
||||
|
||||
vshr.s16 q10, #1 ; right shift 1 after vqdmulh
|
||||
vshr.s16 q11, #1
|
||||
|
||||
ldr r9, [r2, #vp8_blockd_dqcoeff]
|
||||
|
||||
veor.s16 q10, q12 ; y2^sz2
|
||||
veor.s16 q11, q13
|
||||
|
||||
vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
|
||||
|
||||
|
||||
vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
|
||||
vsub.s16 q11, q13
|
||||
|
||||
ldr r6, [r3, #vp8_blockd_qcoeff]
|
||||
|
||||
vmul.s16 q2, q6, q4 ; x * Dequant
|
||||
vmul.s16 q3, q7, q5
|
||||
|
||||
ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
|
||||
|
||||
vceq.s16 q8, q8 ; set q8 to all 1
|
||||
|
||||
vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
|
||||
|
||||
vmul.s16 q12, q6, q10 ; x2 * Dequant
|
||||
vmul.s16 q13, q7, q11
|
||||
|
||||
vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
|
||||
|
||||
vtst.16 q14, q4, q8 ; now find eob
|
||||
vtst.16 q15, q5, q8 ; non-zero element is set to all 1
|
||||
|
||||
vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
|
||||
|
||||
ldr r7, [r3, #vp8_blockd_dqcoeff]
|
||||
|
||||
vand q0, q6, q14 ; get all valid numbers from scan array
|
||||
vand q1, q7, q15
|
||||
|
||||
vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
|
||||
|
||||
vtst.16 q2, q10, q8 ; now find eob
|
||||
vtst.16 q3, q11, q8 ; non-zero element is set to all 1
|
||||
|
||||
vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
|
||||
|
||||
vand q10, q6, q2 ; get all valid numbers from scan array
|
||||
vand q11, q7, q3
|
||||
vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
|
||||
|
||||
vmax.u16 d0, d0, d1
|
||||
vmax.u16 d20, d20, d21
|
||||
vmovl.u16 q0, d0
|
||||
vmovl.u16 q10, d20
|
||||
|
||||
|
||||
vmax.u32 d0, d0, d1
|
||||
vmax.u32 d20, d20, d21
|
||||
vpmax.u32 d0, d0, d0
|
||||
vpmax.u32 d20, d20, d20
|
||||
|
||||
add r4, r2, #vp8_blockd_eob
|
||||
add r5, r3, #vp8_blockd_eob
|
||||
|
||||
vst1.32 {d0[0]}, [r4@32]
|
||||
vst1.32 {d20[0]}, [r5@32]
|
||||
|
||||
vldmia sp!, {q4-q7}
|
||||
ldmfd sp!, {r4-r9}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|
||||
|vp8_fast_quantize_b_neon| PROC
|
||||
|
||||
stmfd sp!, {r4-r7}
|
||||
|
||||
ldr r3, [r0, #vp8_block_coeff]
|
||||
ldr r4, [r0, #vp8_block_quant_fast]
|
||||
ldr r5, [r0, #vp8_block_round]
|
||||
|
||||
vld1.16 {q0, q1}, [r3@128] ; load z
|
||||
vorr.s16 q14, q0, q1 ; check if all zero (step 1)
|
||||
ldr r6, [r1, #vp8_blockd_qcoeff]
|
||||
ldr r7, [r1, #vp8_blockd_dqcoeff]
|
||||
vorr.s16 d28, d28, d29 ; check if all zero (step 2)
|
||||
|
||||
vabs.s16 q12, q0 ; calculate x = abs(z)
|
||||
vabs.s16 q13, q1
|
||||
|
||||
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
|
||||
vshr.s16 q2, q0, #15 ; sz
|
||||
vmov r2, r3, d28 ; check if all zero (step 3)
|
||||
vshr.s16 q3, q1, #15
|
||||
|
||||
vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
|
||||
vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
|
||||
|
||||
vadd.s16 q12, q14 ; x + Round
|
||||
vadd.s16 q13, q15
|
||||
|
||||
ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
|
||||
|
||||
vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
|
||||
vqdmulh.s16 q13, q9
|
||||
|
||||
vld1.16 {q10, q11}, [r0@128]; load inverse scan order
|
||||
|
||||
vceq.s16 q8, q8 ; set q8 to all 1
|
||||
|
||||
ldr r4, [r1, #vp8_blockd_dequant]
|
||||
|
||||
vshr.s16 q12, #1 ; right shift 1 after vqdmulh
|
||||
vshr.s16 q13, #1
|
||||
|
||||
orr r2, r2, r3 ; check if all zero (step 4)
|
||||
cmp r2, #0 ; check if all zero (step 5)
|
||||
beq zero_output ; check if all zero (step 6)
|
||||
|
||||
;modify data to have its original sign
|
||||
veor.s16 q12, q2 ; y^sz
|
||||
veor.s16 q13, q3
|
||||
|
||||
vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
|
||||
vsub.s16 q13, q3
|
||||
|
||||
vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
|
||||
|
||||
vtst.16 q14, q12, q8 ; now find eob
|
||||
vtst.16 q15, q13, q8 ; non-zero element is set to all 1
|
||||
|
||||
vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
|
||||
|
||||
vand q10, q10, q14 ; get all valid numbers from scan array
|
||||
vand q11, q11, q15
|
||||
|
||||
|
||||
vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
|
||||
vmax.u16 d0, d0, d1
|
||||
vmovl.u16 q0, d0
|
||||
|
||||
vmul.s16 q2, q12 ; x * Dequant
|
||||
vmul.s16 q3, q13
|
||||
|
||||
vmax.u32 d0, d0, d1
|
||||
vpmax.u32 d0, d0, d0
|
||||
|
||||
vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
|
||||
|
||||
add r4, r1, #vp8_blockd_eob
|
||||
vst1.32 {d0[0]}, [r4@32]
|
||||
|
||||
ldmfd sp!, {r4-r7}
|
||||
bx lr
|
||||
|
||||
zero_output
|
||||
str r2, [r1, #vp8_blockd_eob]
|
||||
vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
|
||||
vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
|
||||
|
||||
ldmfd sp!, {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
; default inverse zigzag table is defined in vp9/common/vp9_entropy.c
|
||||
_inv_zig_zag_
|
||||
DCD inv_zig_zag
|
||||
|
||||
ALIGN 16 ; enable use of @128 bit aligned loads
|
||||
inv_zig_zag
|
||||
DCW 0x0001, 0x0002, 0x0006, 0x0007
|
||||
DCW 0x0003, 0x0005, 0x0008, 0x000d
|
||||
DCW 0x0004, 0x0009, 0x000c, 0x000e
|
||||
DCW 0x000a, 0x000b, 0x000f, 0x0010
|
||||
|
||||
END
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_memcpy_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;=========================================
|
||||
;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
|
||||
|vp8_memcpy_neon| PROC
|
||||
;pld [r1] ;preload pred data
|
||||
;pld [r1, #128]
|
||||
;pld [r1, #256]
|
||||
;pld [r1, #384]
|
||||
|
||||
mov r12, r2, lsr #8 ;copy 256 bytes data at one time
|
||||
|
||||
memcpy_neon_loop
|
||||
vld1.8 {q0, q1}, [r1]! ;load src data
|
||||
subs r12, r12, #1
|
||||
vld1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
|
||||
vld1.8 {q4, q5}, [r1]!
|
||||
vst1.8 {q2, q3}, [r0]!
|
||||
vld1.8 {q6, q7}, [r1]!
|
||||
vst1.8 {q4, q5}, [r0]!
|
||||
vld1.8 {q8, q9}, [r1]!
|
||||
vst1.8 {q6, q7}, [r0]!
|
||||
vld1.8 {q10, q11}, [r1]!
|
||||
vst1.8 {q8, q9}, [r0]!
|
||||
vld1.8 {q12, q13}, [r1]!
|
||||
vst1.8 {q10, q11}, [r0]!
|
||||
vld1.8 {q14, q15}, [r1]!
|
||||
vst1.8 {q12, q13}, [r0]!
|
||||
vst1.8 {q14, q15}, [r0]!
|
||||
|
||||
;pld [r1] ;preload pred data -- need to adjust for real device
|
||||
;pld [r1, #128]
|
||||
;pld [r1, #256]
|
||||
;pld [r1, #384]
|
||||
|
||||
bne memcpy_neon_loop
|
||||
|
||||
ands r3, r2, #0xff ;extra copy
|
||||
beq done_copy_neon_loop
|
||||
|
||||
extra_copy_neon_loop
|
||||
vld1.8 {q0}, [r1]! ;load src data
|
||||
subs r3, r3, #16
|
||||
vst1.8 {q0}, [r0]!
|
||||
bne extra_copy_neon_loop
|
||||
|
||||
done_copy_neon_loop
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,116 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mse16x16_neon|
|
||||
EXPORT |vp8_get4x4sse_cs_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;============================
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;note: in this function, sum is never used. So, we can remove this part of calculation
|
||||
;from vp9_variance().
|
||||
|
||||
|vp8_mse16x16_neon| PROC
|
||||
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
|
||||
vmov.i8 q8, #0
|
||||
vmov.i8 q9, #0
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
mse16x16_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vmlal.s16 q7, d22, d22
|
||||
vmlal.s16 q8, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vmlal.s16 q7, d26, d26
|
||||
vmlal.s16 q8, d27, d27
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne mse16x16_neon_loop
|
||||
|
||||
vadd.u32 q7, q7, q8
|
||||
vadd.u32 q9, q9, q10
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vadd.u32 q10, q7, q9
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.u64 d0, d2, d3
|
||||
|
||||
vst1.32 {d0[0]}, [r12]
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
;=============================
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int source_stride,
|
||||
; r2 unsigned char *ref_ptr,
|
||||
; r3 int recon_stride
|
||||
|vp8_get4x4sse_cs_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d5}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vmull.s16 q7, d22, d22
|
||||
vmull.s16 q8, d24, d24
|
||||
vmull.s16 q9, d26, d26
|
||||
vmull.s16 q10, d28, d28
|
||||
|
||||
vadd.u32 q7, q7, q8
|
||||
vadd.u32 q9, q9, q10
|
||||
vadd.u32 q9, q7, q9
|
||||
|
||||
vpaddl.u32 q1, q9
|
||||
vadd.u64 d0, d2, d3
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,48 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
#include "vp9/encoder/vp9_onyx_int.h"
|
||||
#include "vp9/encoder/vp9_quantize.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vpx_scale/vpxscale.h"
|
||||
#include "vp9/common/vp9_alloccommon.h"
|
||||
|
||||
extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
|
||||
|
||||
|
||||
void
|
||||
vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
|
||||
unsigned char *src_y, *dst_y;
|
||||
int yheight;
|
||||
int ystride;
|
||||
int border;
|
||||
int yoffset;
|
||||
int linestocopy;
|
||||
|
||||
border = src_ybc->border;
|
||||
yheight = src_ybc->y_height;
|
||||
ystride = src_ybc->y_stride;
|
||||
|
||||
linestocopy = (yheight >> (Fraction + 4));
|
||||
|
||||
if (linestocopy < 1)
|
||||
linestocopy = 1;
|
||||
|
||||
linestocopy <<= 4;
|
||||
|
||||
yoffset = ystride * ((yheight >> 5) * 16 - 8);
|
||||
src_y = src_ybc->y_buffer + yoffset;
|
||||
dst_y = dst_ybc->y_buffer + yoffset;
|
||||
|
||||
// vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
|
||||
vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
|
||||
}
|
|
@ -1,207 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad16x16_neon|
|
||||
EXPORT |vp8_sad16x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
|vp8_sad16x16_neon| PROC
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0]
|
||||
vld1.8 {q7}, [r2]
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
;unsigned int vp8_sad16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|vp8_sad16x8_neon| PROC
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,209 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad8x8_neon|
|
||||
EXPORT |vp8_sad8x16_neon|
|
||||
EXPORT |vp8_sad4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; unsigned int vp8_sad8x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x8_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;============================
|
||||
;unsigned int vp8_sad8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x16_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;===========================
|
||||
;unsigned int vp8_sad4x4_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad4x4_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 d1, d24
|
||||
vpaddl.u32 d0, d1
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,221 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_fdct4x4_neon|
|
||||
EXPORT |vp8_short_fdct8x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=4
|
||||
|
||||
|
||||
ALIGN 16 ; enable use of @128 bit aligned loads
|
||||
coeff
|
||||
DCW 5352, 5352, 5352, 5352
|
||||
DCW 2217, 2217, 2217, 2217
|
||||
DCD 14500, 14500, 14500, 14500
|
||||
DCD 7500, 7500, 7500, 7500
|
||||
DCD 12000, 12000, 12000, 12000
|
||||
DCD 51000, 51000, 51000, 51000
|
||||
|
||||
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct4x4_neon| PROC
|
||||
|
||||
; Part one
|
||||
vld1.16 {d0}, [r0@64], r2
|
||||
adr r12, coeff
|
||||
vld1.16 {d1}, [r0@64], r2
|
||||
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
|
||||
vld1.16 {d2}, [r0@64], r2
|
||||
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
|
||||
vld1.16 {d3}, [r0@64], r2
|
||||
|
||||
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
|
||||
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
|
||||
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
|
||||
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
|
||||
|
||||
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
|
||||
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
|
||||
|
||||
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
|
||||
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
|
||||
|
||||
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
|
||||
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
|
||||
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
|
||||
|
||||
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
|
||||
|
||||
; Part two
|
||||
|
||||
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vmov.s16 d26, #7
|
||||
|
||||
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
|
||||
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
|
||||
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
|
||||
vadd.s16 d4, d4, d26 ; a1 + 7
|
||||
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
|
||||
|
||||
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
|
||||
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
|
||||
|
||||
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
|
||||
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
|
||||
|
||||
vceq.s16 d4, d7, #0
|
||||
|
||||
vshr.s16 d0, d0, #4
|
||||
vshr.s16 d2, d2, #4
|
||||
|
||||
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
|
||||
|
||||
vmvn.s16 d4, d4
|
||||
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
|
||||
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct8x4_neon| PROC
|
||||
|
||||
; Part one
|
||||
|
||||
vld1.16 {q0}, [r0@128], r2
|
||||
adr r12, coeff
|
||||
vld1.16 {q1}, [r0@128], r2
|
||||
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
|
||||
vld1.16 {q2}, [r0@128], r2
|
||||
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
|
||||
vld1.16 {q3}, [r0@128], r2
|
||||
|
||||
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
|
||||
vtrn.32 q0, q2 ; [A0|B0]
|
||||
vtrn.32 q1, q3 ; [A1|B1]
|
||||
vtrn.16 q0, q1 ; [A2|B2]
|
||||
vtrn.16 q2, q3 ; [A3|B3]
|
||||
|
||||
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
|
||||
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
|
||||
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
|
||||
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
|
||||
|
||||
vshl.s16 q11, q11, #3 ; a1 << 3
|
||||
vshl.s16 q12, q12, #3 ; b1 << 3
|
||||
vshl.s16 q13, q13, #3 ; c1 << 3
|
||||
vshl.s16 q14, q14, #3 ; d1 << 3
|
||||
|
||||
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
|
||||
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
|
||||
|
||||
vmov.s16 q11, q9 ; 14500
|
||||
vmov.s16 q12, q10 ; 7500
|
||||
|
||||
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
|
||||
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
|
||||
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
|
||||
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
|
||||
|
||||
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
|
||||
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
|
||||
|
||||
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
|
||||
|
||||
; Part two
|
||||
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
|
||||
|
||||
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
|
||||
vtrn.32 q0, q2 ; q0=[A0 | B0]
|
||||
vtrn.32 q1, q3 ; q1=[A4 | B4]
|
||||
vtrn.16 q0, q1 ; q2=[A8 | B8]
|
||||
vtrn.16 q2, q3 ; q3=[A12|B12]
|
||||
|
||||
vmov.s16 q15, #7
|
||||
|
||||
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
|
||||
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
|
||||
vadd.s16 q11, q11, q15 ; a1 + 7
|
||||
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
|
||||
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
|
||||
|
||||
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
|
||||
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
|
||||
|
||||
vmov.s16 q11, q9 ; 12000
|
||||
vmov.s16 q12, q10 ; 51000
|
||||
|
||||
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
|
||||
|
||||
|
||||
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
|
||||
vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
|
||||
vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
|
||||
vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
|
||||
|
||||
vceq.s16 q14, q14, #0
|
||||
|
||||
vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
|
||||
vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
|
||||
|
||||
vmvn.s16 q14, q14
|
||||
|
||||
vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
|
||||
|
||||
vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]! ; block A
|
||||
vst1.16 {q2, q3}, [r1@128]! ; block B
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_walsh4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
|
||||
; r0 short *input,
|
||||
; r1 short *output,
|
||||
; r2 int pitch
|
||||
|vp8_short_walsh4x4_neon| PROC
|
||||
|
||||
vld1.16 {d0}, [r0@64], r2 ; load input
|
||||
vld1.16 {d1}, [r0@64], r2
|
||||
vld1.16 {d2}, [r0@64], r2
|
||||
vld1.16 {d3}, [r0@64]
|
||||
|
||||
;First for-loop
|
||||
;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
|
||||
vmov.s32 q15, #3 ; add 3 to all values
|
||||
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
|
||||
vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
|
||||
vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
|
||||
vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
|
||||
|
||||
vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
|
||||
vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
|
||||
vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
|
||||
vceq.s16 d16, d4, #0 ; a1 == 0
|
||||
vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
|
||||
|
||||
vadd.s16 d0, d4, d5 ; a1 + d1
|
||||
vmvn d16, d16 ; a1 != 0
|
||||
vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
|
||||
vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
|
||||
vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
|
||||
vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
|
||||
|
||||
;Second for-loop
|
||||
;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
|
||||
vtrn.32 d1, d3
|
||||
vtrn.32 d0, d2
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d0, d1
|
||||
|
||||
vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
|
||||
vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
|
||||
vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
|
||||
vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
|
||||
|
||||
vadd.s32 q0, q8, q9 ; a2 = a1 + d1
|
||||
vadd.s32 q1, q11, q10 ; b2 = b1 + c1
|
||||
vsub.s32 q2, q11, q10 ; c2 = b1 - c1
|
||||
vsub.s32 q3, q8, q9 ; d2 = a1 - d1
|
||||
|
||||
vclt.s32 q8, q0, #0
|
||||
vclt.s32 q9, q1, #0
|
||||
vclt.s32 q10, q2, #0
|
||||
vclt.s32 q11, q3, #0
|
||||
|
||||
; subtract -1 (or 0)
|
||||
vsub.s32 q0, q0, q8 ; a2 += a2 < 0
|
||||
vsub.s32 q1, q1, q9 ; b2 += b2 < 0
|
||||
vsub.s32 q2, q2, q10 ; c2 += c2 < 0
|
||||
vsub.s32 q3, q3, q11 ; d2 += d2 < 0
|
||||
|
||||
vadd.s32 q8, q0, q15 ; a2 + 3
|
||||
vadd.s32 q9, q1, q15 ; b2 + 3
|
||||
vadd.s32 q10, q2, q15 ; c2 + 3
|
||||
vadd.s32 q11, q3, q15 ; d2 + 3
|
||||
|
||||
; vrshrn? would add 1 << 3-1 = 2
|
||||
vshrn.s32 d0, q8, #3
|
||||
vshrn.s32 d1, q9, #3
|
||||
vshrn.s32 d2, q10, #3
|
||||
vshrn.s32 d3, q11, #3
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,425 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_sub_pixel_variance16x16_neon_func|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
|
||||
|
||||
|vp9_sub_pixel_variance16x16_neon_func| PROC
|
||||
push {r4-r6, lr}
|
||||
|
||||
ldr r12, _BilinearTaps_coeff_
|
||||
ldr r4, [sp, #16] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
|
||||
ldr r6, [sp, #24] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {d31}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_bfilter16x16_only
|
||||
|
||||
sub sp, sp, #272 ;reserve space on stack for temporary storage
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
mov lr, sp
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
mov r2, #3 ;loop counter
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vst1.u8 {d18, d19, d20, d21}, [lr]!
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;First-pass filtering for rest 5 lines
|
||||
vld1.u8 {d14, d15, d16}, [r0], r1
|
||||
|
||||
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q10, d3, d0
|
||||
vmull.u8 q11, d5, d0
|
||||
vmull.u8 q12, d6, d0
|
||||
vmull.u8 q13, d8, d0
|
||||
vmull.u8 q14, d9, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
|
||||
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q11, d5, d1
|
||||
vmlal.u8 q13, d8, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
|
||||
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q12, d6, d1
|
||||
vmlal.u8 q14, d9, d1
|
||||
|
||||
vmull.u8 q1, d11, d0
|
||||
vmull.u8 q2, d12, d0
|
||||
vmull.u8 q3, d14, d0
|
||||
vmull.u8 q4, d15, d0
|
||||
|
||||
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
|
||||
vext.8 d14, d14, d15, #1
|
||||
|
||||
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q3, d14, d1
|
||||
|
||||
vext.8 d12, d12, d13, #1
|
||||
vext.8 d15, d15, d16, #1
|
||||
|
||||
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q4, d15, d1
|
||||
|
||||
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d11, q10, #7
|
||||
vqrshrn.u16 d12, q11, #7
|
||||
vqrshrn.u16 d13, q12, #7
|
||||
vqrshrn.u16 d14, q13, #7
|
||||
vqrshrn.u16 d15, q14, #7
|
||||
vqrshrn.u16 d16, q1, #7
|
||||
vqrshrn.u16 d17, q2, #7
|
||||
vqrshrn.u16 d18, q3, #7
|
||||
vqrshrn.u16 d19, q4, #7
|
||||
|
||||
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]!
|
||||
vst1.u8 {d18, d19}, [lr]!
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
sub lr, lr, #272
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
sub sp, sp, #256
|
||||
mov r3, sp
|
||||
|
||||
vld1.u8 {d22, d23}, [lr]! ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [lr]!
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [lr]!
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [lr]!
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [lr]!
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16_only
|
||||
mov r2, #4 ;loop counter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
vst1.u8 {d14, d15}, [r3]! ;store result
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
|
||||
vst1.u8 {d16, d17}, [r3]!
|
||||
vst1.u8 {d18, d19}, [r3]!
|
||||
vst1.u8 {d20, d21}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16_only
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
add r3, r12, r3, lsl #3
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d22, d23}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [r0], r1
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [r0], r1
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [r0], r1
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [r0], r1
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
subs r12, r12, #1
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r12, #8
|
||||
|
||||
sub_pixel_variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q2}, [r4], r5
|
||||
vld1.8 {q1}, [r3]!
|
||||
vld1.8 {q3}, [r4], r5
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne sub_pixel_variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r6] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
add sp, sp, #528
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4-r6,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
_BilinearTaps_coeff_
|
||||
DCD bilinear_taps_coeff
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,572 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance_halfpixvar16x16_h_neon|
|
||||
EXPORT |vp9_variance_halfpixvar16x16_v_neon|
|
||||
EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
|
||||
EXPORT |vp9_sub_pixel_variance16x16s_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;================================================
|
||||
;unsigned int vp9_variance_halfpixvar16x16_h_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp9_variance_halfpixvar16x16_h_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.8 {q11}, [r2], r3
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.8 {q12}, [r2], r3
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.8 {q13}, [r2], r3
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vext.8 q3, q2, q3, #1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vld1.8 {q14}, [r2], r3
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
|
||||
vsubl.u8 q4, d0, d22 ;diff
|
||||
vsubl.u8 q5, d1, d23
|
||||
vsubl.u8 q6, d2, d24
|
||||
vsubl.u8 q7, d3, d25
|
||||
vsubl.u8 q0, d4, d26
|
||||
vsubl.u8 q1, d5, d27
|
||||
vsubl.u8 q2, d6, d28
|
||||
vsubl.u8 q3, d7, d29
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp9_variance_halfpixvar16x16_v_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp9_variance_halfpixvar16x16_v_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vld1.u8 {q0}, [r0], r1 ;load src data
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
vp8_filt_spo16x16s_0_4_loop_neon
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vld1.8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q4
|
||||
vrhadd.u8 q4, q4, q6
|
||||
vrhadd.u8 q6, q6, q15
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_spo16x16s_0_4_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp9_variance_halfpixvar16x16_hv_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp9_variance_halfpixvar16x16_hv_neon| PROC
|
||||
push {lr}
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q13, #0 ;q8 - sum
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
|
||||
vmov.i8 q14, #0 ;q9, q10 - sse
|
||||
vmov.i8 q15, #0
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8_filt16x16s_4_4_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.8 {q8}, [r2], r3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
vsubl.u8 q9, d0, d10 ;diff
|
||||
vsubl.u8 q10, d1, d11
|
||||
vsubl.u8 q11, d2, d12
|
||||
vsubl.u8 q12, d3, d13
|
||||
|
||||
vsubl.u8 q0, d4, d14 ;diff
|
||||
vsubl.u8 q1, d5, d15
|
||||
vsubl.u8 q5, d6, d16
|
||||
vsubl.u8 q6, d7, d17
|
||||
|
||||
vpadal.s16 q13, q9 ;sum
|
||||
vmlal.s16 q14, d18, d18 ;sse
|
||||
vmlal.s16 q15, d19, d19
|
||||
|
||||
vpadal.s16 q13, q10 ;sum
|
||||
vmlal.s16 q14, d20, d20 ;sse
|
||||
vmlal.s16 q15, d21, d21
|
||||
|
||||
vpadal.s16 q13, q11 ;sum
|
||||
vmlal.s16 q14, d22, d22 ;sse
|
||||
vmlal.s16 q15, d23, d23
|
||||
|
||||
vpadal.s16 q13, q12 ;sum
|
||||
vmlal.s16 q14, d24, d24 ;sse
|
||||
vmlal.s16 q15, d25, d25
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q13, q0 ;sum
|
||||
vmlal.s16 q14, d0, d0 ;sse
|
||||
vmlal.s16 q15, d1, d1
|
||||
|
||||
vpadal.s16 q13, q1 ;sum
|
||||
vmlal.s16 q14, d2, d2 ;sse
|
||||
vmlal.s16 q15, d3, d3
|
||||
|
||||
vpadal.s16 q13, q5 ;sum
|
||||
vmlal.s16 q14, d10, d10 ;sse
|
||||
vmlal.s16 q15, d11, d11
|
||||
|
||||
vmov q0, q4
|
||||
|
||||
vpadal.s16 q13, q6 ;sum
|
||||
vmlal.s16 q14, d12, d12 ;sse
|
||||
vmlal.s16 q15, d13, d13
|
||||
|
||||
bne vp8_filt16x16s_4_4_loop_neon
|
||||
|
||||
vadd.u32 q15, q14, q15 ;accumulate sse
|
||||
vpaddl.s32 q0, q13 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q15
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack unsigned char *dst_ptr,
|
||||
; stack int dst_pixels_per_line,
|
||||
; stack unsigned int *sse
|
||||
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
|
||||
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
|
||||
;or filter coeff is {64, 64}. This simplified program only works in this situation.
|
||||
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|
||||
|
||||
|vp9_sub_pixel_variance16x16s_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
ldr r4, [sp, #8] ;load *dst_ptr from stack
|
||||
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #16] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16s_only
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq firstpass_bfilter16x16s_only
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
mov r3, sp
|
||||
mov r2, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
subs r2, r2, #1
|
||||
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
|
||||
vmov q0, q4
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16s_only
|
||||
mov r2, #2 ;loop counter
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
vext.8 q3, q2, q3, #1
|
||||
vld1.u8 {d20, d21, d22, d23}, [r0], r1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vld1.u8 {d24, d25, d26, d27}, [r0], r1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vld1.u8 {d28, d29, d30, d31}, [r0], r1
|
||||
vext.8 q9, q8, q9, #1
|
||||
vext.8 q11, q10, q11, #1
|
||||
vext.8 q13, q12, q13, #1
|
||||
vext.8 q15, q14, q15, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
vrhadd.u8 q5, q10, q11
|
||||
vrhadd.u8 q6, q12, q13
|
||||
vrhadd.u8 q7, q14, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]!
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16s_only
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
vld1.u8 {d0, d1}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
vld1.u8 {d2, d3}, [r0], r1
|
||||
vld1.u8 {d4, d5}, [r0], r1
|
||||
vld1.u8 {d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.u8 {d10, d11}, [r0], r1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.u8 {d12, d13}, [r0], r1
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.u8 {d14, d15}, [r0], r1
|
||||
vrhadd.u8 q3, q3, q4
|
||||
vld1.u8 {d16, d17}, [r0], r1
|
||||
vrhadd.u8 q4, q4, q5
|
||||
vrhadd.u8 q5, q5, q6
|
||||
vrhadd.u8 q6, q6, q7
|
||||
vrhadd.u8 q7, q7, q8
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vmov q0, q8
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16s_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r2, #4
|
||||
|
||||
sub_pixel_variance16x16s_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q1}, [r4], r12
|
||||
vld1.8 {q2}, [r3]!
|
||||
vld1.8 {q3}, [r4], r12
|
||||
vld1.8 {q4}, [r3]!
|
||||
vld1.8 {q5}, [r4], r12
|
||||
vld1.8 {q6}, [r3]!
|
||||
vld1.8 {q7}, [r4], r12
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne sub_pixel_variance16x16s_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
add sp, sp, #256
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4, pc}
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,224 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_sub_pixel_variance8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
|
||||
|
||||
|vp9_sub_pixel_variance8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
ldr r12, _BilinearTaps_coeff_
|
||||
ldr r4, [sp, #12] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #20] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d27, q7, #7
|
||||
vqrshrn.u16 d28, q8, #7
|
||||
vqrshrn.u16 d29, q9, #7
|
||||
vqrshrn.u16 d30, q10, #7
|
||||
|
||||
;Second pass: 8x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
;skip_secondpass_filter
|
||||
beq sub_pixel_variance8x8_neon
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
vmlal.u8 q5, d27, d1
|
||||
vmlal.u8 q6, d28, d1
|
||||
vmlal.u8 q7, d29, d1
|
||||
vmlal.u8 q8, d30, d1
|
||||
|
||||
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d23, q2, #7
|
||||
vqrshrn.u16 d24, q3, #7
|
||||
vqrshrn.u16 d25, q4, #7
|
||||
vqrshrn.u16 d26, q5, #7
|
||||
vqrshrn.u16 d27, q6, #7
|
||||
vqrshrn.u16 d28, q7, #7
|
||||
vqrshrn.u16 d29, q8, #7
|
||||
|
||||
b sub_pixel_variance8x8_neon
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;----------------------
|
||||
;vp9_variance8x8_neon
|
||||
sub_pixel_variance8x8_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
sub_pixel_variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r4], r5 ;load dst data
|
||||
subs r12, r12, #1
|
||||
vld1.8 {d1}, [r4], r5
|
||||
vld1.8 {d2}, [r4], r5
|
||||
vsubl.u8 q4, d22, d0 ;calculate diff
|
||||
vld1.8 {d3}, [r4], r5
|
||||
|
||||
vsubl.u8 q5, d23, d1
|
||||
vsubl.u8 q6, d24, d2
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
vsubl.u8 q7, d25, d3
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
|
||||
vmov q11, q13
|
||||
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
|
||||
vmov q12, q14
|
||||
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
bne sub_pixel_variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.s32 d10, d10, #6
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {r4-r5, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
_BilinearTaps_coeff_
|
||||
DCD bilinear_taps_coeff
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
|
@ -1,185 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_subtract_b_neon|
|
||||
EXPORT |vp8_subtract_mby_neon|
|
||||
EXPORT |vp8_subtract_mbuv_neon|
|
||||
|
||||
INCLUDE vp9_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
|vp8_subtract_b_neon| PROC
|
||||
|
||||
stmfd sp!, {r4-r7}
|
||||
|
||||
ldr r3, [r0, #vp8_block_base_src]
|
||||
ldr r4, [r0, #vp8_block_src]
|
||||
ldr r5, [r0, #vp8_block_src_diff]
|
||||
ldr r3, [r3]
|
||||
ldr r6, [r0, #vp8_block_src_stride]
|
||||
add r3, r3, r4 ; src = *base_src + src
|
||||
ldr r7, [r1, #vp8_blockd_predictor]
|
||||
|
||||
vld1.8 {d0}, [r3], r6 ;load src
|
||||
vld1.8 {d1}, [r7], r2 ;load pred
|
||||
vld1.8 {d2}, [r3], r6
|
||||
vld1.8 {d3}, [r7], r2
|
||||
vld1.8 {d4}, [r3], r6
|
||||
vld1.8 {d5}, [r7], r2
|
||||
vld1.8 {d6}, [r3], r6
|
||||
vld1.8 {d7}, [r7], r2
|
||||
|
||||
vsubl.u8 q10, d0, d1
|
||||
vsubl.u8 q11, d2, d3
|
||||
vsubl.u8 q12, d4, d5
|
||||
vsubl.u8 q13, d6, d7
|
||||
|
||||
mov r2, r2, lsl #1
|
||||
|
||||
vst1.16 {d20}, [r5], r2 ;store diff
|
||||
vst1.16 {d22}, [r5], r2
|
||||
vst1.16 {d24}, [r5], r2
|
||||
vst1.16 {d26}, [r5], r2
|
||||
|
||||
ldmfd sp!, {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
;==========================================
|
||||
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
|vp8_subtract_mby_neon| PROC
|
||||
mov r12, #4
|
||||
|
||||
subtract_mby_loop
|
||||
vld1.8 {q0}, [r1], r3 ;load src
|
||||
vld1.8 {q1}, [r2]! ;load pred
|
||||
vld1.8 {q2}, [r1], r3
|
||||
vld1.8 {q3}, [r2]!
|
||||
vld1.8 {q4}, [r1], r3
|
||||
vld1.8 {q5}, [r2]!
|
||||
vld1.8 {q6}, [r1], r3
|
||||
vld1.8 {q7}, [r2]!
|
||||
|
||||
vsubl.u8 q8, d0, d2
|
||||
vsubl.u8 q9, d1, d3
|
||||
vsubl.u8 q10, d4, d6
|
||||
vsubl.u8 q11, d5, d7
|
||||
vsubl.u8 q12, d8, d10
|
||||
vsubl.u8 q13, d9, d11
|
||||
vsubl.u8 q14, d12, d14
|
||||
vsubl.u8 q15, d13, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
|
||||
subs r12, r12, #1
|
||||
bne subtract_mby_loop
|
||||
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
|vp8_subtract_mbuv_neon| PROC
|
||||
ldr r12, [sp]
|
||||
|
||||
;u
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
add r3, r3, #256 ; unsigned char *upred = pred + 256;
|
||||
|
||||
vld1.8 {d0}, [r1], r12 ;load src
|
||||
vld1.8 {d1}, [r3]! ;load pred
|
||||
vld1.8 {d2}, [r1], r12
|
||||
vld1.8 {d3}, [r3]!
|
||||
vld1.8 {d4}, [r1], r12
|
||||
vld1.8 {d5}, [r3]!
|
||||
vld1.8 {d6}, [r1], r12
|
||||
vld1.8 {d7}, [r3]!
|
||||
vld1.8 {d8}, [r1], r12
|
||||
vld1.8 {d9}, [r3]!
|
||||
vld1.8 {d10}, [r1], r12
|
||||
vld1.8 {d11}, [r3]!
|
||||
vld1.8 {d12}, [r1], r12
|
||||
vld1.8 {d13}, [r3]!
|
||||
vld1.8 {d14}, [r1], r12
|
||||
vld1.8 {d15}, [r3]!
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
|
||||
;v
|
||||
vld1.8 {d0}, [r2], r12 ;load src
|
||||
vld1.8 {d1}, [r3]! ;load pred
|
||||
vld1.8 {d2}, [r2], r12
|
||||
vld1.8 {d3}, [r3]!
|
||||
vld1.8 {d4}, [r2], r12
|
||||
vld1.8 {d5}, [r3]!
|
||||
vld1.8 {d6}, [r2], r12
|
||||
vld1.8 {d7}, [r3]!
|
||||
vld1.8 {d8}, [r2], r12
|
||||
vld1.8 {d9}, [r3]!
|
||||
vld1.8 {d10}, [r2], r12
|
||||
vld1.8 {d11}, [r3]!
|
||||
vld1.8 {d12}, [r2], r12
|
||||
vld1.8 {d13}, [r3]!
|
||||
vld1.8 {d14}, [r2], r12
|
||||
vld1.8 {d15}, [r3]!
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,276 +0,0 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp9_variance16x16_neon|
|
||||
EXPORT |vp9_variance16x8_neon|
|
||||
EXPORT |vp9_variance8x16_neon|
|
||||
EXPORT |vp9_variance8x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance16x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
|
||||
;the results into the elements of the destination vector. The explanation
|
||||
;in ARM guide is wrong.
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
;vmov.32 r0, d0[0] ;this instruction costs a lot
|
||||
;vmov.32 r1, d1[0]
|
||||
;mul r0, r0, r0
|
||||
;str r1, [r12]
|
||||
;sub r0, r1, r0, asr #8
|
||||
|
||||
;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
|
||||
;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.s32 d10, d10, #8
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;================================
|
||||
;unsigned int vp9_variance16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|vp9_variance16x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #4
|
||||
|
||||
variance16x8_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.s32 d10, d10, #7
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;unsigned int vp9_variance8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|
||||
|vp9_variance8x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance8x16_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d2, d6
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
|
||||
bne variance8x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.s32 d10, d10, #7
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==================================
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp9_variance8x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d5}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.s32 d10, d10, #6
|
||||
vsub.s32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,129 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_ports/config.h"
|
||||
#include "vpx_ports/arm.h"
|
||||
#include "vp9/encoder/vp9_variance.h"
|
||||
#include "vp9/encoder/vp9_onyx_int.h"
|
||||
|
||||
extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
|
||||
extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
|
||||
extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
|
||||
|
||||
void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
int flags = cpi->common.rtcd.flags;
|
||||
|
||||
#if HAVE_ARMV5TE
|
||||
if (flags & HAS_EDSP) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV6
|
||||
if (flags & HAS_MEDIA) {
|
||||
cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6;
|
||||
/*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c;
|
||||
cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c;
|
||||
cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c;
|
||||
cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/
|
||||
|
||||
/*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
|
||||
cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6;
|
||||
/*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c;
|
||||
cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/
|
||||
cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6;
|
||||
|
||||
/*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
|
||||
cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6;
|
||||
/*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
|
||||
cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
|
||||
cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6;
|
||||
cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6;
|
||||
cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6;
|
||||
cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6;
|
||||
|
||||
cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6;
|
||||
/*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
|
||||
|
||||
cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6;
|
||||
cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6;
|
||||
cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6;
|
||||
cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6;
|
||||
cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6;
|
||||
|
||||
/*cpi->rtcd.encodemb.berr = vp9_block_error_c;
|
||||
cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
|
||||
cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
|
||||
cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6;
|
||||
cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6;
|
||||
cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6;
|
||||
|
||||
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
|
||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
if (flags & HAS_NEON) {
|
||||
cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon;
|
||||
cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon;
|
||||
cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon;
|
||||
cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon;
|
||||
cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon;
|
||||
|
||||
/*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
|
||||
cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon;
|
||||
cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon;
|
||||
cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon;
|
||||
cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon;
|
||||
|
||||
/*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
|
||||
cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon;
|
||||
/*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
|
||||
cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
|
||||
cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon;
|
||||
cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon;
|
||||
cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon;
|
||||
cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon;
|
||||
|
||||
cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon;
|
||||
/*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
|
||||
|
||||
cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon;
|
||||
cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon;
|
||||
cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon;
|
||||
cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon;
|
||||
cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon;
|
||||
|
||||
/*cpi->rtcd.encodemb.berr = vp9_block_error_c;
|
||||
cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
|
||||
cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
|
||||
cpi->rtcd.encodemb.subb = vp9_subtract_b_neon;
|
||||
cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon;
|
||||
cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon;
|
||||
|
||||
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
|
||||
cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
|
||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
|
||||
cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (flags & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
|
@ -1,33 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vp9/encoder/vp9_boolhuff.h"
|
||||
#include "vp9/common/vp9_blockd.h"
|
||||
|
||||
const unsigned int vp9_prob_cost[256] = {
|
||||
2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
|
||||
1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
|
||||
767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
|
||||
617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
|
||||
511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
|
||||
428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
|
||||
361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
|
||||
304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
|
||||
255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
|
||||
211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
|
||||
172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
|
||||
137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
|
||||
105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
|
||||
75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
|
||||
48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
|
||||
22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
|
||||
};
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "./vp9_rtcd.h"
|
||||
|
||||
#if HAVE_ARMV6
|
||||
|
||||
void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
|
||||
vp9_short_fdct4x4_armv6(input, output, pitch);
|
||||
vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
|
||||
}
|
||||
|
||||
#endif /* HAVE_ARMV6 */
|
|
@ -1,65 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_ENCODER_ARM_VP9_DCT_ARM_H_
|
||||
#define VP9_ENCODER_ARM_VP9_DCT_ARM_H_
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_fdct(vp9_short_walsh4x4_armv6);
|
||||
extern prototype_fdct(vp9_short_fdct4x4_armv6);
|
||||
extern prototype_fdct(vp9_short_fdct8x4_armv6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_fdct_walsh_short4x4
|
||||
#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
|
||||
|
||||
#undef vp8_fdct_short4x4
|
||||
#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
|
||||
|
||||
#undef vp8_fdct_short8x4
|
||||
#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
|
||||
|
||||
#undef vp8_fdct_fast4x4
|
||||
#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
|
||||
|
||||
#undef vp8_fdct_fast8x4
|
||||
#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_ARMV6 */
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_fdct(vp9_short_fdct4x4_neon);
|
||||
extern prototype_fdct(vp9_short_fdct8x4_neon);
|
||||
extern prototype_fdct(vp8_fast_fdct4x4_neon);
|
||||
extern prototype_fdct(vp8_fast_fdct8x4_neon);
|
||||
extern prototype_fdct(vp9_short_walsh4x4_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_fdct_short4x4
|
||||
#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
|
||||
|
||||
#undef vp8_fdct_short8x4
|
||||
#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
|
||||
|
||||
#undef vp8_fdct_fast4x4
|
||||
#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
|
||||
|
||||
#undef vp8_fdct_fast8x4
|
||||
#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
|
||||
|
||||
#undef vp8_fdct_walsh_short4x4
|
||||
#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_
|
||||
#define VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_subb(vp9_subtract_b_armv6);
|
||||
extern prototype_submby(vp9_subtract_mby_armv6);
|
||||
extern prototype_submbuv(vp9_subtract_mbuv_armv6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_encodemb_subb
|
||||
#define vp8_encodemb_subb vp9_subtract_b_armv6
|
||||
|
||||
#undef vp8_encodemb_submby
|
||||
#define vp8_encodemb_submby vp9_subtract_mby_armv6
|
||||
|
||||
#undef vp8_encodemb_submbuv
|
||||
#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_ARMV6 */
|
||||
|
||||
#if HAVE_ARMV7
|
||||
// extern prototype_berr(vp9_block_error_c);
|
||||
// extern prototype_mberr(vp9_mbblock_error_c);
|
||||
// extern prototype_mbuverr(vp9_mbuverror_c);
|
||||
|
||||
extern prototype_subb(vp9_subtract_b_neon);
|
||||
extern prototype_submby(vp9_subtract_mby_neon);
|
||||
extern prototype_submbuv(vp9_subtract_mbuv_neon);
|
||||
|
||||
// #undef vp8_encodemb_berr
|
||||
// #define vp8_encodemb_berr vp9_block_error_c
|
||||
|
||||
// #undef vp8_encodemb_mberr
|
||||
// #define vp8_encodemb_mberr vp9_mbblock_error_c
|
||||
|
||||
// #undef vp8_encodemb_mbuverr
|
||||
// #define vp8_encodemb_mbuverr vp9_mbuverror_c
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_encodemb_subb
|
||||
#define vp8_encodemb_subb vp9_subtract_b_neon
|
||||
|
||||
#undef vp8_encodemb_submby
|
||||
#define vp8_encodemb_submby vp9_subtract_mby_neon
|
||||
|
||||
#undef vp8_encodemb_submbuv
|
||||
#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#include "vp9/encoder/vp9_quantize.h"
|
||||
#include "vp9/common/vp9_entropy.h"
|
||||
|
||||
|
||||
#if HAVE_ARMV7
|
||||
|
||||
/* vp8_quantize_mbX functions here differs from corresponding ones in
|
||||
* vp9_quantize.c only by using quantize_b_pair function pointer instead of
|
||||
* the regular quantize_b function pointer */
|
||||
void vp8_quantize_mby_neon(MACROBLOCK *x) {
|
||||
int i;
|
||||
int has_2nd_order = get_2nd_order_usage(xd);
|
||||
|
||||
for (i = 0; i < 16; i += 2)
|
||||
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
|
||||
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
|
||||
|
||||
if (has_2nd_order)
|
||||
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
|
||||
}
|
||||
|
||||
void vp8_quantize_mb_neon(MACROBLOCK *x) {
|
||||
int i;
|
||||
int has_2nd_order = get_2nd_order_usage(xd);
|
||||
|
||||
for (i = 0; i < 24; i += 2)
|
||||
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
|
||||
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
|
||||
|
||||
if (has_2nd_order)
|
||||
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
|
||||
}
|
||||
|
||||
|
||||
void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
|
||||
int i;
|
||||
|
||||
for (i = 16; i < 24; i += 2)
|
||||
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
|
||||
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
|
||||
}
|
||||
|
||||
#endif /* HAVE_ARMV7 */
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче