Remove ARM optimizations from VP9

Change-Id: I9f0ae635fb9a95c4aa1529c177ccb07e2b76970b
This commit is contained in:
Johann 2012-12-02 14:14:00 -08:00
Родитель 0d793ccfb6
Коммит 34591b54dd
112 изменённых файлов: 15 добавлений и 19132 удалений

Просмотреть файл

@ -109,7 +109,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h

Просмотреть файл

@ -18,30 +18,6 @@ VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
VP8_DX_SRCS-yes += vp8_dx_iface.c
# common
#define ARM
#define DISABLE_THREAD
#INCLUDES += algo/vpx_common/vpx_mem/include
#INCLUDES += common
#INCLUDES += common
#INCLUDES += common
#INCLUDES += common
#INCLUDES += decoder
# decoder
#define ARM
#define DISABLE_THREAD
#INCLUDES += algo/vpx_common/vpx_mem/include
#INCLUDES += common
#INCLUDES += common
#INCLUDES += common
#INCLUDES += common
#INCLUDES += decoder
VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c
VP8_DX_SRCS-yes += decoder/dboolhuff.c
VP8_DX_SRCS-yes += decoder/decodemv.c

Просмотреть файл

@ -1,237 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_filter_block2d_bil_first_pass_armv6|
EXPORT |vp9_filter_block2d_bil_second_pass_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr,
; r1 unsigned short *dst_ptr,
; r2 unsigned int src_pitch,
; r3 unsigned int height,
; stack unsigned int width,
; stack const short *vp9_filter
;-------------------------------------
; The output is transposed stroed in output array to make it easy for second pass filtering.
|vp9_filter_block2d_bil_first_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp9_filter address
ldr r4, [sp, #36] ; width
mov r12, r3 ; outer-loop counter
add r7, r2, r4 ; preload next row
pld [r0, r7]
sub r2, r2, r4 ; src increment for height loop
ldr r5, [r11] ; load up filter coefficients
mov r3, r3, lsl #1 ; height*2
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
mov r11, r1 ; save dst_ptr for each row
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_1st_filter
|bil_height_loop_1st_v6|
ldrb r6, [r0] ; load source data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|bil_width_loop_1st_v6|
ldrb r9, [r0, #3]
ldrb r10, [r0, #4]
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
smuad r6, r6, r5 ; apply the filter
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
smuad r7, r7, r5
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
smuad r8, r8, r5
smuad r9, r9, r5
add r0, r0, #4
subs lr, lr, #1
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #16, r6, asr #7
usat r7, #16, r7, asr #7
strh r6, [r1], r3 ; result is transposed and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strh r7, [r1], r3
add r9, r9, #0x40
usat r8, #16, r8, asr #7
usat r9, #16, r9, asr #7
strh r8, [r1], r3 ; result is transposed and stored
ldrneb r6, [r0] ; load source data
strh r9, [r1], r3
ldrneb r7, [r0, #1]
ldrneb r8, [r0, #2]
bne bil_width_loop_1st_v6
add r0, r0, r2 ; move to next input row
subs r12, r12, #1
add r9, r2, r4, lsl #1 ; adding back block width
pld [r0, r9] ; preload next row
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_1st_v6
ldmia sp!, {r4 - r11, pc}
|bil_null_1st_filter|
|bil_height_loop_null_1st|
mov lr, r4, lsr #2 ; loop counter
|bil_width_loop_null_1st|
ldrb r6, [r0] ; load data
ldrb r7, [r0, #1]
ldrb r8, [r0, #2]
ldrb r9, [r0, #3]
strh r6, [r1], r3 ; store it to immediate buffer
add r0, r0, #4
strh r7, [r1], r3
subs lr, lr, #1
strh r8, [r1], r3
strh r9, [r1], r3
bne bil_width_loop_null_1st
subs r12, r12, #1
add r0, r0, r2 ; move to next input line
add r11, r11, #2 ; move over to next column
mov r1, r11
bne bil_height_loop_null_1st
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_filter_block2d_bil_first_pass_armv6|
;---------------------------------
; r0 unsigned short *src_ptr,
; r1 unsigned char *dst_ptr,
; r2 int dst_pitch,
; r3 unsigned int height,
; stack unsigned int width,
; stack const short *vp9_filter
;---------------------------------
|vp9_filter_block2d_bil_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp9_filter address
ldr r4, [sp, #36] ; width
ldr r5, [r11] ; load up filter coefficients
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
mov r11, r1
cmp r5, #128 ; if filter coef = 128, then skip the filter
beq bil_null_2nd_filter
|bil_height_loop_2nd|
ldr r6, [r0] ; load the data
ldr r8, [r0, #4]
ldrh r10, [r0, #8]
mov lr, r3, lsr #2 ; loop counter
|bil_width_loop_2nd|
pkhtb r7, r6, r8 ; src[1] | src[2]
pkhtb r9, r8, r10 ; src[3] | src[4]
smuad r6, r6, r5 ; apply filter
smuad r8, r8, r5 ; apply filter
subs lr, lr, #1
smuadx r7, r7, r5 ; apply filter
smuadx r9, r9, r5 ; apply filter
add r0, r0, #8
add r6, r6, #0x40 ; round_shift_and_clamp
add r7, r7, #0x40
usat r6, #8, r6, asr #7
usat r7, #8, r7, asr #7
strb r6, [r1], r2 ; the result is transposed back and stored
add r8, r8, #0x40 ; round_shift_and_clamp
strb r7, [r1], r2
add r9, r9, #0x40
usat r8, #8, r8, asr #7
usat r9, #8, r9, asr #7
strb r8, [r1], r2 ; the result is transposed back and stored
ldrne r6, [r0] ; load data
strb r9, [r1], r2
ldrne r8, [r0, #4]
ldrneh r10, [r0, #8]
bne bil_width_loop_2nd
subs r12, r12, #1
add r0, r0, #4 ; update src for next row
add r11, r11, #1
mov r1, r11
bne bil_height_loop_2nd
ldmia sp!, {r4 - r11, pc}
|bil_null_2nd_filter|
|bil_height_loop_null_2nd|
mov lr, r3, lsr #2
|bil_width_loop_null_2nd|
ldr r6, [r0], #4 ; load data
subs lr, lr, #1
ldr r8, [r0], #4
strb r6, [r1], r2 ; store data
mov r7, r6, lsr #16
strb r7, [r1], r2
mov r9, r8, lsr #16
strb r8, [r1], r2
strb r9, [r1], r2
bne bil_width_loop_null_2nd
subs r12, r12, #1
add r0, r0, #4
add r11, r11, #1
mov r1, r11
bne bil_height_loop_null_2nd
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_filter_block2d_second_pass_armv6|
END

Просмотреть файл

@ -1,186 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem16x16_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem16x16_v6| PROC
stmdb sp!, {r4 - r7}
;push {r4-r7}
;preload
pld [r0, #31] ; preload for next 16x16 block
ands r4, r0, #15
beq copy_mem16x16_fast
ands r4, r0, #7
beq copy_mem16x16_8
ands r4, r0, #3
beq copy_mem16x16_4
;copy one byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
ldrb r6, [r0, #2]
ldrb r7, [r0, #3]
mov r12, #16
copy_mem16x16_1_loop
strb r4, [r2]
strb r5, [r2, #1]
strb r6, [r2, #2]
strb r7, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
ldrb r6, [r0, #6]
ldrb r7, [r0, #7]
subs r12, r12, #1
strb r4, [r2, #4]
strb r5, [r2, #5]
strb r6, [r2, #6]
strb r7, [r2, #7]
ldrb r4, [r0, #8]
ldrb r5, [r0, #9]
ldrb r6, [r0, #10]
ldrb r7, [r0, #11]
strb r4, [r2, #8]
strb r5, [r2, #9]
strb r6, [r2, #10]
strb r7, [r2, #11]
ldrb r4, [r0, #12]
ldrb r5, [r0, #13]
ldrb r6, [r0, #14]
ldrb r7, [r0, #15]
add r0, r0, r1
strb r4, [r2, #12]
strb r5, [r2, #13]
strb r6, [r2, #14]
strb r7, [r2, #15]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
ldrneb r6, [r0, #2]
ldrneb r7, [r0, #3]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_1_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 4 bytes each time
copy_mem16x16_4
ldr r4, [r0]
ldr r5, [r0, #4]
ldr r6, [r0, #8]
ldr r7, [r0, #12]
mov r12, #16
copy_mem16x16_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
str r6, [r2, #8]
str r7, [r2, #12]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
ldrne r6, [r0, #8]
ldrne r7, [r0, #12]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_4_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 8 bytes each time
copy_mem16x16_8
sub r1, r1, #16
sub r3, r3, #16
mov r12, #16
copy_mem16x16_8_loop
ldmia r0!, {r4-r5}
;ldm r0, {r4-r5}
ldmia r0!, {r6-r7}
add r0, r0, r1
stmia r2!, {r4-r5}
subs r12, r12, #1
;stm r2, {r4-r5}
stmia r2!, {r6-r7}
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_8_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 16 bytes each time
copy_mem16x16_fast
;sub r1, r1, #16
;sub r3, r3, #16
mov r12, #16
copy_mem16x16_fast_loop
ldmia r0, {r4-r7}
;ldm r0, {r4-r7}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r7}
;stm r2, {r4-r7}
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_fast_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
ENDP ; |vp9_copy_mem16x16_v6|
END

Просмотреть файл

@ -1,128 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem8x4_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem8x4_v6| PROC
;push {r4-r5}
stmdb sp!, {r4-r5}
;preload
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #7
beq copy_mem8x4_fast
ands r4, r0, #3
beq copy_mem8x4_4
;copy 1 byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
mov r12, #4
copy_mem8x4_1_loop
strb r4, [r2]
strb r5, [r2, #1]
ldrb r4, [r0, #2]
ldrb r5, [r0, #3]
subs r12, r12, #1
strb r4, [r2, #2]
strb r5, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
strb r4, [r2, #4]
strb r5, [r2, #5]
ldrb r4, [r0, #6]
ldrb r5, [r0, #7]
add r0, r0, r1
strb r4, [r2, #6]
strb r5, [r2, #7]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
bne copy_mem8x4_1_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 4 bytes each time
copy_mem8x4_4
ldr r4, [r0]
ldr r5, [r0, #4]
mov r12, #4
copy_mem8x4_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
bne copy_mem8x4_4_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
;copy 8 bytes each time
copy_mem8x4_fast
;sub r1, r1, #8
;sub r3, r3, #8
mov r12, #4
copy_mem8x4_fast_loop
ldmia r0, {r4-r5}
;ldm r0, {r4-r5}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r5}
;stm r2, {r4-r5}
add r2, r2, r3
bne copy_mem8x4_fast_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
ENDP ; |vp9_copy_mem8x4_v6|
END

Просмотреть файл

@ -1,128 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem8x8_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem8x8_v6| PROC
;push {r4-r5}
stmdb sp!, {r4-r5}
;preload
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #7
beq copy_mem8x8_fast
ands r4, r0, #3
beq copy_mem8x8_4
;copy 1 byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
mov r12, #8
copy_mem8x8_1_loop
strb r4, [r2]
strb r5, [r2, #1]
ldrb r4, [r0, #2]
ldrb r5, [r0, #3]
subs r12, r12, #1
strb r4, [r2, #2]
strb r5, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
strb r4, [r2, #4]
strb r5, [r2, #5]
ldrb r4, [r0, #6]
ldrb r5, [r0, #7]
add r0, r0, r1
strb r4, [r2, #6]
strb r5, [r2, #7]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
bne copy_mem8x8_1_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 4 bytes each time
copy_mem8x8_4
ldr r4, [r0]
ldr r5, [r0, #4]
mov r12, #8
copy_mem8x8_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
bne copy_mem8x8_4_loop
ldmia sp!, {r4 - r5}
;pop {r4-r5}
mov pc, lr
;copy 8 bytes each time
copy_mem8x8_fast
;sub r1, r1, #8
;sub r3, r3, #8
mov r12, #8
copy_mem8x8_fast_loop
ldmia r0, {r4-r5}
;ldm r0, {r4-r5}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r5}
;stm r2, {r4-r5}
add r2, r2, r3
bne copy_mem8x8_fast_loop
ldmia sp!, {r4-r5}
;pop {r4-r5}
mov pc, lr
ENDP ; |vp9_copy_mem8x8_v6|
END

Просмотреть файл

@ -1,67 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dest_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_v6| PROC
stmdb sp!, {r4 - r7, lr}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
ldr r4, [r1], r3
ldr r6, [r1], r3
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
ldr lr, [sp, #20]
orr r0, r0, r0, lsl #16 ; a1 | a1
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
ldr r4, [r1], r3
ldr r6, [r1]
str r5, [r2], lr
str r7, [r2], lr
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
uxtab16 r7, r0, r6
uxtab16 r6, r0, r6, ror #8
usat16 r5, #8, r5
usat16 r4, #8, r4
usat16 r7, #8, r7
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
str r5, [r2], lr
str r7, [r2]
ldmia sp!, {r4 - r7, pc}
ENDP ; |vp8_dc_only_idct_add_v6|
; Constant Pool
c0x0000FFFF DCD 0x0000FFFF
END

Просмотреть файл

@ -1,624 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_filter_block2d_first_pass_armv6|
EXPORT |vp9_filter_block2d_first_pass_16x16_armv6|
EXPORT |vp9_filter_block2d_first_pass_8x8_armv6|
EXPORT |vp9_filter_block2d_second_pass_armv6|
EXPORT |vp9_filter4_block2d_second_pass_armv6|
EXPORT |vp9_filter_block2d_first_pass_only_armv6|
EXPORT |vp9_filter_block2d_second_pass_only_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr
; r1 short *output_ptr
; r2 unsigned int src_pixels_per_line
; r3 unsigned int output_width
; stack unsigned int output_height
; stack const short *vp9_filter
;-------------------------------------
; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with
; the output being a 2 byte value and the intput being a 1 byte value.
|vp9_filter_block2d_first_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp9_filter address
ldr r7, [sp, #36] ; output height
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_6
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
; --------------------------
; 16x16 version
; -----------------------------
|vp9_filter_block2d_first_pass_16x16_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp9_filter address
ldr r7, [sp, #36] ; output height
add r4, r2, #18 ; preload next low
pld [r0, r4]
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_16_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_16_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_16_6
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r11, r2, #34 ; adding back block width(=16)
pld [r0, r11] ; preload next low
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_16_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
; --------------------------
; 8x8 version
; -----------------------------
|vp9_filter_block2d_first_pass_8x8_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp9_filter address
ldr r7, [sp, #36] ; output height
add r4, r2, #10 ; preload next low
pld [r0, r4]
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_8_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_8_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_8_6
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r11, r2, #18 ; adding back block width(=8)
pld [r0, r11] ; preload next low
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_8_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int output_pitch,
; r3 unsigned int cnt,
; stack const short *vp9_filter
;---------------------------------
|vp9_filter_block2d_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #36] ; vp9_filter address
sub sp, sp, #4
mov r7, r3, lsl #16 ; height is top part of counter
str r1, [sp] ; push destination to stack
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
pkhbt r12, r5, r4 ; pack the filter differently
pkhbt r11, r6, r5
sub r0, r0, #4 ; offset input buffer
|height_loop_2nd|
ldr r8, [r0] ; load the data
ldr r9, [r0, #4]
orr r7, r7, r3, lsr #1 ; loop counter
|width_loop_2nd|
smuad lr, r4, r8 ; apply filter
sub r7, r7, #1
smulbt r8, r4, r8
ldr r10, [r0, #8]
smlad lr, r5, r9, lr
smladx r8, r12, r9, r8
ldrh r9, [r0, #12]
smlad lr, r6, r10, lr
smladx r8, r11, r10, r8
add r0, r0, #4
smlatb r10, r6, r9, r8
add lr, lr, #0x40 ; round_shift_and_clamp
ands r8, r7, #0xff
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r1], r2 ; the result is transposed back and stored
usat r10, #8, r10, asr #7
ldrne r8, [r0] ; load data for next loop
ldrne r9, [r0, #4]
strb r10, [r1], r2
bne width_loop_2nd
ldr r1, [sp] ; update dst for next loop
subs r7, r7, #0x10000
add r0, r0, #16 ; updata src for next loop
add r1, r1, #1
str r1, [sp]
bne height_loop_2nd
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int output_pitch,
; r3 unsigned int cnt,
; stack const short *vp9_filter
;---------------------------------
|vp9_filter4_block2d_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #36] ; vp9_filter address
mov r7, r3, lsl #16 ; height is top part of counter
ldr r4, [r11] ; load up packed filter coefficients
add lr, r1, r3 ; save final destination pointer
ldr r5, [r11, #4]
ldr r6, [r11, #8]
pkhbt r12, r5, r4 ; pack the filter differently
pkhbt r11, r6, r5
mov r4, #0x40 ; rounding factor (for smlad{x})
|height_loop_2nd_4|
ldrd r8, [r0, #-4] ; load the data
orr r7, r7, r3, lsr #1 ; loop counter
|width_loop_2nd_4|
ldr r10, [r0, #4]!
smladx r6, r9, r12, r4 ; apply filter
pkhbt r8, r9, r8
smlad r5, r8, r12, r4
pkhbt r8, r10, r9
smladx r6, r10, r11, r6
sub r7, r7, #1
smlad r5, r8, r11, r5
mov r8, r9 ; shift the data for the next loop
mov r9, r10
usat r6, #8, r6, asr #7 ; shift and clamp
usat r5, #8, r5, asr #7
strb r5, [r1], r2 ; the result is transposed back and stored
tst r7, #0xff
strb r6, [r1], r2
bne width_loop_2nd_4
subs r7, r7, #0x10000
add r0, r0, #16 ; update src for next loop
sub r1, lr, r7, lsr #16 ; update dst for next loop
bne height_loop_2nd_4
ldmia sp!, {r4 - r11, pc}
ENDP
;------------------------------------
; r0 unsigned char *src_ptr
; r1 unsigned char *output_ptr,
; r2 unsigned int src_pixels_per_line
; r3 unsigned int cnt,
; stack unsigned int output_pitch,
; stack const short *vp9_filter
;------------------------------------
|vp9_filter_block2d_first_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
add r7, r2, r3 ; preload next low
add r7, r7, #2
pld [r0, r7]
ldr r4, [sp, #36] ; output pitch
ldr r11, [sp, #40] ; HFilter address
sub sp, sp, #8
mov r7, r3
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
sub r4, r4, r3
str r4, [sp] ; save modified output pitch
str r2, [sp, #4]
mov r2, #0x40
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
; six tap filter
|height_loop_1st_only_6|
ldrb r8, [r0, #-2] ; load data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
mov r12, r3, lsr #1 ; loop counter
|width_loop_1st_only_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
;; smuad lr, lr, r4
smlad lr, lr, r4, r2
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
;; smuad r8, r8, r4
smlad r8, r8, r4, r2
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
subs r12, r12, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r10, r10, r6, r8
;; add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
;; add r10, r10, #0x40
strb lr, [r1], #1 ; store the result
usat r10, #8, r10, asr #7
ldrneb r9, [r0, #-1]
strb r10, [r1], #1
ldrneb r10, [r0], #2
bne width_loop_1st_only_6
ldr lr, [sp] ; load back output pitch
ldr r12, [sp, #4] ; load back output pitch
subs r7, r7, #1
add r0, r0, r12 ; updata src for next loop
add r11, r12, r3 ; preload next low
add r11, r11, #2
pld [r0, r11]
add r1, r1, lr ; update dst for next loop
bne height_loop_1st_only_6
add sp, sp, #8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_filter_block2d_first_pass_only_armv6|
;------------------------------------
; r0 unsigned char *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int src_pixels_per_line
; r3 unsigned int cnt,
; stack unsigned int output_pitch,
; stack const short *vp9_filter
;------------------------------------
|vp9_filter_block2d_second_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; VFilter address
ldr r12, [sp, #36] ; output pitch
mov r7, r3, lsl #16 ; height is top part of counter
sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
sub sp, sp, #8
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r0, [sp] ; save r0 to stack
str r1, [sp, #4] ; save dst to stack
; six tap filter
|width_loop_2nd_only_6|
ldrb r8, [r0], r2 ; load data
orr r7, r7, r3 ; loop counter
ldrb r9, [r0], r2
ldrb r10, [r0], r2
|height_loop_2nd_only_6|
; filter first column in this inner loop, than, move to next colum.
ldrb r11, [r0], r2
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0], r2
smuad lr, lr, r4
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0], r2
smlad r8, r11, r5, r8
ldrb r11, [r0]
sub r7, r7, #2
sub r0, r0, r2, lsl #2
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r10, r10, r6, r8
ands r9, r7, #0xff
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0], r2 ; load data for next loop
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r1], r12 ; store the result for the column
usat r10, #8, r10, asr #7
ldrneb r9, [r0], r2
strb r10, [r1], r12
ldrneb r10, [r0], r2
bne height_loop_2nd_only_6
ldr r0, [sp]
ldr r1, [sp, #4]
subs r7, r7, #0x10000
add r0, r0, #1 ; move to filter next column
str r0, [sp]
add r1, r1, #1
str r1, [sp, #4]
bne width_loop_2nd_only_6
add sp, sp, #8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_filter_block2d_second_pass_only_armv6|
END

Просмотреть файл

@ -1,345 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
EXPORT |vp8_short_idct4x4llm_1_v6|
EXPORT |vp8_short_idct4x4llm_v6|
EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
;********************************************************************************
;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench: 3/5
;********************************************************************************
|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
;
ldrsh r0, [r0] ; load input[0] 1, r0 un 2
add r0, r0, #4 ; 1 +4
stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
mov r5, r4 ; expand expand
strd r4, [r1], r2 ; *output = r0, post inc 1
strd r4, [r1], r2 ; 1
strd r4, [r1], r2 ; 1
strd r4, [r1] ; 1
;
ldmia sp!, {r4, r5, pc} ; replace vars, return restore
ENDP ; |vp8_short_idct4x4llm_1_v6|
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
;
mov r4, #0x00004E00 ; 1 cst
orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
mov r5, #0x00008A00 ; 1 cst
orr r5, r5, #0x0000008C ; sinpi8sqrt2
;
mov r6, #4 ; i=4 1 i
loop1 ;
ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
add r9, r7, r8 ; a1 = [0] + [8] 1 a1
sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
add r11, r3, r11 ; temp2 1
rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
add r8, r7, r11 ; b1 + c1 1 b+c
strh r8, [r1, r2] ; out[pitch] = b1+c1 1
sub r7, r7, r11 ; b1 - c1 1 b-c
add r10, r12, r10 ; temp1 1
add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
add r10, r9, r3 ; a1 + d1 1 a+d
sub r3, r9, r3 ; a1 - d1 1 a-d
add r8, r2, r2 ; pitch * 2 1 p*2
strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
subs r6, r6, #1 ; i-- 1 --
strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
bne loop1 ; if i>0, continue
;
sub r1, r1, #8 ; set up out for next loop 1 -4
; for this iteration, input=prev output
mov r6, #4 ; i=4 1 i
; b returnfull
loop2 ;
ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
add r7, r0, r3 ; a1 = [0] + [2] 1 a1
sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
add r10, r8, r10 ; temp2 1
rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
add r3, r0, r9 ; b1+c1 1 b+c
add r3, r3, #4 ; b1+c1+4 1 +4
add r10, r11, r10 ; temp1 1
mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
strh r3, [r1, #2] ; out[1] = b1+c1 1
add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
add r3, r7, r10 ; a1+d1 1 a+d
add r3, r3, #4 ; a1+d1+4 1 +4
sub r7, r7, r10 ; a1-d1 1 a-d
add r7, r7, #4 ; a1-d1+4 1 +4
mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
strh r7, [r1, #6] ; out[3] = a1-d1 1
sub r0, r0, r9 ; b1-c1 1 b-c
add r0, r0, #4 ; b1-c1+4 1 +4
subs r6, r6, #1 ; i-- 1 --
mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
strh r0, [r1, #4] ; out[2] = b1-c1 1
strh r3, [r1], r2 ; out[0] = a1+d1 1
; add r1, r1, r2 ; out += pitch 1 ++
bne loop2 ; if i>0, continue
returnfull ;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
; mov r0, #0 ;
; ldr r0, [r0] ;
stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
;
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
;
mov r5, #0x2 ; i i
;
short_idct4x4llm_v6_scott_loop1 ;
ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
;
smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
;
smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
;
add r6, r6, r7 ; partial c1 lt1-lt2
add r12, r12, r14 ; partial d1 l2t2+l2t1
;
smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
;
smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
;
add r7, r14, r7 ; partial c1_2 ht1+ht2
sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
;
pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
;
usub16 r6, r6, r10 ; c1_2 | c1_1 c
uadd16 r12, r12, r11 ; d1_2 | d1_1 d
;
ldr r10, [r0, #0] ; i1 | i0 1,0
ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
;
;;;;;; add r0, r0, #0x4 ; +4
;;;;;; add r1, r1, #0x4 ; +4
;
uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
;
uadd16 r7, r8, r12 ; a1 + d1 pair a+d
usub16 r14, r8, r12 ; a1 - d1 pair a-d
;
str r7, [r1] ; op[0] = a1 + d1
str r14, [r1, r2] ; op[pitch*3] = a1 - d1
;
add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
;
subs r5, r5, #0x1 ; --
bne short_idct4x4llm_v6_scott_loop1 ;
;
sub r1, r1, #16 ; reset output ptr
mov r5, #0x4 ;
mov r0, r1 ; input = output
;
short_idct4x4llm_v6_scott_loop2 ;
;
subs r5, r5, #0x1 ;
bne short_idct4x4llm_v6_scott_loop2 ;
;
ldmia sp!, {r4 - r11, pc} ;
ENDP ;
;
;********************************************************************************
;********************************************************************************
;********************************************************************************
;********************************************************************************
;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
;* r0 INT16 * input
;* r1 INT16 * output
;* r2 INT32 pitch
;* bench:
;********************************************************************************
|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
;
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
mov r3, #0x00004E00 ; cos
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
mov r4, #0x00008A00 ; sin
orr r4, r4, #0x0000008C ; sinpi8sqrt2
mov r5, #0x2 ; i=2 i
loop1_dual
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
subs r5, r5, #0x1 ; i-- --
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
usub16 r7, r8, r7 ; c c
uadd16 r6, r6, r10 ; d d
uadd16 r10, r11, r14 ; a a
usub16 r8, r11, r14 ; b b
uadd16 r9, r10, r6 ; a+d a+d
usub16 r10, r10, r6 ; a-d a-d
uadd16 r6, r8, r7 ; b+c b+c
usub16 r7, r8, r7 ; b-c b-c
str r6, [r1, r2] ; o5 | o4
add r6, r2, r2 ; pitch * 2 p2
str r7, [r1, r6] ; o9 | o8
add r6, r6, r2 ; pitch * 3 p3
str r10, [r1, r6] ; o13 | o12
str r9, [r1], #0x4 ; o1 | o0 ++
bne loop1_dual ;
mov r5, #0x2 ; i=2 i
sub r0, r1, #8 ; reset input/output i/o
loop2_dual
ldr r6, [r0, r2] ; i5 | i4 5|4
ldr r1, [r0] ; i1 | i0 1|0
ldr r12, [r0, #0x4] ; i3 | i2 3|2
add r14, r2, #0x4 ; pitch + 2 p+2
ldr r14, [r0, r14] ; i7 | i6 7|6
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
uadd16 r10, r11, r9 ; a a
usub16 r9, r11, r9 ; b b
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
subs r5, r5, #0x1 ; i-- --
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
usub16 r12, r8, r6 ; c (o1 | o5) c
uadd16 r6, r11, r1 ; d (o3 | o7) d
uadd16 r7, r10, r6 ; a+d a+d
mov r8, #0x4 ; set up 4's 4
orr r8, r8, #0x40000 ; 4|4
usub16 r6, r10, r6 ; a-d a-d
uadd16 r6, r6, r8 ; a-d+4 3|7
uadd16 r7, r7, r8 ; a+d+4 0|4
uadd16 r10, r9, r12 ; b+c b+c
usub16 r1, r9, r12 ; b-c b-c
uadd16 r10, r10, r8 ; b+c+4 1|5
uadd16 r1, r1, r8 ; b-c+4 2|6
mov r8, r10, asr #19 ; o1 >> 3
strh r8, [r0, #2] ; o1
mov r8, r1, asr #19 ; o2 >> 3
strh r8, [r0, #4] ; o2
mov r8, r6, asr #19 ; o3 >> 3
strh r8, [r0, #6] ; o3
mov r8, r7, asr #19 ; o0 >> 3
strh r8, [r0], r2 ; o0 +p
sxth r10, r10 ;
mov r8, r10, asr #3 ; o5 >> 3
strh r8, [r0, #2] ; o5
sxth r1, r1 ;
mov r8, r1, asr #3 ; o6 >> 3
strh r8, [r0, #4] ; o6
sxth r6, r6 ;
mov r8, r6, asr #3 ; o7 >> 3
strh r8, [r0, #6] ; o7
sxth r7, r7 ;
mov r8, r7, asr #3 ; o4 >> 3
strh r8, [r0], r2 ; o4 +p
;;;;; subs r5, r5, #0x1 ; i-- --
bne loop2_dual ;
;
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
ENDP
END

Просмотреть файл

@ -1,152 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_v6|
EXPORT |vp8_short_inv_walsh4x4_1_v6|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_v6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r2, [r0], #4 ; [1 | 0]
ldr r3, [r0], #4 ; [3 | 2]
ldr r4, [r0], #4 ; [5 | 4]
ldr r5, [r0], #4 ; [7 | 6]
ldr r6, [r0], #4 ; [9 | 8]
ldr r7, [r0], #4 ; [11 | 10]
ldr r8, [r0], #4 ; [13 | 12]
ldr r9, [r0] ; [15 | 14]
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
; first transform complete
qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
ldr r10, c0x00030003
qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
qadd16 r2, r2, r10 ; [b2+3|c2+3]
qadd16 r3, r3, r10 ; [a2+3|d2+3]
qadd16 r4, r4, r10 ; [b2+3|c2+3]
qadd16 r5, r5, r10 ; [a2+3|d2+3]
asr r12, r2, #3 ; [1 | x]
pkhtb r12, r12, r3, asr #19; [1 | 0]
lsl lr, r3, #16 ; [~3 | x]
lsl r2, r2, #16 ; [~2 | x]
asr lr, lr, #3 ; [3 | x]
pkhtb lr, lr, r2, asr #19 ; [3 | 2]
asr r2, r4, #3 ; [5 | x]
pkhtb r2, r2, r5, asr #19 ; [5 | 4]
lsl r3, r5, #16 ; [~7 | x]
lsl r4, r4, #16 ; [~6 | x]
asr r3, r3, #3 ; [7 | x]
pkhtb r3, r3, r4, asr #19 ; [7 | 6]
str r12, [r1], #4
str lr, [r1], #4
str r2, [r1], #4
str r3, [r1], #4
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
qadd16 r6, r6, r10 ; [b2+3|c2+3]
qadd16 r7, r7, r10 ; [a2+3|d2+3]
qadd16 r8, r8, r10 ; [b2+3|c2+3]
qadd16 r9, r9, r10 ; [a2+3|d2+3]
asr r2, r6, #3 ; [9 | x]
pkhtb r2, r2, r7, asr #19 ; [9 | 8]
lsl r3, r7, #16 ; [~11| x]
lsl r4, r6, #16 ; [~10| x]
asr r3, r3, #3 ; [11 | x]
pkhtb r3, r3, r4, asr #19 ; [11 | 10]
asr r4, r8, #3 ; [13 | x]
pkhtb r4, r4, r9, asr #19 ; [13 | 12]
lsl r5, r9, #16 ; [~15| x]
lsl r6, r8, #16 ; [~14| x]
asr r5, r5, #3 ; [15 | x]
pkhtb r5, r5, r6, asr #19 ; [15 | 14]
str r2, [r1], #4
str r3, [r1], #4
str r4, [r1], #4
str r5, [r1]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_inv_walsh4x4_v6|
;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output)
|vp8_short_inv_walsh4x4_1_v6| PROC
ldrsh r2, [r0] ; [0]
add r2, r2, #3 ; [0] + 3
asr r2, r2, #3 ; a1 ([0]+3) >> 3
lsl r2, r2, #16 ; [a1 | x]
orr r2, r2, r2, lsr #16 ; [a1 | a1]
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1], #4
str r2, [r1]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_v6|
; Constant Pool
c0x00030003 DCD 0x00030003
END

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -1,281 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_armv6|
EXPORT |vp8_recon2b_armv6|
EXPORT |vp8_recon4b_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
prd RN r0
dif RN r1
dst RN r2
stride RN r3
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
; R0 char* pred_ptr
; R1 short * dif_ptr
; R2 char * dst_ptr
; R3 int stride
; Description:
; Loop through the block adding the Pred and Diff together. Clamp and then
; store back into the Dst.
; Restrictions :
; all buffers are expected to be 4 byte aligned coming in and
; going out.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_recon_b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #8] ; 1 | 0
;; ldr r7, [dif, #12] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #16] ; 1 | 0
;; ldr r7, [dif, #20] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
add dif, dif, #32
orr r8, r8, r9, lsl #8
str r8, [dst], stride
;0, 1, 2, 3
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
;; ldr r6, [dif, #24] ; 1 | 0
;; ldr r7, [dif, #28] ; 3 | 2
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst], stride
ldmia sp!, {r4 - r9, pc}
ENDP ; |recon_b|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon4b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon4b_loop
;0, 1, 2, 3
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
ldr r6, [dif, #0] ; 1 | 0
ldr r7, [dif, #4] ; 3 | 2
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
pkhtb r9, r7, r6, asr #16 ; 3 | 1
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
;8, 9, 10, 11
ldr r4, [prd], #4
;; ldr r6, [dif, #64]
;; ldr r7, [dif, #68]
ldr r6, [dif, #16]
ldr r7, [dif, #20]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #8]
;12, 13, 14, 15
ldr r4, [prd], #4
;; ldr r6, [dif, #96]
;; ldr r7, [dif, #100]
ldr r6, [dif, #24]
ldr r7, [dif, #28]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #12]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #32
subs lr, lr, #1
bne recon4b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon4B|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
;
;
;
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
; R0 char *pred_ptr
; R1 short *dif_ptr
; R2 char *dst_ptr
; R3 int stride
|vp8_recon2b_armv6| PROC
stmdb sp!, {r4 - r9, lr}
mov lr, #4
recon2b_loop
;0, 1, 2, 3
ldr r4, [prd], #4
ldr r6, [dif, #0]
ldr r7, [dif, #4]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst]
;4, 5, 6, 7
ldr r4, [prd], #4
;; ldr r6, [dif, #32]
;; ldr r7, [dif, #36]
ldr r6, [dif, #8]
ldr r7, [dif, #12]
pkhbt r8, r6, r7, lsl #16
pkhtb r9, r7, r6, asr #16
uxtab16 r8, r8, r4
uxtab16 r9, r9, r4, ror #8
usat16 r8, #8, r8
usat16 r9, #8, r9
orr r8, r8, r9, lsl #8
str r8, [dst, #4]
add dst, dst, stride
;; add dif, dif, #8
add dif, dif, #16
subs lr, lr, #1
bne recon2b_loop
ldmia sp!, {r4 - r9, pc}
ENDP ; |Recon2B|
END

Просмотреть файл

@ -1,286 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6|
EXPORT |vp9_loop_filter_simple_vertical_edge_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
MACRO
TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
; a0: 03 02 01 00
; a1: 13 12 11 10
; a2: 23 22 21 20
; a3: 33 32 31 30
; b3 b2 b1 b0
uxtb16 $b1, $a1 ; xx 12 xx 10
uxtb16 $b0, $a0 ; xx 02 xx 00
uxtb16 $b3, $a3 ; xx 32 xx 30
uxtb16 $b2, $a2 ; xx 22 xx 20
orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
MEND
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *blimit
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp9_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldrb r12, [r2] ; blimit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
orr r12, r12, r12, lsl #8 ; blimit
ldr r2, c0x80808080
orr r12, r12, r12, lsl #16 ; blimit
mov r9, #4 ; double the count. we're doing 4 at a time
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
; vp8_simple_filter_mask()
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
uqsub8 r10, r4, r5 ; p0 - q0
uqsub8 r11, r5, r4 ; q0 - p0
orr r8, r8, r7 ; abs(p1 - q1)
orr r10, r10, r11 ; abs(p0 - q0)
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r8, #0
usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
sel r10, r8, lr ; filter mask: F or 0
cmp r10, #0
beq simple_hskip_filter ; skip filtering if all masks are 0x00
;vp8_simple_filter()
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6 ; += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6 ; += q0 - p0
ldr r8, c0x03030303
qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, r10 ; vp9_filter &= mask
qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4
qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr
shadd8 r8 , r8 , lr
shadd8 r7 , r7 , lr ; Filter1 >>= 3
shadd8 r8 , r8 , lr ; Filter2 >>= 3
qsub8 r5 ,r5, r7 ; u = q0 - Filter1
qadd8 r4, r4, r8 ; u = p0 + Filter2
eor r5, r5, r2 ; *oq0 = u^0x80
str r5, [src] ; store oq0 result
eor r4, r4, r2 ; *op0 = u^0x80
str r4, [src, -pstep] ; store op0 result
|simple_hskip_filter|
subs r9, r9, #1
addne src, src, #4 ; next row
ldrne r3, [src, -pstep, lsl #1] ; p1
ldrne r4, [src, -pstep] ; p0
ldrne r5, [src] ; q0
ldrne r6, [src, pstep] ; q1
bne simple_hnext8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6|
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp9_loop_filter_simple_vertical_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldrb r12, [r2] ; r12: blimit
ldr r2, c0x80808080
orr r12, r12, r12, lsl #8
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
orr r12, r12, r12, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
pld [src, #23]
ldrh r4, [src], pstep
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
mov r11, #4 ; double the count. we're doing 4 at a time
|simple_vnext8|
; vp8_simple_filter_mask() function
pkhbt r9, r3, r4, lsl #16
pkhbt r10, r5, r6, lsl #16
;transpose r7, r8, r9, r10 to r3, r4, r5, r6
TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
uqsub8 r9, r4, r5 ; p0 - q0
uqsub8 r10, r5, r4 ; q0 - p0
orr r7, r7, r8 ; abs(p1 - q1)
orr r9, r9, r10 ; abs(p0 - q0)
mov r8, #0
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r10, #0 ; r10 == -1
usub8 r7, r12, r7 ; compare to flimit
sel lr, r10, r8 ; filter mask
cmp lr, #0
beq simple_vskip_filter ; skip filtering
;vp8_simple_filter() function
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
qsub8 r3, r3, r6 ; vp9_filter = p1 - q1
qsub8 r6, r5, r4 ; q0 - p0
qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
ldr r9, c0x03030303 ; r9 = 3
qadd8 r3, r3, r6 ; vp9_filter += q0 - p0
ldr r7, c0x04040404
qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0))
;STALL
and r3, r3, lr ; vp9_filter &= mask
qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3
qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8
shadd8 r3 , r3 , r8
shadd8 r9 , r9 , r8 ; Filter2 >>= 3
shadd8 r3 , r3 , r8 ; Filter1 >>= 3
;calculate output
sub src, src, pstep, lsl #2
qadd8 r4, r4, r9 ; u = p0 + Filter2
qsub8 r5, r5, r3 ; u = q0 - Filter1
eor r4, r4, r2 ; *op0 = u^0x80
eor r5, r5, r2 ; *oq0 = u^0x80
strb r4, [src, #-1] ; store the result
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
mov r4, r4, lsr #8
strb r5, [src], pstep
mov r5, r5, lsr #8
strb r4, [src, #-1]
strb r5, [src], pstep
|simple_vskip_filter|
subs r11, r11, #1
; load soure data to r7, r8, r9, r10
ldrneh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrneh r4, [src], pstep
ldrneh r5, [src, #-2]
pld [src, #23]
ldrneh r6, [src], pstep
pkhbt r7, r3, r4, lsl #16
ldrneh r3, [src, #-2]
pld [src, #23]
ldrneh r4, [src], pstep
pkhbt r8, r5, r6, lsl #16
ldrneh r5, [src, #-2]
pld [src, #23]
ldrneh r6, [src], pstep
bne simple_vnext8
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6|
; Constant Pool
c0x80808080 DCD 0x80808080
c0x03030303 DCD 0x03030303
c0x04040404 DCD 0x04040404
END

Просмотреть файл

@ -1,273 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_armv6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------------
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pitch
;-------------------------------------
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
;and the result is stored in transpose.
|vp8_sixtap_predict8x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
cmp r2, #0 ;skip first_pass filter if xoffset=0
add lr, sp, #4 ;point to temporary buffer
beq skip_firstpass_filter
;first-pass filter
adr r12, filter8_coeff
sub r0, r0, r1, lsl #1
add r3, r1, #10 ; preload next low
pld [r0, r3]
add r2, r12, r2, lsl #4 ;calculate filter location
add r0, r0, #3 ;adjust src only for loading convinience
ldr r3, [r2] ; load up packed filter coefficients
ldr r4, [r2, #4]
ldr r5, [r2, #8]
mov r2, #0x90000 ; height=9 is top part of counter
sub r1, r1, #8
|first_pass_hloop_v6|
ldrb r6, [r0, #-5] ; load source data
ldrb r7, [r0, #-4]
ldrb r8, [r0, #-3]
ldrb r9, [r0, #-2]
ldrb r10, [r0, #-1]
orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
pkhbt r6, r6, r7, lsl #16 ; r7 | r6
pkhbt r7, r7, r8, lsl #16 ; r8 | r7
pkhbt r8, r8, r9, lsl #16 ; r9 | r8
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|first_pass_wloop_v6|
smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1]
smuad r12, r7, r3
ldrb r6, [r0], #1
smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3]
ldrb r7, [r0], #1
smlad r12, r9, r4, r12
pkhbt r10, r10, r6, lsl #16 ; r10 | r9
pkhbt r6, r6, r7, lsl #16 ; r11 | r10
smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5]
smlad r12, r6, r5, r12
sub r2, r2, #1
add r11, r11, #0x40 ; round_shift_and_clamp
tst r2, #0xff ; test loop counter
usat r11, #8, r11, asr #7
add r12, r12, #0x40
strh r11, [lr], #20 ; result is transposed and stored, which
usat r12, #8, r12, asr #7
strh r12, [lr], #20
movne r11, r6
movne r12, r7
movne r6, r8
movne r7, r9
movne r8, r10
movne r9, r11
movne r10, r12
bne first_pass_wloop_v6
;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [src, ppl]
;;pld [src, r9]
;;ENDIF
subs r2, r2, #0x10000
sub lr, lr, #158
add r0, r0, r1 ; move to next input line
add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
pld [r0, r11]
bne first_pass_hloop_v6
;second pass filter
secondpass_filter
ldr r3, [sp], #4 ; load back yoffset
ldr r0, [sp, #216] ; load dst address from stack 180+36
ldr r1, [sp, #220] ; load dst stride from stack 180+40
cmp r3, #0
beq skip_secondpass_filter
adr r12, filter8_coeff
add lr, r12, r3, lsl #4 ;calculate filter location
mov r2, #0x00080000
ldr r3, [lr] ; load up packed filter coefficients
ldr r4, [lr, #4]
ldr r5, [lr, #8]
pkhbt r12, r4, r3 ; pack the filter differently
pkhbt r11, r5, r4
second_pass_hloop_v6
ldr r6, [sp] ; load the data
ldr r7, [sp, #4]
orr r2, r2, #2 ; loop counter
second_pass_wloop_v6
smuad lr, r3, r6 ; apply filter
smulbt r10, r3, r6
ldr r8, [sp, #8]
smlad lr, r4, r7, lr
smladx r10, r12, r7, r10
ldrh r9, [sp, #12]
smlad lr, r5, r8, lr
smladx r10, r11, r8, r10
add sp, sp, #4
smlatb r10, r5, r9, r10
sub r2, r2, #1
add lr, lr, #0x40 ; round_shift_and_clamp
tst r2, #0xff
usat lr, #8, lr, asr #7
add r10, r10, #0x40
strb lr, [r0], r1 ; the result is transposed back and stored
usat r10, #8, r10, asr #7
strb r10, [r0],r1
movne r6, r7
movne r7, r8
bne second_pass_wloop_v6
subs r2, r2, #0x10000
add sp, sp, #12 ; updata src for next loop (20-8)
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne second_pass_hloop_v6
add sp, sp, #20
ldmia sp!, {r4 - r11, pc}
;--------------------
skip_firstpass_filter
sub r0, r0, r1, lsl #1
sub r1, r1, #8
mov r2, #9
skip_firstpass_hloop
ldrb r4, [r0], #1 ; load data
subs r2, r2, #1
ldrb r5, [r0], #1
strh r4, [lr], #20 ; store it to immediate buffer
ldrb r6, [r0], #1 ; load data
strh r5, [lr], #20
ldrb r7, [r0], #1
strh r6, [lr], #20
ldrb r8, [r0], #1
strh r7, [lr], #20
ldrb r9, [r0], #1
strh r8, [lr], #20
ldrb r10, [r0], #1
strh r9, [lr], #20
ldrb r11, [r0], #1
strh r10, [lr], #20
add r0, r0, r1 ; move to next input line
strh r11, [lr], #20
sub lr, lr, #158 ; move over to next column
bne skip_firstpass_hloop
b secondpass_filter
;--------------------
skip_secondpass_filter
mov r2, #8
add sp, sp, #4 ;start from src[0] instead of src[-2]
skip_secondpass_hloop
ldr r6, [sp], #4
subs r2, r2, #1
ldr r8, [sp], #4
mov r7, r6, lsr #16 ; unpack
strb r6, [r0], r1
mov r9, r8, lsr #16
strb r7, [r0], r1
add sp, sp, #12 ; 20-8
strb r8, [r0], r1
strb r9, [r0], r1
sub r0, r0, r1, lsl #2
add r0, r0, #1
bne skip_secondpass_hloop
add sp, sp, #16 ; 180 - (160 +4)
ldmia sp!, {r4 - r11, pc}
ENDP
;-----------------
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
filter8_coeff
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
;DCD 0, 0, 128, 0, 0, 0
;DCD 0, -6, 123, 12, -1, 0
;DCD 2, -11, 108, 36, -8, 1
;DCD 0, -9, 93, 50, -6, 0
;DCD 3, -16, 77, 77, -16, 3
;DCD 0, -6, 50, 93, -9, 0
;DCD 1, -8, 36, 108, -11, 2
;DCD 0, -1, 12, 123, -6, 0
END

Просмотреть файл

@ -1,357 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_bilinear_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, bifilter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
vst1.u8 {d4, d5}, [r4], r5
vst1.u8 {d6, d7}, [r4], r5
vmov q11, q15
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_sp16x16_loop_neon
add sp, sp, #272
pop {r4-r5,pc}
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r4], r5 ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r4], r5
vst1.u8 {d18, d19}, [r4], r5
vst1.u8 {d20, d21}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r4], r5 ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r4], r5
vmov q11, q15
vst1.u8 {d6, d7}, [r4], r5
vst1.u8 {d8, d9}, [r4], r5
bne filt_blk2d_spo16x16_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
bifilter16_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,130 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict4x4_neon| PROC
push {r4, lr}
adr r12, bifilter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x4)
vld1.u8 {d2}, [r0], r1 ;load src data
add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes)
vld1.u8 {d3}, [r0], r1
vld1.u32 {d31}, [r2] ;first_pass filter
vld1.u8 {d4}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0-d1)
vld1.u8 {d5}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {d6}, [r0], r1
vshr.u64 q4, q1, #8 ;construct src_ptr[1]
vshr.u64 q5, q2, #8
vshr.u64 d12, d6, #8
vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d4, d5
vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q8, d4, d0
vmull.u8 q9, d6, d0
vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1])
vmlal.u8 q8, d10, d1
vmlal.u8 q9, d12, d1
vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d29, q8, #7
vqrshrn.u16 d30, q9, #7
;Second pass: 4x4
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3 ;calculate Vfilter location
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d31[4]
vmull.u8 q1, d28, d0
vmull.u8 q2, d29, d0
vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step]
vext.8 d27, d29, d30, #4
vmlal.u8 q1, d26, d1
vmlal.u8 q2, d27, d1
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vst1.32 {d2[0]}, [r4] ;store result
vst1.32 {d2[1]}, [r0]
vst1.32 {d3[0]}, [r1]
vst1.32 {d3[1]}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.32 {d28[0]}, [r0], r1 ;load src data
vld1.32 {d28[1]}, [r0], r1
vld1.32 {d29[0]}, [r0], r1
vld1.32 {d29[1]}, [r0], r1
vld1.32 {d30[0]}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.32 {d28[0]}, [r4], lr ;store result
vst1.32 {d28[1]}, [r4], lr
vst1.32 {d29[0]}, [r4], lr
vst1.32 {d29[1]}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,135 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x4_neon| PROC
push {r4, lr}
adr r12, bifilter8x4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (5x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vld1.u8 {q5}, [r0], r1
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q7, #7
vqrshrn.u16 d24, q8, #7
vqrshrn.u16 d25, q9, #7
vqrshrn.u16 d26, q10, #7
;Second pass: 4x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
add r2, r1, lr
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1]
vst1.u8 {d5}, [r2]
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter8x4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,183 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_bilinear_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_bilinear_predict8x8_neon| PROC
push {r4, lr}
adr r12, bifilter8_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq skip_secondpass_filter
add r3, r12, r3, lsl #3
add r0, r4, lr
vld1.u32 {d31}, [r3] ;load second_pass filter
add r1, r0, lr
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2}, [r4] ;store result
vst1.u8 {d3}, [r0]
vst1.u8 {d4}, [r1], lr
vst1.u8 {d5}, [r1], lr
vst1.u8 {d6}, [r1], lr
vst1.u8 {d7}, [r1], lr
vst1.u8 {d8}, [r1], lr
vst1.u8 {d9}, [r1], lr
pop {r4, pc}
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;---------------------
skip_secondpass_filter
vst1.u8 {d22}, [r4], lr ;store result
vst1.u8 {d23}, [r4], lr
vst1.u8 {d24}, [r4], lr
vst1.u8 {d25}, [r4], lr
vst1.u8 {d26}, [r4], lr
vst1.u8 {d27}, [r4], lr
vst1.u8 {d28}, [r4], lr
vst1.u8 {d29}, [r4], lr
pop {r4, pc}
ENDP
;-----------------
bifilter8_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,584 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_build_intra_predictors_mby_neon_func|
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
cmp r3, #0
beq case_dc_pred
cmp r3, #1
beq case_v_pred
cmp r3, #2
beq case_h_pred
cmp r3, #3
beq case_tm_pred
case_dc_pred
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up
cmp r5, #0
beq skip_dc_pred_left
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_v_pred
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_h_pred
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
pop {r4-r8,pc}
case_tm_pred
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
subs r12, r12, #1
bne case_tm_pred_loop
pop {r4-r8,pc}
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
cmp r3, #0
beq case_dc_pred_s
cmp r3, #1
beq case_v_pred_s
cmp r3, #2
beq case_h_pred_s
cmp r3, #3
beq case_tm_pred_s
case_dc_pred_s
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left_s
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up_s
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up_s
cmp r5, #0
beq skip_dc_pred_left_s
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left_s
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_h_pred_s
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
pop {r4-r8,pc}
case_tm_pred_s
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop_s
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
subs r12, r12, #1
bne case_tm_pred_loop_s
pop {r4-r8,pc}
ENDP
END

Просмотреть файл

@ -1,59 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem16x16_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem16x16_neon| PROC
vld1.u8 {q0}, [r0], r1
vld1.u8 {q1}, [r0], r1
vld1.u8 {q2}, [r0], r1
vst1.u8 {q0}, [r2], r3
vld1.u8 {q3}, [r0], r1
vst1.u8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vst1.u8 {q2}, [r2], r3
vld1.u8 {q5}, [r0], r1
vst1.u8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vst1.u8 {q4}, [r2], r3
vld1.u8 {q7}, [r0], r1
vst1.u8 {q5}, [r2], r3
vld1.u8 {q8}, [r0], r1
vst1.u8 {q6}, [r2], r3
vld1.u8 {q9}, [r0], r1
vst1.u8 {q7}, [r2], r3
vld1.u8 {q10}, [r0], r1
vst1.u8 {q8}, [r2], r3
vld1.u8 {q11}, [r0], r1
vst1.u8 {q9}, [r2], r3
vld1.u8 {q12}, [r0], r1
vst1.u8 {q10}, [r2], r3
vld1.u8 {q13}, [r0], r1
vst1.u8 {q11}, [r2], r3
vld1.u8 {q14}, [r0], r1
vst1.u8 {q12}, [r2], r3
vld1.u8 {q15}, [r0], r1
vst1.u8 {q13}, [r2], r3
vst1.u8 {q14}, [r2], r3
vst1.u8 {q15}, [r2], r3
mov pc, lr
ENDP ; |vp9_copy_mem16x16_neon|
END

Просмотреть файл

@ -1,34 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem8x4_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem8x4_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vst1.u8 {d3}, [r2], r3
mov pc, lr
ENDP ; |vp9_copy_mem8x4_neon|
END

Просмотреть файл

@ -1,43 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_copy_mem8x8_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp9_copy_mem8x8_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vld1.u8 {d4}, [r0], r1
vst1.u8 {d3}, [r2], r3
vld1.u8 {d5}, [r0], r1
vst1.u8 {d4}, [r2], r3
vld1.u8 {d6}, [r0], r1
vst1.u8 {d5}, [r2], r3
vld1.u8 {d7}, [r0], r1
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
mov pc, lr
ENDP ; |vp9_copy_mem8x8_neon|
END

Просмотреть файл

@ -1,49 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
; unsigned char *dst_ptr, int pitch, int stride)
; r0 input_dc
; r1 pred_ptr
; r2 dst_ptr
; r3 pitch
; sp stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r3
vld1.32 {d2[1]}, [r1], r3
vld1.32 {d4[0]}, [r1], r3
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r2], r12
vst1.32 {d2[1]}, [r2], r12
vst1.32 {d4[0]}, [r2], r12
vst1.32 {d4[1]}, [r2]
bx lr
ENDP
END

Просмотреть файл

@ -1,80 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
EXPORT |vp8_short_inv_walsh4x4_1_neon|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
vld1.i16 {q0-q1}, [r0@128]
; first for loop
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d6, d1, d2 ;b = [4] + [8]
vsub.s16 d5, d0, d3 ;d = [0] - [12]
vsub.s16 d7, d1, d2 ;c = [4] - [8]
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vtrn.32 d0, d2 ;d0: 0 1 8 9
;d2: 2 3 10 11
vtrn.32 d1, d3 ;d1: 4 5 12 13
;d3: 6 7 14 15
vtrn.16 d0, d1 ;d0: 0 4 8 12
;d1: 1 5 9 13
vtrn.16 d2, d3 ;d2: 2 6 10 14
;d3: 3 7 11 15
; second for loop
vadd.s16 d4, d0, d3 ;a = [0] + [3]
vadd.s16 d6, d1, d2 ;b = [1] + [2]
vsub.s16 d5, d0, d3 ;d = [0] - [3]
vsub.s16 d7, d1, d2 ;c = [1] - [2]
vmov.i16 q8, #3
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vadd.i16 q0, q0, q8 ;e/f += 3
vadd.i16 q1, q1, q8 ;g/h += 3
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
vst4.i16 {d0,d1,d2,d3}, [r1@128]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_1_neon| PROC
ldrsh r2, [r0] ; load input[0]
add r3, r2, #3 ; add 3
add r2, r1, #16 ; base for last 8 output
asr r0, r3, #3 ; right shift 3
vdup.16 q0, r0 ; load and duplicate
vst1.16 {q0}, [r1@128] ; write back 8
vst1.16 {q0}, [r2@128] ; write back last 8
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_neon|
END

Просмотреть файл

@ -1,397 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_loop_filter_horizontal_edge_y_neon|
EXPORT |vp9_loop_filter_horizontal_edge_uv_neon|
EXPORT |vp9_loop_filter_vertical_edge_y_neon|
EXPORT |vp9_loop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp9_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1
add r1, r1, r1
vdup.u8 q2, r3 ; duplicate thresh
vld1.u8 {q3}, [r2@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r2@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r2@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r2@128] ; q2
vld1.u8 {q10}, [r12@128] ; q3
sub r2, r2, r1, lsl #1
sub r12, r12, r1, lsl #1
bl vp9_loop_filter_neon
vst1.u8 {q5}, [r2@128], r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
pop {pc}
ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp9_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r3@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r3@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r3@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r3@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r3@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r3@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r3@64] ; q3
vld1.u8 {d21}, [r12@64] ; q3
bl vp9_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r2@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r2@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
pop {pc}
ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon|
; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp9_loop_filter_vertical_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r12], r1
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r12], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d19}, [r2]
vld1.u8 {d21}, [r12]
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r3 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp9_loop_filter_neon
vswp d12, d11
vswp d16, d13
sub r0, r0, #2 ; dst ptr
vswp d14, d12
vswp d16, d15
add r12, r0, r1, asr #1
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
pop {pc}
ENDP ; |vp9_loop_filter_vertical_edge_y_neon|
; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp9_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r12 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp9_loop_filter_neon
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
sub r0, r0, #2
sub r2, r2, #2
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
pop {pc}
ENDP ; |vp9_loop_filter_vertical_edge_uv_neon|
; void vp9_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store.
; r0-r3 PRESERVE
; q0 flimit
; q1 limit
; q2 thresh
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
|vp9_loop_filter_neon| PROC
; vp9_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vmov.u8 q10, #0x80 ; 0x80
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vcge.u8 q15, q1, q15
; vp9_filter() function
; convert to signed
veor q7, q7, q10 ; qs0
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
veor q5, q5, q10 ; ps1
vqadd.u8 q9, q9, q2 ; a = b + a
veor q8, q8, q10 ; qs1
vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
vorr q14, q13, q14 ; vp8_hevmask
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
vmul.i16 q11, q11, q4
vand q1, q1, q14 ; vp9_filter &= hev
vand q15, q15, q9 ; vp9_filter_mask
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
vmov.u8 q9, #4 ; #4
; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q11
vand q1, q1, q15 ; vp9_filter &= mask
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3)
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp9_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp9_filter &= ~hev
vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
ENDP ; |vp9_loop_filter_horizontal_edge_y_neon|
;-----------------
END

Просмотреть файл

@ -1,117 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp9_loop_filter_bhs_neon|
EXPORT |vp9_loop_filter_mbhs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp9_loop_filter_simple_horizontal_edge_neon| PROC
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
vand q14, q4, q15 ; vp9_filter &= mask
vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
bx lr
ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp9_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp9_loop_filter_simple_horizontal_edge_neon
; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp9_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp9_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp9_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp9_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp9_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp9_loop_filter_bhs_neon|
END

Просмотреть файл

@ -1,154 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp9_loop_filter_simple_vertical_edge_neon|
EXPORT |vp9_loop_filter_bvs_neon|
EXPORT |vp9_loop_filter_mbvs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp9_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vswp d7, d10
vswp d12, d9
;vp9_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vmul.s16 q13, q13, q11
vmov.u8 q11, #0x03 ; 0x03
vmov.u8 q12, #0x04 ; 0x04
vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d29
vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d29, q13
add r0, r0, #1
add r3, r0, r1
vand q14, q14, q15 ; vp9_filter &= mask
vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3)
vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
;calculate output
vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
add r12, r1, r1
vswp d13, d14
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0], r12
vst2.8 {d12[1], d13[1]}, [r3], r12
vst2.8 {d12[2], d13[2]}, [r0], r12
vst2.8 {d12[3], d13[3]}, [r3], r12
vst2.8 {d12[4], d13[4]}, [r0], r12
vst2.8 {d12[5], d13[5]}, [r3], r12
vst2.8 {d12[6], d13[6]}, [r0], r12
vst2.8 {d12[7], d13[7]}, [r3], r12
vst2.8 {d14[0], d15[0]}, [r0], r12
vst2.8 {d14[1], d15[1]}, [r3], r12
vst2.8 {d14[2], d15[2]}, [r0], r12
vst2.8 {d14[3], d15[3]}, [r3], r12
vst2.8 {d14[4], d15[4]}, [r0], r12
vst2.8 {d14[5], d15[5]}, [r3], r12
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
bx lr
ENDP ; |vp9_loop_filter_simple_vertical_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp9_loop_filter_bvs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
mov r4, r0
add r0, r0, #4
vdup.s8 q1, r3 ; duplicate blim
bl vp9_loop_filter_simple_vertical_edge_neon
; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1
add r0, r4, #8
bl vp9_loop_filter_simple_vertical_edge_neon
add r0, r4, #12
pop {r4, lr}
b vp9_loop_filter_simple_vertical_edge_neon
ENDP ;|vp9_loop_filter_bvs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp9_loop_filter_mbvs_neon| PROC
ldrb r3, [r2] ; load mblim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp9_loop_filter_simple_vertical_edge_neon
ENDP ;|vp9_loop_filter_bvs_neon|
END

Просмотреть файл

@ -1,469 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
push {lr}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
vld1.u8 {q3}, [r0@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r0@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r0@128], r1 ; q2
vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #2
add r0, r12, r1, lsr #1
vst1.u8 {q4}, [r12@128],r1 ; store op2
vst1.u8 {q5}, [r0@128],r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r0@128],r1 ; store oq0
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r0@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r0@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r0@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r0@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r0@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r0@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r0@64], r1 ; q3
vld1.u8 {d21}, [r12@64], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r12, r12, r1, lsl #3
add r0, r0, r1
add r12, r12, r1
vst1.u8 {d8}, [r0@64], r1 ; store u op2
vst1.u8 {d9}, [r12@64], r1 ; store v op2
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r12@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r12@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
; This is a helper function for the macroblock loopfilters. The individual
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; r0,r1 PRESERVE
; r2 mblimit
; r3 limit
; q2 thresh
; q3 p3 PRESERVE
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3 PRESERVE
|vp8_mbloop_filter_neon| PROC
; vp9_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q1, q1, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q1
vdup.u8 q1, r3 ; limit
vdup.u8 q2, r2 ; mblimit
vmov.u8 q0, #0x80 ; 0x80
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vmov.u16 q11, #3 ; #3
; vp9_filter
; convert to signed
veor q7, q7, q0 ; qs0
vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
vqadd.u8 q12, q12, q1 ; a = b + a
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1)
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vand q15, q15, q12 ; vp9_filter_mask
vmul.i16 q13, q13, q11
vmov.u8 q12, #3 ; #3
vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vmov.u8 q11, #4 ; #4
; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q13
vand q1, q1, q15 ; vp9_filter &= mask
vmov.u16 q15, #63 ; #63
vand q13, q1, q14 ; Filter2 &= hev
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vmov q0, q15
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q11, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp9_filter &= ~hev
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov.u8 d5, #9 ; #9
vmov.u8 d4, #18 ; #18
vmov q13, q15
vmov q14, q15
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
vmlal.s8 q11, d3, d5
vmov.u8 d5, #27 ; #27
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
vmlal.s8 q13, d3, d4
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d1, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vmov.u8 q1, #0x80 ; 0x80
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q1 ; *oq2 = s^0x80
veor q4, q0, q1 ; *op2 = s^0x80
veor q8, q13, q1 ; *oq1 = s^0x80
veor q5, q12, q1 ; *op2 = s^0x80
veor q7, q15, q1 ; *oq0 = s^0x80
veor q6, q14, q1 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|
;-----------------
END

Просмотреть файл

@ -1,131 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon16x16mb_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int ystride,
; stack unsigned char *udst_ptr,
; stack unsigned char *vdst_ptr
|vp8_recon16x16mb_neon| PROC
mov r12, #4 ;loop counter for Y loop
recon16x16mb_loop_y
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
pld [r0]
pld [r1]
pld [r1, #64]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vst1.u8 {q0}, [r2], r3 ;store result
vqmovun.s16 d6, q6
vst1.u8 {q1}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {q2}, [r2], r3
subs r12, r12, #1
moveq r12, #2 ;loop counter for UV loop
vst1.u8 {q3}, [r2], r3
bne recon16x16mb_loop_y
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
ldr r2, [sp] ;load upred_ptr
recon16x16mb_loop_uv
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]!
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]!
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vqmovun.s16 d0, q0 ;CLAMP() saturation
vadd.s16 q7, q7, q15
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vst1.u8 {d0}, [r2], r3 ;store result
vqmovun.s16 d4, q4
vst1.u8 {d1}, [r2], r3
vqmovun.s16 d5, q5
vst1.u8 {d2}, [r2], r3
vqmovun.s16 d6, q6
vst1.u8 {d3}, [r2], r3
vqmovun.s16 d7, q7
vst1.u8 {d4}, [r2], r3
subs r12, r12, #1
vst1.u8 {d5}, [r2], r3
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
ldrne r2, [sp, #4] ;load vpred_ptr
bne recon16x16mb_loop_uv
bx lr
ENDP
END

Просмотреть файл

@ -1,54 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon2b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon2b_neon| PROC
vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
vld1.16 {q6, q7}, [r1]!
vmovl.u8 q1, d17
vmovl.u8 q2, d18
vmovl.u8 q3, d19
vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
vadd.s16 q1, q1, q5
vadd.s16 q2, q2, q6
vadd.s16 q3, q3, q7
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r0, r2, r3
vst1.u8 {d0}, [r2] ;store result
vst1.u8 {d1}, [r0], r3
add r2, r0, r3
vst1.u8 {d2}, [r0]
vst1.u8 {d3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,69 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon4b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon4b_neon| PROC
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
vld1.u8 {q14, q15}, [r0]
vld1.16 {q10, q11}, [r1]!
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d25
vmovl.u8 q2, d26
vmovl.u8 q3, d27
vmovl.u8 q4, d28
vmovl.u8 q5, d29
vmovl.u8 q6, d30
vld1.16 {q12, q13}, [r1]!
vmovl.u8 q7, d31
vld1.16 {q14, q15}, [r1]
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
vadd.s16 q1, q1, q9
vadd.s16 q2, q2, q10
vadd.s16 q3, q3, q11
vadd.s16 q4, q4, q12
vadd.s16 q5, q5, q13
vadd.s16 q6, q6, q14
vadd.s16 q7, q7, q15
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
vqmovun.s16 d4, q4
vqmovun.s16 d5, q5
vqmovun.s16 d6, q6
vqmovun.s16 d7, q7
add r0, r2, r3
vst1.u8 {q0}, [r2] ;store result
vst1.u8 {q1}, [r0], r3
add r2, r0, r3
vst1.u8 {q2}, [r0]
vst1.u8 {q3}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,29 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp9/common/recon.h"
#include "vp9/common/vp9_blockd.h"
extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
void vp8_recon_mb_neon(MACROBLOCKD *xd) {
unsigned char *pred_ptr = &xd->predictor[0];
short *diff_ptr = &xd->diff[0];
unsigned char *dst_ptr = xd->dst.y_buffer;
unsigned char *udst_ptr = xd->dst.u_buffer;
unsigned char *vdst_ptr = xd->dst.v_buffer;
int ystride = xd->dst.y_stride;
/*int uv_stride = xd->dst.uv_stride;*/
vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride,
udst_ptr, vdst_ptr);
}

Просмотреть файл

@ -1,61 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_recon_b_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *pred_ptr,
; r1 short *diff_ptr,
; r2 unsigned char *dst_ptr,
; r3 int stride
|vp8_recon_b_neon| PROC
mov r12, #16
vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
vld1.u8 {d29}, [r0], r12
vld1.16 {q11, q12}, [r1]!
vld1.u8 {d30}, [r0], r12
vld1.16 {q12, q13}, [r1]!
vld1.u8 {d31}, [r0], r12
vld1.16 {q13}, [r1]
vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
vmovl.u8 q2, d30
vmovl.u8 q3, d31
vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
vadd.s16 d2, d2, d22
vadd.s16 d4, d4, d24
vadd.s16 d6, d6, d26
vqmovun.s16 d0, q0 ;CLAMP() saturation
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
vqmovun.s16 d3, q3
add r1, r2, r3
vst1.32 {d0[0]}, [r2] ;store result
vst1.32 {d1[0]}, [r1], r3
add r2, r1, r3
vst1.32 {d2[0]}, [r1]
vst1.32 {d3[0]}, [r2], r3
bx lr
ENDP
END

Просмотреть файл

@ -1,36 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_push_neon|
EXPORT |vp9_pop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
|vp9_push_neon| PROC
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
|vp9_pop_neon| PROC
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
END

Просмотреть файл

@ -1,67 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_1_neon|
EXPORT |vp8_dc_only_idct_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
; r0 short *input;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_short_idct4x4llm_1_neon| PROC
vld1.16 {d0[]}, [r0] ;load input[0]
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
; r0 short input_dc;
; r1 short *output;
; r2 int pitch;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|vp8_dc_only_idct_neon| PROC
vdup.16 d0, r0
add r3, r1, r2
add r12, r3, r2
vrshr.s16 d0, d0, #3
add r0, r12, r2
vst1.16 {d0}, [r1]
vst1.16 {d0}, [r3]
vst1.16 {d0}, [r12]
vst1.16 {d0}, [r0]
bx lr
ENDP
END

Просмотреть файл

@ -1,122 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
;r0 short * input
;r1 short * output
;r2 int pitch
;*************************************************************
;static const int cospi8sqrt2minus1=20091;
;static const int sinpi8sqrt2 =35468;
;static const int rounding = 0;
;Optimization note: The resulted data from dequantization are signed 13-bit data that is
;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
;result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
add r3, r1, r2
add r12, r3, r2
add r0, r12, r2
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vst1.16 {d2}, [r1]
vst1.16 {d3}, [r3]
vst1.16 {d4}, [r12]
vst1.16 {d5}, [r0]
bx lr
ENDP
;-----------------
idct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
END

Просмотреть файл

@ -1,490 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
; that the result can be a large positive number (> 2^15-1), which could be confused as a
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
; which ensures that the result stays in s16 range. Finally, saturated add the result by
; applying 3rd filter coeff. Same applys to other filter functions.
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, filter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter16x16_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter16x16_only
sub sp, sp, #336 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #7 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (21x16)
filt_blk2d_fp16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q9, d7, d0
vmull.u8 q10, d9, d0
vmull.u8 q11, d10, d0
vmull.u8 q12, d12, d0
vmull.u8 q13, d13, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d9, d10, #1
vext.8 d30, d12, d13, #1
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q10, d29, d1
vmlsl.u8 q12, d30, d1
vext.8 d28, d7, d8, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d13, d14, #1
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q11, d29, d1
vmlsl.u8 q13, d30, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d9, d10, #4
vext.8 d30, d12, d13, #4
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q10, d29, d4
vmlsl.u8 q12, d30, d4
vext.8 d28, d7, d8, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d13, d14, #4
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q11, d29, d4
vmlsl.u8 q13, d30, d4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vext.8 d30, d12, d13, #5
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q10, d29, d5
vmlal.u8 q12, d30, d5
vext.8 d28, d7, d8, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d13, d14, #5
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q11, d29, d5
vmlal.u8 q13, d30, d5
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d9, d10, #2
vext.8 d30, d12, d13, #2
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q10, d29, d2
vmlal.u8 q12, d30, d2
vext.8 d28, d7, d8, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d13, d14, #2
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q11, d29, d2
vmlal.u8 q13, d30, d2
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d9, d10, #3
vext.8 d30, d12, d13, #3
vext.8 d15, d7, d8, #3
vext.8 d31, d10, d11, #3
vext.8 d6, d13, d14, #3
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
vqadd.s16 q10, q5
vqadd.s16 q12, q6
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q7, d31, d3
vmull.u8 q3, d6, d3
subs r2, r2, #1
vqadd.s16 q9, q6
vqadd.s16 q11, q7
vqadd.s16 q13, q3
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q9, #7
vqrshrun.s16 d8, q10, #7
vqrshrun.s16 d9, q11, #7
vqrshrun.s16 d10, q12, #7
vqrshrun.s16 d11, q13, #7
vst1.u8 {d6, d7, d8}, [lr]! ;store result
vst1.u8 {d9, d10, d11}, [lr]!
bne filt_blk2d_fp16x16_loop_neon
;Second pass: 16x16
;secondpass_filter - do first 8-columns and then second 8-columns
add r3, r12, r3, lsl #5
sub lr, lr, #336
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
mov r2, #16
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp16x16_outloop_neon
vld1.u8 {d18}, [lr], r2 ;load src data
vld1.u8 {d19}, [lr], r2
vld1.u8 {d20}, [lr], r2
vld1.u8 {d21}, [lr], r2
mov r12, #4 ;loop counter
vld1.u8 {d22}, [lr], r2
secondpass_inner_loop_neon
vld1.u8 {d23}, [lr], r2 ;load src data
vld1.u8 {d24}, [lr], r2
vld1.u8 {d25}, [lr], r2
vld1.u8 {d26}, [lr], r2
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_inner_loop_neon
subs r3, r3, #1
sub lr, lr, #336
add lr, lr, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_sp16x16_outloop_neon
add sp, sp, #336
pop {r4-r5,pc}
;--------------------
firstpass_filter16x16_only
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #8 ;loop counter
sub r0, r0, #2 ;move srcptr back to (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
pld [r0]
pld [r0, r1]
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q7, d7, d0
vmull.u8 q8, d9, d0
vmull.u8 q9, d10, d0
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d21, d9, d10, #1
vext.8 d22, d7, d8, #1
vext.8 d23, d10, d11, #1
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
vext.8 d25, d9, d10, #4
vext.8 d26, d7, d8, #4
vext.8 d27, d10, d11, #4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d21, d1
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q9, d23, d1
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d25, d4
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q9, d27, d4
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q8, d29, d5
vext.8 d20, d7, d8, #5
vext.8 d21, d10, d11, #5
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
vext.8 d23, d9, d10, #2
vext.8 d24, d7, d8, #2
vext.8 d25, d10, d11, #2
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
vext.8 d27, d9, d10, #3
vext.8 d28, d7, d8, #3
vext.8 d29, d10, d11, #3
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q9, d21, d5
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d23, d2
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q11, d27, d3
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q15, d29, d3
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q11
vqadd.s16 q7, q12
vqadd.s16 q9, q15
subs r2, r2, #1
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q7, #7
vqrshrun.s16 d8, q8, #7
vqrshrun.s16 d9, q9, #7
vst1.u8 {q3}, [r4], r5 ;store result
vst1.u8 {q4}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;--------------------
secondpass_filter16x16_only
;Second pass: 16x16
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_spo16x16_outloop_neon
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.u8 {d19}, [r0], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r0], r1
mov r12, #4 ;loop counter
vld1.u8 {d22}, [r0], r1
secondpass_only_inner_loop_neon
vld1.u8 {d23}, [r0], r1 ;load src data
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_only_inner_loop_neon
subs r3, r3, #1
sub r0, r0, r1, lsl #4
sub r0, r0, r1, lsl #2
sub r0, r0, r1
add r0, r0, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_spo16x16_outloop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

Просмотреть файл

@ -1,422 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_sixtap_predict_neon| PROC
push {r4, lr}
adr r12, filter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter4x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
;First pass: output_height lines x output_width columns (9x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q10, d10, d3
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
vld1.u8 {q4}, [r0], r1
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
;First Pass on rest 5-line data
vld1.u8 {q11}, [r0], r1
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
vmull.u8 q8, d20, d5
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5])
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
vmlal.u8 q8, d10, d0
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d20, d1
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d10, d4
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d20, d2
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q10, d10, d3
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3])
add r3, r12, r3, lsl #5
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqadd.s16 q12, q11
vext.8 d23, d27, d28, #4
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d30, q8, #7
vqrshrun.s16 d31, q12, #7
;Second pass: 4x4
vabs.s32 q7, q5
vabs.s32 q8, q6
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
;---------------------
firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
;First pass: output_height lines x output_width columns (4x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q10, d10, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
vst1.32 {d27[0]}, [r4] ;store result
vst1.32 {d27[1]}, [r0]
vst1.32 {d28[0]}, [r1]
vst1.32 {d28[1]}, [r2]
pop {r4, pc}
;---------------------
secondpass_filter4x4_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.32 {d27[0]}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.32 {d27[1]}, [r0], r1
vabs.s32 q7, q5
vld1.32 {d28[0]}, [r0], r1
vabs.s32 q8, q6
vld1.32 {d28[1]}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.32 {d29[0]}, [r0], r1
vdup.8 d1, d14[4]
vld1.32 {d29[1]}, [r0], r1
vdup.8 d2, d15[0]
vld1.32 {d30[0]}, [r0], r1
vdup.8 d3, d15[4]
vld1.32 {d30[1]}, [r0], r1
vdup.8 d4, d16[0]
vld1.32 {d31[0]}, [r0], r1
vdup.8 d5, d16[4]
vext.8 d23, d27, d28, #4
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
ENDP
;-----------------
END

Просмотреть файл

@ -1,473 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x4_only
sub sp, sp, #32 ;reserve space on stack for temporary storage
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
mov lr, sp
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (9x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vld1.u8 {q4}, [r0], r1
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q5}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q7}, [r0], r1
vst1.u8 {d25}, [lr]!
;first_pass filtering on the rest 5-line data
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d27, q9, #7
vqrshrun.s16 d28, q10, #7
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
vqrshrun.s16 d30, q12, #7
;Second pass: 8x4
;secondpass_filter
add r3, r12, r3, lsl #5
sub lr, lr, #32
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {q12}, [lr]!
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
add sp, sp, #32
pop {r4-r5,pc}
;--------------------
firstpass_filter8x4_only
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (4x8)
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
pop {r4-r5,pc}
;---------------------
secondpass_filter8x4_only
;Second pass: 8x4
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {d22}, [r0], r1
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d25}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d28}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d29}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d30}, [r0], r1
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
pop {r4-r5,pc}
ENDP
;-----------------
END

Просмотреть файл

@ -1,524 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x8_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x8_only
sub sp, sp, #64 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (13x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
filt_blk2d_fp8x8_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
subs r2, r2, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q4}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q5}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d25}, [lr]!
bne filt_blk2d_fp8x8_loop_neon
;first_pass filtering on the rest 5-line data
;vld1.u8 {q3}, [r0], r1 ;load src data
;vld1.u8 {q4}, [r0], r1
;vld1.u8 {q5}, [r0], r1
;vld1.u8 {q6}, [r0], r1
vld1.u8 {q7}, [r0], r1
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
add r3, r12, r3, lsl #5
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
sub lr, lr, #64
vqrshrun.s16 d27, q9, #7
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
vqrshrun.s16 d28, q10, #7
vld1.u8 {q10}, [lr]!
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q11, #7
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vqrshrun.s16 d30, q12, #7
vld1.u8 {q12}, [lr]!
;Second pass: 8x8
mov r3, #2 ;loop counter
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_sp8x8_loop_neon
add sp, sp, #64
pop {r4-r5,pc}
;---------------------
firstpass_filter8x8_only
;add r2, r12, r2, lsl #5 ;calculate filter location
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (8x8)
filt_blk2d_fpo8x8_loop_neon
vld1.u8 {q3}, [r0], r1 ;load src data
vld1.u8 {q4}, [r0], r1
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
;
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
subs r2, r2, #1
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
bne filt_blk2d_fpo8x8_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_filter8x8_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {d19}, [r0], r1
vabs.s32 q7, q5
vld1.u8 {d20}, [r0], r1
vabs.s32 q8, q6
vld1.u8 {d21}, [r0], r1
mov r3, #2 ;loop counter
vld1.u8 {d22}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d23}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d24}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d25}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
;Second pass: 8x8
filt_blk2d_spo8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_spo8x8_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

Просмотреть файл

@ -1,91 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vpx_ports/arm.h"
#include "vp9/common/vp9_pragmas.h"
#include "vp9/common/vp9_subpixel.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/recon.h"
#include "vp9/common/vp9_onyxc_int.h"
void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
#if CONFIG_RUNTIME_CPU_DETECT
VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
int flags = arm_cpu_caps();
rtcd->flags = flags;
/* Override default functions with fastest ones for this CPU. */
#if HAVE_ARMV5TE
if (flags & HAS_EDSP) {
}
#endif
// The commented functions need to be re-written for vpx.
#if HAVE_ARMV6
if (flags & HAS_MEDIA) {
rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6;
rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6;
rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6;
rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6;
rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6;
rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6;
rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6;
rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6;
// rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6;
// rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual;
// rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6;
// rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6;
rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6;
rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6;
rtcd->recon.recon = vp9_recon_b_armv6;
rtcd->recon.recon2 = vp9_recon2b_armv6;
rtcd->recon.recon4 = vp9_recon4b_armv6;
}
#endif
#if HAVE_ARMV7
if (flags & HAS_NEON) {
rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon;
rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon;
rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon;
rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon;
rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon;
rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon;
rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon;
rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon;
// rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon;
// rtcd->idct.idct16 = vp9_short_idct4x4llm_neon;
// rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon;
// rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon;
rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon;
rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon;
rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon;
rtcd->recon.recon = vp9_recon_b_neon;
rtcd->recon.recon2 = vp9_recon2b_neon;
rtcd->recon.recon4 = vp9_recon4b_neon;
rtcd->recon.recon_mb = vp9_recon_mb_neon;
rtcd->recon.build_intra_predictors_mby =
vp9_build_intra_predictors_mby_neon;
rtcd->recon.build_intra_predictors_mby_s =
vp9_build_intra_predictors_mby_s_neon;
}
#endif
#endif
}

Просмотреть файл

@ -1,108 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_subpixel.h"
#include "vp9/common/arm/vp9_bilinearfilter_arm.h"
void vp9_filter_block2d_bil_armv6
(
unsigned char *src_ptr,
unsigned char *dst_ptr,
unsigned int src_pitch,
unsigned int dst_pitch,
const short *HFilter,
const short *VFilter,
int Width,
int Height
) {
unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
/* then 1-D vertically... */
vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
}
void vp9_bilinear_predict4x4_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
}
void vp9_bilinear_predict8x8_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
}
void vp9_bilinear_predict8x4_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
}
void vp9_bilinear_predict16x16_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
HFilter = vp8_bilinear_filters[xoffset];
VFilter = vp8_bilinear_filters[yoffset];
vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
}

Просмотреть файл

@ -1,35 +0,0 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_
#define VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_
extern void vp9_filter_block2d_bil_first_pass_armv6
(
const unsigned char *src_ptr,
unsigned short *dst_ptr,
unsigned int src_pitch,
unsigned int height,
unsigned int width,
const short *vp9_filter
);
extern void vp9_filter_block2d_bil_second_pass_armv6
(
const unsigned short *src_ptr,
unsigned char *dst_ptr,
int dst_pitch,
unsigned int height,
unsigned int width,
const short *vp9_filter
);
#endif /* BILINEARFILTER_ARM_H */

Просмотреть файл

@ -1,198 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include <math.h>
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_subpixel.h"
#include "vpx_ports/mem.h"
extern void vp9_filter_block2d_first_pass_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp9_filter
);
// 8x8
extern void vp9_filter_block2d_first_pass_8x8_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp9_filter
);
// 16x16
extern void vp9_filter_block2d_first_pass_16x16_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp9_filter
);
extern void vp9_filter_block2d_second_pass_armv6
(
short *src_ptr,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int cnt,
const short *vp9_filter
);
extern void vp9_filter4_block2d_second_pass_armv6
(
short *src_ptr,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int cnt,
const short *vp9_filter
);
extern void vp9_filter_block2d_first_pass_only_armv6
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int cnt,
unsigned int output_pitch,
const short *vp9_filter
);
extern void vp9_filter_block2d_second_pass_only_armv6
(
unsigned char *src_ptr,
unsigned char *output_ptr,
unsigned int src_pixels_per_line,
unsigned int cnt,
unsigned int output_pitch,
const short *vp9_filter
);
#if HAVE_ARMV6
void vp9_sixtap_predict_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
/* Vfilter is null. First pass only */
if (xoffset && !yoffset) {
/*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter );
vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset) {
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
} else {
/* Vfilter is a 4 tap filter */
if (yoffset & 0x1) {
vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
/* Vfilter is 6 tap filter */
else {
vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
}
}
void vp9_sixtap_predict8x8_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
if (xoffset && !yoffset) {
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset) {
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
} else {
if (yoffset & 0x1) {
vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
} else {
vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
}
}
void vp9_sixtap_predict16x16_armv6
(
unsigned char *src_ptr,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_ptr,
int dst_pitch
) {
const short *HFilter;
const short *VFilter;
DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
if (xoffset && !yoffset) {
vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
}
/* Hfilter is null. Second pass only */
else if (!xoffset && yoffset) {
vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
} else {
if (yoffset & 0x1) {
vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
} else {
vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
}
}
#endif

Просмотреть файл

@ -1,65 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_ARM_VP9_IDCT_ARM_H_
#define VP9_COMMON_ARM_VP9_IDCT_ARM_H_
#if HAVE_ARMV6
extern prototype_idct(vp9_short_idct4x4llm_1_v6);
extern prototype_idct(vp9_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6);
extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp9_short_inv_walsh4x4_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_idct_idct1
#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6
#undef vp9_idct_idct16
#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual
#undef vp9_idct_idct1_scalar_add
#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6
#endif
#endif
#if HAVE_ARMV7
extern prototype_idct(vp9_short_idct4x4llm_1_neon);
extern prototype_idct(vp9_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon);
extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp9_short_inv_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_idct_idct1
#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon
#undef vp9_idct_idct16
#define vp9_idct_idct16 vp9_short_idct4x4llm_neon
#undef vp9_idct_idct1_scalar_add
#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon
#undef vp8_idct_iwalsh1
#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon
#undef vp8_idct_iwalsh16
#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon
#endif
#endif
#endif

Просмотреть файл

@ -1,166 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
#if HAVE_ARMV6
extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6);
#endif
#if HAVE_ARMV7
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon;
extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon;
extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon;
#endif
#if HAVE_ARMV6
/*ARMV6 loopfilter functions*/
/* Horizontal MB filtering */
void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit) {
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
}
#endif
#if HAVE_ARMV7
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Vertical MB Filtering */
void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Horizontal B Filtering */
void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
}
/* Vertical B Filtering */
void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi) {
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
}
#endif

Просмотреть файл

@ -1,41 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_
#define VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_
#include "vpx_config.h"
#if HAVE_ARMV6
extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6);
extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6);
extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6);
extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6);
extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6);
#endif /* HAVE_ARMV6 */
#if HAVE_ARMV7
extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon);
extern prototype_loopfilter_block(vp9_loop_filter_bv_neon);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon);
extern prototype_loopfilter_block(vp9_loop_filter_bh_neon);
extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon);
extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon);
extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon);
extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon);
#endif /* HAVE_ARMV7 */
#endif /* LOOPFILTER_ARM_H */

Просмотреть файл

@ -1,90 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_ARM_VP9_RECON_ARM_H_
#define VP9_COMMON_ARM_VP9_RECON_ARM_H_
#if HAVE_ARMV6
extern prototype_recon_block(vp9_recon_b_armv6);
extern prototype_recon_block(vp9_recon2b_armv6);
extern prototype_recon_block(vp9_recon4b_armv6);
extern prototype_copy_block(vp9_copy_mem8x8_v6);
extern prototype_copy_block(vp9_copy_mem8x4_v6);
extern prototype_copy_block(vp9_copy_mem16x16_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp9_recon_b_armv6
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp9_recon2b_armv6
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp9_recon4b_armv6
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6
#undef vp8_recon_copy8x4
#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6
#endif
#endif
#if HAVE_ARMV7
extern prototype_recon_block(vp9_recon_b_neon);
extern prototype_recon_block(vp9_recon2b_neon);
extern prototype_recon_block(vp9_recon4b_neon);
extern prototype_copy_block(vp9_copy_mem8x8_neon);
extern prototype_copy_block(vp9_copy_mem8x4_neon);
extern prototype_copy_block(vp9_copy_mem16x16_neon);
extern prototype_recon_macroblock(vp9_recon_mb_neon);
extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon);
extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_recon_recon
#define vp8_recon_recon vp9_recon_b_neon
#undef vp8_recon_recon2
#define vp8_recon_recon2 vp9_recon2b_neon
#undef vp8_recon_recon4
#define vp8_recon_recon4 vp9_recon4b_neon
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon
#undef vp8_recon_copy8x4
#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon
#undef vp8_recon_recon_mb
#define vp8_recon_recon_mb vp9_recon_mb_neon
#undef vp9_recon_build_intra_predictors_mby
#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon
#undef vp9_recon_build_intra_predictors_mby_s
#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon
#endif
#endif
#endif

Просмотреть файл

@ -1,62 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_reconintra.h"
#include "vpx_mem/vpx_mem.h"
#include "vp9/common/recon.h"
#if HAVE_ARMV7
extern void vp9_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) {
unsigned char *y_buffer = xd->dst.y_buffer;
unsigned char *ypred_ptr = xd->predictor;
int y_stride = xd->dst.y_stride;
int mode = xd->mode_info_context->mbmi.mode;
int Up = xd->up_available;
int Left = xd->left_available;
vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr,
y_stride, mode, Up, Left);
}
#endif
#if HAVE_ARMV7
extern void vp9_build_intra_predictors_mby_s_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) {
unsigned char *y_buffer = xd->dst.y_buffer;
unsigned char *ypred_ptr = xd->predictor;
int y_stride = xd->dst.y_stride;
int mode = xd->mode_info_context->mbmi.mode;
int Up = xd->up_available;
int Left = xd->left_available;
vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr,
y_stride, mode, Up, Left);
}
#endif

Просмотреть файл

@ -1,89 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_
#define VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_
#if HAVE_ARMV6
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6);
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6);
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6);
extern prototype_subpixel_predict(vp9_sixtap_predict_armv6);
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6);
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6);
extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6);
extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_subpix_sixtap16x16
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6
#undef vp9_subpix_sixtap8x8
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6
#undef vp9_subpix_sixtap8x4
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6
#undef vp9_subpix_sixtap4x4
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6
#undef vp9_subpix_bilinear16x16
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6
#undef vp9_subpix_bilinear8x8
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6
#undef vp9_subpix_bilinear8x4
#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6
#undef vp9_subpix_bilinear4x4
#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6
#endif
#endif
#if HAVE_ARMV7
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon);
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon);
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon);
extern prototype_subpixel_predict(vp9_sixtap_predict_neon);
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon);
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon);
extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon);
extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_subpix_sixtap16x16
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon
#undef vp9_subpix_sixtap8x8
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon
#undef vp9_subpix_sixtap8x4
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon
#undef vp9_subpix_sixtap4x4
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon
#undef vp9_subpix_bilinear16x16
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon
#undef vp9_subpix_bilinear8x8
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon
#undef vp9_subpix_bilinear8x4
#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon
#undef vp9_subpix_bilinear4x4
#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon
#endif
#endif
#endif

Просмотреть файл

@ -12,29 +12,10 @@
#include "vpx_config.h"
#include "vpx/vpx_codec.h"
#include "vpx_ports/asm_offsets.h"
#include "vpx_scale/yv12config.h"
BEGIN
/* vpx_scale */
DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width));
DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height));
DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride));
DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer));
DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
DEFINE(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS);
END
/* add asserts for any offset that is not supported by assembly code */
/* add asserts for any size that is not supported by assembly code */
#if HAVE_ARMV7
/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
ct_assert(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS == 32)
#endif

Просмотреть файл

@ -22,11 +22,7 @@ typedef enum {
SIMPLE_LOOPFILTER = 1
} LOOPFILTERTYPE;
#if ARCH_ARM
#define SIMD_WIDTH 1
#else
#define SIMD_WIDTH 16
#endif
/* Need to align this structure so when it is declared and
* passed it can be loaded into vector registers.
@ -67,10 +63,6 @@ struct loop_filter_info {
#include "x86/vp9_loopfilter_x86.h"
#endif
#if ARCH_ARM
#include "arm/vp9_loopfilter_arm.h"
#endif
typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */
int p, /* pitch */
const unsigned char *blimit,

Просмотреть файл

@ -79,13 +79,11 @@ specialize vp9_dequant_idct_add_uv_block mmx
# RECON
#
prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2
vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6
specialize vp9_copy_mem16x16 mmx sse2 dspr2
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
specialize vp9_copy_mem8x8 mmx media neon dspr2
vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6
specialize vp9_copy_mem8x8 mmx dspr2
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
@ -98,8 +96,7 @@ prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char
specialize vp9_avg_mem8x8
prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
specialize vp9_copy_mem8x4 mmx media neon dspr2
vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6
specialize vp9_copy_mem8x4 mmx dspr2
vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
@ -193,36 +190,28 @@ prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsign
specialize vp9_loop_filter_bh8x8 sse2
prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
specialize vp9_loop_filter_simple_mbv mmx sse2 media neon
specialize vp9_loop_filter_simple_mbv mmx sse2
vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6
vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon
prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
specialize vp9_loop_filter_simple_mbh mmx sse2 media neon
specialize vp9_loop_filter_simple_mbh mmx sse2
vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6
vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon
prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
specialize vp9_loop_filter_simple_bv mmx sse2 media neon
specialize vp9_loop_filter_simple_bv mmx sse2
vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6
vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon
prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
specialize vp9_loop_filter_simple_bh mmx sse2 media neon
specialize vp9_loop_filter_simple_bh mmx sse2
vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6
vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon
#
# post proc
@ -683,7 +672,7 @@ prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int st
specialize vp9_temporal_filter_apply sse2
prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
specialize vp9_yv12_copy_partial_frame neon
specialize vp9_yv12_copy_partial_frame
fi
@ -716,11 +705,11 @@ if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then
fi
prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf"
specialize vp8_yv12_extend_frame_borders neon
specialize vp8_yv12_extend_frame_borders
prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
specialize vp8_yv12_copy_frame neon
specialize vp8_yv12_copy_frame
prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"
specialize vp8_yv12_copy_y neon
specialize vp8_yv12_copy_y

Просмотреть файл

@ -1,218 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_dc_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride, int Dc)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
; sp + 44 = Dc ; +4 = 48
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r6, [sp, #44]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
mov r12, #3
vp8_dequant_dc_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_dc_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_dc_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_dc_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_dc_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_dc_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_dc_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END

Просмотреть файл

@ -1,196 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_v6|
AREA |.text|, CODE, READONLY
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 = input
; r1 = dq
; r2 = pred
; r3 = dest
; sp + 36 = pitch ; +4 = 40
; sp + 40 = stride ; +4 = 44
|vp8_dequant_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
sub sp, sp, #4
str r3, [sp]
mov r12, #4
vp8_dequant_add_loop
smulbb r6, r4, r5
smultt r7, r4, r5
ldr r4, [r0, #4] ;input
ldr r5, [r1], #4 ;dq
strh r6, [r0], #2
strh r7, [r0], #2
smulbb r6, r4, r5
smultt r7, r4, r5
subs r12, r12, #1
ldrne r4, [r0, #4]
ldrne r5, [r1], #4
strh r6, [r0], #2
strh r7, [r0], #2
bne vp8_dequant_add_loop
sub r0, r0, #32
mov r1, r0
; short_idct4x4llm_v6_dual
ldr r3, cospi8sqrt2minus1
ldr r4, sinpi8sqrt2
ldr r6, [r0, #8]
mov r5, #2
vp8_dequant_idct_loop1_v6
ldr r12, [r0, #24]
ldr r14, [r0, #16]
smulwt r9, r3, r6
smulwb r7, r3, r6
smulwt r10, r4, r6
smulwb r8, r4, r6
pkhbt r7, r7, r9, lsl #16
smulwt r11, r3, r12
pkhbt r8, r8, r10, lsl #16
uadd16 r6, r6, r7
smulwt r7, r4, r12
smulwb r9, r3, r12
smulwb r10, r4, r12
subs r5, r5, #1
pkhbt r9, r9, r11, lsl #16
ldr r11, [r0], #4
pkhbt r10, r10, r7, lsl #16
uadd16 r7, r12, r9
usub16 r7, r8, r7
uadd16 r6, r6, r10
uadd16 r10, r11, r14
usub16 r8, r11, r14
uadd16 r9, r10, r6
usub16 r10, r10, r6
uadd16 r6, r8, r7
usub16 r7, r8, r7
str r6, [r1, #8]
ldrne r6, [r0, #8]
str r7, [r1, #16]
str r10, [r1, #24]
str r9, [r1], #4
bne vp8_dequant_idct_loop1_v6
mov r5, #2
sub r0, r1, #8
vp8_dequant_idct_loop2_v6
ldr r6, [r0], #4
ldr r7, [r0], #4
ldr r8, [r0], #4
ldr r9, [r0], #4
smulwt r1, r3, r6
smulwt r12, r4, r6
smulwt lr, r3, r8
smulwt r10, r4, r8
pkhbt r11, r8, r6, lsl #16
pkhbt r1, lr, r1, lsl #16
pkhbt r12, r10, r12, lsl #16
pkhtb r6, r6, r8, asr #16
uadd16 r6, r1, r6
pkhbt lr, r9, r7, lsl #16
uadd16 r10, r11, lr
usub16 lr, r11, lr
pkhtb r8, r7, r9, asr #16
subs r5, r5, #1
smulwt r1, r3, r8
smulwb r7, r3, r8
smulwt r11, r4, r8
smulwb r9, r4, r8
pkhbt r1, r7, r1, lsl #16
uadd16 r8, r1, r8
pkhbt r11, r9, r11, lsl #16
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
ldr r12, [sp, #40]
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
uadd16 r6, r6, r9
uadd16 r10, r14, r1
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
ldr r11, [r2], r12
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
pkhtb r8, r8, r6, asr #19
uxtb16 lr, r11, ror #8
qadd16 r9, r9, lr
uxtb16 lr, r11
qadd16 r8, r8, lr
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
ldr r11, [r2], r12
ldr lr, [sp]
ldr r12, [sp, #44]
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
mov r6, r6, lsl #16
mov r7, r7, asr #3
pkhtb r7, r7, r10, asr #19
mov r1, r1, asr #3
pkhtb r1, r1, r6, asr #19
uxtb16 r8, r11, ror #8
qadd16 r7, r7, r8
uxtb16 r8, r11
qadd16 r1, r1, r8
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
str r9, [lr], r12
str r1, [lr], r12
str lr, [sp]
bne vp8_dequant_idct_loop2_v6
; vpx_memset
sub r0, r0, #32
add sp, sp, #4
mov r12, #0
str r12, [r0]
str r12, [r0, #4]
str r12, [r0, #8]
str r12, [r0, #12]
str r12, [r0, #16]
str r12, [r0, #20]
str r12, [r0, #24]
str r12, [r0, #28]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_dequant_idct_add_v6|
; Constant Pool
cospi8sqrt2minus1 DCD 0x00004E7B
sinpi8sqrt2 DCD 0x00008A8C
c0x00040004 DCD 0x00040004
END

Просмотреть файл

@ -1,69 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequantize_b_loop_v6|
AREA |.text|, CODE, READONLY ; name this block of code
;-------------------------------
;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
; r0 short *Q,
; r1 short *DQC
; r2 short *DQ
|vp8_dequantize_b_loop_v6| PROC
stmdb sp!, {r4-r9, lr}
ldr r3, [r0] ;load Q
ldr r4, [r1] ;load DQC
ldr r5, [r0, #4]
ldr r6, [r1, #4]
mov r12, #2 ;loop counter
dequant_loop
smulbb r7, r3, r4 ;multiply
smultt r8, r3, r4
smulbb r9, r5, r6
smultt lr, r5, r6
ldr r3, [r0, #8]
ldr r4, [r1, #8]
ldr r5, [r0, #12]
ldr r6, [r1, #12]
strh r7, [r2], #2 ;store result
smulbb r7, r3, r4 ;multiply
strh r8, [r2], #2
smultt r8, r3, r4
strh r9, [r2], #2
smulbb r9, r5, r6
strh lr, [r2], #2
smultt lr, r5, r6
subs r12, r12, #1
add r0, r0, #16
add r1, r1, #16
ldrne r3, [r0]
strh r7, [r2], #2 ;store result
ldrne r4, [r1]
strh r8, [r2], #2
ldrne r5, [r0, #4]
strh r9, [r2], #2
ldrne r6, [r1, #4]
strh lr, [r2], #2
bne dequant_loop
ldmia sp!, {r4-r9, pc}
ENDP ;|vp8_dequantize_b_loop_v6|
END

Просмотреть файл

@ -1,137 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/decoder/vp9_dequantize.h"
void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
unsigned char *pre,
unsigned char *dst, int stride,
unsigned short *eobs, short *dc) {
int i;
for (i = 0; i < 4; i++) {
if (eobs[0] > 1)
vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]);
else
vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride);
if (eobs[1] > 1)
vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]);
else
vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride);
if (eobs[2] > 1)
vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]);
else
vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride);
if (eobs[3] > 1)
vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]);
else
vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4 * stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride,
unsigned short *eobs) {
int i;
for (i = 0; i < 4; i++) {
if (eobs[0] > 1)
vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride);
else {
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride);
else {
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride);
((int *)(q + 16))[0] = 0;
}
if (eobs[2] > 1)
vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride);
else {
vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride);
((int *)(q + 32))[0] = 0;
}
if (eobs[3] > 1)
vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride);
else {
vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride);
((int *)(q + 48))[0] = 0;
}
q += 64;
pre += 64;
dst += 4 * stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *pre,
unsigned char *dstu, unsigned char *dstv,
int stride, unsigned short *eobs) {
int i;
for (i = 0; i < 2; i++) {
if (eobs[0] > 1)
vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride);
else {
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride);
else {
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);
((int *)(q + 16))[0] = 0;
}
q += 32;
pre += 32;
dstu += 4 * stride;
eobs += 2;
}
for (i = 0; i < 2; i++) {
if (eobs[0] > 1)
vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride);
else {
vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride);
else {
vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);
((int *)(q + 16))[0] = 0;
}
q += 32;
pre += 32;
dstv += 4 * stride;
eobs += 2;
}
}

Просмотреть файл

@ -1,129 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
; unsigned char *dest, int pitch, int stride)
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *pred
; r3 unsigned char *dest
; sp int pitch
; sp+4 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
ldr r1, [sp] ; pitch
vld1.32 {d14[0]}, [r2], r1
vld1.32 {d14[1]}, [r2], r1
vld1.32 {d15[0]}, [r2], r1
vld1.32 {d15[1]}, [r2]
ldr r1, [sp, #4] ; stride
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; memset(input, 0, 32) -- 32bytes
vmov.i16 q14, #0
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vmov q15, q14
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vst1.16 {q14, q15}, [r0]
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r3], r1
vst1.32 {d0[1]}, [r3], r1
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r3]
bx lr
ENDP ; |vp8_dequant_idct_add_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

Просмотреть файл

@ -1,34 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequantize_b_loop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 short *Q,
; r1 short *DQC
; r2 short *DQ
|vp8_dequantize_b_loop_neon| PROC
vld1.16 {q0, q1}, [r0]
vld1.16 {q2, q3}, [r1]
vmul.i16 q4, q0, q2
vmul.i16 q5, q1, q3
vst1.16 {q4, q5}, [r2]
bx lr
ENDP
END

Просмотреть файл

@ -1,113 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/decoder/vp9_dequantize.h"
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
void idct_dequant_dc_full_2x_neon
(short *input, short *dq, unsigned char *pre, unsigned char *dst,
int stride, short *dc);
void idct_dequant_dc_0_2x_neon
(short *dc, unsigned char *pre, unsigned char *dst, int stride);
void idct_dequant_full_2x_neon
(short *q, short *dq, unsigned char *pre, unsigned char *dst,
int pitch, int stride);
void idct_dequant_0_2x_neon
(short *q, short dq, unsigned char *pre, int pitch,
unsigned char *dst, int stride);
void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
unsigned char *pre,
unsigned char *dst, int stride,
unsigned short *eobs, short *dc) {
int i;
for (i = 0; i < 4; i++) {
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc);
else
idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2);
else
idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride);
q += 64;
dc += 4;
pre += 64;
dst += 4 * stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *pre,
unsigned char *dst, int stride,
unsigned short *eobs) {
int i;
for (i = 0; i < 4; i++) {
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride);
else
idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride);
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride);
else
idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride);
q += 64;
pre += 64;
dst += 4 * stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
unsigned char *pre,
unsigned char *dstu,
unsigned char *dstv, int stride,
unsigned short *eobs) {
if (((short *)eobs)[0] & 0xfefe)
idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
q += 32;
pre += 32;
dstu += 4 * stride;
if (((short *)eobs)[1] & 0xfefe)
idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride);
else
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride);
q += 32;
pre += 32;
if (((short *)eobs)[2] & 0xfefe)
idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
q += 32;
pre += 32;
dstv += 4 * stride;
if (((short *)eobs)[3] & 0xfefe)
idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride);
else
idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride);
}

Просмотреть файл

@ -1,79 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
; int pitch, unsigned char *dst, int stride);
; r0 *q
; r1 dq
; r2 *pre
; r3 pitch
; sp *dst
; sp+4 stride
|idct_dequant_0_2x_neon| PROC
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d4[1]}, [r2]
vld1.32 {d8[0]}, [r12], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d10[1]}, [r12]
ldrh r12, [r0] ; lo q
ldrh r2, [r0, #32] ; hi q
mov r3, #0
strh r3, [r0]
strh r3, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
sxth r2, r2 ; hi
mul r0, r2, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
ldr r2, [sp] ; dst
ldr r3, [sp, #4] ; stride
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d10[1]}, [r0]
bx lr
ENDP ; |idct_dequant_0_2x_neon|
END

Просмотреть файл

@ -1,69 +0,0 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_dc_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
; unsigned char *dst, int stride);
; r0 *dc
; r1 *pre
; r2 *dst
; r3 stride
|idct_dequant_dc_0_2x_neon| PROC
ldr r0, [r0] ; *dc
mov r12, #16
vld1.32 {d2[0]}, [r1], r12 ; lo
vld1.32 {d2[1]}, [r1], r12
vld1.32 {d4[0]}, [r1], r12
vld1.32 {d4[1]}, [r1]
sub r1, r1, #44
vld1.32 {d8[0]}, [r1], r12 ; hi
vld1.32 {d8[1]}, [r1], r12
vld1.32 {d10[0]}, [r1], r12
vld1.32 {d10[1]}, [r1]
sxth r1, r0 ; lo *dc
add r1, r1, #4
asr r1, r1, #3
vdup.16 q0, r1
sxth r0, r0, ror #16 ; hi *dc
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d10[1]}, [r0]
bx lr
ENDP ;|idct_dequant_dc_0_2x_neon|
END

Просмотреть файл

@ -1,205 +0,0 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_dc_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
; unsigned char *dst, int stride, short *dc);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp stride
; sp+4 *dc
|idct_dequant_dc_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
mov r1, #16 ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r1 ; l pre
vld1.32 {d28[1]}, [r12], r1 ; r pre
vld1.32 {d29[0]}, [r2], r1
vld1.32 {d29[1]}, [r12], r1
vld1.32 {d30[0]}, [r2], r1
vld1.32 {d30[1]}, [r12], r1
vld1.32 {d31[0]}, [r2]
ldr r1, [sp, #4]
vld1.32 {d31[1]}, [r12]
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
ldrh r12, [r1], #2 ; lo *dc
ldrh r1, [r1] ; hi *dc
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
vmul.i16 q3, q3, q1
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
; move dc up to neon and overwrite first element
vmov.16 d4[0], r12
vmov.16 d8[0], r1
vld1.16 {d0}, [r2]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
vswp d5, d8
vswp d7, d10
; _CONSTANTS_ * 4,12 >> 16
; q6: 4 * sinpi : c1/temp1
; q7: 12 * sinpi : d1/temp2
; q8: 4 * cospi
; q9: 12 * cospi
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q7, q5, d0[2]
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q9, q5, d0[0]
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
; vqdmulh only accepts signed values. this was a problem because
; our constant had the high bit set, and was treated as a negative value.
; vqdmulh also doubles the value before it shifts by 16. we need to
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
; so we can shift the constant without losing precision. this avoids
; shift again afterward, but also avoids the sign issue. win win!
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
; pre-shift it
vshr.s16 q8, q8, #1
vshr.s16 q9, q9, #1
; q4: 4 + 4 * cospi : d1/temp1
; q5: 12 + 12 * cospi : c1/temp2
vqadd.s16 q4, q4, q8
vqadd.s16 q5, q5, q9
; c1 = temp1 - temp2
; d1 = temp1 + temp2
vqsub.s16 q2, q6, q5
vqadd.s16 q3, q4, q7
; [0]: a1+d1
; [1]: b1+c1
; [2]: b1-c1
; [3]: a1-d1
vqadd.s16 q4, q10, q3
vqadd.s16 q5, q11, q2
vqsub.s16 q6, q11, q2
vqsub.s16 q7, q10, q3
; rotate
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; idct loop 2
; q4: l 0, 4, 8,12 r 0, 4, 8,12
; q5: l 1, 5, 9,13 r 1, 5, 9,13
; q6: l 2, 6,10,14 r 2, 6,10,14
; q7: l 3, 7,11,15 r 3, 7,11,15
; q8: 1 * sinpi : c1/temp1
; q9: 3 * sinpi : d1/temp2
; q10: 1 * cospi
; q11: 3 * cospi
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q9, q7, d0[2]
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q11, q7, d0[0]
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
; see note on shifting above
vshr.s16 q10, q10, #1
vshr.s16 q11, q11, #1
; q10: 1 + 1 * cospi : d1/temp1
; q11: 3 + 3 * cospi : c1/temp2
vqadd.s16 q10, q5, q10
vqadd.s16 q11, q7, q11
; q8: c1 = temp1 - temp2
; q9: d1 = temp1 + temp2
vqsub.s16 q8, q8, q11
vqadd.s16 q9, q10, q9
; a1+d1
; b1+c1
; b1-c1
; a1-d1
vqadd.s16 q4, q2, q9
vqadd.s16 q5, q3, q8
vqsub.s16 q6, q3, q8
vqsub.s16 q7, q2, q9
; +4 >> 3 (rounding)
vrshr.s16 q4, q4, #3 ; lo
vrshr.s16 q5, q5, #3
vrshr.s16 q6, q6, #3 ; hi
vrshr.s16 q7, q7, #3
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; adding pre
; input is still packed. pre was read interleaved
vaddw.u8 q4, q4, d28
vaddw.u8 q5, q5, d29
vaddw.u8 q6, q6, d30
vaddw.u8 q7, q7, d31
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
ldr r1, [sp] ; stride
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r2], r1 ; hi
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r2], r1
vst1.32 {d2[0]}, [r3], r1
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r3]
vst1.32 {d3[1]}, [r2]
bx lr
ENDP ; |idct_dequant_dc_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
sinpi8sqrt2 DCD 0x4546
END

Просмотреть файл

@ -1,197 +0,0 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
; unsigned char *dst, int pitch, int stride);
; r0 *q,
; r1 *dq,
; r2 *pre
; r3 *dst
; sp pitch
; sp+4 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
ldr r1, [sp] ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r1 ; l pre
vld1.32 {d28[1]}, [r12], r1 ; r pre
vld1.32 {d29[0]}, [r2], r1
vld1.32 {d29[1]}, [r12], r1
vld1.32 {d30[0]}, [r2], r1
vld1.32 {d30[1]}, [r12], r1
vld1.32 {d31[0]}, [r2]
vld1.32 {d31[1]}, [r12]
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
vmul.i16 q3, q3, q1
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
vld1.16 {d0}, [r2]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
vswp d5, d8
vswp d7, d10
; _CONSTANTS_ * 4,12 >> 16
; q6: 4 * sinpi : c1/temp1
; q7: 12 * sinpi : d1/temp2
; q8: 4 * cospi
; q9: 12 * cospi
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q7, q5, d0[2]
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q9, q5, d0[0]
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
; vqdmulh only accepts signed values. this was a problem because
; our constant had the high bit set, and was treated as a negative value.
; vqdmulh also doubles the value before it shifts by 16. we need to
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
; so we can shift the constant without losing precision. this avoids
; shift again afterward, but also avoids the sign issue. win win!
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
; pre-shift it
vshr.s16 q8, q8, #1
vshr.s16 q9, q9, #1
; q4: 4 + 4 * cospi : d1/temp1
; q5: 12 + 12 * cospi : c1/temp2
vqadd.s16 q4, q4, q8
vqadd.s16 q5, q5, q9
; c1 = temp1 - temp2
; d1 = temp1 + temp2
vqsub.s16 q2, q6, q5
vqadd.s16 q3, q4, q7
; [0]: a1+d1
; [1]: b1+c1
; [2]: b1-c1
; [3]: a1-d1
vqadd.s16 q4, q10, q3
vqadd.s16 q5, q11, q2
vqsub.s16 q6, q11, q2
vqsub.s16 q7, q10, q3
; rotate
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; idct loop 2
; q4: l 0, 4, 8,12 r 0, 4, 8,12
; q5: l 1, 5, 9,13 r 1, 5, 9,13
; q6: l 2, 6,10,14 r 2, 6,10,14
; q7: l 3, 7,11,15 r 3, 7,11,15
; q8: 1 * sinpi : c1/temp1
; q9: 3 * sinpi : d1/temp2
; q10: 1 * cospi
; q11: 3 * cospi
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q9, q7, d0[2]
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q11, q7, d0[0]
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
; see note on shifting above
vshr.s16 q10, q10, #1
vshr.s16 q11, q11, #1
; q10: 1 + 1 * cospi : d1/temp1
; q11: 3 + 3 * cospi : c1/temp2
vqadd.s16 q10, q5, q10
vqadd.s16 q11, q7, q11
; q8: c1 = temp1 - temp2
; q9: d1 = temp1 + temp2
vqsub.s16 q8, q8, q11
vqadd.s16 q9, q10, q9
; a1+d1
; b1+c1
; b1-c1
; a1-d1
vqadd.s16 q4, q2, q9
vqadd.s16 q5, q3, q8
vqsub.s16 q6, q3, q8
vqsub.s16 q7, q2, q9
; +4 >> 3 (rounding)
vrshr.s16 q4, q4, #3 ; lo
vrshr.s16 q5, q5, #3
vrshr.s16 q6, q6, #3 ; hi
vrshr.s16 q7, q7, #3
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; adding pre
; input is still packed. pre was read interleaved
vaddw.u8 q4, q4, d28
vaddw.u8 q5, q5, d29
vaddw.u8 q6, q6, d30
vaddw.u8 q7, q7, d31
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
ldr r1, [sp, #4] ; stride
add r2, r3, #4 ; hi
vst1.32 {d0[0]}, [r3], r1 ; lo
vst1.32 {d0[1]}, [r2], r1 ; hi
vst1.32 {d1[0]}, [r3], r1
vst1.32 {d1[1]}, [r2], r1
vst1.32 {d2[0]}, [r3], r1
vst1.32 {d2[1]}, [r2], r1
vst1.32 {d3[0]}, [r3]
vst1.32 {d3[1]}, [r2]
bx lr
ENDP ; |idct_dequant_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
sinpi8sqrt2 DCD 0x4546
END

Просмотреть файл

@ -1,44 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vp9/decoder/vp9_dequantize.h"
#include "vp9/common/vp9_blockd.h"
#include "vpx_mem/vpx_mem.h"
#if HAVE_ARMV7
extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_ARMV6
extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_ARMV7
void vp9_dequantize_b_neon(BLOCKD *d) {
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
vp9_dequantize_b_loop_neon(Q, DQC, DQ);
}
#endif
#if HAVE_ARMV6
void vp9_dequantize_b_v6(BLOCKD *d) {
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
short *DQC = d->dequant;
vp9_dequantize_b_loop_v6(Q, DQC, DQ);
}
#endif

Просмотреть файл

@ -28,9 +28,6 @@
#include "vpx_ports/vpx_timer.h"
#include "vp9/decoder/vp9_decodframe.h"
#include "vp9/decoder/vp9_detokenize.h"
#if ARCH_ARM
#include "vpx_ports/arm.h"
#endif
static int get_free_fb(VP9_COMMON *cm);
static void ref_cnt_fb(int *buf, int *idx, int new_idx);
@ -235,11 +232,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,
return pbi->common.error.error_code;
}
/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
#if HAVE_ARMV7
extern void vp9_push_neon(int64_t *store);
extern void vp9_pop_neon(int64_t *store);
#endif
static int get_free_fb(VP9_COMMON *cm) {
int i;
@ -317,9 +309,6 @@ static int swap_frame_buffers(VP9_COMMON *cm) {
int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
const unsigned char **psource,
int64_t time_stamp) {
#if HAVE_ARMV7
int64_t dx_store_reg[8];
#endif
VP9D_COMP *pbi = (VP9D_COMP *) ptr;
VP9_COMMON *cm = &pbi->common;
const unsigned char *source = *psource;
@ -346,26 +335,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
}
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->rtcd.flags & HAS_NEON)
#endif
{
vp9_push_neon(dx_store_reg);
}
#endif
cm->new_fb_idx = get_free_fb(cm);
if (setjmp(pbi->common.error.jmp)) {
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->rtcd.flags & HAS_NEON)
#endif
{
vp9_pop_neon(dx_store_reg);
}
#endif
pbi->common.error.setjmp = 0;
/* We do not know if the missing frame(s) was supposed to update
@ -384,14 +356,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
retcode = vp9_decode_frame(pbi, psource);
if (retcode < 0) {
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->rtcd.flags & HAS_NEON)
#endif
{
vp9_pop_neon(dx_store_reg);
}
#endif
pbi->common.error.error_code = VPX_CODEC_ERROR;
pbi->common.error.setjmp = 0;
if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
@ -401,14 +365,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
{
if (swap_frame_buffers(cm)) {
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->rtcd.flags & HAS_NEON)
#endif
{
vp9_pop_neon(dx_store_reg);
}
#endif
pbi->common.error.error_code = VPX_CODEC_ERROR;
pbi->common.error.setjmp = 0;
return -1;
@ -455,14 +411,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
pbi->last_time_stamp = time_stamp;
pbi->source_sz = 0;
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->rtcd.flags & HAS_NEON)
#endif
{
vp9_pop_neon(dx_store_reg);
}
#endif
pbi->common.error.setjmp = 0;
return retcode;
}

Просмотреть файл

@ -1,286 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_start_encode|
EXPORT |vp9_encode_bool|
EXPORT |vp8_stop_encode|
EXPORT |vp8_encode_value|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY
; r0 BOOL_CODER *br
; r1 unsigned char *source
|vp8_start_encode| PROC
mov r12, #0
mov r3, #255
mvn r2, #23
str r12, [r0, #vp9_writer_lowvalue]
str r3, [r0, #vp9_writer_range]
str r12, [r0, #vp9_writer_value]
str r2, [r0, #vp9_writer_count]
str r12, [r0, #vp9_writer_pos]
str r1, [r0, #vp9_writer_buffer]
bx lr
ENDP
; r0 BOOL_CODER *br
; r1 int bit
; r2 int probability
|vp9_encode_bool| PROC
push {r4-r9, lr}
mov r4, r2
ldr r2, [r0, #vp9_writer_lowvalue]
ldr r5, [r0, #vp9_writer_range]
ldr r3, [r0, #vp9_writer_count]
sub r7, r5, #1 ; range-1
cmp r1, #0
mul r6, r4, r7 ; ((range-1) * probability)
mov r7, #1
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8)
addne r2, r2, r4 ; if (bit) lowvalue += split
subne r4, r5, r4 ; if (bit) range = range-split
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start
token_zero_while_loop
mov r9, #0
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r1, [r7, r4]
cmpge r1, #0xff
beq token_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r9, [r7, r4] ; w->buffer[x]
add r9, r9, #1
strb r9, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r9, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r1, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r1, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
str r2, [r0, #vp9_writer_lowvalue]
str r5, [r0, #vp9_writer_range]
str r3, [r0, #vp9_writer_count]
pop {r4-r9, pc}
ENDP
; r0 BOOL_CODER *br
|vp8_stop_encode| PROC
push {r4-r10, lr}
ldr r2, [r0, #vp9_writer_lowvalue]
ldr r5, [r0, #vp9_writer_range]
ldr r3, [r0, #vp9_writer_count]
mov r10, #32
stop_encode_loop
sub r7, r5, #1 ; range-1
mov r4, r7, lsl #7 ; ((range-1) * 128)
mov r7, #1
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero_se ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set_se
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start_se
token_zero_while_loop_se
mov r9, #0
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start_se
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r1, [r7, r4]
cmpge r1, #0xff
beq token_zero_while_loop_se
ldr r7, [r0, #vp9_writer_buffer]
ldrb r9, [r7, r4] ; w->buffer[x]
add r9, r9, #1
strb r9, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set_se
rsb r4, r6, #24 ; 24-offset
ldr r9, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r1, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r1, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero_se
lsl r2, r2, r6 ; lowvalue <<= shift
subs r10, r10, #1
bne stop_encode_loop
str r2, [r0, #vp9_writer_lowvalue]
str r5, [r0, #vp9_writer_range]
str r3, [r0, #vp9_writer_count]
pop {r4-r10, pc}
ENDP
; r0 BOOL_CODER *br
; r1 int data
; r2 int bits
|vp8_encode_value| PROC
push {r4-r11, lr}
mov r10, r2
ldr r2, [r0, #vp9_writer_lowvalue]
ldr r5, [r0, #vp9_writer_range]
ldr r3, [r0, #vp9_writer_count]
rsb r4, r10, #32 ; 32-n
; v is kept in r1 during the token pack loop
lsl r1, r1, r4 ; r1 = v << 32 - n
encode_value_loop
sub r7, r5, #1 ; range-1
; Decisions are made based on the bit value shifted
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r1, r1, #1 ; bit = v >> n
mov r4, r7, lsl #7 ; ((range-1) * 128)
mov r7, #1
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
addcs r2, r2, r4 ; if (bit) lowvalue += split
subcs r4, r5, r4 ; if (bit) range = range-split
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero_ev ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set_ev
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start_ev
token_zero_while_loop_ev
mov r9, #0
strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start_ev
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq token_zero_while_loop_ev
ldr r7, [r0, #vp9_writer_buffer]
ldrb r9, [r7, r4] ; w->buffer[x]
add r9, r9, #1
strb r9, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set_ev
rsb r4, r6, #24 ; 24-offset
ldr r9, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r9, r4] ; w->buffer[w->pos++]
token_count_lt_zero_ev
lsl r2, r2, r6 ; lowvalue <<= shift
subs r10, r10, #1
bne encode_value_loop
str r2, [r0, #vp9_writer_lowvalue]
str r5, [r0, #vp9_writer_range]
str r3, [r0, #vp9_writer_count]
pop {r4-r11, pc}
ENDP
END

Просмотреть файл

@ -1,291 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8cx_pack_tokens_armv5|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY
; r0 vp9_writer *w
; r1 const TOKENEXTRA *p
; r2 int xcount
; r3 vp8_coef_encodings
; s0 vp8_extra_bits
; s1 vp8_coef_tree
|vp8cx_pack_tokens_armv5| PROC
push {r4-r11, lr}
; Add size of xcount * sizeof (TOKENEXTRA) to get stop
; sizeof (TOKENEXTRA) is 8
sub sp, sp, #12
add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA)
str r2, [sp, #0]
str r3, [sp, #8] ; save vp8_coef_encodings
ldr r2, [r0, #vp9_writer_lowvalue]
ldr r5, [r0, #vp9_writer_range]
ldr r3, [r0, #vp9_writer_count]
b check_p_lt_stop
while_p_lt_stop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r4, [sp, #8] ; vp8_coef_encodings
mov lr, #0
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
ldr r9, [r1, #tokenextra_context_tree] ; pp
ldrb r7, [r1, #tokenextra_skip_eob_node]
ldr r6, [r4, #vp9_token_value] ; v
ldr r8, [r4, #vp9_token_len] ; n
; vp8 specific skip_eob_node
cmp r7, #0
movne lr, #2 ; i = 2
subne r8, r8, #1 ; --n
rsb r4, r8, #32 ; 32-n
ldr r10, [sp, #52] ; vp8_coef_tree
; v is kept in r12 during the token pack loop
lsl r12, r6, r4 ; r12 = v << 32 - n
; loop start
token_loop
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
sub r7, r5, #1 ; range-1
; Decisions are made based on the bit value shifted
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start
token_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq token_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4] ; w->buffer[x]
add r10, r10, #1
strb r10, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
; temp variable here. So after r10 is used, reload
; vp8_coef_tree_dcd into r10
ldr r10, [sp, #52] ; vp8_coef_tree
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
subs r8, r8, #1 ; --n
bne token_loop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r7, [sp, #48] ; vp8_extra_bits
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
; element. Here vp9_extra_bit_struct == 16
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
cmp r4, #0
beq skip_extra_bits
; if( b->base_val)
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
cmp r8, #0 ; if( L)
beq no_extra_bits
ldr r9, [r12, #vp9_extra_bit_struct_prob]
asr r7, lr, #1 ; v=e>>1
ldr r10, [r12, #vp9_extra_bit_struct_tree]
str r10, [sp, #4] ; b->tree
rsb r4, r8, #32
lsl r12, r7, r4
mov lr, #0 ; i = 0
extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
clz r6, r4
sub r6, r6, #24
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi extra_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset= shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl extra_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos - 1
b extra_zero_while_start
extra_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
extra_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq extra_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4]
add r10, r10, #1
strb r10, [r7, r4]
extra_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos]
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
lsl r2, r2, r6
subs r8, r8, #1 ; --n
bne extra_bits_loop ; while (n)
no_extra_bits
ldr lr, [r1, #4] ; e = p->Extra
add r4, r5, #1 ; range + 1
tst lr, #1
lsr r4, r4, #1 ; split = (range + 1) >> 1
addne r2, r2, r4 ; lowvalue += split
subne r4, r5, r4 ; range = range-split
tst r2, #0x80000000 ; lowvalue & 0x80000000
lsl r5, r4, #1 ; range <<= 1
beq end_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos]
mov r7, #0
sub r4, r4, #1
b end_zero_while_start
end_zero_while_loop
strb r7, [r6, r4]
sub r4, r4, #1 ; x--
end_zero_while_start
cmp r4, #0
ldrge r6, [r0, #vp9_writer_buffer]
ldrb r12, [r6, r4]
cmpge r12, #0xff
beq end_zero_while_loop
ldr r6, [r0, #vp9_writer_buffer]
ldrb r7, [r6, r4]
add r7, r7, #1
strb r7, [r6, r4]
end_high_bit_not_set
adds r3, r3, #1 ; ++count
lsl r2, r2, #1 ; lowvalue <<= 1
bne end_count_zero
ldr r4, [r0, #vp9_writer_pos]
mvn r3, #7
ldr r7, [r0, #vp9_writer_buffer]
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r12, [r0, #0x10]
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
add r1, r1, #TOKENEXTRA_SZ ; ++p
check_p_lt_stop
ldr r4, [sp, #0] ; stop
cmp r1, r4 ; while( p < stop)
bcc while_p_lt_stop
str r2, [r0, #vp9_writer_lowvalue]
str r5, [r0, #vp9_writer_range]
str r3, [r0, #vp9_writer_count]
add sp, sp, #12
pop {r4-r11, pc}
ENDP
END

Просмотреть файл

@ -1,327 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8cx_pack_mb_row_tokens_armv5|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY
; r0 VP8_COMP *cpi
; r1 vp9_writer *w
; r2 vp8_coef_encodings
; r3 vp8_extra_bits
; s0 vp8_coef_tree
|vp8cx_pack_mb_row_tokens_armv5| PROC
push {r4-r11, lr}
sub sp, sp, #24
; Compute address of cpi->common.mb_rows
ldr r4, _VP8_COMP_common_
ldr r6, _VP8_COMMON_MBrows_
add r4, r0, r4
ldr r5, [r4, r6] ; load up mb_rows
str r2, [sp, #20] ; save vp8_coef_encodings
str r5, [sp, #12] ; save mb_rows
str r3, [sp, #8] ; save vp8_extra_bits
ldr r4, _VP8_COMP_tplist_
add r4, r0, r4
ldr r7, [r4, #0] ; dereference cpi->tp_list
mov r0, r1 ; keep same as other loops
ldr r2, [r0, #vp9_writer_lowvalue]
ldr r5, [r0, #vp9_writer_range]
ldr r3, [r0, #vp9_writer_count]
mb_row_loop
ldr r1, [r7, #tokenlist_start]
ldr r9, [r7, #tokenlist_stop]
str r9, [sp, #0] ; save stop for later comparison
str r7, [sp, #16] ; tokenlist address for next time
b check_p_lt_stop
; actuall work gets done here!
while_p_lt_stop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r4, [sp, #20] ; vp8_coef_encodings
mov lr, #0
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
ldr r9, [r1, #tokenextra_context_tree] ; pp
ldrb r7, [r1, #tokenextra_skip_eob_node]
ldr r6, [r4, #vp9_token_value] ; v
ldr r8, [r4, #vp9_token_len] ; n
; vp8 specific skip_eob_node
cmp r7, #0
movne lr, #2 ; i = 2
subne r8, r8, #1 ; --n
rsb r4, r8, #32 ; 32-n
ldr r10, [sp, #60] ; vp8_coef_tree
; v is kept in r12 during the token pack loop
lsl r12, r6, r4 ; r12 = v << 32 - n
; loop start
token_loop
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
sub r7, r5, #1 ; range-1
; Decisions are made based on the bit value shifted
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start
token_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq token_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4] ; w->buffer[x]
add r10, r10, #1
strb r10, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
; temp variable here. So after r10 is used, reload
; vp8_coef_tree_dcd into r10
ldr r10, [sp, #60] ; vp8_coef_tree
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
subs r8, r8, #1 ; --n
bne token_loop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r7, [sp, #8] ; vp8_extra_bits
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
; element. Here vp9_extra_bit_struct == 16
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
cmp r4, #0
beq skip_extra_bits
; if( b->base_val)
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
cmp r8, #0 ; if( L)
beq no_extra_bits
ldr r9, [r12, #vp9_extra_bit_struct_prob]
asr r7, lr, #1 ; v=e>>1
ldr r10, [r12, #vp9_extra_bit_struct_tree]
str r10, [sp, #4] ; b->tree
rsb r4, r8, #32
lsl r12, r7, r4
mov lr, #0 ; i = 0
extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
clz r6, r4
sub r6, r6, #24
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi extra_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset= shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl extra_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos - 1
b extra_zero_while_start
extra_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
extra_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq extra_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4]
add r10, r10, #1
strb r10, [r7, r4]
extra_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos]
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
lsl r2, r2, r6
subs r8, r8, #1 ; --n
bne extra_bits_loop ; while (n)
no_extra_bits
ldr lr, [r1, #4] ; e = p->Extra
add r4, r5, #1 ; range + 1
tst lr, #1
lsr r4, r4, #1 ; split = (range + 1) >> 1
addne r2, r2, r4 ; lowvalue += split
subne r4, r5, r4 ; range = range-split
tst r2, #0x80000000 ; lowvalue & 0x80000000
lsl r5, r4, #1 ; range <<= 1
beq end_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos]
mov r7, #0
sub r4, r4, #1
b end_zero_while_start
end_zero_while_loop
strb r7, [r6, r4]
sub r4, r4, #1 ; x--
end_zero_while_start
cmp r4, #0
ldrge r6, [r0, #vp9_writer_buffer]
ldrb r12, [r6, r4]
cmpge r12, #0xff
beq end_zero_while_loop
ldr r6, [r0, #vp9_writer_buffer]
ldrb r7, [r6, r4]
add r7, r7, #1
strb r7, [r6, r4]
end_high_bit_not_set
adds r3, r3, #1 ; ++count
lsl r2, r2, #1 ; lowvalue <<= 1
bne end_count_zero
ldr r4, [r0, #vp9_writer_pos]
mvn r3, #7
ldr r7, [r0, #vp9_writer_buffer]
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r12, [r0, #0x10]
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
add r1, r1, #TOKENEXTRA_SZ ; ++p
check_p_lt_stop
ldr r4, [sp, #0] ; stop
cmp r1, r4 ; while( p < stop)
bcc while_p_lt_stop
ldr r6, [sp, #12] ; mb_rows
ldr r7, [sp, #16] ; tokenlist address
subs r6, r6, #1
add r7, r7, #TOKENLIST_SZ ; next element in the array
str r6, [sp, #12]
bne mb_row_loop
str r2, [r0, #vp9_writer_lowvalue]
str r5, [r0, #vp9_writer_range]
str r3, [r0, #vp9_writer_count]
add sp, sp, #24
pop {r4-r11, pc}
ENDP
_VP8_COMP_common_
DCD vp8_comp_common
_VP8_COMMON_MBrows_
DCD vp8_common_mb_rows
_VP8_COMP_tplist_
DCD vp8_comp_tplist
END

Просмотреть файл

@ -1,465 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY
; r0 VP8_COMP *cpi
; r1 unsigned char *cx_data
; r2 int num_part
; r3 *size
; s0 vp8_coef_encodings
; s1 vp8_extra_bits,
; s2 const vp9_tree_index *,
|vp8cx_pack_tokens_into_partitions_armv5| PROC
push {r4-r11, lr}
sub sp, sp, #44
; Compute address of cpi->common.mb_rows
ldr r4, _VP8_COMP_common_
ldr r6, _VP8_COMMON_MBrows_
add r4, r0, r4
ldr r5, [r4, r6] ; load up mb_rows
str r5, [sp, #36] ; save mb_rows
str r1, [sp, #24] ; save cx_data
str r2, [sp, #20] ; save num_part
str r3, [sp, #8] ; save *size
; *size = 3*(num_part -1 );
sub r2, r2, #1 ; num_part - 1
add r2, r2, r2, lsl #1 ; 3*(num_part - 1)
str r2, [r3]
add r2, r2, r1 ; cx_data + *size
str r2, [sp, #40] ; ptr
ldr r4, _VP8_COMP_tplist_
add r4, r0, r4
ldr r7, [r4, #0] ; dereference cpi->tp_list
str r7, [sp, #32] ; store start of cpi->tp_list
ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi
add r0, r0, r11
mov r11, #0
str r11, [sp, #28] ; i
numparts_loop
ldr r10, [sp, #40] ; ptr
ldr r5, [sp, #36] ; move mb_rows to the counting section
sub r5, r5, r11 ; move start point with each partition
; mb_rows starts at i
str r5, [sp, #12]
; Reset all of the VP8 Writer data for each partition that
; is processed.
; start_encode
mov r2, #0 ; vp9_writer_lowvalue
mov r5, #255 ; vp9_writer_range
mvn r3, #23 ; vp9_writer_count
str r2, [r0, #vp9_writer_value]
str r2, [r0, #vp9_writer_pos]
str r10, [r0, #vp9_writer_buffer]
mb_row_loop
ldr r1, [r7, #tokenlist_start]
ldr r9, [r7, #tokenlist_stop]
str r9, [sp, #0] ; save stop for later comparison
str r7, [sp, #16] ; tokenlist address for next time
b check_p_lt_stop
; actual work gets done here!
while_p_lt_stop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r4, [sp, #80] ; vp8_coef_encodings
mov lr, #0
add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t
ldr r9, [r1, #tokenextra_context_tree] ; pp
ldrb r7, [r1, #tokenextra_skip_eob_node]
ldr r6, [r4, #vp9_token_value] ; v
ldr r8, [r4, #vp9_token_len] ; n
; vp8 specific skip_eob_node
cmp r7, #0
movne lr, #2 ; i = 2
subne r8, r8, #1 ; --n
rsb r4, r8, #32 ; 32-n
ldr r10, [sp, #88] ; vp8_coef_tree
; v is kept in r12 during the token pack loop
lsl r12, r6, r4 ; r12 = v << 32 - n
; loop start
token_loop
ldrb r4, [r9, lr, asr #1] ; pp [i>>1]
sub r7, r5, #1 ; range-1
; Decisions are made based on the bit value shifted
; off of v, so set a flag here based on this.
; This value is refered to as "bb"
lsls r12, r12, #1 ; bb = v >> n
mul r6, r4, r7 ; ((range-1) * pp[i>>1]))
; bb can only be 0 or 1. So only execute this statement
; if bb == 1, otherwise it will act like i + 0
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb]
add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start
token_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq token_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4] ; w->buffer[x]
add r10, r10, #1
strb r10, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]
; r10 is used earlier in the loop, but r10 is used as
; temp variable here. So after r10 is used, reload
; vp8_coef_tree_dcd into r10
ldr r10, [sp, #88] ; vp8_coef_tree
token_count_lt_zero
lsl r2, r2, r6 ; lowvalue <<= shift
subs r8, r8, #1 ; --n
bne token_loop
ldrb r6, [r1, #tokenextra_token] ; t
ldr r7, [sp, #84] ; vp8_extra_bits
; Add t * sizeof (vp9_extra_bit_struct) to get the desired
; element. Here vp9_extra_bit_struct == 16
add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t
ldr r4, [r12, #vp9_extra_bit_struct_base_val]
cmp r4, #0
beq skip_extra_bits
; if( b->base_val)
ldr r8, [r12, #vp9_extra_bit_struct_len] ; L
ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra
cmp r8, #0 ; if( L)
beq no_extra_bits
ldr r9, [r12, #vp9_extra_bit_struct_prob]
asr r7, lr, #1 ; v=e>>1
ldr r10, [r12, #vp9_extra_bit_struct_tree]
str r10, [sp, #4] ; b->tree
rsb r4, r8, #32
lsl r12, r7, r4
mov lr, #0 ; i = 0
extra_bits_loop
ldrb r4, [r9, lr, asr #1] ; pp[i>>1]
sub r7, r5, #1 ; range-1
lsls r12, r12, #1 ; v >> n
mul r6, r4, r7 ; (range-1) * pp[i>>1]
addcs lr, lr, #1 ; i + bb
mov r7, #1
ldrsb lr, [r10, lr] ; i = b->tree[i+bb]
add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8)
addcs r2, r2, r4 ; if (bb) lowvalue += split
subcs r4, r5, r4 ; if (bb) range = range-split
clz r6, r4
sub r6, r6, #24
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi extra_count_lt_zero ; if(count >= 0)
sub r6, r6, r3 ; offset= shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl extra_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos - 1
b extra_zero_while_start
extra_zero_while_loop
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
extra_zero_while_start
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq extra_zero_while_loop
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4]
add r10, r10, #1
strb r10, [r7, r4]
extra_high_bit_not_set
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos]
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
ldr r10, [sp, #4] ; b->tree
extra_count_lt_zero
lsl r2, r2, r6
subs r8, r8, #1 ; --n
bne extra_bits_loop ; while (n)
no_extra_bits
ldr lr, [r1, #4] ; e = p->Extra
add r4, r5, #1 ; range + 1
tst lr, #1
lsr r4, r4, #1 ; split = (range + 1) >> 1
addne r2, r2, r4 ; lowvalue += split
subne r4, r5, r4 ; range = range-split
tst r2, #0x80000000 ; lowvalue & 0x80000000
lsl r5, r4, #1 ; range <<= 1
beq end_high_bit_not_set
ldr r4, [r0, #vp9_writer_pos]
mov r7, #0
sub r4, r4, #1
b end_zero_while_start
end_zero_while_loop
strb r7, [r6, r4]
sub r4, r4, #1 ; x--
end_zero_while_start
cmp r4, #0
ldrge r6, [r0, #vp9_writer_buffer]
ldrb r12, [r6, r4]
cmpge r12, #0xff
beq end_zero_while_loop
ldr r6, [r0, #vp9_writer_buffer]
ldrb r7, [r6, r4]
add r7, r7, #1
strb r7, [r6, r4]
end_high_bit_not_set
adds r3, r3, #1 ; ++count
lsl r2, r2, #1 ; lowvalue <<= 1
bne end_count_zero
ldr r4, [r0, #vp9_writer_pos]
mvn r3, #7
ldr r7, [r0, #vp9_writer_buffer]
lsr r6, r2, #24 ; lowvalue >> 24
add r12, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r12, [r0, #0x10]
strb r6, [r7, r4]
end_count_zero
skip_extra_bits
add r1, r1, #TOKENEXTRA_SZ ; ++p
check_p_lt_stop
ldr r4, [sp, #0] ; stop
cmp r1, r4 ; while( p < stop)
bcc while_p_lt_stop
ldr r10, [sp, #20] ; num_parts
mov r1, #TOKENLIST_SZ
mul r1, r10, r1
ldr r6, [sp, #12] ; mb_rows
ldr r7, [sp, #16] ; tokenlist address
subs r6, r6, r10
add r7, r7, r1 ; next element in the array
str r6, [sp, #12]
bgt mb_row_loop
mov r12, #32
stop_encode_loop
sub r7, r5, #1 ; range-1
mov r4, r7, lsl #7 ; ((range-1) * 128)
mov r7, #1
add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8)
; Counting the leading zeros is used to normalize range.
clz r6, r4
sub r6, r6, #24 ; shift
; Flag is set on the sum of count. This flag is used later
; to determine if count >= 0
adds r3, r3, r6 ; count += shift
lsl r5, r4, r6 ; range <<= shift
bmi token_count_lt_zero_se ; if(count >= 0)
sub r6, r6, r3 ; offset = shift - count
sub r4, r6, #1 ; offset-1
lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 )
bpl token_high_bit_not_set_se
ldr r4, [r0, #vp9_writer_pos] ; x
sub r4, r4, #1 ; x = w->pos-1
b token_zero_while_start_se
token_zero_while_loop_se
mov r10, #0
strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0
sub r4, r4, #1 ; x--
token_zero_while_start_se
cmp r4, #0
ldrge r7, [r0, #vp9_writer_buffer]
ldrb r11, [r7, r4]
cmpge r11, #0xff
beq token_zero_while_loop_se
ldr r7, [r0, #vp9_writer_buffer]
ldrb r10, [r7, r4] ; w->buffer[x]
add r10, r10, #1
strb r10, [r7, r4] ; w->buffer[x] + 1
token_high_bit_not_set_se
rsb r4, r6, #24 ; 24-offset
ldr r10, [r0, #vp9_writer_buffer]
lsr r7, r2, r4 ; lowvalue >> (24-offset)
ldr r4, [r0, #vp9_writer_pos] ; w->pos
lsl r2, r2, r6 ; lowvalue <<= offset
mov r6, r3 ; shift = count
add r11, r4, #1 ; w->pos++
bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff
str r11, [r0, #vp9_writer_pos]
sub r3, r3, #8 ; count -= 8
strb r7, [r10, r4] ; w->buffer[w->pos++]
token_count_lt_zero_se
lsl r2, r2, r6 ; lowvalue <<= shift
subs r12, r12, #1
bne stop_encode_loop
ldr r10, [sp, #8] ; *size
ldr r11, [r10]
ldr r4, [r0, #vp9_writer_pos] ; w->pos
add r11, r11, r4 ; *size += w->pos
str r11, [r10]
ldr r9, [sp, #20] ; num_parts
sub r9, r9, #1
ldr r10, [sp, #28] ; i
cmp r10, r9 ; if(i<(num_part - 1))
bge skip_write_partition
ldr r12, [sp, #40] ; ptr
add r12, r12, r4 ; ptr += w->pos
str r12, [sp, #40]
ldr r9, [sp, #24] ; cx_data
mov r8, r4, asr #8
strb r4, [r9, #0]
strb r8, [r9, #1]
mov r4, r4, asr #16
strb r4, [r9, #2]
add r9, r9, #3 ; cx_data += 3
str r9, [sp, #24]
skip_write_partition
ldr r11, [sp, #28] ; i
ldr r10, [sp, #20] ; num_parts
add r11, r11, #1 ; i++
str r11, [sp, #28]
ldr r7, [sp, #32] ; cpi->tp_list[i]
mov r1, #TOKENLIST_SZ
add r7, r7, r1 ; next element in cpi->tp_list
str r7, [sp, #32] ; cpi->tp_list[i+1]
cmp r10, r11
bgt numparts_loop
add sp, sp, #44
pop {r4-r11, pc}
ENDP
_VP8_COMP_common_
DCD vp8_comp_common
_VP8_COMMON_MBrows_
DCD vp8_common_mb_rows
_VP8_COMP_tplist_
DCD vp8_comp_tplist
_VP8_COMP_bc2_
DCD vp8_comp_bc2
END

Просмотреть файл

@ -1,223 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_quantize_b_armv6|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 BLOCK *b
; r1 BLOCKD *d
|vp8_fast_quantize_b_armv6| PROC
stmfd sp!, {r1, r4-r11, lr}
ldr r3, [r0, #vp8_block_coeff] ; coeff
ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast
ldr r5, [r0, #vp8_block_round] ; round
ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff
ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff
ldr r8, [r1, #vp8_blockd_dequant] ; dequant
ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction
; is used to update the counter so that
; it can be used to mark nonzero
; quantized coefficient pairs.
mov r1, #0 ; flags for quantized coeffs
; PART 1: quantization and dequantization loop
loop
ldr r9, [r3], #4 ; [z1 | z0]
ldr r10, [r5], #4 ; [r1 | r0]
ldr r11, [r4], #4 ; [q1 | q0]
ssat16 lr, #1, r9 ; [sz1 | sz0]
eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0]
ssub16 r9, r9, lr ; x = (z ^ sz) - sz
sadd16 r9, r9, r10 ; [x1+r1 | x0+r0]
ldr r12, [r3], #4 ; [z3 | z2]
smulbb r0, r9, r11 ; [(x0+r0)*q0]
smultt r9, r9, r11 ; [(x1+r1)*q1]
ldr r10, [r5], #4 ; [r3 | r2]
ssat16 r11, #1, r12 ; [sz3 | sz2]
eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2]
pkhtb r0, r9, r0, asr #16 ; [y1 | y0]
ldr r9, [r4], #4 ; [q3 | q2]
ssub16 r12, r12, r11 ; x = (z ^ sz) - sz
sadd16 r12, r12, r10 ; [x3+r3 | x2+r2]
eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)]
smulbb r10, r12, r9 ; [(x2+r2)*q2]
smultt r12, r12, r9 ; [(x3+r3)*q3]
ssub16 r0, r0, lr ; x = (y ^ sz) - sz
cmp r0, #0 ; check if zero
orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs
str r0, [r6], #4 ; *qcoeff++ = x
ldr r9, [r8], #4 ; [dq1 | dq0]
pkhtb r10, r12, r10, asr #16 ; [y3 | y2]
eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)]
ssub16 r10, r10, r11 ; x = (y ^ sz) - sz
cmp r10, #0 ; check if zero
orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs
str r10, [r6], #4 ; *qcoeff++ = x
ldr r11, [r8], #4 ; [dq3 | dq2]
smulbb r12, r0, r9 ; [x0*dq0]
smultt r0, r0, r9 ; [x1*dq1]
smulbb r9, r10, r11 ; [x2*dq2]
smultt r10, r10, r11 ; [x3*dq3]
lsls r2, r2, #2 ; update loop counter
strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0]
strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1]
strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2]
strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3]
add r7, r7, #8 ; dqcoeff += 8
bne loop
; PART 2: check position for eob...
mov lr, #0 ; init eob
cmp r1, #0 ; coeffs after quantization?
ldr r11, [sp, #0] ; restore BLOCKD pointer
beq end ; skip eob calculations if all zero
ldr r0, [r11, #vp8_blockd_qcoeff]
; check shortcut for nonzero qcoeffs
tst r1, #0x80
bne quant_coeff_15_14
tst r1, #0x20
bne quant_coeff_13_11
tst r1, #0x8
bne quant_coeff_12_7
tst r1, #0x40
bne quant_coeff_10_9
tst r1, #0x10
bne quant_coeff_8_3
tst r1, #0x2
bne quant_coeff_6_5
tst r1, #0x4
bne quant_coeff_4_2
b quant_coeff_1_0
quant_coeff_15_14
ldrh r2, [r0, #30] ; rc=15, i=15
mov lr, #16
cmp r2, #0
bne end
ldrh r3, [r0, #28] ; rc=14, i=14
mov lr, #15
cmp r3, #0
bne end
quant_coeff_13_11
ldrh r2, [r0, #22] ; rc=11, i=13
mov lr, #14
cmp r2, #0
bne end
quant_coeff_12_7
ldrh r3, [r0, #14] ; rc=7, i=12
mov lr, #13
cmp r3, #0
bne end
ldrh r2, [r0, #20] ; rc=10, i=11
mov lr, #12
cmp r2, #0
bne end
quant_coeff_10_9
ldrh r3, [r0, #26] ; rc=13, i=10
mov lr, #11
cmp r3, #0
bne end
ldrh r2, [r0, #24] ; rc=12, i=9
mov lr, #10
cmp r2, #0
bne end
quant_coeff_8_3
ldrh r3, [r0, #18] ; rc=9, i=8
mov lr, #9
cmp r3, #0
bne end
ldrh r2, [r0, #12] ; rc=6, i=7
mov lr, #8
cmp r2, #0
bne end
quant_coeff_6_5
ldrh r3, [r0, #6] ; rc=3, i=6
mov lr, #7
cmp r3, #0
bne end
ldrh r2, [r0, #4] ; rc=2, i=5
mov lr, #6
cmp r2, #0
bne end
quant_coeff_4_2
ldrh r3, [r0, #10] ; rc=5, i=4
mov lr, #5
cmp r3, #0
bne end
ldrh r2, [r0, #16] ; rc=8, i=3
mov lr, #4
cmp r2, #0
bne end
ldrh r3, [r0, #8] ; rc=4, i=2
mov lr, #3
cmp r3, #0
bne end
quant_coeff_1_0
ldrh r2, [r0, #2] ; rc=1, i=1
mov lr, #2
cmp r2, #0
bne end
mov lr, #1 ; rc=0, i=0
end
str lr, [r11, #vp8_blockd_eob]
ldmfd sp!, {r1, r4-r11, pc}
ENDP
loop_count
DCD 0x1000000
END

Просмотреть файл

@ -1,138 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mse16x16_armv6|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;
;note: Based on vp9_variance16x16_armv6. In this function, sum is never used.
; So, we can remove this part of calculation.
|vp8_mse16x16_armv6| PROC
push {r4-r9, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #16 ; set loop counter to 16 (=block height)
mov r4, #0 ; initialize sse = 0
loop
; 1st 4 pixels
ldr r5, [r0, #0x0] ; load 4 src pixels
ldr r6, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r5, r6 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x4] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r2, #0x4] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0x8] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r6, [r2, #0x8] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
ldr r5, [r0, #0xc] ; load 4 src pixels
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r6, [r2, #0xc] ; load 4 ref pixels
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
usub8 r8, r5, r6 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r8, lr ; select bytes with positive difference
usub8 r9, r6, r5 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r5, r7, lr ; calculate sum of positive differences
usad8 r6, r8, lr ; calculate sum of negative differences
orr r8, r8, r7 ; differences of all 4 pixels
subs r12, r12, #1 ; next row
; calculate sse
uxtb16 r6, r8 ; byte (two pixels) to halfwords
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r1, [sp, #28] ; get address of sse
mov r0, r4 ; return sse
str r4, [r1] ; store sse
pop {r4-r9, pc}
ENDP
END

Просмотреть файл

@ -1,95 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad16x16_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 const unsigned char *src_ptr
; r1 int src_stride
; r2 const unsigned char *ref_ptr
; r3 int ref_stride
; stack max_sad (not used)
|vp8_sad16x16_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
pld [r0, r1, lsl #1]
pld [r2, r3, lsl #1]
mov r4, #0 ; sad = 0;
mov r5, #8 ; loop count
loop
; 1st row
ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
usad8 r8, r7, r9 ; calculate sad for 4 pixels
ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
add r0, r0, r1 ; set src pointer to next row
add r2, r2, r3 ; set dst pointer to next row
pld [r0, r1, lsl #1]
pld [r2, r3, lsl #1]
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
add r4, r4, r8 ; add partial sad values
; 2nd row
ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
usad8 r8, r7, r9 ; calculate sad for 4 pixels
ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
add r0, r0, r1 ; set src pointer to next row
add r2, r2, r3 ; set dst pointer to next row
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
pld [r0, r1, lsl #1]
pld [r2, r3, lsl #1]
subs r5, r5, #1 ; decrement loop counter
add r4, r4, r8 ; add partial sad values
bne loop
mov r0, r4 ; return sad
ldmfd sp!, {r4-r12, pc}
ENDP
END

Просмотреть файл

@ -1,262 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_fdct4x4_armv6|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_armv6| PROC
stmfd sp!, {r4 - r12, lr}
; PART 1
; coeffs 0-3
ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
ldr r10, c7500
ldr r11, c14500
ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
ldr lr, c0x00080008
ror r5, r5, #16 ; [i2 | i3]
qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
add r0, r0, r2 ; update input pointer
qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
str r6, [r1, #4]
; coeffs 4-7
ror r9, r9, #16 ; [i6 | i7]
qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
add r0, r0, r2 ; update input pointer
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
str r6, [r1, #12]
; coeffs 8-11
ror r5, r5, #16 ; [i10 | i11]
qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
add r0, r0, r2 ; update input pointer
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
str r6, [r1, #20]
; coeffs 12-15
ror r5, r5, #16 ; [i14 | i15]
qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
; with 2217*4 and 5352*4 without losing the
; sign bit (overflow)
smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
str r6, [r1, #28]
; PART 2 -------------------------------------------------
ldr r11, c12000
ldr r10, c51000
ldr lr, c0x00070007
qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
qadd16 r4, r4, lr ; a1 + 7
add r0, r11, #0x10000 ; add (d!=0)
qadd16 r2, r4, r5 ; a1 + b1 + 7
qsub16 r3, r4, r5 ; a1 - b1 + 7
ldr r12, c0x08a914e8 ; [2217 | 5352]
lsl r8, r2, #16 ; prepare bottom halfword for scaling
asr r2, r2, #4 ; scale top halfword
lsl r9, r3, #16 ; prepare bottom halfword for scaling
asr r3, r3, #4 ; scale top halfword
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
smulbt r2, r6, r12 ; [ ------ | c1*2217]
str r4, [r1, #0] ; [ o1 | o0]
smultt r3, r6, r12 ; [c1*2217 | ------ ]
str r5, [r1, #16] ; [ o9 | o8]
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
smulbb r2, r6, r12 ; [ ------ | c1*5352]
smultb r3, r6, r12 ; [c1*5352 | ------ ]
lsls r6, r7, #16 ; d1 != 0 ?
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs r6, r7, #16
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
pkhtb r9, r9, r8, asr #16
sub r4, r4, r2
sub r5, r5, r3
ldr r3, [r1, #4] ; [i3 | i2]
pkhtb r5, r5, r4, asr #16 ; [o13|o12]
str r9, [r1, #8] ; [o5 | 04]
ldr r9, [r1, #12] ; [i7 | i6]
ldr r8, [r1, #28] ; [i15|i14]
ldr r2, [r1, #20] ; [i11|i10]
str r5, [r1, #24] ; [o13|o12]
qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
qadd16 r4, r4, lr ; a1 + 7
qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
qadd16 r2, r4, r5 ; a1 + b1 + 7
qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
qsub16 r3, r4, r5 ; a1 - b1 + 7
lsl r8, r2, #16 ; prepare bottom halfword for scaling
asr r2, r2, #4 ; scale top halfword
lsl r9, r3, #16 ; prepare bottom halfword for scaling
asr r3, r3, #4 ; scale top halfword
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
smulbt r2, r6, r12 ; [ ------ | c1*2217]
str r4, [r1, #4] ; [ o3 | o2]
smultt r3, r6, r12 ; [c1*2217 | ------ ]
str r5, [r1, #20] ; [ o11 | o10]
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
smulbb r2, r6, r12 ; [ ------ | c1*5352]
smultb r3, r6, r12 ; [c1*5352 | ------ ]
lsls r6, r7, #16 ; d1 != 0 ?
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
asrs r6, r7, #16
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
pkhtb r9, r9, r8, asr #16
sub r4, r4, r2
sub r5, r5, r3
str r9, [r1, #12] ; [o7 | o6]
pkhtb r5, r5, r4, asr #16 ; [o15|o14]
str r5, [r1, #28] ; [o15|o14]
ldmfd sp!, {r4 - r12, pc}
ENDP
; Used constants
c7500
DCD 7500
c14500
DCD 14500
c0x22a453a0
DCD 0x22a453a0
c0x00080008
DCD 0x00080008
c12000
DCD 12000
c51000
DCD 51000
c0x00070007
DCD 0x00070007
c0x08a914e8
DCD 0x08a914e8
END

Просмотреть файл

@ -1,264 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_subtract_mby_armv6|
EXPORT |vp8_subtract_mbuv_armv6|
EXPORT |vp8_subtract_b_armv6|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 BLOCK *be
; r1 BLOCKD *bd
; r2 int pitch
|vp8_subtract_b_armv6| PROC
stmfd sp!, {r4-r9}
ldr r4, [r0, #vp8_block_base_src]
ldr r5, [r0, #vp8_block_src]
ldr r6, [r0, #vp8_block_src_diff]
ldr r3, [r4]
ldr r7, [r0, #vp8_block_src_stride]
add r3, r3, r5 ; src = *base_src + src
ldr r8, [r1, #vp8_blockd_predictor]
mov r9, #4 ; loop count
loop_block
ldr r0, [r3], r7 ; src
ldr r1, [r8], r2 ; pred
uxtb16 r4, r0 ; [s2 | s0]
uxtb16 r5, r1 ; [p2 | p0]
uxtb16 r0, r0, ror #8 ; [s3 | s1]
uxtb16 r1, r1, ror #8 ; [p3 | p1]
usub16 r4, r4, r5 ; [d2 | d0]
usub16 r5, r0, r1 ; [d3 | d1]
subs r9, r9, #1 ; decrement loop counter
pkhbt r0, r4, r5, lsl #16 ; [d1 | d0]
pkhtb r1, r5, r4, asr #16 ; [d3 | d2]
str r0, [r6, #0] ; diff
str r1, [r6, #4] ; diff
add r6, r6, r2, lsl #1 ; update diff pointer
bne loop_block
ldmfd sp!, {r4-r9}
mov pc, lr
ENDP
; r0 short *diff
; r1 unsigned char *usrc
; r2 unsigned char *vsrc
; r3 unsigned char *pred
; stack int stride
|vp8_subtract_mbuv_armv6| PROC
stmfd sp!, {r4-r12, lr}
add r0, r0, #512 ; set *diff point to Cb
add r3, r3, #256 ; set *pred point to Cb
mov r4, #8 ; loop count
ldr r5, [sp, #40] ; stride
; Subtract U block
loop_u
ldr r6, [r1] ; src (A)
ldr r7, [r3], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r1, #4] ; src (B)
ldr r11, [r3], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
add r1, r1, r5 ; update usrc pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (B)
bne loop_u
mov r4, #8 ; loop count
; Subtract V block
loop_v
ldr r6, [r2] ; src (A)
ldr r7, [r3], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r2, #4] ; src (B)
ldr r11, [r3], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
add r2, r2, r5 ; update vsrc pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (B)
bne loop_v
ldmfd sp!, {r4-r12, pc}
ENDP
; r0 short *diff
; r1 unsigned char *src
; r2 unsigned char *pred
; r3 int stride
|vp8_subtract_mby_armv6| PROC
stmfd sp!, {r4-r11}
mov r4, #16
loop
ldr r6, [r1] ; src (A)
ldr r7, [r2], #4 ; pred (A)
uxtb16 r8, r6 ; [s2 | s0] (A)
uxtb16 r9, r7 ; [p2 | p0] (A)
uxtb16 r10, r6, ror #8 ; [s3 | s1] (A)
uxtb16 r11, r7, ror #8 ; [p3 | p1] (A)
usub16 r6, r8, r9 ; [d2 | d0] (A)
usub16 r7, r10, r11 ; [d3 | d1] (A)
ldr r10, [r1, #4] ; src (B)
ldr r11, [r2], #4 ; pred (B)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
str r8, [r0], #4 ; diff (A)
uxtb16 r8, r10 ; [s2 | s0] (B)
str r9, [r0], #4 ; diff (A)
uxtb16 r9, r11 ; [p2 | p0] (B)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (B)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (B)
usub16 r6, r8, r9 ; [d2 | d0] (B)
usub16 r7, r10, r11 ; [d3 | d1] (B)
ldr r10, [r1, #8] ; src (C)
ldr r11, [r2], #4 ; pred (C)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
str r8, [r0], #4 ; diff (B)
uxtb16 r8, r10 ; [s2 | s0] (C)
str r9, [r0], #4 ; diff (B)
uxtb16 r9, r11 ; [p2 | p0] (C)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (C)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (C)
usub16 r6, r8, r9 ; [d2 | d0] (C)
usub16 r7, r10, r11 ; [d3 | d1] (C)
ldr r10, [r1, #12] ; src (D)
ldr r11, [r2], #4 ; pred (D)
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
str r8, [r0], #4 ; diff (C)
uxtb16 r8, r10 ; [s2 | s0] (D)
str r9, [r0], #4 ; diff (C)
uxtb16 r9, r11 ; [p2 | p0] (D)
uxtb16 r10, r10, ror #8 ; [s3 | s1] (D)
uxtb16 r11, r11, ror #8 ; [p3 | p1] (D)
usub16 r6, r8, r9 ; [d2 | d0] (D)
usub16 r7, r10, r11 ; [d3 | d1] (D)
add r1, r1, r3 ; update src pointer
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
str r8, [r0], #4 ; diff (D)
subs r4, r4, #1 ; update loop counter
str r9, [r0], #4 ; diff (D)
bne loop
ldmfd sp!, {r4-r11}
mov pc, lr
ENDP
END

Просмотреть файл

@ -1,153 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance16x16_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance16x16_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
loop
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r5, [r2, #0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r5, [r2, #4] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r5, [r2, #8] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r5, [r2, #12] ; load 4 ref pixels
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r9, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
END

Просмотреть файл

@ -1,101 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance8x8_armv6|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance8x8_armv6| PROC
push {r4-r10, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r12, #8 ; set loop counter to 8 (=block height)
mov r4, #0 ; initialize sum = 0
mov r5, #0 ; initialize sse = 0
loop
; 1st 4 pixels
ldr r6, [r0, #0x0] ; load 4 src pixels
ldr r7, [r2, #0x0] ; load 4 ref pixels
mov lr, #0 ; constant zero
usub8 r8, r6, r7 ; calculate difference
pld [r0, r1, lsl #1]
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; substract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r6, [r0, #0x4] ; load 4 src pixels
ldr r7, [r2, #0x4] ; load 4 ref pixels
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
usub8 r8, r6, r7 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r10, r8, lr ; select bytes with positive difference
usub8 r9, r7, r6 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r8, r9, lr ; select bytes with negative difference
; calculate partial sums
usad8 r6, r10, lr ; calculate sum of positive differences
usad8 r7, r8, lr ; calculate sum of negative differences
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; substract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1 ; next row
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r8, [sp, #32] ; get address of sse
mul r1, r4, r4 ; sum * sum
str r5, [r8] ; store sse
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
pop {r4-r10, pc}
ENDP
END

Просмотреть файл

@ -1,181 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance_halfpixvar16x16_h_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance_halfpixvar16x16_h_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
ldr r10, c80808080
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
mov lr, #0 ; constant zero
loop
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
ldr r5, [r2, #0] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
ldr r5, [r2, #4] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
ldr r5, [r2, #8] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
ldr r5, [r2, #12] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
c80808080
DCD 0x80808080
END

Просмотреть файл

@ -1,222 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance_halfpixvar16x16_hv_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance_halfpixvar16x16_hv_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
ldr r10, c80808080
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
mov lr, #0 ; constant zero
loop
add r9, r0, r1 ; pointer to pixels on the next row
; 1st 4 pixels
ldr r4, [r0, #0] ; load source pixels a, row N
ldr r6, [r0, #1] ; load source pixels b, row N
ldr r5, [r9, #0] ; load source pixels c, row N+1
ldr r7, [r9, #1] ; load source pixels d, row N+1
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
mvn r7, r7
uhsub8 r5, r5, r7
eor r5, r5, r10
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
mvn r5, r5
uhsub8 r4, r4, r5
ldr r5, [r2, #0] ; load 4 ref pixels
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load source pixels a, row N
ldr r6, [r0, #5] ; load source pixels b, row N
ldr r5, [r9, #4] ; load source pixels c, row N+1
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
ldr r7, [r9, #5] ; load source pixels d, row N+1
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
mvn r7, r7
uhsub8 r5, r5, r7
eor r5, r5, r10
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
mvn r5, r5
uhsub8 r4, r4, r5
ldr r5, [r2, #4] ; load 4 ref pixels
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load source pixels a, row N
ldr r6, [r0, #9] ; load source pixels b, row N
ldr r5, [r9, #8] ; load source pixels c, row N+1
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
ldr r7, [r9, #9] ; load source pixels d, row N+1
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
mvn r7, r7
uhsub8 r5, r5, r7
eor r5, r5, r10
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
mvn r5, r5
uhsub8 r4, r4, r5
ldr r5, [r2, #8] ; load 4 ref pixels
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load source pixels a, row N
ldr r6, [r0, #13] ; load source pixels b, row N
ldr r5, [r9, #12] ; load source pixels c, row N+1
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
ldr r7, [r9, #13] ; load source pixels d, row N+1
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
mvn r7, r7
uhsub8 r5, r5, r7
eor r5, r5, r10
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
mvn r5, r5
uhsub8 r4, r4, r5
ldr r5, [r2, #12] ; load 4 ref pixels
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
subs r12, r12, #1
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
bne loop
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
c80808080
DCD 0x80808080
END

Просмотреть файл

@ -1,183 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance_halfpixvar16x16_v_armv6|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance_halfpixvar16x16_v_armv6| PROC
stmfd sp!, {r4-r12, lr}
pld [r0, r1, lsl #0]
pld [r2, r3, lsl #0]
mov r8, #0 ; initialize sum = 0
ldr r10, c80808080
mov r11, #0 ; initialize sse = 0
mov r12, #16 ; set loop counter to 16 (=block height)
mov lr, #0 ; constant zero
loop
add r9, r0, r1 ; set src pointer to next row
; 1st 4 pixels
ldr r4, [r0, #0] ; load 4 src pixels
ldr r6, [r9, #0] ; load 4 src pixels from next row
ldr r5, [r2, #0] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
usub8 r6, r4, r5 ; calculate difference
pld [r0, r1, lsl #1]
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
pld [r2, r3, lsl #1]
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 2nd 4 pixels
ldr r4, [r0, #4] ; load 4 src pixels
ldr r6, [r9, #4] ; load 4 src pixels from next row
ldr r5, [r2, #4] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 3rd 4 pixels
ldr r4, [r0, #8] ; load 4 src pixels
ldr r6, [r9, #8] ; load 4 src pixels from next row
ldr r5, [r2, #8] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
; 4th 4 pixels
ldr r4, [r0, #12] ; load 4 src pixels
ldr r6, [r9, #12] ; load 4 src pixels from next row
ldr r5, [r2, #12] ; load 4 ref pixels
; bilinear interpolation
mvn r6, r6
uhsub8 r4, r4, r6
eor r4, r4, r10
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
usub8 r6, r4, r5 ; calculate difference
add r0, r0, r1 ; set src_ptr to next row
sel r7, r6, lr ; select bytes with positive difference
usub8 r6, r5, r4 ; calculate difference with reversed operands
add r2, r2, r3 ; set dst_ptr to next row
sel r6, r6, lr ; select bytes with negative difference
; calculate partial sums
usad8 r4, r7, lr ; calculate sum of positive differences
usad8 r5, r6, lr ; calculate sum of negative differences
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
subs r12, r12, #1
bne loop
; return stuff
ldr r6, [sp, #40] ; get address of sse
mul r0, r8, r8 ; sum * sum
str r11, [r6] ; store sse
sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
ldmfd sp!, {r4-r12, pc}
ENDP
c80808080
DCD 0x80808080
END

Просмотреть файл

@ -1,212 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_walsh4x4_armv6|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
; r0 short *input,
; r1 short *output,
; r2 int pitch
|vp8_short_walsh4x4_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldrd r4, r5, [r0], r2
ldr lr, c00040004
ldrd r6, r7, [r0], r2
; 0-3
qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
ldrd r8, r9, [r0], r2
; 4-7
qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
ldrd r10, r11, [r0]
; 8-11
qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
; 12-15
qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
lsls r2, r3, #16
smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
addne r11, r11, #1 ; A0 += (a1!=0)
lsls r2, r7, #16
smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
addne r12, r12, #1 ; C0 += (a1!=0)
add r0, r11, r12 ; a1_0 = A0 + C0
sub r11, r11, r12 ; b1_0 = A0 - C0
lsls r2, r5, #16
smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
addne r12, r12, #1 ; B0 += (a1!=0)
lsls r2, r9, #16
smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
addne r2, r2, #1 ; D0 += (a1!=0)
add lr, r12, r2 ; d1_0 = B0 + D0
sub r12, r12, r2 ; c1_0 = B0 - D0
; op[0,4,8,12]
adds r2, r0, lr ; a2 = a1_0 + d1_0
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r0, r0, lr ; d2 = a1_0 - d1_0
mov r2, r2, asr #3 ; >> 3
strh r2, [r1] ; op[0]
addmi r0, r0, #1 ; += a2 < 0
add r0, r0, #3 ; += 3
ldr lr, c00040004
mov r0, r0, asr #3 ; >> 3
strh r0, [r1, #24] ; op[12]
adds r2, r11, r12 ; b2 = b1_0 + c1_0
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r0, r11, r12 ; c2 = b1_0 - c1_0
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #8] ; op[4]
addmi r0, r0, #1 ; += a2 < 0
add r0, r0, #3 ; += 3
smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
mov r0, r0, asr #3 ; >> 3
strh r0, [r1, #16] ; op[8]
; op[3,7,11,15]
add r0, r3, r7 ; a1_3 = A3 + C3
sub r3, r3, r7 ; b1_3 = A3 - C3
smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
add r7, r5, r9 ; d1_3 = B3 + D3
sub r5, r5, r9 ; c1_3 = B3 - D3
adds r2, r0, r7 ; a2 = a1_3 + d1_3
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r3, r5 ; b2 = b1_3 + c1_3
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #6] ; op[3]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r3, r5 ; c2 = b1_3 - c1_3
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #14] ; op[7]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r0, r7 ; d2 = a1_3 - d1_3
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #22] ; op[11]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #30] ; op[15]
; op[1,5,9,13]
add r0, r3, r5 ; a1_1 = A1 + C1
sub r3, r3, r5 ; b1_1 = A1 - C1
smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
add r5, r7, r9 ; d1_1 = B1 + D1
sub r7, r7, r9 ; c1_1 = B1 - D1
adds r2, r0, r5 ; a2 = a1_1 + d1_1
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r3, r7 ; b2 = b1_1 + c1_1
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #2] ; op[1]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r3, r7 ; c2 = b1_1 - c1_1
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #10] ; op[5]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r0, r5 ; d2 = a1_1 - d1_1
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #18] ; op[9]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #26] ; op[13]
; op[2,6,10,14]
add r11, r4, r8 ; a1_2 = A2 + C2
sub r12, r4, r8 ; b1_2 = A2 - C2
smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
add r4, r6, r10 ; d1_2 = B2 + D2
sub r8, r6, r10 ; c1_2 = B2 - D2
adds r2, r11, r4 ; a2 = a1_2 + d1_2
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
adds r9, r12, r8 ; b2 = b1_2 + c1_2
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #4] ; op[2]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
subs r2, r12, r8 ; c2 = b1_2 - c1_2
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #12] ; op[6]
addmi r2, r2, #1 ; += a2 < 0
add r2, r2, #3 ; += 3
subs r9, r11, r4 ; d2 = a1_2 - d1_2
mov r2, r2, asr #3 ; >> 3
strh r2, [r1, #20] ; op[10]
addmi r9, r9, #1 ; += a2 < 0
add r9, r9, #3 ; += 3
mov r9, r9, asr #3 ; >> 3
strh r9, [r1, #28] ; op[14]
ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_short_walsh4x4_armv6|
c00040004
DCD 0x00040004
END

Просмотреть файл

@ -1,261 +0,0 @@
;
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_fast_quantize_b_neon|
EXPORT |vp8_fast_quantize_b_pair_neon|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
|vp8_fast_quantize_b_pair_neon| PROC
stmfd sp!, {r4-r9}
vstmdb sp!, {q4-q7}
ldr r4, [r0, #vp8_block_coeff]
ldr r5, [r0, #vp8_block_quant_fast]
ldr r6, [r0, #vp8_block_round]
vld1.16 {q0, q1}, [r4@128] ; load z
ldr r7, [r2, #vp8_blockd_qcoeff]
vabs.s16 q4, q0 ; calculate x = abs(z)
vabs.s16 q5, q1
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vshr.s16 q3, q1, #15
vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15]
vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15]
ldr r4, [r1, #vp8_block_coeff]
vadd.s16 q4, q6 ; x + Round
vadd.s16 q5, q7
vld1.16 {q0, q1}, [r4@128] ; load z2
vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q5, q9
vabs.s16 q10, q0 ; calculate x2 = abs(z_2)
vabs.s16 q11, q1
vshr.s16 q12, q0, #15 ; sz2
vshr.s16 q13, q1, #15
;modify data to have its original sign
veor.s16 q4, q2 ; y^sz
veor.s16 q5, q3
vadd.s16 q10, q6 ; x2 + Round
vadd.s16 q11, q7
ldr r8, [r2, #vp8_blockd_dequant]
vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q11, q9
vshr.s16 q4, #1 ; right shift 1 after vqdmulh
vshr.s16 q5, #1
vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i]
vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q5, q3
vshr.s16 q10, #1 ; right shift 1 after vqdmulh
vshr.s16 q11, #1
ldr r9, [r2, #vp8_blockd_dqcoeff]
veor.s16 q10, q12 ; y2^sz2
veor.s16 q11, q13
vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1
vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q11, q13
ldr r6, [r3, #vp8_blockd_qcoeff]
vmul.s16 q2, q6, q4 ; x * Dequant
vmul.s16 q3, q7, q5
ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
vceq.s16 q8, q8 ; set q8 to all 1
vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2
vmul.s16 q12, q6, q10 ; x2 * Dequant
vmul.s16 q13, q7, q11
vld1.16 {q6, q7}, [r0@128] ; load inverse scan order
vtst.16 q14, q4, q8 ; now find eob
vtst.16 q15, q5, q8 ; non-zero element is set to all 1
vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant
ldr r7, [r3, #vp8_blockd_dqcoeff]
vand q0, q6, q14 ; get all valid numbers from scan array
vand q1, q7, q15
vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant
vtst.16 q2, q10, q8 ; now find eob
vtst.16 q3, q11, q8 ; non-zero element is set to all 1
vmax.u16 q0, q0, q1 ; find maximum value in q0, q1
vand q10, q6, q2 ; get all valid numbers from scan array
vand q11, q7, q3
vmax.u16 q10, q10, q11 ; find maximum value in q10, q11
vmax.u16 d0, d0, d1
vmax.u16 d20, d20, d21
vmovl.u16 q0, d0
vmovl.u16 q10, d20
vmax.u32 d0, d0, d1
vmax.u32 d20, d20, d21
vpmax.u32 d0, d0, d0
vpmax.u32 d20, d20, d20
add r4, r2, #vp8_blockd_eob
add r5, r3, #vp8_blockd_eob
vst1.32 {d0[0]}, [r4@32]
vst1.32 {d20[0]}, [r5@32]
vldmia sp!, {q4-q7}
ldmfd sp!, {r4-r9}
bx lr
ENDP
;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
|vp8_fast_quantize_b_neon| PROC
stmfd sp!, {r4-r7}
ldr r3, [r0, #vp8_block_coeff]
ldr r4, [r0, #vp8_block_quant_fast]
ldr r5, [r0, #vp8_block_round]
vld1.16 {q0, q1}, [r3@128] ; load z
vorr.s16 q14, q0, q1 ; check if all zero (step 1)
ldr r6, [r1, #vp8_blockd_qcoeff]
ldr r7, [r1, #vp8_blockd_dqcoeff]
vorr.s16 d28, d28, d29 ; check if all zero (step 2)
vabs.s16 q12, q0 ; calculate x = abs(z)
vabs.s16 q13, q1
;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
vshr.s16 q2, q0, #15 ; sz
vmov r2, r3, d28 ; check if all zero (step 3)
vshr.s16 q3, q1, #15
vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15]
vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15]
vadd.s16 q12, q14 ; x + Round
vadd.s16 q13, q15
ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table
vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16
vqdmulh.s16 q13, q9
vld1.16 {q10, q11}, [r0@128]; load inverse scan order
vceq.s16 q8, q8 ; set q8 to all 1
ldr r4, [r1, #vp8_blockd_dequant]
vshr.s16 q12, #1 ; right shift 1 after vqdmulh
vshr.s16 q13, #1
orr r2, r2, r3 ; check if all zero (step 4)
cmp r2, #0 ; check if all zero (step 5)
beq zero_output ; check if all zero (step 6)
;modify data to have its original sign
veor.s16 q12, q2 ; y^sz
veor.s16 q13, q3
vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
vsub.s16 q13, q3
vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i]
vtst.16 q14, q12, q8 ; now find eob
vtst.16 q15, q13, q8 ; non-zero element is set to all 1
vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1
vand q10, q10, q14 ; get all valid numbers from scan array
vand q11, q11, q15
vmax.u16 q0, q10, q11 ; find maximum value in q0, q1
vmax.u16 d0, d0, d1
vmovl.u16 q0, d0
vmul.s16 q2, q12 ; x * Dequant
vmul.s16 q3, q13
vmax.u32 d0, d0, d1
vpmax.u32 d0, d0, d0
vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant
add r4, r1, #vp8_blockd_eob
vst1.32 {d0[0]}, [r4@32]
ldmfd sp!, {r4-r7}
bx lr
zero_output
str r2, [r1, #vp8_blockd_eob]
vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0
vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0
ldmfd sp!, {r4-r7}
bx lr
ENDP
; default inverse zigzag table is defined in vp9/common/vp9_entropy.c
_inv_zig_zag_
DCD inv_zig_zag
ALIGN 16 ; enable use of @128 bit aligned loads
inv_zig_zag
DCW 0x0001, 0x0002, 0x0006, 0x0007
DCW 0x0003, 0x0005, 0x0008, 0x000d
DCW 0x0004, 0x0009, 0x000c, 0x000e
DCW 0x000a, 0x000b, 0x000f, 0x0010
END

Просмотреть файл

@ -1,68 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_memcpy_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;=========================================
;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
|vp8_memcpy_neon| PROC
;pld [r1] ;preload pred data
;pld [r1, #128]
;pld [r1, #256]
;pld [r1, #384]
mov r12, r2, lsr #8 ;copy 256 bytes data at one time
memcpy_neon_loop
vld1.8 {q0, q1}, [r1]! ;load src data
subs r12, r12, #1
vld1.8 {q2, q3}, [r1]!
vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
vld1.8 {q4, q5}, [r1]!
vst1.8 {q2, q3}, [r0]!
vld1.8 {q6, q7}, [r1]!
vst1.8 {q4, q5}, [r0]!
vld1.8 {q8, q9}, [r1]!
vst1.8 {q6, q7}, [r0]!
vld1.8 {q10, q11}, [r1]!
vst1.8 {q8, q9}, [r0]!
vld1.8 {q12, q13}, [r1]!
vst1.8 {q10, q11}, [r0]!
vld1.8 {q14, q15}, [r1]!
vst1.8 {q12, q13}, [r0]!
vst1.8 {q14, q15}, [r0]!
;pld [r1] ;preload pred data -- need to adjust for real device
;pld [r1, #128]
;pld [r1, #256]
;pld [r1, #384]
bne memcpy_neon_loop
ands r3, r2, #0xff ;extra copy
beq done_copy_neon_loop
extra_copy_neon_loop
vld1.8 {q0}, [r1]! ;load src data
subs r3, r3, #16
vst1.8 {q0}, [r0]!
bne extra_copy_neon_loop
done_copy_neon_loop
bx lr
ENDP
END

Просмотреть файл

@ -1,116 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mse16x16_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;============================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;note: in this function, sum is never used. So, we can remove this part of calculation
;from vp9_variance().
|vp8_mse16x16_neon| PROC
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
vmov.i8 q10, #0
mov r12, #8
mse16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmlal.s16 q7, d22, d22
vmlal.s16 q8, d23, d23
subs r12, r12, #1
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vmlal.s16 q7, d26, d26
vmlal.s16 q8, d27, d27
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
ldr r12, [sp] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
vadd.u64 d0, d2, d3
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
bx lr
ENDP
;=============================
; r0 unsigned char *src_ptr,
; r1 int source_stride,
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmull.s16 q7, d22, d22
vmull.s16 q8, d24, d24
vmull.s16 q9, d26, d26
vmull.s16 q10, d28, d28
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
vadd.u32 q9, q7, q9
vpaddl.u32 q1, q9
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -1,48 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpxscale.h"
#include "vp9/common/vp9_alloccommon.h"
extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
void
vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
unsigned char *src_y, *dst_y;
int yheight;
int ystride;
int border;
int yoffset;
int linestocopy;
border = src_ybc->border;
yheight = src_ybc->y_height;
ystride = src_ybc->y_stride;
linestocopy = (yheight >> (Fraction + 4));
if (linestocopy < 1)
linestocopy = 1;
linestocopy <<= 4;
yoffset = ystride * ((yheight >> 5) * 16 - 8);
src_y = src_ybc->y_buffer + yoffset;
dst_y = dst_ybc->y_buffer + yoffset;
// vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16)));
}

Просмотреть файл

@ -1,207 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad16x16_neon|
EXPORT |vp8_sad16x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_sad16x16_neon| PROC
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0]
vld1.8 {q7}, [r2]
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;==============================
;unsigned int vp8_sad16x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad16x8_neon| PROC
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -1,209 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad8x8_neon|
EXPORT |vp8_sad8x16_neon|
EXPORT |vp8_sad4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; unsigned int vp8_sad8x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x8_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;============================
;unsigned int vp8_sad8x16_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x16_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;===========================
;unsigned int vp8_sad4x4_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad4x4_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 d1, d24
vpaddl.u32 d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -1,221 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_fdct4x4_neon|
EXPORT |vp8_short_fdct8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
ALIGN 16 ; enable use of @128 bit aligned loads
coeff
DCW 5352, 5352, 5352, 5352
DCW 2217, 2217, 2217, 2217
DCD 14500, 14500, 14500, 14500
DCD 7500, 7500, 7500, 7500
DCD 12000, 12000, 12000, 12000
DCD 51000, 51000, 51000, 51000
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_neon| PROC
; Part one
vld1.16 {d0}, [r0@64], r2
adr r12, coeff
vld1.16 {d1}, [r0@64], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {d2}, [r0@64], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {d3}, [r0@64], r2
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vmov.s16 d26, #7
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
vadd.s16 d4, d4, d26 ; a1 + 7
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
vceq.s16 d4, d7, #0
vshr.s16 d0, d0, #4
vshr.s16 d2, d2, #4
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
vmvn.s16 d4, d4
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|vp8_short_fdct8x4_neon| PROC
; Part one
vld1.16 {q0}, [r0@128], r2
adr r12, coeff
vld1.16 {q1}, [r0@128], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {q2}, [r0@128], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {q3}, [r0@128], r2
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
vtrn.32 q0, q2 ; [A0|B0]
vtrn.32 q1, q3 ; [A1|B1]
vtrn.16 q0, q1 ; [A2|B2]
vtrn.16 q2, q3 ; [A3|B3]
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
vshl.s16 q11, q11, #3 ; a1 << 3
vshl.s16 q12, q12, #3 ; b1 << 3
vshl.s16 q13, q13, #3 ; c1 << 3
vshl.s16 q14, q14, #3 ; d1 << 3
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
vmov.s16 q11, q9 ; 14500
vmov.s16 q12, q10 ; 7500
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
vtrn.32 q0, q2 ; q0=[A0 | B0]
vtrn.32 q1, q3 ; q1=[A4 | B4]
vtrn.16 q0, q1 ; q2=[A8 | B8]
vtrn.16 q2, q3 ; q3=[A12|B12]
vmov.s16 q15, #7
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
vadd.s16 q11, q11, q15 ; a1 + 7
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
vmov.s16 q11, q9 ; 12000
vmov.s16 q12, q10 ; 51000
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
vceq.s16 q14, q14, #0
vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
vmvn.s16 q14, q14
vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
vst1.16 {q0, q1}, [r1@128]! ; block A
vst1.16 {q2, q3}, [r1@128]! ; block B
bx lr
ENDP
END

Просмотреть файл

@ -1,103 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_walsh4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
; r0 short *input,
; r1 short *output,
; r2 int pitch
|vp8_short_walsh4x4_neon| PROC
vld1.16 {d0}, [r0@64], r2 ; load input
vld1.16 {d1}, [r0@64], r2
vld1.16 {d2}, [r0@64], r2
vld1.16 {d3}, [r0@64]
;First for-loop
;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vmov.s32 q15, #3 ; add 3 to all values
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
vceq.s16 d16, d4, #0 ; a1 == 0
vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
vadd.s16 d0, d4, d5 ; a1 + d1
vmvn d16, d16 ; a1 != 0
vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
;Second for-loop
;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d1, d3
vtrn.32 d0, d2
vtrn.16 d2, d3
vtrn.16 d0, d1
vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
vadd.s32 q0, q8, q9 ; a2 = a1 + d1
vadd.s32 q1, q11, q10 ; b2 = b1 + c1
vsub.s32 q2, q11, q10 ; c2 = b1 - c1
vsub.s32 q3, q8, q9 ; d2 = a1 - d1
vclt.s32 q8, q0, #0
vclt.s32 q9, q1, #0
vclt.s32 q10, q2, #0
vclt.s32 q11, q3, #0
; subtract -1 (or 0)
vsub.s32 q0, q0, q8 ; a2 += a2 < 0
vsub.s32 q1, q1, q9 ; b2 += b2 < 0
vsub.s32 q2, q2, q10 ; c2 += c2 < 0
vsub.s32 q3, q3, q11 ; d2 += d2 < 0
vadd.s32 q8, q0, q15 ; a2 + 3
vadd.s32 q9, q1, q15 ; b2 + 3
vadd.s32 q10, q2, q15 ; c2 + 3
vadd.s32 q11, q3, q15 ; d2 + 3
; vrshrn? would add 1 << 3-1 = 2
vshrn.s32 d0, q8, #3
vshrn.s32 d1, q9, #3
vshrn.s32 d2, q10, #3
vshrn.s32 d3, q11, #3
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
END

Просмотреть файл

@ -1,425 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_sub_pixel_variance16x16_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon.
|vp9_sub_pixel_variance16x16_neon_func| PROC
push {r4-r6, lr}
ldr r12, _BilinearTaps_coeff_
ldr r4, [sp, #16] ;load *dst_ptr from stack
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
ldr r6, [sp, #24] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne vp8e_filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
sub sp, sp, #256
mov r3, sp
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
vp8e_filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5}, [r3]!
vst1.u8 {d6, d7}, [r3]!
vmov q11, q15
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_sp16x16_loop_neon
b sub_pixel_variance16x16_neon
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
sub sp, sp, #528 ;reserve space on stack for temporary storage
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r3]! ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r3]!
vst1.u8 {d18, d19}, [r3]!
vst1.u8 {d20, d21}, [r3]!
bne vp8e_filt_blk2d_fpo16x16_loop_neon
b sub_pixel_variance16x16_neon
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
sub sp, sp, #528 ;reserve space on stack for temporary storage
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
mov r3, sp
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vp8e_filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r3]!
vmov q11, q15
vst1.u8 {d6, d7}, [r3]!
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_spo16x16_loop_neon
b sub_pixel_variance16x16_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r12, #8
sub_pixel_variance16x16_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q2}, [r4], r5
vld1.8 {q1}, [r3]!
vld1.8 {q3}, [r4], r5
vsubl.u8 q11, d0, d4 ;diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne sub_pixel_variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r6] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
add sp, sp, #528
vmov.32 r0, d0[0] ;return
pop {r4-r6,pc}
ENDP
;-----------------
_BilinearTaps_coeff_
DCD bilinear_taps_coeff
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,572 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance_halfpixvar16x16_h_neon|
EXPORT |vp9_variance_halfpixvar16x16_v_neon|
EXPORT |vp9_variance_halfpixvar16x16_hv_neon|
EXPORT |vp9_sub_pixel_variance16x16s_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;================================================
;unsigned int vp9_variance_halfpixvar16x16_h_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp9_variance_halfpixvar16x16_h_neon| PROC
push {lr}
mov r12, #4 ;loop counter
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
;First Pass: output_height lines x output_width columns (16x16)
vp8_filt_fpo16x16s_4_0_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.8 {q11}, [r2], r3
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.8 {q12}, [r2], r3
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.8 {q13}, [r2], r3
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vext.8 q3, q2, q3, #1
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vld1.8 {q14}, [r2], r3
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vsubl.u8 q4, d0, d22 ;diff
vsubl.u8 q5, d1, d23
vsubl.u8 q6, d2, d24
vsubl.u8 q7, d3, d25
vsubl.u8 q0, d4, d26
vsubl.u8 q1, d5, d27
vsubl.u8 q2, d6, d28
vsubl.u8 q3, d7, d29
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
subs r12, r12, #1
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_fpo16x16s_4_0_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp9_variance_halfpixvar16x16_v_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp9_variance_halfpixvar16x16_v_neon| PROC
push {lr}
mov r12, #4 ;loop counter
vld1.u8 {q0}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
vp8_filt_spo16x16s_0_4_loop_neon
vld1.u8 {q2}, [r0], r1
vld1.8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vld1.8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vld1.8 {q5}, [r2], r3
vld1.u8 {q15}, [r0], r1
vrhadd.u8 q0, q0, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q4
vrhadd.u8 q4, q4, q6
vrhadd.u8 q6, q6, q15
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vmov q0, q15
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_spo16x16s_0_4_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp9_variance_halfpixvar16x16_hv_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp9_variance_halfpixvar16x16_hv_neon| PROC
push {lr}
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q13, #0 ;q8 - sum
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vmov.i8 q14, #0 ;q9, q10 - sse
vmov.i8 q15, #0
mov r12, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8_filt16x16s_4_4_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vld1.8 {q5}, [r2], r3
vrhadd.u8 q0, q0, q1
vld1.8 {q6}, [r2], r3
vrhadd.u8 q1, q1, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q3
vld1.8 {q8}, [r2], r3
vrhadd.u8 q3, q3, q4
vsubl.u8 q9, d0, d10 ;diff
vsubl.u8 q10, d1, d11
vsubl.u8 q11, d2, d12
vsubl.u8 q12, d3, d13
vsubl.u8 q0, d4, d14 ;diff
vsubl.u8 q1, d5, d15
vsubl.u8 q5, d6, d16
vsubl.u8 q6, d7, d17
vpadal.s16 q13, q9 ;sum
vmlal.s16 q14, d18, d18 ;sse
vmlal.s16 q15, d19, d19
vpadal.s16 q13, q10 ;sum
vmlal.s16 q14, d20, d20 ;sse
vmlal.s16 q15, d21, d21
vpadal.s16 q13, q11 ;sum
vmlal.s16 q14, d22, d22 ;sse
vmlal.s16 q15, d23, d23
vpadal.s16 q13, q12 ;sum
vmlal.s16 q14, d24, d24 ;sse
vmlal.s16 q15, d25, d25
subs r12, r12, #1
vpadal.s16 q13, q0 ;sum
vmlal.s16 q14, d0, d0 ;sse
vmlal.s16 q15, d1, d1
vpadal.s16 q13, q1 ;sum
vmlal.s16 q14, d2, d2 ;sse
vmlal.s16 q15, d3, d3
vpadal.s16 q13, q5 ;sum
vmlal.s16 q14, d10, d10 ;sse
vmlal.s16 q15, d11, d11
vmov q0, q4
vpadal.s16 q13, q6 ;sum
vmlal.s16 q14, d12, d12 ;sse
vmlal.s16 q15, d13, d13
bne vp8_filt16x16s_4_4_loop_neon
vadd.u32 q15, q14, q15 ;accumulate sse
vpaddl.s32 q0, q13 ;accumulate sum
vpaddl.u32 q1, q15
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;==============================
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pixels_per_line,
; stack unsigned int *sse
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
;or filter coeff is {64, 64}. This simplified program only works in this situation.
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|vp9_sub_pixel_variance16x16s_neon| PROC
push {r4, lr}
ldr r4, [sp, #8] ;load *dst_ptr from stack
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
ldr lr, [sp, #16] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16s_only
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq firstpass_bfilter16x16s_only
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
sub sp, sp, #256 ;reserve space on stack for temporary storage
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
mov r3, sp
mov r2, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16s_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q0, q0, q1
vrhadd.u8 q1, q1, q2
vrhadd.u8 q2, q2, q3
vrhadd.u8 q3, q3, q4
subs r2, r2, #1
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
vmov q0, q4
vst1.u8 {d4, d5, d6, d7}, [r3]!
bne vp8e_filt_blk2d_fp16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;--------------------
firstpass_bfilter16x16s_only
mov r2, #2 ;loop counter
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16s_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vld1.u8 {d16, d17, d18, d19}, [r0], r1
vext.8 q3, q2, q3, #1
vld1.u8 {d20, d21, d22, d23}, [r0], r1
vext.8 q5, q4, q5, #1
vld1.u8 {d24, d25, d26, d27}, [r0], r1
vext.8 q7, q6, q7, #1
vld1.u8 {d28, d29, d30, d31}, [r0], r1
vext.8 q9, q8, q9, #1
vext.8 q11, q10, q11, #1
vext.8 q13, q12, q13, #1
vext.8 q15, q14, q15, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q5, q10, q11
vrhadd.u8 q6, q12, q13
vrhadd.u8 q7, q14, q15
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]!
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;---------------------
secondpass_bfilter16x16s_only
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r2, #2 ;loop counter
vld1.u8 {d0, d1}, [r0], r1 ;load src data
mov r3, sp
vp8e_filt_blk2d_spo16x16s_loop_neon
vld1.u8 {d2, d3}, [r0], r1
vld1.u8 {d4, d5}, [r0], r1
vld1.u8 {d6, d7}, [r0], r1
vld1.u8 {d8, d9}, [r0], r1
vrhadd.u8 q0, q0, q1
vld1.u8 {d10, d11}, [r0], r1
vrhadd.u8 q1, q1, q2
vld1.u8 {d12, d13}, [r0], r1
vrhadd.u8 q2, q2, q3
vld1.u8 {d14, d15}, [r0], r1
vrhadd.u8 q3, q3, q4
vld1.u8 {d16, d17}, [r0], r1
vrhadd.u8 q4, q4, q5
vrhadd.u8 q5, q5, q6
vrhadd.u8 q6, q6, q7
vrhadd.u8 q7, q7, q8
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vmov q0, q8
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_spo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16s_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r2, #4
sub_pixel_variance16x16s_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q1}, [r4], r12
vld1.8 {q2}, [r3]!
vld1.8 {q3}, [r4], r12
vld1.8 {q4}, [r3]!
vld1.8 {q5}, [r4], r12
vld1.8 {q6}, [r3]!
vld1.8 {q7}, [r4], r12
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r2, r2, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne sub_pixel_variance16x16s_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
add sp, sp, #256
vmov.32 r0, d0[0] ;return
pop {r4, pc}
ENDP
END

Просмотреть файл

@ -1,224 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_sub_pixel_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict8x8_neon and vp9_variance8x8_neon.
|vp9_sub_pixel_variance8x8_neon| PROC
push {r4-r5, lr}
ldr r12, _BilinearTaps_coeff_
ldr r4, [sp, #12] ;load *dst_ptr from stack
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
ldr lr, [sp, #20] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
;skip_secondpass_filter
beq sub_pixel_variance8x8_neon
add r3, r12, r3, lsl #3
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q2, #7
vqrshrn.u16 d24, q3, #7
vqrshrn.u16 d25, q4, #7
vqrshrn.u16 d26, q5, #7
vqrshrn.u16 d27, q6, #7
vqrshrn.u16 d28, q7, #7
vqrshrn.u16 d29, q8, #7
b sub_pixel_variance8x8_neon
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;----------------------
;vp9_variance8x8_neon
sub_pixel_variance8x8_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
sub_pixel_variance8x8_neon_loop
vld1.8 {d0}, [r4], r5 ;load dst data
subs r12, r12, #1
vld1.8 {d1}, [r4], r5
vld1.8 {d2}, [r4], r5
vsubl.u8 q4, d22, d0 ;calculate diff
vld1.8 {d3}, [r4], r5
vsubl.u8 q5, d23, d1
vsubl.u8 q6, d24, d2
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
vsubl.u8 q7, d25, d3
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vmov q11, q13
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vmov q12, q14
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
bne sub_pixel_variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.s32 d10, d10, #6
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {r4-r5, pc}
ENDP
;-----------------
_BilinearTaps_coeff_
DCD bilinear_taps_coeff
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Просмотреть файл

@ -1,185 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_subtract_b_neon|
EXPORT |vp8_subtract_mby_neon|
EXPORT |vp8_subtract_mbuv_neon|
INCLUDE vp9_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
|vp8_subtract_b_neon| PROC
stmfd sp!, {r4-r7}
ldr r3, [r0, #vp8_block_base_src]
ldr r4, [r0, #vp8_block_src]
ldr r5, [r0, #vp8_block_src_diff]
ldr r3, [r3]
ldr r6, [r0, #vp8_block_src_stride]
add r3, r3, r4 ; src = *base_src + src
ldr r7, [r1, #vp8_blockd_predictor]
vld1.8 {d0}, [r3], r6 ;load src
vld1.8 {d1}, [r7], r2 ;load pred
vld1.8 {d2}, [r3], r6
vld1.8 {d3}, [r7], r2
vld1.8 {d4}, [r3], r6
vld1.8 {d5}, [r7], r2
vld1.8 {d6}, [r3], r6
vld1.8 {d7}, [r7], r2
vsubl.u8 q10, d0, d1
vsubl.u8 q11, d2, d3
vsubl.u8 q12, d4, d5
vsubl.u8 q13, d6, d7
mov r2, r2, lsl #1
vst1.16 {d20}, [r5], r2 ;store diff
vst1.16 {d22}, [r5], r2
vst1.16 {d24}, [r5], r2
vst1.16 {d26}, [r5], r2
ldmfd sp!, {r4-r7}
bx lr
ENDP
;==========================================
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
|vp8_subtract_mby_neon| PROC
mov r12, #4
subtract_mby_loop
vld1.8 {q0}, [r1], r3 ;load src
vld1.8 {q1}, [r2]! ;load pred
vld1.8 {q2}, [r1], r3
vld1.8 {q3}, [r2]!
vld1.8 {q4}, [r1], r3
vld1.8 {q5}, [r2]!
vld1.8 {q6}, [r1], r3
vld1.8 {q7}, [r2]!
vsubl.u8 q8, d0, d2
vsubl.u8 q9, d1, d3
vsubl.u8 q10, d4, d6
vsubl.u8 q11, d5, d7
vsubl.u8 q12, d8, d10
vsubl.u8 q13, d9, d11
vsubl.u8 q14, d12, d14
vsubl.u8 q15, d13, d15
vst1.16 {q8}, [r0]! ;store diff
vst1.16 {q9}, [r0]!
vst1.16 {q10}, [r0]!
vst1.16 {q11}, [r0]!
vst1.16 {q12}, [r0]!
vst1.16 {q13}, [r0]!
vst1.16 {q14}, [r0]!
vst1.16 {q15}, [r0]!
subs r12, r12, #1
bne subtract_mby_loop
bx lr
ENDP
;=================================
;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|vp8_subtract_mbuv_neon| PROC
ldr r12, [sp]
;u
add r0, r0, #512 ; short *udiff = diff + 256;
add r3, r3, #256 ; unsigned char *upred = pred + 256;
vld1.8 {d0}, [r1], r12 ;load src
vld1.8 {d1}, [r3]! ;load pred
vld1.8 {d2}, [r1], r12
vld1.8 {d3}, [r3]!
vld1.8 {d4}, [r1], r12
vld1.8 {d5}, [r3]!
vld1.8 {d6}, [r1], r12
vld1.8 {d7}, [r3]!
vld1.8 {d8}, [r1], r12
vld1.8 {d9}, [r3]!
vld1.8 {d10}, [r1], r12
vld1.8 {d11}, [r3]!
vld1.8 {d12}, [r1], r12
vld1.8 {d13}, [r3]!
vld1.8 {d14}, [r1], r12
vld1.8 {d15}, [r3]!
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
vsubl.u8 q10, d4, d5
vsubl.u8 q11, d6, d7
vsubl.u8 q12, d8, d9
vsubl.u8 q13, d10, d11
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
vst1.16 {q8}, [r0]! ;store diff
vst1.16 {q9}, [r0]!
vst1.16 {q10}, [r0]!
vst1.16 {q11}, [r0]!
vst1.16 {q12}, [r0]!
vst1.16 {q13}, [r0]!
vst1.16 {q14}, [r0]!
vst1.16 {q15}, [r0]!
;v
vld1.8 {d0}, [r2], r12 ;load src
vld1.8 {d1}, [r3]! ;load pred
vld1.8 {d2}, [r2], r12
vld1.8 {d3}, [r3]!
vld1.8 {d4}, [r2], r12
vld1.8 {d5}, [r3]!
vld1.8 {d6}, [r2], r12
vld1.8 {d7}, [r3]!
vld1.8 {d8}, [r2], r12
vld1.8 {d9}, [r3]!
vld1.8 {d10}, [r2], r12
vld1.8 {d11}, [r3]!
vld1.8 {d12}, [r2], r12
vld1.8 {d13}, [r3]!
vld1.8 {d14}, [r2], r12
vld1.8 {d15}, [r3]!
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
vsubl.u8 q10, d4, d5
vsubl.u8 q11, d6, d7
vsubl.u8 q12, d8, d9
vsubl.u8 q13, d10, d11
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
vst1.16 {q8}, [r0]! ;store diff
vst1.16 {q9}, [r0]!
vst1.16 {q10}, [r0]!
vst1.16 {q11}, [r0]!
vst1.16 {q12}, [r0]!
vst1.16 {q13}, [r0]!
vst1.16 {q14}, [r0]!
vst1.16 {q15}, [r0]!
bx lr
ENDP
END

Просмотреть файл

@ -1,276 +0,0 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp9_variance16x16_neon|
EXPORT |vp9_variance16x8_neon|
EXPORT |vp9_variance8x16_neon|
EXPORT |vp9_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance16x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
;the results into the elements of the destination vector. The explanation
;in ARM guide is wrong.
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
;vmov.32 r0, d0[0] ;this instruction costs a lot
;vmov.32 r1, d1[0]
;mul r0, r0, r0
;str r1, [r12]
;sub r0, r1, r0, asr #8
;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.s32 d10, d10, #8
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;================================
;unsigned int vp9_variance16x8_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp9_variance16x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #4
variance16x8_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.s32 d10, d10, #7
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;=================================
;unsigned int vp9_variance8x16_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp9_variance8x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance8x16_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d2, d6
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
bne variance8x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.s32 d10, d10, #7
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;==================================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp9_variance8x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
variance8x8_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.s32 d10, d10, #6
vsub.s32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
END

Просмотреть файл

@ -1,129 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vpx_ports/arm.h"
#include "vp9/encoder/vp9_variance.h"
#include "vp9/encoder/vp9_onyx_int.h"
extern void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
void vp9_arch_arm_encoder_init(VP9_COMP *cpi) {
#if CONFIG_RUNTIME_CPU_DETECT
int flags = cpi->common.rtcd.flags;
#if HAVE_ARMV5TE
if (flags & HAS_EDSP) {
}
#endif
#if HAVE_ARMV6
if (flags & HAS_MEDIA) {
cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6;
/*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c;
cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c;
cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c;
cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/
/*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6;
/*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c;
cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/
cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6;
/*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6;
/*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6;
cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6;
cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6;
cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6;
cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6;
/*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6;
cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6;
cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6;
cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6;
cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6;
/*cpi->rtcd.encodemb.berr = vp9_block_error_c;
cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6;
cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6;
cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6;
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6;
}
#endif
#if HAVE_ARMV7
if (flags & HAS_NEON) {
cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon;
cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon;
cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon;
cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon;
cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon;
/*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/
cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon;
cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon;
cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon;
cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon;
/*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/
cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon;
/*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c;
cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/
cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon;
cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon;
cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon;
cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon;
cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon;
/*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/
cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon;
cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon;
cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon;
cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon;
cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon;
/*cpi->rtcd.encodemb.berr = vp9_block_error_c;
cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c;
cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/
cpi->rtcd.encodemb.subb = vp9_subtract_b_neon;
cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon;
cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon;
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;
cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon;
cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon;
}
#endif
#if HAVE_ARMV7
#if CONFIG_RUNTIME_CPU_DETECT
if (flags & HAS_NEON)
#endif
{
vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
}
#endif
#endif
}

Просмотреть файл

@ -1,33 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/encoder/vp9_boolhuff.h"
#include "vp9/common/vp9_blockd.h"
const unsigned int vp9_prob_cost[256] = {
2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778,
767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625,
617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516,
511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433,
428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365,
361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307,
304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257,
255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214,
211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174,
172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139,
137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107,
105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77,
75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50,
48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24,
22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1
};

Просмотреть файл

@ -1,21 +0,0 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "./vp9_rtcd.h"
#if HAVE_ARMV6
void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) {
vp9_short_fdct4x4_armv6(input, output, pitch);
vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch);
}
#endif /* HAVE_ARMV6 */

Просмотреть файл

@ -1,65 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_ARM_VP9_DCT_ARM_H_
#define VP9_ENCODER_ARM_VP9_DCT_ARM_H_
#if HAVE_ARMV6
extern prototype_fdct(vp9_short_walsh4x4_armv6);
extern prototype_fdct(vp9_short_fdct4x4_armv6);
extern prototype_fdct(vp9_short_fdct8x4_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6
#endif
#endif /* HAVE_ARMV6 */
#if HAVE_ARMV7
extern prototype_fdct(vp9_short_fdct4x4_neon);
extern prototype_fdct(vp9_short_fdct8x4_neon);
extern prototype_fdct(vp8_fast_fdct4x4_neon);
extern prototype_fdct(vp8_fast_fdct8x4_neon);
extern prototype_fdct(vp9_short_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon
#undef vp8_fdct_fast4x4
#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon
#undef vp8_fdct_fast8x4
#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon
#undef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon
#endif
#endif
#endif

Просмотреть файл

@ -1,64 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_
#define VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_
#if HAVE_ARMV6
extern prototype_subb(vp9_subtract_b_armv6);
extern prototype_submby(vp9_subtract_mby_armv6);
extern prototype_submbuv(vp9_subtract_mbuv_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_encodemb_subb
#define vp8_encodemb_subb vp9_subtract_b_armv6
#undef vp8_encodemb_submby
#define vp8_encodemb_submby vp9_subtract_mby_armv6
#undef vp8_encodemb_submbuv
#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6
#endif
#endif /* HAVE_ARMV6 */
#if HAVE_ARMV7
// extern prototype_berr(vp9_block_error_c);
// extern prototype_mberr(vp9_mbblock_error_c);
// extern prototype_mbuverr(vp9_mbuverror_c);
extern prototype_subb(vp9_subtract_b_neon);
extern prototype_submby(vp9_subtract_mby_neon);
extern prototype_submbuv(vp9_subtract_mbuv_neon);
// #undef vp8_encodemb_berr
// #define vp8_encodemb_berr vp9_block_error_c
// #undef vp8_encodemb_mberr
// #define vp8_encodemb_mberr vp9_mbblock_error_c
// #undef vp8_encodemb_mbuverr
// #define vp8_encodemb_mbuverr vp9_mbuverror_c
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_encodemb_subb
#define vp8_encodemb_subb vp9_subtract_b_neon
#undef vp8_encodemb_submby
#define vp8_encodemb_submby vp9_subtract_mby_neon
#undef vp8_encodemb_submbuv
#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon
#endif
#endif
#endif

Просмотреть файл

@ -1,57 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <math.h>
#include "vpx_mem/vpx_mem.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/common/vp9_entropy.h"
#if HAVE_ARMV7
/* vp8_quantize_mbX functions here differs from corresponding ones in
* vp9_quantize.c only by using quantize_b_pair function pointer instead of
* the regular quantize_b function pointer */
void vp8_quantize_mby_neon(MACROBLOCK *x) {
int i;
int has_2nd_order = get_2nd_order_usage(xd);
for (i = 0; i < 16; i += 2)
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
if (has_2nd_order)
x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
}
void vp8_quantize_mb_neon(MACROBLOCK *x) {
int i;
int has_2nd_order = get_2nd_order_usage(xd);
for (i = 0; i < 24; i += 2)
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
if (has_2nd_order)
x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
}
void vp8_quantize_mbuv_neon(MACROBLOCK *x) {
int i;
for (i = 16; i < 24; i += 2)
x->quantize_b_pair(&x->block[i], &x->block[i + 1],
&x->e_mbd.block[i], &x->e_mbd.block[i + 1]);
}
#endif /* HAVE_ARMV7 */

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше