From 34591b54dda4a25f42aa5add71b625b2600f6d6a Mon Sep 17 00:00:00 2001 From: Johann Date: Sun, 2 Dec 2012 14:14:00 -0800 Subject: [PATCH] Remove ARM optimizations from VP9 Change-Id: I9f0ae635fb9a95c4aa1529c177ccb07e2b76970b --- libs.mk | 1 - vp8/vp8dx.mk | 24 - .../arm/armv6/vp9_bilinearfilter_v6.asm | 237 --- vp9/common/arm/armv6/vp9_copymem16x16_v6.asm | 186 --- vp9/common/arm/armv6/vp9_copymem8x4_v6.asm | 128 -- vp9/common/arm/armv6/vp9_copymem8x8_v6.asm | 128 -- .../arm/armv6/vp9_dc_only_idct_add_v6.asm | 67 - vp9/common/arm/armv6/vp9_filter_v6.asm | 624 -------- vp9/common/arm/armv6/vp9_idct_v6.asm | 345 ----- vp9/common/arm/armv6/vp9_iwalsh_v6.asm | 152 -- vp9/common/arm/armv6/vp9_loopfilter_v6.asm | 1282 ----------------- vp9/common/arm/armv6/vp9_recon_v6.asm | 281 ---- .../arm/armv6/vp9_simpleloopfilter_v6.asm | 286 ---- .../arm/armv6/vp9_sixtappredict8x4_v6.asm | 273 ---- .../neon/vp9_bilinearpredict16x16_neon.asm | 357 ----- .../arm/neon/vp9_bilinearpredict4x4_neon.asm | 130 -- .../arm/neon/vp9_bilinearpredict8x4_neon.asm | 135 -- .../arm/neon/vp9_bilinearpredict8x8_neon.asm | 183 --- .../neon/vp9_buildintrapredictorsmby_neon.asm | 584 -------- vp9/common/arm/neon/vp9_copymem16x16_neon.asm | 59 - vp9/common/arm/neon/vp9_copymem8x4_neon.asm | 34 - vp9/common/arm/neon/vp9_copymem8x8_neon.asm | 43 - .../arm/neon/vp9_dc_only_idct_add_neon.asm | 49 - vp9/common/arm/neon/vp9_iwalsh_neon.asm | 80 - vp9/common/arm/neon/vp9_loopfilter_neon.asm | 397 ----- ...p9_loopfiltersimplehorizontaledge_neon.asm | 117 -- .../vp9_loopfiltersimpleverticaledge_neon.asm | 154 -- vp9/common/arm/neon/vp9_mbloopfilter_neon.asm | 469 ------ vp9/common/arm/neon/vp9_recon16x16mb_neon.asm | 131 -- vp9/common/arm/neon/vp9_recon2b_neon.asm | 54 - vp9/common/arm/neon/vp9_recon4b_neon.asm | 69 - vp9/common/arm/neon/vp9_recon_neon.c | 29 - vp9/common/arm/neon/vp9_reconb_neon.asm | 61 - vp9/common/arm/neon/vp9_save_neon_reg.asm | 36 - .../arm/neon/vp9_shortidct4x4llm_1_neon.asm | 67 - .../arm/neon/vp9_shortidct4x4llm_neon.asm | 122 -- .../arm/neon/vp9_sixtappredict16x16_neon.asm | 490 ------- .../arm/neon/vp9_sixtappredict4x4_neon.asm | 422 ------ .../arm/neon/vp9_sixtappredict8x4_neon.asm | 473 ------ .../arm/neon/vp9_sixtappredict8x8_neon.asm | 524 ------- vp9/common/arm/vp9_arm_systemdependent.c | 91 -- vp9/common/arm/vp9_bilinearfilter_arm.c | 108 -- vp9/common/arm/vp9_bilinearfilter_arm.h | 35 - vp9/common/arm/vp9_filter_arm.c | 198 --- vp9/common/arm/vp9_idct_arm.h | 65 - vp9/common/arm/vp9_loopfilter_arm.c | 166 --- vp9/common/arm/vp9_loopfilter_arm.h | 41 - vp9/common/arm/vp9_recon_arm.h | 90 -- vp9/common/arm/vp9_reconintra_arm.c | 62 - vp9/common/arm/vp9_subpixel_arm.h | 89 -- vp9/common/vp9_asm_com_offsets.c | 19 - vp9/common/vp9_loopfilter.h | 8 - vp9/common/vp9_rtcd_defs.sh | 33 +- .../arm/armv6/vp9_dequant_dc_idct_v6.asm | 218 --- vp9/decoder/arm/armv6/vp9_dequant_idct_v6.asm | 196 --- vp9/decoder/arm/armv6/vp9_dequantize_v6.asm | 69 - vp9/decoder/arm/armv6/vp9_idct_blk_v6.c | 137 -- .../arm/neon/vp9_dequant_idct_neon.asm | 129 -- vp9/decoder/arm/neon/vp9_dequantizeb_neon.asm | 34 - vp9/decoder/arm/neon/vp9_idct_blk_neon.c | 113 -- .../arm/neon/vp9_idct_dequant_0_2x_neon.asm | 79 - .../neon/vp9_idct_dequant_dc_0_2x_neon.asm | 69 - .../neon/vp9_idct_dequant_dc_full_2x_neon.asm | 205 --- .../neon/vp9_idct_dequant_full_2x_neon.asm | 197 --- vp9/decoder/arm/vp9_dequantize_arm.c | 44 - vp9/decoder/vp9_onyxd_if.c | 52 - .../arm/armv5te/vp9_boolhuff_armv5te.asm | 286 ---- .../arm/armv5te/vp9_packtokens_armv5.asm | 291 ---- .../armv5te/vp9_packtokens_mbrow_armv5.asm | 327 ----- .../vp9_packtokens_partitions_armv5.asm | 465 ------ .../arm/armv6/vp9_fast_quantize_b_armv6.asm | 223 --- vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm | 138 -- vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm | 95 -- .../arm/armv6/vp9_short_fdct4x4_armv6.asm | 262 ---- vp9/encoder/arm/armv6/vp9_subtract_armv6.asm | 264 ---- .../arm/armv6/vp9_variance16x16_armv6.asm | 153 -- .../arm/armv6/vp9_variance8x8_armv6.asm | 101 -- .../vp9_variance_halfpixvar16x16_h_armv6.asm | 181 --- .../vp9_variance_halfpixvar16x16_hv_armv6.asm | 222 --- .../vp9_variance_halfpixvar16x16_v_armv6.asm | 183 --- vp9/encoder/arm/armv6/vp9_walsh_v6.asm | 212 --- .../arm/neon/vp9_fastquantizeb_neon.asm | 261 ---- vp9/encoder/arm/neon/vp9_memcpy_neon.asm | 68 - vp9/encoder/arm/neon/vp9_mse16x16_neon.asm | 116 -- vp9/encoder/arm/neon/vp9_picklpf_arm.c | 48 - vp9/encoder/arm/neon/vp9_sad16_neon.asm | 207 --- vp9/encoder/arm/neon/vp9_sad8_neon.asm | 209 --- vp9/encoder/arm/neon/vp9_shortfdct_neon.asm | 221 --- .../arm/neon/vp9_shortwalsh4x4_neon.asm | 103 -- .../neon/vp9_subpixelvariance16x16_neon.asm | 425 ------ .../neon/vp9_subpixelvariance16x16s_neon.asm | 572 -------- .../arm/neon/vp9_subpixelvariance8x8_neon.asm | 224 --- vp9/encoder/arm/neon/vp9_subtract_neon.asm | 185 --- vp9/encoder/arm/neon/vp9_variance_neon.asm | 276 ---- vp9/encoder/arm/vp9_arm_csystemdependent.c | 129 -- vp9/encoder/arm/vp9_boolhuff_arm.c | 33 - vp9/encoder/arm/vp9_dct_arm.c | 21 - vp9/encoder/arm/vp9_dct_arm.h | 65 - vp9/encoder/arm/vp9_encodemb_arm.h | 64 - vp9/encoder/arm/vp9_quantize_arm.c | 57 - vp9/encoder/arm/vp9_quantize_arm.h | 52 - vp9/encoder/arm/vp9_variance_arm.c | 112 -- vp9/encoder/arm/vp9_variance_arm.h | 132 -- vp9/encoder/vp9_asm_enc_offsets.c | 8 - vp9/encoder/vp9_onyx_if.c | 68 - vp9/encoder/vp9_picklpf.c | 76 +- vp9/encoder/vp9_quantize.h | 4 - vp9/vp9_common.mk | 54 - vp9/vp9cx.mk | 4 - vp9/vp9cx_arm.mk | 63 - vp9/vp9dx.mk | 28 - vp9/vp9dx_arm.mk | 29 - 112 files changed, 15 insertions(+), 19132 deletions(-) delete mode 100644 vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_copymem16x16_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_copymem8x4_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_copymem8x8_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_filter_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_idct_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_iwalsh_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_loopfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_recon_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm delete mode 100644 vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_copymem16x16_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_copymem8x4_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_copymem8x8_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_iwalsh_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_loopfilter_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_mbloopfilter_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_recon16x16mb_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_recon2b_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_recon4b_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_recon_neon.c delete mode 100644 vp9/common/arm/neon/vp9_reconb_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_save_neon_reg.asm delete mode 100644 vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm delete mode 100644 vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm delete mode 100644 vp9/common/arm/vp9_arm_systemdependent.c delete mode 100644 vp9/common/arm/vp9_bilinearfilter_arm.c delete mode 100644 vp9/common/arm/vp9_bilinearfilter_arm.h delete mode 100644 vp9/common/arm/vp9_filter_arm.c delete mode 100644 vp9/common/arm/vp9_idct_arm.h delete mode 100644 vp9/common/arm/vp9_loopfilter_arm.c delete mode 100644 vp9/common/arm/vp9_loopfilter_arm.h delete mode 100644 vp9/common/arm/vp9_recon_arm.h delete mode 100644 vp9/common/arm/vp9_reconintra_arm.c delete mode 100644 vp9/common/arm/vp9_subpixel_arm.h delete mode 100644 vp9/decoder/arm/armv6/vp9_dequant_dc_idct_v6.asm delete mode 100644 vp9/decoder/arm/armv6/vp9_dequant_idct_v6.asm delete mode 100644 vp9/decoder/arm/armv6/vp9_dequantize_v6.asm delete mode 100644 vp9/decoder/arm/armv6/vp9_idct_blk_v6.c delete mode 100644 vp9/decoder/arm/neon/vp9_dequant_idct_neon.asm delete mode 100644 vp9/decoder/arm/neon/vp9_dequantizeb_neon.asm delete mode 100644 vp9/decoder/arm/neon/vp9_idct_blk_neon.c delete mode 100644 vp9/decoder/arm/neon/vp9_idct_dequant_0_2x_neon.asm delete mode 100644 vp9/decoder/arm/neon/vp9_idct_dequant_dc_0_2x_neon.asm delete mode 100644 vp9/decoder/arm/neon/vp9_idct_dequant_dc_full_2x_neon.asm delete mode 100644 vp9/decoder/arm/neon/vp9_idct_dequant_full_2x_neon.asm delete mode 100644 vp9/decoder/arm/vp9_dequantize_arm.c delete mode 100644 vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm delete mode 100644 vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_subtract_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm delete mode 100644 vp9/encoder/arm/armv6/vp9_walsh_v6.asm delete mode 100644 vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_memcpy_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_mse16x16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_picklpf_arm.c delete mode 100644 vp9/encoder/arm/neon/vp9_sad16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_sad8_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_shortfdct_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subpixelvariance8x8_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_subtract_neon.asm delete mode 100644 vp9/encoder/arm/neon/vp9_variance_neon.asm delete mode 100644 vp9/encoder/arm/vp9_arm_csystemdependent.c delete mode 100644 vp9/encoder/arm/vp9_boolhuff_arm.c delete mode 100644 vp9/encoder/arm/vp9_dct_arm.c delete mode 100644 vp9/encoder/arm/vp9_dct_arm.h delete mode 100644 vp9/encoder/arm/vp9_encodemb_arm.h delete mode 100644 vp9/encoder/arm/vp9_quantize_arm.c delete mode 100644 vp9/encoder/arm/vp9_quantize_arm.h delete mode 100644 vp9/encoder/arm/vp9_variance_arm.c delete mode 100644 vp9/encoder/arm/vp9_variance_arm.h delete mode 100644 vp9/vp9cx_arm.mk delete mode 100644 vp9/vp9dx_arm.mk diff --git a/libs.mk b/libs.mk index c54522159..bd1cd54e3 100644 --- a/libs.mk +++ b/libs.mk @@ -109,7 +109,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) CODEC_SRCS-yes += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS)) CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS)) CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h - CODEC_SRCS-$(ARCH_ARM) += $(VP9_PREFIX)vp98cx_arm.mk INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/% CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk index 5753e04e5..b5b90d37d 100644 --- a/vp8/vp8dx.mk +++ b/vp8/vp8dx.mk @@ -18,30 +18,6 @@ VP8_DX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no) VP8_DX_SRCS-yes += vp8_dx_iface.c -# common -#define ARM -#define DISABLE_THREAD - -#INCLUDES += algo/vpx_common/vpx_mem/include -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += decoder - - - -# decoder -#define ARM -#define DISABLE_THREAD - -#INCLUDES += algo/vpx_common/vpx_mem/include -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += decoder - VP8_DX_SRCS-yes += decoder/asm_dec_offsets.c VP8_DX_SRCS-yes += decoder/dboolhuff.c VP8_DX_SRCS-yes += decoder/decodemv.c diff --git a/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm b/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm deleted file mode 100644 index 36e391e2b..000000000 --- a/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_filter_block2d_bil_first_pass_armv6| - EXPORT |vp9_filter_block2d_bil_second_pass_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp9_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|vp9_filter_block2d_bil_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vp9_filter_block2d_bil_first_pass_armv6| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter_block2d_bil_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_second_pass_armv6| - - END diff --git a/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm b/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm deleted file mode 100644 index 44c3c492f..000000000 --- a/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm +++ /dev/null @@ -1,186 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem16x16_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem16x16_v6| PROC - stmdb sp!, {r4 - r7} - ;push {r4-r7} - - ;preload - pld [r0, #31] ; preload for next 16x16 block - - ands r4, r0, #15 - beq copy_mem16x16_fast - - ands r4, r0, #7 - beq copy_mem16x16_8 - - ands r4, r0, #3 - beq copy_mem16x16_4 - - ;copy one byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - ldrb r6, [r0, #2] - ldrb r7, [r0, #3] - - mov r12, #16 - -copy_mem16x16_1_loop - strb r4, [r2] - strb r5, [r2, #1] - strb r6, [r2, #2] - strb r7, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - ldrb r6, [r0, #6] - ldrb r7, [r0, #7] - - subs r12, r12, #1 - - strb r4, [r2, #4] - strb r5, [r2, #5] - strb r6, [r2, #6] - strb r7, [r2, #7] - - ldrb r4, [r0, #8] - ldrb r5, [r0, #9] - ldrb r6, [r0, #10] - ldrb r7, [r0, #11] - - strb r4, [r2, #8] - strb r5, [r2, #9] - strb r6, [r2, #10] - strb r7, [r2, #11] - - ldrb r4, [r0, #12] - ldrb r5, [r0, #13] - ldrb r6, [r0, #14] - ldrb r7, [r0, #15] - - add r0, r0, r1 - - strb r4, [r2, #12] - strb r5, [r2, #13] - strb r6, [r2, #14] - strb r7, [r2, #15] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - ldrneb r6, [r0, #2] - ldrneb r7, [r0, #3] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_1_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 4 bytes each time -copy_mem16x16_4 - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - - mov r12, #16 - -copy_mem16x16_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - str r6, [r2, #8] - str r7, [r2, #12] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - ldrne r6, [r0, #8] - ldrne r7, [r0, #12] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_4_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 8 bytes each time -copy_mem16x16_8 - sub r1, r1, #16 - sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_8_loop - ldmia r0!, {r4-r5} - ;ldm r0, {r4-r5} - ldmia r0!, {r6-r7} - - add r0, r0, r1 - - stmia r2!, {r4-r5} - subs r12, r12, #1 - ;stm r2, {r4-r5} - stmia r2!, {r6-r7} - - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_8_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 16 bytes each time -copy_mem16x16_fast - ;sub r1, r1, #16 - ;sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_fast_loop - ldmia r0, {r4-r7} - ;ldm r0, {r4-r7} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r7} - ;stm r2, {r4-r7} - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_fast_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - - ENDP ; |vp9_copy_mem16x16_v6| - - END diff --git a/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm b/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm deleted file mode 100644 index 45b904367..000000000 --- a/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x4_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x4_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x4_fast - - ands r4, r0, #3 - beq copy_mem8x4_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #4 - -copy_mem8x4_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x4_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x4_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #4 - -copy_mem8x4_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x4_4_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x4_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #4 - -copy_mem8x4_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x4_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp9_copy_mem8x4_v6| - - END diff --git a/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm b/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm deleted file mode 100644 index 0dd971bfe..000000000 --- a/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x8_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x8_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x8_fast - - ands r4, r0, #3 - beq copy_mem8x8_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #8 - -copy_mem8x8_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x8_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x8_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #8 - -copy_mem8x8_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x8_4_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x8_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #8 - -copy_mem8x8_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x8_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp9_copy_mem8x8_v6| - - END diff --git a/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm b/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm deleted file mode 100644 index e0660e9fd..000000000 --- a/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm +++ /dev/null @@ -1,67 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp8_dc_only_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) -; r0 input_dc -; r1 pred_ptr -; r2 dest_ptr -; r3 pitch -; sp stride - -|vp8_dc_only_idct_add_v6| PROC - stmdb sp!, {r4 - r7, lr} - - add r0, r0, #4 ; input_dc += 4 - ldr r12, c0x0000FFFF - ldr r4, [r1], r3 - ldr r6, [r1], r3 - and r0, r12, r0, asr #3 ; input_dc >> 3 + mask - ldr lr, [sp, #20] - orr r0, r0, r0, lsl #16 ; a1 | a1 - - uxtab16 r5, r0, r4 ; a1+2 | a1+0 - uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - ldr r4, [r1], r3 - ldr r6, [r1] - str r5, [r2], lr - str r7, [r2], lr - - uxtab16 r5, r0, r4 - uxtab16 r4, r0, r4, ror #8 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - str r5, [r2], lr - str r7, [r2] - - ldmia sp!, {r4 - r7, pc} - - ENDP ; |vp8_dc_only_idct_add_v6| - -; Constant Pool -c0x0000FFFF DCD 0x0000FFFF - END diff --git a/vp9/common/arm/armv6/vp9_filter_v6.asm b/vp9/common/arm/armv6/vp9_filter_v6.asm deleted file mode 100644 index 16b321e37..000000000 --- a/vp9/common/arm/armv6/vp9_filter_v6.asm +++ /dev/null @@ -1,624 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_filter_block2d_first_pass_armv6| - EXPORT |vp9_filter_block2d_first_pass_16x16_armv6| - EXPORT |vp9_filter_block2d_first_pass_8x8_armv6| - EXPORT |vp9_filter_block2d_second_pass_armv6| - EXPORT |vp9_filter4_block2d_second_pass_armv6| - EXPORT |vp9_filter_block2d_first_pass_only_armv6| - EXPORT |vp9_filter_block2d_second_pass_only_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr -; r1 short *output_ptr -; r2 unsigned int src_pixels_per_line -; r3 unsigned int output_width -; stack unsigned int output_height -; stack const short *vp9_filter -;------------------------------------- -; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with -; the output being a 2 byte value and the intput being a 1 byte value. -|vp9_filter_block2d_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 16x16 version -; ----------------------------- -|vp9_filter_block2d_first_pass_16x16_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #18 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_16_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_16_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_16_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #34 ; adding back block width(=16) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_16_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 8x8 version -; ----------------------------- -|vp9_filter_block2d_first_pass_8x8_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #10 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_8_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_8_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_8_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #18 ; adding back block width(=8) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_8_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp9_filter address - sub sp, sp, #4 - mov r7, r3, lsl #16 ; height is top part of counter - str r1, [sp] ; push destination to stack - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - - sub r0, r0, #4 ; offset input buffer - -|height_loop_2nd| - ldr r8, [r0] ; load the data - ldr r9, [r0, #4] - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd| - smuad lr, r4, r8 ; apply filter - sub r7, r7, #1 - smulbt r8, r4, r8 - - ldr r10, [r0, #8] - - smlad lr, r5, r9, lr - smladx r8, r12, r9, r8 - - ldrh r9, [r0, #12] - - smlad lr, r6, r10, lr - smladx r8, r11, r10, r8 - - add r0, r0, #4 - smlatb r10, r6, r9, r8 - - add lr, lr, #0x40 ; round_shift_and_clamp - ands r8, r7, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r2 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - ldrne r8, [r0] ; load data for next loop - ldrne r9, [r0, #4] - strb r10, [r1], r2 - - bne width_loop_2nd - - ldr r1, [sp] ; update dst for next loop - subs r7, r7, #0x10000 - add r0, r0, #16 ; updata src for next loop - add r1, r1, #1 - str r1, [sp] - - bne height_loop_2nd - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter4_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp9_filter address - mov r7, r3, lsl #16 ; height is top part of counter - - ldr r4, [r11] ; load up packed filter coefficients - add lr, r1, r3 ; save final destination pointer - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - mov r4, #0x40 ; rounding factor (for smlad{x}) - -|height_loop_2nd_4| - ldrd r8, [r0, #-4] ; load the data - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd_4| - ldr r10, [r0, #4]! - smladx r6, r9, r12, r4 ; apply filter - pkhbt r8, r9, r8 - smlad r5, r8, r12, r4 - pkhbt r8, r10, r9 - smladx r6, r10, r11, r6 - sub r7, r7, #1 - smlad r5, r8, r11, r5 - - mov r8, r9 ; shift the data for the next loop - mov r9, r10 - - usat r6, #8, r6, asr #7 ; shift and clamp - usat r5, #8, r5, asr #7 - - strb r5, [r1], r2 ; the result is transposed back and stored - tst r7, #0xff - strb r6, [r1], r2 - - bne width_loop_2nd_4 - - subs r7, r7, #0x10000 - add r0, r0, #16 ; update src for next loop - sub r1, lr, r7, lsr #16 ; update dst for next loop - - bne height_loop_2nd_4 - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;------------------------------------ -; r0 unsigned char *src_ptr -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp9_filter -;------------------------------------ -|vp9_filter_block2d_first_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - add r7, r2, r3 ; preload next low - add r7, r7, #2 - pld [r0, r7] - - ldr r4, [sp, #36] ; output pitch - ldr r11, [sp, #40] ; HFilter address - sub sp, sp, #8 - - mov r7, r3 - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - sub r4, r4, r3 - str r4, [sp] ; save modified output pitch - str r2, [sp, #4] - - mov r2, #0x40 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - -; six tap filter -|height_loop_1st_only_6| - ldrb r8, [r0, #-2] ; load data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - - mov r12, r3, lsr #1 ; loop counter - -|width_loop_1st_only_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - -;; smuad lr, lr, r4 - smlad lr, lr, r4, r2 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 -;; smuad r8, r8, r4 - smlad r8, r8, r4, r2 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - subs r12, r12, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - -;; add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 -;; add r10, r10, #0x40 - strb lr, [r1], #1 ; store the result - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0, #-1] - strb r10, [r1], #1 - ldrneb r10, [r0], #2 - - bne width_loop_1st_only_6 - - ldr lr, [sp] ; load back output pitch - ldr r12, [sp, #4] ; load back output pitch - subs r7, r7, #1 - add r0, r0, r12 ; updata src for next loop - - add r11, r12, r3 ; preload next low - add r11, r11, #2 - pld [r0, r11] - - add r1, r1, lr ; update dst for next loop - - bne height_loop_1st_only_6 - - add sp, sp, #8 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_first_pass_only_armv6| - - -;------------------------------------ -; r0 unsigned char *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp9_filter -;------------------------------------ -|vp9_filter_block2d_second_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; VFilter address - ldr r12, [sp, #36] ; output pitch - - mov r7, r3, lsl #16 ; height is top part of counter - sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after - - sub sp, sp, #8 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r0, [sp] ; save r0 to stack - str r1, [sp, #4] ; save dst to stack - -; six tap filter -|width_loop_2nd_only_6| - ldrb r8, [r0], r2 ; load data - orr r7, r7, r3 ; loop counter - ldrb r9, [r0], r2 - ldrb r10, [r0], r2 - -|height_loop_2nd_only_6| - ; filter first column in this inner loop, than, move to next colum. - ldrb r11, [r0], r2 - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0], r2 - - smuad lr, lr, r4 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0], r2 - smlad r8, r11, r5, r8 - ldrb r11, [r0] - - sub r7, r7, #2 - sub r0, r0, r2, lsl #2 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - - ands r9, r7, #0xff - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0], r2 ; load data for next loop - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r12 ; store the result for the column - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0], r2 - strb r10, [r1], r12 - ldrneb r10, [r0], r2 - - bne height_loop_2nd_only_6 - - ldr r0, [sp] - ldr r1, [sp, #4] - subs r7, r7, #0x10000 - add r0, r0, #1 ; move to filter next column - str r0, [sp] - add r1, r1, #1 - str r1, [sp, #4] - - bne width_loop_2nd_only_6 - - add sp, sp, #8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_second_pass_only_armv6| - - END diff --git a/vp9/common/arm/armv6/vp9_idct_v6.asm b/vp9/common/arm/armv6/vp9_idct_v6.asm deleted file mode 100644 index 27215afcd..000000000 --- a/vp9/common/arm/armv6/vp9_idct_v6.asm +++ /dev/null @@ -1,345 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 - EXPORT |vp8_short_idct4x4llm_1_v6| - EXPORT |vp8_short_idct4x4llm_v6| - EXPORT |vp8_short_idct4x4llm_v6_scott| - EXPORT |vp8_short_idct4x4llm_v6_dual| - - AREA |.text|, CODE, READONLY - -;******************************************************************************** -;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: 3/5 -;******************************************************************************** - -|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit - ; - ldrsh r0, [r0] ; load input[0] 1, r0 un 2 - add r0, r0, #4 ; 1 +4 - stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup - mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 - pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack - mov r5, r4 ; expand expand - - strd r4, [r1], r2 ; *output = r0, post inc 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1] ; 1 - ; - ldmia sp!, {r4, r5, pc} ; replace vars, return restore - ENDP ; |vp8_short_idct4x4llm_1_v6| -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - ; - mov r4, #0x00004E00 ; 1 cst - orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 - mov r5, #0x00008A00 ; 1 cst - orr r5, r5, #0x0000008C ; sinpi8sqrt2 - ; - mov r6, #4 ; i=4 1 i -loop1 ; - ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] - ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] - ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] - ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] - smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 - smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 - add r9, r7, r8 ; a1 = [0] + [8] 1 a1 - sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 - add r11, r3, r11 ; temp2 1 - rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 - smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 - smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 - add r8, r7, r11 ; b1 + c1 1 b+c - strh r8, [r1, r2] ; out[pitch] = b1+c1 1 - sub r7, r7, r11 ; b1 - c1 1 b-c - add r10, r12, r10 ; temp1 1 - add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 - add r10, r9, r3 ; a1 + d1 1 a+d - sub r3, r9, r3 ; a1 - d1 1 a-d - add r8, r2, r2 ; pitch * 2 1 p*2 - strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 - add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 - strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 - subs r6, r6, #1 ; i-- 1 -- - strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ - bne loop1 ; if i>0, continue - ; - sub r1, r1, #8 ; set up out for next loop 1 -4 - ; for this iteration, input=prev output - mov r6, #4 ; i=4 1 i -; b returnfull -loop2 ; - ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] - ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] - ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] - ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] - smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 - smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 - add r7, r0, r3 ; a1 = [0] + [2] 1 a1 - sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 - add r10, r8, r10 ; temp2 1 - rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 - smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 - smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 - add r3, r0, r9 ; b1+c1 1 b+c - add r3, r3, #4 ; b1+c1+4 1 +4 - add r10, r11, r10 ; temp1 1 - mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 - strh r3, [r1, #2] ; out[1] = b1+c1 1 - add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 - add r3, r7, r10 ; a1+d1 1 a+d - add r3, r3, #4 ; a1+d1+4 1 +4 - sub r7, r7, r10 ; a1-d1 1 a-d - add r7, r7, #4 ; a1-d1+4 1 +4 - mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 - mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 - strh r7, [r1, #6] ; out[3] = a1-d1 1 - sub r0, r0, r9 ; b1-c1 1 b-c - add r0, r0, #4 ; b1-c1+4 1 +4 - subs r6, r6, #1 ; i-- 1 -- - mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 - strh r0, [r1, #4] ; out[2] = b1-c1 1 - strh r3, [r1], r2 ; out[0] = a1+d1 1 -; add r1, r1, r2 ; out += pitch 1 ++ - bne loop2 ; if i>0, continue -returnfull ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - ENDP - -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit -; mov r0, #0 ; -; ldr r0, [r0] ; - stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup - ; - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - ; - mov r5, #0x2 ; i i - ; -short_idct4x4llm_v6_scott_loop1 ; - ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 - ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 - ; - smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 - smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 - ; - smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 - smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 - ; - add r6, r6, r7 ; partial c1 lt1-lt2 - add r12, r12, r14 ; partial d1 l2t2+l2t1 - ; - smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 - smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 - ; - smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 - smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 - ; - add r7, r14, r7 ; partial c1_2 ht1+ht2 - sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 - ; - pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack - pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack - ; - usub16 r6, r6, r10 ; c1_2 | c1_1 c - uadd16 r12, r12, r11 ; d1_2 | d1_1 d - ; - ldr r10, [r0, #0] ; i1 | i0 1,0 - ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 - ; -;;;;;; add r0, r0, #0x4 ; +4 -;;;;;; add r1, r1, #0x4 ; +4 - ; - uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a - usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b - ; - uadd16 r7, r8, r12 ; a1 + d1 pair a+d - usub16 r14, r8, r12 ; a1 - d1 pair a-d - ; - str r7, [r1] ; op[0] = a1 + d1 - str r14, [r1, r2] ; op[pitch*3] = a1 - d1 - ; - add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ - add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ - ; - subs r5, r5, #0x1 ; -- - bne short_idct4x4llm_v6_scott_loop1 ; - ; - sub r1, r1, #16 ; reset output ptr - mov r5, #0x4 ; - mov r0, r1 ; input = output - ; -short_idct4x4llm_v6_scott_loop2 ; - ; - subs r5, r5, #0x1 ; - bne short_idct4x4llm_v6_scott_loop2 ; - ; - ldmia sp!, {r4 - r11, pc} ; - ENDP ; - ; -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual ; - ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - ENDP - - END diff --git a/vp9/common/arm/armv6/vp9_iwalsh_v6.asm b/vp9/common/arm/armv6/vp9_iwalsh_v6.asm deleted file mode 100644 index 463bff0f5..000000000 --- a/vp9/common/arm/armv6/vp9_iwalsh_v6.asm +++ /dev/null @@ -1,152 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_inv_walsh4x4_v6| - EXPORT |vp8_short_inv_walsh4x4_1_v6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_v6| PROC - - stmdb sp!, {r4 - r11, lr} - - ldr r2, [r0], #4 ; [1 | 0] - ldr r3, [r0], #4 ; [3 | 2] - ldr r4, [r0], #4 ; [5 | 4] - ldr r5, [r0], #4 ; [7 | 6] - ldr r6, [r0], #4 ; [9 | 8] - ldr r7, [r0], #4 ; [11 | 10] - ldr r8, [r0], #4 ; [13 | 12] - ldr r9, [r0] ; [15 | 14] - - qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] - qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] - qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] - qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] - - qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] - qadd16 r4, r12, lr ; c1 + d1 [5 | 4] - qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] - qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] - - qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] - qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] - qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] - qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] - - qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] - qadd16 r5, r12, lr ; c1 + d1 [7 | 6] - qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] - qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] - - ; first transform complete - - qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] - qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] - qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] - qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] - - qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] - ldr r10, c0x00030003 - qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r2, r2, r10 ; [b2+3|c2+3] - qadd16 r3, r3, r10 ; [a2+3|d2+3] - qadd16 r4, r4, r10 ; [b2+3|c2+3] - qadd16 r5, r5, r10 ; [a2+3|d2+3] - - asr r12, r2, #3 ; [1 | x] - pkhtb r12, r12, r3, asr #19; [1 | 0] - lsl lr, r3, #16 ; [~3 | x] - lsl r2, r2, #16 ; [~2 | x] - asr lr, lr, #3 ; [3 | x] - pkhtb lr, lr, r2, asr #19 ; [3 | 2] - - asr r2, r4, #3 ; [5 | x] - pkhtb r2, r2, r5, asr #19 ; [5 | 4] - lsl r3, r5, #16 ; [~7 | x] - lsl r4, r4, #16 ; [~6 | x] - asr r3, r3, #3 ; [7 | x] - pkhtb r3, r3, r4, asr #19 ; [7 | 6] - - str r12, [r1], #4 - str lr, [r1], #4 - str r2, [r1], #4 - str r3, [r1], #4 - - qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] - qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] - qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] - qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] - - qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] - qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r6, r6, r10 ; [b2+3|c2+3] - qadd16 r7, r7, r10 ; [a2+3|d2+3] - qadd16 r8, r8, r10 ; [b2+3|c2+3] - qadd16 r9, r9, r10 ; [a2+3|d2+3] - - asr r2, r6, #3 ; [9 | x] - pkhtb r2, r2, r7, asr #19 ; [9 | 8] - lsl r3, r7, #16 ; [~11| x] - lsl r4, r6, #16 ; [~10| x] - asr r3, r3, #3 ; [11 | x] - pkhtb r3, r3, r4, asr #19 ; [11 | 10] - - asr r4, r8, #3 ; [13 | x] - pkhtb r4, r4, r9, asr #19 ; [13 | 12] - lsl r5, r9, #16 ; [~15| x] - lsl r6, r8, #16 ; [~14| x] - asr r5, r5, #3 ; [15 | x] - pkhtb r5, r5, r6, asr #19 ; [15 | 14] - - str r2, [r1], #4 - str r3, [r1], #4 - str r4, [r1], #4 - str r5, [r1] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_inv_walsh4x4_v6| - - -;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_v6| PROC - - ldrsh r2, [r0] ; [0] - add r2, r2, #3 ; [0] + 3 - asr r2, r2, #3 ; a1 ([0]+3) >> 3 - lsl r2, r2, #16 ; [a1 | x] - orr r2, r2, r2, lsr #16 ; [a1 | a1] - - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_v6| - -; Constant Pool -c0x00030003 DCD 0x00030003 - END diff --git a/vp9/common/arm/armv6/vp9_loopfilter_v6.asm b/vp9/common/arm/armv6/vp9_loopfilter_v6.asm deleted file mode 100644 index 37b54a39c..000000000 --- a/vp9/common/arm/armv6/vp9_loopfilter_v6.asm +++ /dev/null @@ -1,1282 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_horizontal_edge_armv6| - EXPORT |vp9_mbloop_filter_horizontal_edge_armv6| - EXPORT |vp9_loop_filter_vertical_edge_armv6| - EXPORT |vp9_mbloop_filter_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - -src RN r0 -pstep RN r1 -count RN r5 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Hnext8| - ; vp9_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - orr lr, lr, r10 - sub src, src, pstep, lsl #2 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq hskip_filter ; skip filtering - - sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines - - ;vp8_hevmask() function - ;calculate high edge variance - orr r10, r6, r8 ; calculate vp8_hevmask - - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 ; use usub8 instead of ssub8 - sel r6, r12, r11 ; obtain vp8_hevmask: r6 - - ;vp9_filter() function - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp9_filter (r7) &= hev - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - and r7, r7, lr ; vp9_filter &= mask; - - ;modify code for vp8 -- Filter1 = vp9_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) - qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: Filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8 ,r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) - qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) - - ;end of modification for vp8 - - mov lr, #0 - sadd8 r7, r7 , r10 ; vp9_filter += 1 - shadd8 r7, r7, lr ; vp9_filter >>= 1 - - ldr r11, [sp, #12] ; load ps1 - ldr r10, [sp, #8] ; load qs1 - - bic r7, r7, r6 ; vp9_filter &= ~hev - sub src, src, pstep, lsl #2 - - qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) - qsub8 r10, r10,r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) - - eor r11, r11, r12 ; *op1 = u^0x80 - str r11, [src], pstep ; store op1 - eor r9, r9, r12 ; *op0 = u^0x80 - str r9, [src], pstep ; store op0 result - eor r8, r8, r12 ; *oq0 = u^0x80 - str r8, [src], pstep ; store oq0 result - eor r10, r10, r12 ; *oq1 = u^0x80 - str r10, [src], pstep ; store oq1 - - sub src, src, pstep, lsl #1 - -|hskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #2 - - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne Hnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBHnext8| - - ; vp9_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - - orr lr, lr, r10 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbhskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines - sub src, src, pstep, lsl #1 - - orr r10, r6, r8 - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_mbfilter() function - ;p2, q2 are only needed at the end. Don't need to load them in now. - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src] ; q1 - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp9_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - - and r7, r7, lr ; vp9_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ; vp9_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - str r8, [src] ; store *oq0 - sub src, src, pstep - eor r10, r10, lr ; *op0 = s^0x80 - str r10, [src] ; store *op0 - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) - - qadd8 r11, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) - eor r11, r11, lr ; *op1 = s^0x80 - str r11, [src], pstep ; store *op1 - eor r8, r8, lr ; *oq1 = s^0x80 - add src, src, pstep, lsl #1 - - mov r7, #0x3f ; 63 - - str r8, [src], pstep ; store *oq1 - - ;roughly 1/7th difference across boundary - mov lr, #0x9 ; 9 - ldr r9, [src] ; load q2 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - sub src, src, pstep - ldr lr, c0x80808080 - - ldr r11, [src] ; load p2 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - eor r9, r9, lr - eor r11, r11, lr - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - str r8, [src], pstep, lsl #2 ; store *op2 - add src, src, pstep - eor r10, r10, lr ; *oq2 = s^0x80 - str r10, [src], pstep, lsl #1 ; store *oq2 - -|mbhskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #3 - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne MBHnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Vnext8| - - ; vp9_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq vskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - ;vp9_filter() function - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - str r6, [sp] - str lr, [sp, #4] - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to r7, r8, r9, r10 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp9_filter (r7) &= hev (r7 : filter) - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp9_filter &= mask - - ;modify code for vp8 -- Filter1 = vp9_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) - qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8, r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) - qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) - ;end of modification for vp8 - - eor r8, r8, r12 - eor r9, r9, r12 - - mov lr, #0 - - sadd8 r7, r7, r10 - shadd8 r7, r7, lr - - ldr r10, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - bic r7, r7, r6 ; r7: vp9_filter - - qsub8 r10 , r10, r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) - qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) - eor r10, r10, r12 - eor r11, r11, r12 - - sub src, src, pstep, lsl #2 - - ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 - ;output is b0, b1, b2, b3 - ;b0: 03 02 01 00 - ;b1: 13 12 11 10 - ;b2: 23 22 21 20 - ;b3: 33 32 31 30 - ; p1 p0 q0 q1 - ; (a3 a2 a1 a0) - TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr - - strh r6, [src, #-2] ; store the result - mov r6, r6, lsr #16 - strh r6, [src], pstep - - strh r7, [src, #-2] - mov r7, r7, lsr #16 - strh r7, [src], pstep - - strh r12, [src, #-2] - mov r12, r12, lsr #16 - strh r12, [src], pstep - - strh lr, [src, #-2] - mov lr, lr, lsr #16 - strh lr, [src], pstep - -|vskip_filter| - sub src, src, #4 - subs count, count, #1 - - ldrne r6, [src], pstep ; load source data - ldrne r7, [src], pstep - ldrne r8, [src], pstep - ldrne lr, [src], pstep - - bne Vnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_vertical_edge_armv6| - - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - pld [src, #23] ; preload for next block - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - pld [src, #23] - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - pld [src, #23] - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - pld [src, #23] - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBVnext8| - ; vp9_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbvskip_filter ; skip filtering - - - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - - ; vp8_mbfilter() function - ; p2, q2 are only needed at the end. Don't need to load them in now. - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - str r6, [sp] ; save r6 - str lr, [sp, #4] ; save lr - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to p1, p0, q0, q1 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp9_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp9_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ;vp9_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - eor r10, r10, lr ; *op0 = s^0x80 - - strb r10, [src, #-1] ; store op0 result - strb r8, [src], pstep ; store oq0 result - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - ldr lr, c0x80808080 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - add src, src, #2 - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) - eor r8, r8, lr ; *oq1 = s^0x80 - eor r10, r10, lr ; *op1 = s^0x80 - - ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary - strb r10, [src, #-4] ; store op1 - strb r8, [src, #-1] ; store oq1 - ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #8 - orr r9, r9, r7, lsl #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #16 - orr r9, r9, r7, lsl #16 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - orr r11, r11, r6, lsl #24 - orr r9, r9, r7, lsl #24 - - ;roughly 1/7th difference across boundary - eor r9, r9, lr - eor r11, r11, lr - - mov lr, #0x9 ; 9 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - ldr lr, c0x80808080 - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - eor r10, r10, lr ; *oq2 = s^0x80 - - strb r8, [src, #-5] ; store *op2 - strb r10, [src], pstep ; store *oq2 - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - - ;adjust src pointer for next loop - sub src, src, #2 - -|mbvskip_filter| - sub src, src, #4 - subs count, count, #1 - - pld [src, #23] ; preload for next block - ldrne r6, [src], pstep ; load source data - pld [src, #23] - ldrne r7, [src], pstep - pld [src, #23] - ldrne r8, [src], pstep - pld [src, #23] - ldrne lr, [src], pstep - - bne MBVnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 -c0x01010101 DCD 0x01010101 -c0x7F7F7F7F DCD 0x7F7F7F7F - - END diff --git a/vp9/common/arm/armv6/vp9_recon_v6.asm b/vp9/common/arm/armv6/vp9_recon_v6.asm deleted file mode 100644 index 99c7bcf2d..000000000 --- a/vp9/common/arm/armv6/vp9_recon_v6.asm +++ /dev/null @@ -1,281 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_armv6| - EXPORT |vp8_recon2b_armv6| - EXPORT |vp8_recon4b_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -prd RN r0 -dif RN r1 -dst RN r2 -stride RN r3 - -;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) -; R0 char* pred_ptr -; R1 short * dif_ptr -; R2 char * dst_ptr -; R3 int stride - -; Description: -; Loop through the block adding the Pred and Diff together. Clamp and then -; store back into the Dst. - -; Restrictions : -; all buffers are expected to be 4 byte aligned coming in and -; going out. -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_recon_b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #8] ; 1 | 0 -;; ldr r7, [dif, #12] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #16] ; 1 | 0 -;; ldr r7, [dif, #20] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #24] ; 1 | 0 -;; ldr r7, [dif, #28] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |recon_b| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon4b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon4b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - ;8, 9, 10, 11 - ldr r4, [prd], #4 -;; ldr r6, [dif, #64] -;; ldr r7, [dif, #68] - ldr r6, [dif, #16] - ldr r7, [dif, #20] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #8] - - ;12, 13, 14, 15 - ldr r4, [prd], #4 -;; ldr r6, [dif, #96] -;; ldr r7, [dif, #100] - ldr r6, [dif, #24] - ldr r7, [dif, #28] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #12] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #32 - - subs lr, lr, #1 - bne recon4b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon4B| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon2b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon2b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 - ldr r6, [dif, #0] - ldr r7, [dif, #4] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #16 - - subs lr, lr, #1 - bne recon2b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon2B| - - END diff --git a/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm b/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm deleted file mode 100644 index 8306912be..000000000 --- a/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6| - EXPORT |vp9_loop_filter_simple_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - - -src RN r0 -pstep RN r1 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_simple_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; blimit - ldr r3, [src, -pstep, lsl #1] ; p1 - ldr r4, [src, -pstep] ; p0 - ldr r5, [src] ; q0 - ldr r6, [src, pstep] ; q1 - orr r12, r12, r12, lsl #8 ; blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #16 ; blimit - mov r9, #4 ; double the count. we're doing 4 at a time - mov lr, #0 ; need 0 in a couple places - -|simple_hnext8| - ; vp8_simple_filter_mask() - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r10, r4, r5 ; p0 - q0 - uqsub8 r11, r5, r4 ; q0 - p0 - orr r8, r8, r7 ; abs(p1 - q1) - orr r10, r10, r11 ; abs(p0 - q0) - uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 - uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 - uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r8, #0 - usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags - sel r10, r8, lr ; filter mask: F or 0 - cmp r10, #0 - beq simple_hskip_filter ; skip filtering if all masks are 0x00 - - ;vp8_simple_filter() - - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r7, c0x04040404 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r8, c0x03030303 - qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, r10 ; vp9_filter &= mask - - qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4 - qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3 - - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr ; Filter1 >>= 3 - shadd8 r8 , r8 , lr ; Filter2 >>= 3 - - qsub8 r5 ,r5, r7 ; u = q0 - Filter1 - qadd8 r4, r4, r8 ; u = p0 + Filter2 - eor r5, r5, r2 ; *oq0 = u^0x80 - str r5, [src] ; store oq0 result - eor r4, r4, r2 ; *op0 = u^0x80 - str r4, [src, -pstep] ; store op0 result - -|simple_hskip_filter| - subs r9, r9, #1 - addne src, src, #4 ; next row - - ldrne r3, [src, -pstep, lsl #1] ; p1 - ldrne r4, [src, -pstep] ; p0 - ldrne r5, [src] ; q0 - ldrne r6, [src, pstep] ; q1 - - bne simple_hnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_simple_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; r12: blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #8 - - ; load soure data to r7, r8, r9, r10 - ldrh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrh r4, [src], pstep - orr r12, r12, r12, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrh r3, [src, #-2] - pld [src, #23] - ldrh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - mov r11, #4 ; double the count. we're doing 4 at a time - -|simple_vnext8| - ; vp8_simple_filter_mask() function - pkhbt r9, r3, r4, lsl #16 - pkhbt r10, r5, r6, lsl #16 - - ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 - TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r9, r4, r5 ; p0 - q0 - uqsub8 r10, r5, r4 ; q0 - p0 - orr r7, r7, r8 ; abs(p1 - q1) - orr r9, r9, r10 ; abs(p0 - q0) - mov r8, #0 - uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 - uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 - uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r10, #0 ; r10 == -1 - - usub8 r7, r12, r7 ; compare to flimit - sel lr, r10, r8 ; filter mask - - cmp lr, #0 - beq simple_vskip_filter ; skip filtering - - ;vp8_simple_filter() function - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - - qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 - ldr r9, c0x03030303 ; r9 = 3 - - qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 - ldr r7, c0x04040404 - - qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, lr ; vp9_filter &= mask - - qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3 - qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4 - - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 ; Filter2 >>= 3 - shadd8 r3 , r3 , r8 ; Filter1 >>= 3 - - ;calculate output - sub src, src, pstep, lsl #2 - - qadd8 r4, r4, r9 ; u = p0 + Filter2 - qsub8 r5, r5, r3 ; u = q0 - Filter1 - eor r4, r4, r2 ; *op0 = u^0x80 - eor r5, r5, r2 ; *oq0 = u^0x80 - - strb r4, [src, #-1] ; store the result - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - strb r5, [src], pstep - -|simple_vskip_filter| - subs r11, r11, #1 - - ; load soure data to r7, r8, r9, r10 - ldrneh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrneh r4, [src], pstep - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrneh r3, [src, #-2] - pld [src, #23] - ldrneh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - bne simple_vnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 - - END diff --git a/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm b/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm deleted file mode 100644 index 5bf94e090..000000000 --- a/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pitch -;------------------------------------- -;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. -;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, -;and the result is stored in transpose. -|vp8_sixtap_predict8x4_armv6| PROC - stmdb sp!, {r4 - r11, lr} - str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - add lr, sp, #4 ;point to temporary buffer - beq skip_firstpass_filter - -;first-pass filter - adr r12, filter8_coeff - sub r0, r0, r1, lsl #1 - - add r3, r1, #10 ; preload next low - pld [r0, r3] - - add r2, r12, r2, lsl #4 ;calculate filter location - add r0, r0, #3 ;adjust src only for loading convinience - - ldr r3, [r2] ; load up packed filter coefficients - ldr r4, [r2, #4] - ldr r5, [r2, #8] - - mov r2, #0x90000 ; height=9 is top part of counter - - sub r1, r1, #8 - -|first_pass_hloop_v6| - ldrb r6, [r0, #-5] ; load source data - ldrb r7, [r0, #-4] - ldrb r8, [r0, #-3] - ldrb r9, [r0, #-2] - ldrb r10, [r0, #-1] - - orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 - - pkhbt r6, r6, r7, lsl #16 ; r7 | r6 - pkhbt r7, r7, r8, lsl #16 ; r8 | r7 - - pkhbt r8, r8, r9, lsl #16 ; r9 | r8 - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - -|first_pass_wloop_v6| - smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1] - smuad r12, r7, r3 - - ldrb r6, [r0], #1 - - smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3] - ldrb r7, [r0], #1 - smlad r12, r9, r4, r12 - - pkhbt r10, r10, r6, lsl #16 ; r10 | r9 - pkhbt r6, r6, r7, lsl #16 ; r11 | r10 - smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5] - smlad r12, r6, r5, r12 - - sub r2, r2, #1 - - add r11, r11, #0x40 ; round_shift_and_clamp - tst r2, #0xff ; test loop counter - usat r11, #8, r11, asr #7 - add r12, r12, #0x40 - strh r11, [lr], #20 ; result is transposed and stored, which - usat r12, #8, r12, asr #7 - - strh r12, [lr], #20 - - movne r11, r6 - movne r12, r7 - - movne r6, r8 - movne r7, r9 - movne r8, r10 - movne r9, r11 - movne r10, r12 - - bne first_pass_wloop_v6 - - ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines - ;;IF ARCHITECTURE=6 - ;pld [src, ppl] - ;;pld [src, r9] - ;;ENDIF - - subs r2, r2, #0x10000 - - sub lr, lr, #158 - - add r0, r0, r1 ; move to next input line - - add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier - pld [r0, r11] - - bne first_pass_hloop_v6 - -;second pass filter -secondpass_filter - ldr r3, [sp], #4 ; load back yoffset - ldr r0, [sp, #216] ; load dst address from stack 180+36 - ldr r1, [sp, #220] ; load dst stride from stack 180+40 - - cmp r3, #0 - beq skip_secondpass_filter - - adr r12, filter8_coeff - add lr, r12, r3, lsl #4 ;calculate filter location - - mov r2, #0x00080000 - - ldr r3, [lr] ; load up packed filter coefficients - ldr r4, [lr, #4] - ldr r5, [lr, #8] - - pkhbt r12, r4, r3 ; pack the filter differently - pkhbt r11, r5, r4 - -second_pass_hloop_v6 - ldr r6, [sp] ; load the data - ldr r7, [sp, #4] - - orr r2, r2, #2 ; loop counter - -second_pass_wloop_v6 - smuad lr, r3, r6 ; apply filter - smulbt r10, r3, r6 - - ldr r8, [sp, #8] - - smlad lr, r4, r7, lr - smladx r10, r12, r7, r10 - - ldrh r9, [sp, #12] - - smlad lr, r5, r8, lr - smladx r10, r11, r8, r10 - - add sp, sp, #4 - smlatb r10, r5, r9, r10 - - sub r2, r2, #1 - - add lr, lr, #0x40 ; round_shift_and_clamp - tst r2, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r0], r1 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - strb r10, [r0],r1 - - movne r6, r7 - movne r7, r8 - - bne second_pass_wloop_v6 - - subs r2, r2, #0x10000 - add sp, sp, #12 ; updata src for next loop (20-8) - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne second_pass_hloop_v6 - - add sp, sp, #20 - ldmia sp!, {r4 - r11, pc} - -;-------------------- -skip_firstpass_filter - sub r0, r0, r1, lsl #1 - sub r1, r1, #8 - mov r2, #9 - -skip_firstpass_hloop - ldrb r4, [r0], #1 ; load data - subs r2, r2, #1 - ldrb r5, [r0], #1 - strh r4, [lr], #20 ; store it to immediate buffer - ldrb r6, [r0], #1 ; load data - strh r5, [lr], #20 - ldrb r7, [r0], #1 - strh r6, [lr], #20 - ldrb r8, [r0], #1 - strh r7, [lr], #20 - ldrb r9, [r0], #1 - strh r8, [lr], #20 - ldrb r10, [r0], #1 - strh r9, [lr], #20 - ldrb r11, [r0], #1 - strh r10, [lr], #20 - add r0, r0, r1 ; move to next input line - strh r11, [lr], #20 - - sub lr, lr, #158 ; move over to next column - bne skip_firstpass_hloop - - b secondpass_filter - -;-------------------- -skip_secondpass_filter - mov r2, #8 - add sp, sp, #4 ;start from src[0] instead of src[-2] - -skip_secondpass_hloop - ldr r6, [sp], #4 - subs r2, r2, #1 - ldr r8, [sp], #4 - - mov r7, r6, lsr #16 ; unpack - strb r6, [r0], r1 - mov r9, r8, lsr #16 - strb r7, [r0], r1 - add sp, sp, #12 ; 20-8 - strb r8, [r0], r1 - strb r9, [r0], r1 - - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne skip_secondpass_hloop - - add sp, sp, #16 ; 180 - (160 +4) - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;----------------- -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -filter8_coeff - DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 - DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 - DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 - DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 - DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 - DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 - DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 - DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 - - ;DCD 0, 0, 128, 0, 0, 0 - ;DCD 0, -6, 123, 12, -1, 0 - ;DCD 2, -11, 108, 36, -8, 1 - ;DCD 0, -9, 93, 50, -6, 0 - ;DCD 3, -16, 77, 77, -16, 3 - ;DCD 0, -6, 50, 93, -9, 0 - ;DCD 1, -8, 36, 108, -11, 2 - ;DCD 0, -1, 12, 123, -6, 0 - - END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm deleted file mode 100644 index 2528be7c3..000000000 --- a/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_bilinear_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, bifilter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - vst1.u8 {d4, d5}, [r4], r5 - vst1.u8 {d6, d7}, [r4], r5 - vmov q11, q15 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_sp16x16_loop_neon - - add sp, sp, #272 - - pop {r4-r5,pc} - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r4], r5 ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r4], r5 - vst1.u8 {d18, d19}, [r4], r5 - vst1.u8 {d20, d21}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - pop {r4-r5,pc} - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r4], r5 - vmov q11, q15 - vst1.u8 {d6, d7}, [r4], r5 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_spo16x16_loop_neon - pop {r4-r5,pc} - - ENDP - -;----------------- - -bifilter16_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm deleted file mode 100644 index 01eedf8e9..000000000 --- a/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm +++ /dev/null @@ -1,130 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict4x4_neon| PROC - push {r4, lr} - - adr r12, bifilter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x4) - vld1.u8 {d2}, [r0], r1 ;load src data - add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) - - vld1.u8 {d3}, [r0], r1 - vld1.u32 {d31}, [r2] ;first_pass filter - - vld1.u8 {d4}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0-d1) - vld1.u8 {d5}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {d6}, [r0], r1 - - vshr.u64 q4, q1, #8 ;construct src_ptr[1] - vshr.u64 q5, q2, #8 - vshr.u64 d12, d6, #8 - - vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d4, d5 - vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - - vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q8, d10, d1 - vmlal.u8 q9, d12, d1 - - vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d29, q8, #7 - vqrshrn.u16 d30, q9, #7 - -;Second pass: 4x4 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 ;calculate Vfilter location - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d31[4] - - vmull.u8 q1, d28, d0 - vmull.u8 q2, d29, d0 - - vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] - vext.8 d27, d29, d30, #4 - - vmlal.u8 q1, d26, d1 - vmlal.u8 q2, d27, d1 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - - vst1.32 {d2[0]}, [r4] ;store result - vst1.32 {d2[1]}, [r0] - vst1.32 {d3[0]}, [r1] - vst1.32 {d3[1]}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - - vld1.32 {d28[0]}, [r0], r1 ;load src data - vld1.32 {d28[1]}, [r0], r1 - vld1.32 {d29[0]}, [r0], r1 - vld1.32 {d29[1]}, [r0], r1 - vld1.32 {d30[0]}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.32 {d28[0]}, [r4], lr ;store result - vst1.32 {d28[1]}, [r4], lr - vst1.32 {d29[0]}, [r4], lr - vst1.32 {d29[1]}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm deleted file mode 100644 index 8f49345ff..000000000 --- a/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm +++ /dev/null @@ -1,135 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x4_neon| PROC - push {r4, lr} - - adr r12, bifilter8x4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {q5}, [r0], r1 - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q7, #7 - vqrshrn.u16 d24, q8, #7 - vqrshrn.u16 d25, q9, #7 - vqrshrn.u16 d26, q10, #7 - -;Second pass: 4x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1] - vst1.u8 {d5}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8x4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm deleted file mode 100644 index 6967f1950..000000000 --- a/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x8_neon| PROC - push {r4, lr} - - adr r12, bifilter8_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1], lr - vst1.u8 {d5}, [r1], lr - vst1.u8 {d6}, [r1], lr - vst1.u8 {d7}, [r1], lr - vst1.u8 {d8}, [r1], lr - vst1.u8 {d9}, [r1], lr - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - vst1.u8 {d26}, [r4], lr - vst1.u8 {d27}, [r4], lr - vst1.u8 {d28}, [r4], lr - vst1.u8 {d29}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm b/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm deleted file mode 100644 index e3ea91fe6..000000000 --- a/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm +++ /dev/null @@ -1,584 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_build_intra_predictors_mby_neon_func| - EXPORT |vp8_build_intra_predictors_mby_s_neon_func| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_neon_func| PROC - push {r4-r8, lr} - - cmp r3, #0 - beq case_dc_pred - cmp r3, #1 - beq case_v_pred - cmp r3, #2 - beq case_h_pred - cmp r3, #3 - beq case_tm_pred - -case_dc_pred - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up - - cmp r5, #0 - beq skip_dc_pred_left - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - - pop {r4-r8,pc} -case_v_pred - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - pop {r4-r8,pc} - -case_h_pred - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - pop {r4-r8,pc} - -case_tm_pred - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - subs r12, r12, #1 - bne case_tm_pred_loop - - pop {r4-r8,pc} - - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_s_neon_func| PROC - push {r4-r8, lr} - - mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; - - cmp r3, #0 - beq case_dc_pred_s - cmp r3, #1 - beq case_v_pred_s - cmp r3, #2 - beq case_h_pred_s - cmp r3, #3 - beq case_tm_pred_s - -case_dc_pred_s - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left_s - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up_s - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up_s - - cmp r5, #0 - beq skip_dc_pred_left_s - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left_s - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left_s - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - pop {r4-r8,pc} -case_v_pred_s - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - pop {r4-r8,pc} - -case_h_pred_s - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - pop {r4-r8,pc} - -case_tm_pred_s - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop_s - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - subs r12, r12, #1 - bne case_tm_pred_loop_s - - pop {r4-r8,pc} - - ENDP - - - END diff --git a/vp9/common/arm/neon/vp9_copymem16x16_neon.asm b/vp9/common/arm/neon/vp9_copymem16x16_neon.asm deleted file mode 100644 index bff8156d9..000000000 --- a/vp9/common/arm/neon/vp9_copymem16x16_neon.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem16x16_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem16x16_neon| PROC - - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vld1.u8 {q2}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vld1.u8 {q3}, [r0], r1 - vst1.u8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vst1.u8 {q2}, [r2], r3 - vld1.u8 {q5}, [r0], r1 - vst1.u8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vst1.u8 {q4}, [r2], r3 - vld1.u8 {q7}, [r0], r1 - vst1.u8 {q5}, [r2], r3 - vld1.u8 {q8}, [r0], r1 - vst1.u8 {q6}, [r2], r3 - vld1.u8 {q9}, [r0], r1 - vst1.u8 {q7}, [r2], r3 - vld1.u8 {q10}, [r0], r1 - vst1.u8 {q8}, [r2], r3 - vld1.u8 {q11}, [r0], r1 - vst1.u8 {q9}, [r2], r3 - vld1.u8 {q12}, [r0], r1 - vst1.u8 {q10}, [r2], r3 - vld1.u8 {q13}, [r0], r1 - vst1.u8 {q11}, [r2], r3 - vld1.u8 {q14}, [r0], r1 - vst1.u8 {q12}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - vst1.u8 {q13}, [r2], r3 - vst1.u8 {q14}, [r2], r3 - vst1.u8 {q15}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem16x16_neon| - - END diff --git a/vp9/common/arm/neon/vp9_copymem8x4_neon.asm b/vp9/common/arm/neon/vp9_copymem8x4_neon.asm deleted file mode 100644 index ffd2df8e1..000000000 --- a/vp9/common/arm/neon/vp9_copymem8x4_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x4_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x4_neon| PROC - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vst1.u8 {d3}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem8x4_neon| - - END diff --git a/vp9/common/arm/neon/vp9_copymem8x8_neon.asm b/vp9/common/arm/neon/vp9_copymem8x8_neon.asm deleted file mode 100644 index 2d394c043..000000000 --- a/vp9/common/arm/neon/vp9_copymem8x8_neon.asm +++ /dev/null @@ -1,43 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x8_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x8_neon| PROC - - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vld1.u8 {d4}, [r0], r1 - vst1.u8 {d3}, [r2], r3 - vld1.u8 {d5}, [r0], r1 - vst1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r0], r1 - vst1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r0], r1 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem8x8_neon| - - END diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm deleted file mode 100644 index 49ba05fb0..000000000 --- a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm +++ /dev/null @@ -1,49 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) -; r0 input_dc -; r1 pred_ptr -; r2 dst_ptr -; r3 pitch -; sp stride -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r3 - vld1.32 {d2[1]}, [r1], r3 - vld1.32 {d4[0]}, [r1], r3 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r2], r12 - vst1.32 {d2[1]}, [r2], r12 - vst1.32 {d4[0]}, [r2], r12 - vst1.32 {d4[1]}, [r2] - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_iwalsh_neon.asm b/vp9/common/arm/neon/vp9_iwalsh_neon.asm deleted file mode 100644 index 01c79d937..000000000 --- a/vp9/common/arm/neon/vp9_iwalsh_neon.asm +++ /dev/null @@ -1,80 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - EXPORT |vp8_short_inv_walsh4x4_neon| - EXPORT |vp8_short_inv_walsh4x4_1_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_neon| PROC - - ; read in all four lines of values: d0->d3 - vld1.i16 {q0-q1}, [r0@128] - - ; first for loop - vadd.s16 d4, d0, d3 ;a = [0] + [12] - vadd.s16 d6, d1, d2 ;b = [4] + [8] - vsub.s16 d5, d0, d3 ;d = [0] - [12] - vsub.s16 d7, d1, d2 ;c = [4] - [8] - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vtrn.32 d0, d2 ;d0: 0 1 8 9 - ;d2: 2 3 10 11 - vtrn.32 d1, d3 ;d1: 4 5 12 13 - ;d3: 6 7 14 15 - - vtrn.16 d0, d1 ;d0: 0 4 8 12 - ;d1: 1 5 9 13 - vtrn.16 d2, d3 ;d2: 2 6 10 14 - ;d3: 3 7 11 15 - - ; second for loop - - vadd.s16 d4, d0, d3 ;a = [0] + [3] - vadd.s16 d6, d1, d2 ;b = [1] + [2] - vsub.s16 d5, d0, d3 ;d = [0] - [3] - vsub.s16 d7, d1, d2 ;c = [1] - [2] - - vmov.i16 q8, #3 - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vadd.i16 q0, q0, q8 ;e/f += 3 - vadd.i16 q1, q1, q8 ;g/h += 3 - - vshr.s16 q0, q0, #3 ;e/f >> 3 - vshr.s16 q1, q1, #3 ;g/h >> 3 - - vst4.i16 {d0,d1,d2,d3}, [r1@128] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| - - -;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_1_neon| PROC - ldrsh r2, [r0] ; load input[0] - add r3, r2, #3 ; add 3 - add r2, r1, #16 ; base for last 8 output - asr r0, r3, #3 ; right shift 3 - vdup.16 q0, r0 ; load and duplicate - vst1.16 {q0}, [r1@128] ; write back 8 - vst1.16 {q0}, [r2@128] ; write back last 8 - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_neon| - - END diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm deleted file mode 100644 index bc6616734..000000000 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm +++ /dev/null @@ -1,397 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_horizontal_edge_y_neon| - EXPORT |vp9_loop_filter_horizontal_edge_uv_neon| - EXPORT |vp9_loop_filter_vertical_edge_y_neon| - EXPORT |vp9_loop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp9_loop_filter_horizontal_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1 - add r1, r1, r1 - - vdup.u8 q2, r3 ; duplicate thresh - - vld1.u8 {q3}, [r2@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r2@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r2@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r2@128] ; q2 - vld1.u8 {q10}, [r12@128] ; q3 - - sub r2, r2, r1, lsl #1 - sub r12, r12, r1, lsl #1 - - bl vp9_loop_filter_neon - - vst1.u8 {q5}, [r2@128], r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r2@128], r1 ; store oq0 - vst1.u8 {q8}, [r12@128], r1 ; store oq1 - - pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| - - -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp9_loop_filter_horizontal_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #4] ; load thresh - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q2, r12 ; duplicate thresh - - sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r3@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r3@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r3@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r3@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r3@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r3@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r3@64] ; q3 - vld1.u8 {d21}, [r12@64] ; q3 - - bl vp9_loop_filter_neon - - sub r0, r0, r1, lsl #1 - sub r2, r2, r1, lsl #1 - - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r2@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r2@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r2@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64] ; store u oq1 - vst1.u8 {d17}, [r2@64] ; store v oq1 - - pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon| - -; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, - -|vp9_loop_filter_vertical_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, #4 ; src ptr down by 4 columns - add r1, r1, r1 - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1, asr #1 - - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r12], r1 - - vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d19}, [r2] - vld1.u8 {d21}, [r12] - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r3 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp9_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - - sub r0, r0, #2 ; dst ptr - - vswp d14, d12 - vswp d16, d15 - - add r12, r0, r1, asr #1 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - - pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_y_neon| - -; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp9_loop_filter_vertical_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q1, r3 ; duplicate limit - sub r3, r2, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r12] - vld1.u8 {d21}, [r3] - - ldr r12, [sp, #4] ; load thresh - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r12 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp9_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - vswp d14, d12 - vswp d16, d15 - - sub r0, r0, #2 - sub r2, r2, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - - pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_uv_neon| - -; void vp9_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. - -; r0-r3 PRESERVE -; q0 flimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -|vp9_loop_filter_neon| PROC - - ; vp9_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) - vabd.u8 q4, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q4 - vmax.u8 q15, q11, q12 - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 - - vmov.u8 q10, #0x80 ; 0x80 - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - vcge.u8 q15, q1, q15 - - ; vp9_filter() function - ; convert to signed - veor q7, q7, q10 ; qs0 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u8 q10, #3 ; #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - - vmovl.u8 q4, d20 - - vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; vp8_hevmask - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; vp9_filter &= hev - vand q15, q15, q9 ; vp9_filter_mask - - vaddw.s8 q2, q2, d2 - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 ; #4 - - ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3) - vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) - - ; outer tap adjustments: ++vp9_filter >> 1 - vrshr.s8 q1, q1, #1 - vbic q1, q1, q14 ; vp9_filter &= ~hev - vmov.u8 q0, #0x80 ; 0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - veor q5, q13, q0 ; *op1 = u^0x80 - veor q8, q12, q0 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| - -;----------------- - - END diff --git a/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm b/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm deleted file mode 100644 index eb07ce0d5..000000000 --- a/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm +++ /dev/null @@ -1,117 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon| - EXPORT |vp9_loop_filter_bhs_neon| - EXPORT |vp9_loop_filter_mbhs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp9_loop_filter_simple_horizontal_edge_neon| PROC - - sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines - - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q5}, [r3@128], r1 ; p0 - vld1.u8 {q8}, [r0@128] ; q1 - vld1.u8 {q6}, [r3@128] ; p1 - - vabd.u8 q15, q6, q7 ; abs(p0 - q0) - vabd.u8 q14, q5, q8 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q13, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value - veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value - veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value - veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q3, d15, d13 - - vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) - vmul.s16 q3, q3, q13 - - vmov.u8 q10, #0x03 ; 0x03 - vmov.u8 q9, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q3, q3, d9 - - vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d9, q3 - - vand q14, q4, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) - vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - - sub r0, r0, r1 - - ;calculate output - vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - - vst1.u8 {q6}, [r3@128] ; store op0 - vst1.u8 {q7}, [r0@128] ; store oq0 - - bx lr - ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_bhs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate blim - - add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride - bl vp9_loop_filter_simple_horizontal_edge_neon - ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 - add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride - bl vp9_loop_filter_simple_horizontal_edge_neon - add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride - pop {r4, lr} - b vp9_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp9_loop_filter_bhs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_mbhs_neon| PROC - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp9_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp9_loop_filter_bhs_neon| - - END diff --git a/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm b/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm deleted file mode 100644 index d5cf8c2b5..000000000 --- a/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp9_loop_filter_simple_vertical_edge_neon| - EXPORT |vp9_loop_filter_bvs_neon| - EXPORT |vp9_loop_filter_mbvs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp9_loop_filter_simple_vertical_edge_neon| PROC - sub r0, r0, #2 ; move src pointer down by 2 columns - add r12, r1, r1 - add r3, r0, r1 - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] - - vswp d7, d10 - vswp d12, d9 - - ;vp9_filter_mask() function - ;vp8_hevmask() function - sub r0, r0, r1, lsl #4 - vabd.u8 q15, q5, q4 ; abs(p0 - q0) - vabd.u8 q14, q3, q6 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q11, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value - veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value - veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value - veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - - vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) - vsubl.s8 q13, d9, d11 - - vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) - vmul.s16 q13, q13, q11 - - vmov.u8 q11, #0x03 ; 0x03 - vmov.u8 q12, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d29 - - vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d29, q13 - - add r0, r0, #1 - add r3, r0, r1 - - vand q14, q14, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) - vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q14, q3, #3 ; Filter1 >>= 3 - - ;calculate output - vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - add r12, r1, r1 - vswp d13, d14 - - ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0], r12 - vst2.8 {d12[1], d13[1]}, [r3], r12 - vst2.8 {d12[2], d13[2]}, [r0], r12 - vst2.8 {d12[3], d13[3]}, [r3], r12 - vst2.8 {d12[4], d13[4]}, [r0], r12 - vst2.8 {d12[5], d13[5]}, [r3], r12 - vst2.8 {d12[6], d13[6]}, [r0], r12 - vst2.8 {d12[7], d13[7]}, [r3], r12 - vst2.8 {d14[0], d15[0]}, [r0], r12 - vst2.8 {d14[1], d15[1]}, [r3], r12 - vst2.8 {d14[2], d15[2]}, [r0], r12 - vst2.8 {d14[3], d15[3]}, [r3], r12 - vst2.8 {d14[4], d15[4]}, [r0], r12 - vst2.8 {d14[5], d15[5]}, [r3], r12 - vst2.8 {d14[6], d15[6]}, [r0], r12 - vst2.8 {d14[7], d15[7]}, [r3] - - bx lr - ENDP ; |vp9_loop_filter_simple_vertical_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_bvs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - mov r4, r0 - add r0, r0, #4 - vdup.s8 q1, r3 ; duplicate blim - bl vp9_loop_filter_simple_vertical_edge_neon - ; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1 - add r0, r4, #8 - bl vp9_loop_filter_simple_vertical_edge_neon - add r0, r4, #12 - pop {r4, lr} - b vp9_loop_filter_simple_vertical_edge_neon - ENDP ;|vp9_loop_filter_bvs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_mbvs_neon| PROC - ldrb r3, [r2] ; load mblim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp9_loop_filter_simple_vertical_edge_neon - ENDP ;|vp9_loop_filter_bvs_neon| - END diff --git a/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm b/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm deleted file mode 100644 index 19b67f47d..000000000 --- a/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm +++ /dev/null @@ -1,469 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| - EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_horizontal_edge_y_neon| PROC - push {lr} - add r1, r1, r1 ; double stride - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - add r12, r0, r1, lsr #1 ; move src pointer up by 1 line - - vld1.u8 {q3}, [r0@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r0@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r0@128], r1 ; q2 - vld1.u8 {q10}, [r12@128], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #2 - add r0, r12, r1, lsr #1 - - vst1.u8 {q4}, [r12@128],r1 ; store op2 - vst1.u8 {q5}, [r0@128],r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r0@128],r1 ; store oq0 - vst1.u8 {q8}, [r12@128] ; store oq1 - vst1.u8 {q9}, [r0@128] ; store oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| - -; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v - -|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r0@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r0@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r0@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r0@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r0@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r0@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r0@64], r1 ; q3 - vld1.u8 {d21}, [r12@64], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r0, r0, r1, lsl #3 - sub r12, r12, r1, lsl #3 - - add r0, r0, r1 - add r12, r12, r1 - - vst1.u8 {d8}, [r0@64], r1 ; store u op2 - vst1.u8 {d9}, [r12@64], r1 ; store v op2 - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r12@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r12@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r12@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64], r1 ; store u oq1 - vst1.u8 {d17}, [r12@64], r1 ; store v oq1 - vst1.u8 {d18}, [r0@64], r1 ; store u oq2 - vst1.u8 {d19}, [r12@64], r1 ; store v oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| - -; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_vertical_edge_y_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move src pointer down by 4 columns - vdup.s8 q2, r12 ; thresh - add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - vld1.u8 {d7}, [r12], r1 ; load second 8-line src data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| - -; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 unsigned char *v -|vp8_mbloop_filter_vertical_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move u pointer down by 4 columns - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r12], r1 ;load v data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| - -; void vp8_mbloop_filter_neon() -; This is a helper function for the macroblock loopfilters. The individual -; functions do the necessary load, transpose (if necessary), preserve (if -; necessary) and store. - -; r0,r1 PRESERVE -; r2 mblimit -; r3 limit - -; q2 thresh -; q3 p3 PRESERVE -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 PRESERVE - -|vp8_mbloop_filter_neon| PROC - - ; vp9_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q1, q9, q8 ; abs(q2 - q1) - vabd.u8 q0, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q1, q1, q0 - vmax.u8 q15, q11, q12 - - vabd.u8 q12, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q1 - - vdup.u8 q1, r3 ; limit - vdup.u8 q2, r2 ; mblimit - - vmov.u8 q0, #0x80 ; 0x80 - - vcge.u8 q15, q1, q15 - - vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vmov.u16 q11, #3 ; #3 - - ; vp9_filter - ; convert to signed - veor q7, q7, q0 ; qs0 - vshr.u8 q1, q1, #1 ; a = a / 2 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - - vqadd.u8 q12, q12, q1 ; a = b + a - - veor q8, q8, q0 ; qs1 - veor q4, q4, q0 ; ps2 - veor q9, q9, q0 ; qs2 - - vorr q14, q13, q14 ; vp8_hevmask - - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 - - vsubl.s8 q2, d14, d12 ; qs0 - ps0 - vsubl.s8 q13, d15, d13 - - vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) - - vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) - - vand q15, q15, q12 ; vp9_filter_mask - - vmul.i16 q13, q13, q11 - - vmov.u8 q12, #3 ; #3 - - vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 - - vmov.u8 q11, #4 ; #4 - - ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q13 - - vand q1, q1, q15 ; vp9_filter &= mask - - vmov.u16 q15, #63 ; #63 - - vand q13, q1, q14 ; Filter2 &= hev - - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) - vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - - vmov q0, q15 - - vshr.s8 q2, q2, #3 ; Filter1 >>= 3 - vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - - vmov q11, q15 - vmov q12, q15 - - vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) - - vbic q1, q1, q14 ; vp9_filter &= ~hev - - ; roughly 1/7th difference across boundary - ; roughly 2/7th difference across boundary - ; roughly 3/7th difference across boundary - - vmov.u8 d5, #9 ; #9 - vmov.u8 d4, #18 ; #18 - - vmov q13, q15 - vmov q14, q15 - - vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 - vmlal.s8 q11, d3, d5 - vmov.u8 d5, #27 ; #27 - vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 - vmlal.s8 q13, d3, d4 - vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 - vmlal.s8 q15, d3, d5 - - vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d1, q11, #7 - vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) - vqshrn.s16 d25, q13, #7 - vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) - vqshrn.s16 d29, q15, #7 - - vmov.u8 q1, #0x80 ; 0x80 - - vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) - vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) - vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) - vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) - vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) - vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - - veor q9, q11, q1 ; *oq2 = s^0x80 - veor q4, q0, q1 ; *op2 = s^0x80 - veor q8, q13, q1 ; *oq1 = s^0x80 - veor q5, q12, q1 ; *op2 = s^0x80 - veor q7, q15, q1 ; *oq0 = s^0x80 - veor q6, q14, q1 ; *op0 = s^0x80 - - bx lr - ENDP ; |vp8_mbloop_filter_neon| - -;----------------- - - END diff --git a/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm b/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm deleted file mode 100644 index 3f1a30f48..000000000 --- a/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon16x16mb_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int ystride, -; stack unsigned char *udst_ptr, -; stack unsigned char *vdst_ptr - -|vp8_recon16x16mb_neon| PROC - mov r12, #4 ;loop counter for Y loop - -recon16x16mb_loop_y - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - pld [r0] - pld [r1] - pld [r1, #64] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vst1.u8 {q0}, [r2], r3 ;store result - vqmovun.s16 d6, q6 - vst1.u8 {q1}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {q2}, [r2], r3 - subs r12, r12, #1 - - moveq r12, #2 ;loop counter for UV loop - - vst1.u8 {q3}, [r2], r3 - bne recon16x16mb_loop_y - - mov r3, r3, lsr #1 ;uv_stride = ystride>>1 - ldr r2, [sp] ;load upred_ptr - -recon16x16mb_loop_uv - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vadd.s16 q7, q7, q15 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vst1.u8 {d0}, [r2], r3 ;store result - vqmovun.s16 d4, q4 - vst1.u8 {d1}, [r2], r3 - vqmovun.s16 d5, q5 - vst1.u8 {d2}, [r2], r3 - vqmovun.s16 d6, q6 - vst1.u8 {d3}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {d4}, [r2], r3 - subs r12, r12, #1 - - vst1.u8 {d5}, [r2], r3 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - ldrne r2, [sp, #4] ;load vpred_ptr - bne recon16x16mb_loop_uv - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_recon2b_neon.asm b/vp9/common/arm/neon/vp9_recon2b_neon.asm deleted file mode 100644 index 99b251c91..000000000 --- a/vp9/common/arm/neon/vp9_recon2b_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon2b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon2b_neon| PROC - vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr - vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr - - vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits - vld1.16 {q6, q7}, [r1]! - vmovl.u8 q1, d17 - vmovl.u8 q2, d18 - vmovl.u8 q3, d19 - - vadd.s16 q0, q0, q4 ;add Diff data and Pred data together - vadd.s16 q1, q1, q5 - vadd.s16 q2, q2, q6 - vadd.s16 q3, q3, q7 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r0, r2, r3 - - vst1.u8 {d0}, [r2] ;store result - vst1.u8 {d1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {d2}, [r0] - vst1.u8 {d3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_recon4b_neon.asm b/vp9/common/arm/neon/vp9_recon4b_neon.asm deleted file mode 100644 index 991727746..000000000 --- a/vp9/common/arm/neon/vp9_recon4b_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon4b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon4b_neon| PROC - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0] - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - add r0, r2, r3 - - vst1.u8 {q0}, [r2] ;store result - vst1.u8 {q1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {q2}, [r0] - vst1.u8 {q3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_recon_neon.c b/vp9/common/arm/neon/vp9_recon_neon.c deleted file mode 100644 index 1bf7a29bd..000000000 --- a/vp9/common/arm/neon/vp9_recon_neon.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9/common/recon.h" -#include "vp9/common/vp9_blockd.h" - -extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); - -void vp8_recon_mb_neon(MACROBLOCKD *xd) { - unsigned char *pred_ptr = &xd->predictor[0]; - short *diff_ptr = &xd->diff[0]; - unsigned char *dst_ptr = xd->dst.y_buffer; - unsigned char *udst_ptr = xd->dst.u_buffer; - unsigned char *vdst_ptr = xd->dst.v_buffer; - int ystride = xd->dst.y_stride; - /*int uv_stride = xd->dst.uv_stride;*/ - - vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, - udst_ptr, vdst_ptr); -} diff --git a/vp9/common/arm/neon/vp9_reconb_neon.asm b/vp9/common/arm/neon/vp9_reconb_neon.asm deleted file mode 100644 index 288c0ef01..000000000 --- a/vp9/common/arm/neon/vp9_reconb_neon.asm +++ /dev/null @@ -1,61 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon_b_neon| PROC - mov r12, #16 - - vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr - vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr - vld1.u8 {d29}, [r0], r12 - vld1.16 {q11, q12}, [r1]! - vld1.u8 {d30}, [r0], r12 - vld1.16 {q12, q13}, [r1]! - vld1.u8 {d31}, [r0], r12 - vld1.16 {q13}, [r1] - - vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 - vmovl.u8 q2, d30 - vmovl.u8 q3, d31 - - vadd.s16 d0, d0, d20 ;add Diff data and Pred data together - vadd.s16 d2, d2, d22 - vadd.s16 d4, d4, d24 - vadd.s16 d6, d6, d26 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r1, r2, r3 - - vst1.32 {d0[0]}, [r2] ;store result - vst1.32 {d1[0]}, [r1], r3 - add r2, r1, r3 - vst1.32 {d2[0]}, [r1] - vst1.32 {d3[0]}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_save_neon_reg.asm b/vp9/common/arm/neon/vp9_save_neon_reg.asm deleted file mode 100644 index 71c3e7077..000000000 --- a/vp9/common/arm/neon/vp9_save_neon_reg.asm +++ /dev/null @@ -1,36 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_push_neon| - EXPORT |vp9_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|vp9_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|vp9_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm b/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm deleted file mode 100644 index d7bdbae75..000000000 --- a/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm +++ /dev/null @@ -1,67 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_1_neon| - EXPORT |vp8_dc_only_idct_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); -; r0 short *input; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_short_idct4x4llm_1_neon| PROC - vld1.16 {d0[]}, [r0] ;load input[0] - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); -; r0 short input_dc; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_dc_only_idct_neon| PROC - vdup.16 d0, r0 - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm b/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm deleted file mode 100644 index b74c31521..000000000 --- a/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm +++ /dev/null @@ -1,122 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;************************************************************* -;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) -;r0 short * input -;r1 short * output -;r2 int pitch -;************************************************************* -;static const int cospi8sqrt2minus1=20091; -;static const int sinpi8sqrt2 =35468; -;static const int rounding = 0; -;Optimization note: The resulted data from dequantization are signed 13-bit data that is -;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since -;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half -;result of the multiplication that is needed in IDCT. - -|vp8_short_idct4x4llm_neon| PROC - adr r12, idct_coeff - vld1.16 {q1, q2}, [r0] - vld1.16 {d0}, [r12] - - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vswp d3, d4 - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - add r3, r1, r2 - add r12, r3, r2 - add r0, r12, r2 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vst1.16 {d2}, [r1] - vst1.16 {d3}, [r3] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] - - bx lr - - ENDP - -;----------------- - -idct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c - -;20091, 20091, 35468, 35468 - - END diff --git a/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm deleted file mode 100644 index 5e83f49f5..000000000 --- a/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm +++ /dev/null @@ -1,490 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter16_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to -; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, -; the result can be negtive. So, I treat the result as s16. But, since it is also possible -; that the result can be a large positive number (> 2^15-1), which could be confused as a -; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, -; which ensures that the result stays in s16 range. Finally, saturated add the result by -; applying 3rd filter coeff. Same applys to other filter functions. - -|vp8_sixtap_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, filter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter16x16_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter16x16_only - - sub sp, sp, #336 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #7 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (21x16) -filt_blk2d_fp16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d7, d0 - vmull.u8 q10, d9, d0 - vmull.u8 q11, d10, d0 - vmull.u8 q12, d12, d0 - vmull.u8 q13, d13, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d9, d10, #1 - vext.8 d30, d12, d13, #1 - - vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q12, d30, d1 - - vext.8 d28, d7, d8, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d13, d14, #1 - - vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q11, d29, d1 - vmlsl.u8 q13, d30, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d9, d10, #4 - vext.8 d30, d12, d13, #4 - - vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q12, d30, d4 - - vext.8 d28, d7, d8, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d13, d14, #4 - - vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q11, d29, d4 - vmlsl.u8 q13, d30, d4 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - vext.8 d30, d12, d13, #5 - - vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q10, d29, d5 - vmlal.u8 q12, d30, d5 - - vext.8 d28, d7, d8, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d13, d14, #5 - - vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q11, d29, d5 - vmlal.u8 q13, d30, d5 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d9, d10, #2 - vext.8 d30, d12, d13, #2 - - vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q10, d29, d2 - vmlal.u8 q12, d30, d2 - - vext.8 d28, d7, d8, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d13, d14, #2 - - vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q11, d29, d2 - vmlal.u8 q13, d30, d2 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d9, d10, #3 - vext.8 d30, d12, d13, #3 - - vext.8 d15, d7, d8, #3 - vext.8 d31, d10, d11, #3 - vext.8 d6, d13, d14, #3 - - vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - - vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) - vqadd.s16 q10, q5 - vqadd.s16 q12, q6 - - vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q7, d31, d3 - vmull.u8 q3, d6, d3 - - subs r2, r2, #1 - - vqadd.s16 q9, q6 - vqadd.s16 q11, q7 - vqadd.s16 q13, q3 - - vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q9, #7 - vqrshrun.s16 d8, q10, #7 - vqrshrun.s16 d9, q11, #7 - vqrshrun.s16 d10, q12, #7 - vqrshrun.s16 d11, q13, #7 - - vst1.u8 {d6, d7, d8}, [lr]! ;store result - vst1.u8 {d9, d10, d11}, [lr]! - - bne filt_blk2d_fp16x16_loop_neon - -;Second pass: 16x16 -;secondpass_filter - do first 8-columns and then second 8-columns - add r3, r12, r3, lsl #5 - sub lr, lr, #336 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - mov r2, #16 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp16x16_outloop_neon - vld1.u8 {d18}, [lr], r2 ;load src data - vld1.u8 {d19}, [lr], r2 - vld1.u8 {d20}, [lr], r2 - vld1.u8 {d21}, [lr], r2 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [lr], r2 - -secondpass_inner_loop_neon - vld1.u8 {d23}, [lr], r2 ;load src data - vld1.u8 {d24}, [lr], r2 - vld1.u8 {d25}, [lr], r2 - vld1.u8 {d26}, [lr], r2 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_inner_loop_neon - - subs r3, r3, #1 - sub lr, lr, #336 - add lr, lr, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_sp16x16_outloop_neon - - add sp, sp, #336 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter16x16_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #8 ;loop counter - sub r0, r0, #2 ;move srcptr back to (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - - pld [r0] - pld [r0, r1] - - vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q7, d7, d0 - vmull.u8 q8, d9, d0 - vmull.u8 q9, d10, d0 - - vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d21, d9, d10, #1 - vext.8 d22, d7, d8, #1 - vext.8 d23, d10, d11, #1 - vext.8 d24, d6, d7, #4 ;construct src_ptr[2] - vext.8 d25, d9, d10, #4 - vext.8 d26, d7, d8, #4 - vext.8 d27, d10, d11, #4 - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - - vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d21, d1 - vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d23, d1 - vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d25, d4 - vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d27, d4 - vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - - vext.8 d20, d7, d8, #5 - vext.8 d21, d10, d11, #5 - vext.8 d22, d6, d7, #2 ;construct src_ptr[0] - vext.8 d23, d9, d10, #2 - vext.8 d24, d7, d8, #2 - vext.8 d25, d10, d11, #2 - - vext.8 d26, d6, d7, #3 ;construct src_ptr[1] - vext.8 d27, d9, d10, #3 - vext.8 d28, d7, d8, #3 - vext.8 d29, d10, d11, #3 - - vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d21, d5 - vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d23, d2 - vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d25, d2 - - vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q11, d27, d3 - vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q15, d29, d3 - - vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q11 - vqadd.s16 q7, q12 - vqadd.s16 q9, q15 - - subs r2, r2, #1 - - vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q7, #7 - vqrshrun.s16 d8, q8, #7 - vqrshrun.s16 d9, q9, #7 - - vst1.u8 {q3}, [r4], r5 ;store result - vst1.u8 {q4}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - - pop {r4-r5,pc} - -;-------------------- -secondpass_filter16x16_only -;Second pass: 16x16 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_spo16x16_outloop_neon - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r0], r1 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [r0], r1 - -secondpass_only_inner_loop_neon - vld1.u8 {d23}, [r0], r1 ;load src data - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_only_inner_loop_neon - - subs r3, r3, #1 - sub r0, r0, r1, lsl #4 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1 - add r0, r0, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_spo16x16_outloop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - END diff --git a/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm deleted file mode 100644 index 5966b642f..000000000 --- a/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm +++ /dev/null @@ -1,422 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter4_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_sixtap_predict_neon| PROC - push {r4, lr} - - adr r12, filter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter4x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter4x4_only - - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - sub r0, r0, r1, lsl #1 ;go back 2 lines of src data - -;First pass: output_height lines x output_width columns (9x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - - vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data - vld1.u8 {q4}, [r0], r1 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - ;First Pass on rest 5-line data - vld1.u8 {q11}, [r0], r1 - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vext.8 d31, d22, d23, #5 ;construct src_ptr[3] - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5]) - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] - - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vext.8 d31, d22, d23, #4 ;construct src_ptr[2] - - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vext.8 d31, d22, d23, #2 ;construct src_ptr[0] - - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vext.8 d31, d22, d23, #3 ;construct src_ptr[1] - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3]) - - add r3, r12, r3, lsl #5 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - vqadd.s16 q12, q11 - - vext.8 d23, d27, d28, #4 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d30, q8, #7 - vqrshrun.s16 d31, q12, #7 - -;Second pass: 4x4 - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -firstpass_filter4x4_only - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - -;First pass: output_height lines x output_width columns (4x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - vst1.32 {d27[0]}, [r4] ;store result - vst1.32 {d27[1]}, [r0] - vst1.32 {d28[0]}, [r1] - vst1.32 {d28[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -secondpass_filter4x4_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.32 {d27[0]}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.32 {d27[1]}, [r0], r1 - vabs.s32 q7, q5 - vld1.32 {d28[0]}, [r0], r1 - vabs.s32 q8, q6 - vld1.32 {d28[1]}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.32 {d29[0]}, [r0], r1 - vdup.8 d1, d14[4] - vld1.32 {d29[1]}, [r0], r1 - vdup.8 d2, d15[0] - vld1.32 {d30[0]}, [r0], r1 - vdup.8 d3, d15[4] - vld1.32 {d30[1]}, [r0], r1 - vdup.8 d4, d16[0] - vld1.32 {d31[0]}, [r0], r1 - vdup.8 d5, d16[4] - - vext.8 d23, d27, d28, #4 - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm deleted file mode 100644 index 9ce1e3bbd..000000000 --- a/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm +++ /dev/null @@ -1,473 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x4_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x4_only - - sub sp, sp, #32 ;reserve space on stack for temporary storage - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - mov lr, sp - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (9x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q7}, [r0], r1 - vst1.u8 {d25}, [lr]! - - ;first_pass filtering on the rest 5-line data - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d27, q9, #7 - vqrshrun.s16 d28, q10, #7 - vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack - vqrshrun.s16 d30, q12, #7 - -;Second pass: 8x4 -;secondpass_filter - add r3, r12, r3, lsl #5 - sub lr, lr, #32 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {q12}, [lr]! - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - add sp, sp, #32 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter8x4_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - vld1.u8 {q3}, [r0], r1 ;load src data - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (4x8) - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x4_only -;Second pass: 8x4 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {d22}, [r0], r1 - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d25}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d28}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d29}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d30}, [r0], r1 - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm deleted file mode 100644 index 5ff16616d..000000000 --- a/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm +++ /dev/null @@ -1,524 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x8_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x8_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x8_only - - sub sp, sp, #64 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (13x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - -filt_blk2d_fp8x8_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - subs r2, r2, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d25}, [lr]! - - bne filt_blk2d_fp8x8_loop_neon - - ;first_pass filtering on the rest 5-line data - ;vld1.u8 {q3}, [r0], r1 ;load src data - ;vld1.u8 {q4}, [r0], r1 - ;vld1.u8 {q5}, [r0], r1 - ;vld1.u8 {q6}, [r0], r1 - vld1.u8 {q7}, [r0], r1 - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - add r3, r12, r3, lsl #5 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - sub lr, lr, #64 - vqrshrun.s16 d27, q9, #7 - vld1.u8 {q9}, [lr]! ;load intermediate data from stack - vqrshrun.s16 d28, q10, #7 - vld1.u8 {q10}, [lr]! - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q11, #7 - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vqrshrun.s16 d30, q12, #7 - vld1.u8 {q12}, [lr]! - -;Second pass: 8x8 - mov r3, #2 ;loop counter - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_sp8x8_loop_neon - - add sp, sp, #64 - pop {r4-r5,pc} - -;--------------------- -firstpass_filter8x8_only - ;add r2, r12, r2, lsl #5 ;calculate filter location - ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (8x8) -filt_blk2d_fpo8x8_loop_neon - vld1.u8 {q3}, [r0], r1 ;load src data - vld1.u8 {q4}, [r0], r1 - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - ; - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - subs r2, r2, #1 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - bne filt_blk2d_fpo8x8_loop_neon - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x8_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {d19}, [r0], r1 - vabs.s32 q7, q5 - vld1.u8 {d20}, [r0], r1 - vabs.s32 q8, q6 - vld1.u8 {d21}, [r0], r1 - mov r3, #2 ;loop counter - vld1.u8 {d22}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d23}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d24}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d25}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - -;Second pass: 8x8 -filt_blk2d_spo8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_spo8x8_loop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/vp9_arm_systemdependent.c b/vp9/common/arm/vp9_arm_systemdependent.c deleted file mode 100644 index a6319a4c5..000000000 --- a/vp9/common/arm/vp9_arm_systemdependent.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx_ports/arm.h" -#include "vp9/common/vp9_pragmas.h" -#include "vp9/common/vp9_subpixel.h" -#include "vp9/common/vp9_loopfilter.h" -#include "vp9/common/recon.h" -#include "vp9/common/vp9_onyxc_int.h" - -void vp9_arch_arm_common_init(VP9_COMMON *ctx) { -#if CONFIG_RUNTIME_CPU_DETECT - VP9_COMMON_RTCD *rtcd = &ctx->rtcd; - int flags = arm_cpu_caps(); - rtcd->flags = flags; - - /* Override default functions with fastest ones for this CPU. */ -#if HAVE_ARMV5TE - if (flags & HAS_EDSP) { - } -#endif - -// The commented functions need to be re-written for vpx. -#if HAVE_ARMV6 - if (flags & HAS_MEDIA) { - rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6; - rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6; - rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6; - rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6; - - rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6; - rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6; - rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6; - rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6; - - // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6; - // rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual; - // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6; - // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6; - - rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6; - rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6; - rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6; - rtcd->recon.recon = vp9_recon_b_armv6; - rtcd->recon.recon2 = vp9_recon2b_armv6; - rtcd->recon.recon4 = vp9_recon4b_armv6; - } -#endif - -#if HAVE_ARMV7 - if (flags & HAS_NEON) { - rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon; - rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon; - rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon; - rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon; - - rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon; - rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon; - rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon; - rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon; - - // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon; - // rtcd->idct.idct16 = vp9_short_idct4x4llm_neon; - // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon; - // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon; - - rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon; - rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon; - rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon; - rtcd->recon.recon = vp9_recon_b_neon; - rtcd->recon.recon2 = vp9_recon2b_neon; - rtcd->recon.recon4 = vp9_recon4b_neon; - rtcd->recon.recon_mb = vp9_recon_mb_neon; - rtcd->recon.build_intra_predictors_mby = - vp9_build_intra_predictors_mby_neon; - rtcd->recon.build_intra_predictors_mby_s = - vp9_build_intra_predictors_mby_s_neon; - } -#endif - -#endif -} diff --git a/vp9/common/arm/vp9_bilinearfilter_arm.c b/vp9/common/arm/vp9_bilinearfilter_arm.c deleted file mode 100644 index 678173141..000000000 --- a/vp9/common/arm/vp9_bilinearfilter_arm.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_subpixel.h" -#include "vp9/common/arm/vp9_bilinearfilter_arm.h" - -void vp9_filter_block2d_bil_armv6 -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const short *HFilter, - const short *VFilter, - int Width, - int Height -) { - unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - - -void vp9_bilinear_predict4x4_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); -} - -void vp9_bilinear_predict16x16_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/arm/vp9_bilinearfilter_arm.h b/vp9/common/arm/vp9_bilinearfilter_arm.h deleted file mode 100644 index 422691e44..000000000 --- a/vp9/common/arm/vp9_bilinearfilter_arm.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_ -#define VP9_COMMON_ARM_VP9_BILINEARFILTER_ARM_H_ - -extern void vp9_filter_block2d_bil_first_pass_armv6 -( - const unsigned char *src_ptr, - unsigned short *dst_ptr, - unsigned int src_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter -); - -extern void vp9_filter_block2d_bil_second_pass_armv6 -( - const unsigned short *src_ptr, - unsigned char *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter -); - -#endif /* BILINEARFILTER_ARM_H */ diff --git a/vp9/common/arm/vp9_filter_arm.c b/vp9/common/arm/vp9_filter_arm.c deleted file mode 100644 index f55273c33..000000000 --- a/vp9/common/arm/vp9_filter_arm.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include -#include "vp9/common/vp9_filter.h" -#include "vp9/common/vp9_subpixel.h" -#include "vpx_ports/mem.h" - -extern void vp9_filter_block2d_first_pass_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -// 8x8 -extern void vp9_filter_block2d_first_pass_8x8_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -// 16x16 -extern void vp9_filter_block2d_first_pass_16x16_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -extern void vp9_filter_block2d_second_pass_armv6 -( - short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp9_filter -); - -extern void vp9_filter4_block2d_second_pass_armv6 -( - short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp9_filter -); - -extern void vp9_filter_block2d_first_pass_only_armv6 -( - unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int cnt, - unsigned int output_pitch, - const short *vp9_filter -); - - -extern void vp9_filter_block2d_second_pass_only_armv6 -( - unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int cnt, - unsigned int output_pitch, - const short *vp9_filter -); - -#if HAVE_ARMV6 -void vp9_sixtap_predict_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */ - - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - /* Vfilter is null. First pass only */ - if (xoffset && !yoffset) { - /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); - vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/ - - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); - } else { - /* Vfilter is a 4 tap filter */ - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); - } - /* Vfilter is 6 tap filter */ - else { - vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); - } - } -} - -void vp9_sixtap_predict8x8_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); - } else { - vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); - } - } -} - - -void vp9_sixtap_predict16x16_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); - } else { - vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); - } - } - -} -#endif diff --git a/vp9/common/arm/vp9_idct_arm.h b/vp9/common/arm/vp9_idct_arm.h deleted file mode 100644 index 8112ab913..000000000 --- a/vp9/common/arm/vp9_idct_arm.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_ARM_VP9_IDCT_ARM_H_ -#define VP9_COMMON_ARM_VP9_IDCT_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_idct(vp9_short_idct4x4llm_1_v6); -extern prototype_idct(vp9_short_idct4x4llm_v6_dual); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6); -extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6); -extern prototype_second_order(vp9_short_inv_walsh4x4_v6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6 - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6 - -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6 - -#undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_idct(vp9_short_idct4x4llm_1_neon); -extern prototype_idct(vp9_short_idct4x4llm_neon); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon); -extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon); -extern prototype_second_order(vp9_short_inv_walsh4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_neon - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon - -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon - -#undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon -#endif -#endif - -#endif diff --git a/vp9/common/arm/vp9_loopfilter_arm.c b/vp9/common/arm/vp9_loopfilter_arm.c deleted file mode 100644 index b61f1a86b..000000000 --- a/vp9/common/arm/vp9_loopfilter_arm.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp9/common/vp9_loopfilter.h" -#include "vp9/common/vp9_onyxc_int.h" - -#if HAVE_ARMV6 -extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6); -#endif - -#if HAVE_ARMV7 -typedef void loopfilter_y_neon(unsigned char *src, int pitch, - unsigned char blimit, unsigned char limit, unsigned char thresh); -typedef void loopfilter_uv_neon(unsigned char *u, int pitch, - unsigned char blimit, unsigned char limit, unsigned char thresh, - unsigned char *v); - -extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon; -extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon; - -extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon; -extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon; -#endif - -#if HAVE_ARMV6 -/*ARMV6 loopfilter functions*/ -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); -} -#endif - -#if HAVE_ARMV7 -/* NEON loopfilter functions */ -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char mblim = *lfi->mblim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); - - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char mblim = *lfi->mblim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); - - if (u_ptr) - vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char blim = *lfi->blim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char blim = *lfi->blim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); - - if (u_ptr) - vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); -} -#endif diff --git a/vp9/common/arm/vp9_loopfilter_arm.h b/vp9/common/arm/vp9_loopfilter_arm.h deleted file mode 100644 index 4f12ff31e..000000000 --- a/vp9/common/arm/vp9_loopfilter_arm.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_ -#define VP9_COMMON_ARM_VP9_LOOPFILTER_ARM_H_ - -#include "vpx_config.h" - -#if HAVE_ARMV6 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6); - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon); -extern prototype_loopfilter_block(vp9_loop_filter_bv_neon); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon); -extern prototype_loopfilter_block(vp9_loop_filter_bh_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon); - -#endif /* HAVE_ARMV7 */ - -#endif /* LOOPFILTER_ARM_H */ diff --git a/vp9/common/arm/vp9_recon_arm.h b/vp9/common/arm/vp9_recon_arm.h deleted file mode 100644 index 788385272..000000000 --- a/vp9/common/arm/vp9_recon_arm.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_ARM_VP9_RECON_ARM_H_ -#define VP9_COMMON_ARM_VP9_RECON_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_recon_block(vp9_recon_b_armv6); -extern prototype_recon_block(vp9_recon2b_armv6); -extern prototype_recon_block(vp9_recon4b_armv6); - -extern prototype_copy_block(vp9_copy_mem8x8_v6); -extern prototype_copy_block(vp9_copy_mem8x4_v6); -extern prototype_copy_block(vp9_copy_mem16x16_v6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp9_recon_b_armv6 - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp9_recon2b_armv6 - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp9_recon4b_armv6 - -#undef vp8_recon_copy8x8 -#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6 - -#undef vp8_recon_copy8x4 -#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6 - -#undef vp8_recon_copy16x16 -#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_recon_block(vp9_recon_b_neon); -extern prototype_recon_block(vp9_recon2b_neon); -extern prototype_recon_block(vp9_recon4b_neon); - -extern prototype_copy_block(vp9_copy_mem8x8_neon); -extern prototype_copy_block(vp9_copy_mem8x4_neon); -extern prototype_copy_block(vp9_copy_mem16x16_neon); - -extern prototype_recon_macroblock(vp9_recon_mb_neon); - -extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon); -extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp9_recon_b_neon - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp9_recon2b_neon - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp9_recon4b_neon - -#undef vp8_recon_copy8x8 -#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon - -#undef vp8_recon_copy8x4 -#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon - -#undef vp8_recon_copy16x16 -#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon - -#undef vp8_recon_recon_mb -#define vp8_recon_recon_mb vp9_recon_mb_neon - -#undef vp9_recon_build_intra_predictors_mby -#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon - -#undef vp9_recon_build_intra_predictors_mby_s -#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon - -#endif -#endif - -#endif diff --git a/vp9/common/arm/vp9_reconintra_arm.c b/vp9/common/arm/vp9_reconintra_arm.c deleted file mode 100644 index 5720828c7..000000000 --- a/vp9/common/arm/vp9_reconintra_arm.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/common/vp9_reconintra.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/recon.h" - -#if HAVE_ARMV7 -extern void vp9_build_intra_predictors_mby_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) { - unsigned char *y_buffer = xd->dst.y_buffer; - unsigned char *ypred_ptr = xd->predictor; - int y_stride = xd->dst.y_stride; - int mode = xd->mode_info_context->mbmi.mode; - int Up = xd->up_available; - int Left = xd->left_available; - - vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, - y_stride, mode, Up, Left); -} -#endif - - -#if HAVE_ARMV7 -extern void vp9_build_intra_predictors_mby_s_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) { - unsigned char *y_buffer = xd->dst.y_buffer; - unsigned char *ypred_ptr = xd->predictor; - int y_stride = xd->dst.y_stride; - int mode = xd->mode_info_context->mbmi.mode; - int Up = xd->up_available; - int Left = xd->left_available; - - vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, - y_stride, mode, Up, Left); -} - -#endif diff --git a/vp9/common/arm/vp9_subpixel_arm.h b/vp9/common/arm/vp9_subpixel_arm.h deleted file mode 100644 index efc7c1a5d..000000000 --- a/vp9/common/arm/vp9_subpixel_arm.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_ -#define VP9_COMMON_ARM_VP9_SUBPIXEL_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6 - -#undef vp9_subpix_bilinear8x4 -#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6 - -#undef vp9_subpix_bilinear4x4 -#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon - -#undef vp9_subpix_bilinear8x4 -#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon - -#undef vp9_subpix_bilinear4x4 -#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon -#endif -#endif - -#endif diff --git a/vp9/common/vp9_asm_com_offsets.c b/vp9/common/vp9_asm_com_offsets.c index 07d3e333a..94ccb6ebd 100644 --- a/vp9/common/vp9_asm_com_offsets.c +++ b/vp9/common/vp9_asm_com_offsets.c @@ -12,29 +12,10 @@ #include "vpx_config.h" #include "vpx/vpx_codec.h" #include "vpx_ports/asm_offsets.h" -#include "vpx_scale/yv12config.h" BEGIN -/* vpx_scale */ -DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); -DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); -DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); -DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); -DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); -DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); -DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); -DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); -DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); -DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); -DEFINE(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS); - END /* add asserts for any offset that is not supported by assembly code */ /* add asserts for any size that is not supported by assembly code */ - -#if HAVE_ARMV7 -/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ -ct_assert(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS == 32) -#endif diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 4e8fa78e2..23df2d86d 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -22,11 +22,7 @@ typedef enum { SIMPLE_LOOPFILTER = 1 } LOOPFILTERTYPE; -#if ARCH_ARM -#define SIMD_WIDTH 1 -#else #define SIMD_WIDTH 16 -#endif /* Need to align this structure so when it is declared and * passed it can be loaded into vector registers. @@ -67,10 +63,6 @@ struct loop_filter_info { #include "x86/vp9_loopfilter_x86.h" #endif -#if ARCH_ARM -#include "arm/vp9_loopfilter_arm.h" -#endif - typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */ int p, /* pitch */ const unsigned char *blimit, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 1c0ce16e1..ddc64886d 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -79,13 +79,11 @@ specialize vp9_dequant_idct_add_uv_block mmx # RECON # prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp9_copy_mem16x16 mmx sse2 media neon dspr2 -vp9_copy_mem16x16_media=vp9_copy_mem16x16_v6 +specialize vp9_copy_mem16x16 mmx sse2 dspr2 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp9_copy_mem8x8 mmx media neon dspr2 -vp9_copy_mem8x8_media=vp9_copy_mem8x8_v6 +specialize vp9_copy_mem8x8 mmx dspr2 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" @@ -98,8 +96,7 @@ prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char specialize vp9_avg_mem8x8 prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx media neon dspr2 -vp9_copy_mem8x4_media=vp9_copy_mem8x4_v6 +specialize vp9_copy_mem8x4 mmx dspr2 vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2 prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride" @@ -193,36 +190,28 @@ prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsign specialize vp9_loop_filter_bh8x8 sse2 prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp9_loop_filter_simple_mbv mmx sse2 media neon +specialize vp9_loop_filter_simple_mbv mmx sse2 vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2 -vp9_loop_filter_simple_mbv_media=vp9_loop_filter_simple_vertical_edge_armv6 -vp9_loop_filter_simple_mbv_neon=vp9_loop_filter_mbvs_neon prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp9_loop_filter_simple_mbh mmx sse2 media neon +specialize vp9_loop_filter_simple_mbh mmx sse2 vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2 -vp9_loop_filter_simple_mbh_media=vp9_loop_filter_simple_horizontal_edge_armv6 -vp9_loop_filter_simple_mbh_neon=vp9_loop_filter_mbhs_neon prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp9_loop_filter_simple_bv mmx sse2 media neon +specialize vp9_loop_filter_simple_bv mmx sse2 vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2 -vp9_loop_filter_simple_bv_media=vp9_loop_filter_bvs_armv6 -vp9_loop_filter_simple_bv_neon=vp9_loop_filter_bvs_neon prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit" -specialize vp9_loop_filter_simple_bh mmx sse2 media neon +specialize vp9_loop_filter_simple_bh mmx sse2 vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2 -vp9_loop_filter_simple_bh_media=vp9_loop_filter_bhs_armv6 -vp9_loop_filter_simple_bh_neon=vp9_loop_filter_bhs_neon # # post proc @@ -683,7 +672,7 @@ prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int st specialize vp9_temporal_filter_apply sse2 prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction" -specialize vp9_yv12_copy_partial_frame neon +specialize vp9_yv12_copy_partial_frame fi @@ -716,11 +705,11 @@ if [ "$CONFIG_SPATIAL_RESAMPLING" = "yes" ]; then fi prototype void vp8_yv12_extend_frame_borders "struct yv12_buffer_config *ybf" -specialize vp8_yv12_extend_frame_borders neon +specialize vp8_yv12_extend_frame_borders prototype void vp8_yv12_copy_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" -specialize vp8_yv12_copy_frame neon +specialize vp8_yv12_copy_frame prototype void vp8_yv12_copy_y "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc" -specialize vp8_yv12_copy_y neon +specialize vp8_yv12_copy_y diff --git a/vp9/decoder/arm/armv6/vp9_dequant_dc_idct_v6.asm b/vp9/decoder/arm/armv6/vp9_dequant_dc_idct_v6.asm deleted file mode 100644 index 6bebda24f..000000000 --- a/vp9/decoder/arm/armv6/vp9_dequant_dc_idct_v6.asm +++ /dev/null @@ -1,218 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dequant_dc_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride, int Dc) -; r0 = input -; r1 = dq -; r2 = pred -; r3 = dest -; sp + 36 = pitch ; +4 = 40 -; sp + 40 = stride ; +4 = 44 -; sp + 44 = Dc ; +4 = 48 - - -|vp8_dequant_dc_idct_add_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r6, [sp, #44] - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r3, [sp] - - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - mov r12, #3 - -vp8_dequant_dc_add_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne vp8_dequant_dc_add_loop - - sub r0, r0, #32 - mov r1, r0 - -; short_idct4x4llm_v6_dual - ldr r3, cospi8sqrt2minus1 - ldr r4, sinpi8sqrt2 - ldr r6, [r0, #8] - mov r5, #2 -vp8_dequant_dc_idct_loop1_v6 - ldr r12, [r0, #24] - ldr r14, [r0, #16] - smulwt r9, r3, r6 - smulwb r7, r3, r6 - smulwt r10, r4, r6 - smulwb r8, r4, r6 - pkhbt r7, r7, r9, lsl #16 - smulwt r11, r3, r12 - pkhbt r8, r8, r10, lsl #16 - uadd16 r6, r6, r7 - smulwt r7, r4, r12 - smulwb r9, r3, r12 - smulwb r10, r4, r12 - subs r5, r5, #1 - pkhbt r9, r9, r11, lsl #16 - ldr r11, [r0], #4 - pkhbt r10, r10, r7, lsl #16 - uadd16 r7, r12, r9 - usub16 r7, r8, r7 - uadd16 r6, r6, r10 - uadd16 r10, r11, r14 - usub16 r8, r11, r14 - uadd16 r9, r10, r6 - usub16 r10, r10, r6 - uadd16 r6, r8, r7 - usub16 r7, r8, r7 - str r6, [r1, #8] - ldrne r6, [r0, #8] - str r7, [r1, #16] - str r10, [r1, #24] - str r9, [r1], #4 - bne vp8_dequant_dc_idct_loop1_v6 - - mov r5, #2 - sub r0, r1, #8 -vp8_dequant_dc_idct_loop2_v6 - ldr r6, [r0], #4 - ldr r7, [r0], #4 - ldr r8, [r0], #4 - ldr r9, [r0], #4 - smulwt r1, r3, r6 - smulwt r12, r4, r6 - smulwt lr, r3, r8 - smulwt r10, r4, r8 - pkhbt r11, r8, r6, lsl #16 - pkhbt r1, lr, r1, lsl #16 - pkhbt r12, r10, r12, lsl #16 - pkhtb r6, r6, r8, asr #16 - uadd16 r6, r1, r6 - pkhbt lr, r9, r7, lsl #16 - uadd16 r10, r11, lr - usub16 lr, r11, lr - pkhtb r8, r7, r9, asr #16 - subs r5, r5, #1 - smulwt r1, r3, r8 - smulwb r7, r3, r8 - smulwt r11, r4, r8 - smulwb r9, r4, r8 - pkhbt r1, r7, r1, lsl #16 - uadd16 r8, r1, r8 - pkhbt r11, r9, r11, lsl #16 - usub16 r1, r12, r8 - uadd16 r8, r11, r6 - ldr r9, c0x00040004 - ldr r12, [sp, #40] - uadd16 r6, r10, r8 - usub16 r7, r10, r8 - uadd16 r7, r7, r9 - uadd16 r6, r6, r9 - uadd16 r10, r14, r1 - usub16 r1, r14, r1 - uadd16 r10, r10, r9 - uadd16 r1, r1, r9 - ldr r11, [r2], r12 - mov r8, r7, asr #3 - pkhtb r9, r8, r10, asr #19 - mov r8, r1, asr #3 - pkhtb r8, r8, r6, asr #19 - uxtb16 lr, r11, ror #8 - qadd16 r9, r9, lr - uxtb16 lr, r11 - qadd16 r8, r8, lr - usat16 r9, #8, r9 - usat16 r8, #8, r8 - orr r9, r8, r9, lsl #8 - ldr r11, [r2], r12 - ldr lr, [sp] - ldr r12, [sp, #44] - mov r7, r7, lsl #16 - mov r1, r1, lsl #16 - mov r10, r10, lsl #16 - mov r6, r6, lsl #16 - mov r7, r7, asr #3 - pkhtb r7, r7, r10, asr #19 - mov r1, r1, asr #3 - pkhtb r1, r1, r6, asr #19 - uxtb16 r8, r11, ror #8 - qadd16 r7, r7, r8 - uxtb16 r8, r11 - qadd16 r1, r1, r8 - usat16 r7, #8, r7 - usat16 r1, #8, r1 - orr r1, r1, r7, lsl #8 - str r9, [lr], r12 - str r1, [lr], r12 - str lr, [sp] - bne vp8_dequant_dc_idct_loop2_v6 - -; vpx_memset - sub r0, r0, #32 - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_dequant_dc_idct_add_v6| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x00004E7B -sinpi8sqrt2 DCD 0x00008A8C -c0x00040004 DCD 0x00040004 - - END diff --git a/vp9/decoder/arm/armv6/vp9_dequant_idct_v6.asm b/vp9/decoder/arm/armv6/vp9_dequant_idct_v6.asm deleted file mode 100644 index 47b671ca6..000000000 --- a/vp9/decoder/arm/armv6/vp9_dequant_idct_v6.asm +++ /dev/null @@ -1,196 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp8_dequant_idct_add_v6| - - AREA |.text|, CODE, READONLY -;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride) -; r0 = input -; r1 = dq -; r2 = pred -; r3 = dest -; sp + 36 = pitch ; +4 = 40 -; sp + 40 = stride ; +4 = 44 - - -|vp8_dequant_idct_add_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r3, [sp] - - mov r12, #4 - -vp8_dequant_add_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne vp8_dequant_add_loop - - sub r0, r0, #32 - mov r1, r0 - -; short_idct4x4llm_v6_dual - ldr r3, cospi8sqrt2minus1 - ldr r4, sinpi8sqrt2 - ldr r6, [r0, #8] - mov r5, #2 -vp8_dequant_idct_loop1_v6 - ldr r12, [r0, #24] - ldr r14, [r0, #16] - smulwt r9, r3, r6 - smulwb r7, r3, r6 - smulwt r10, r4, r6 - smulwb r8, r4, r6 - pkhbt r7, r7, r9, lsl #16 - smulwt r11, r3, r12 - pkhbt r8, r8, r10, lsl #16 - uadd16 r6, r6, r7 - smulwt r7, r4, r12 - smulwb r9, r3, r12 - smulwb r10, r4, r12 - subs r5, r5, #1 - pkhbt r9, r9, r11, lsl #16 - ldr r11, [r0], #4 - pkhbt r10, r10, r7, lsl #16 - uadd16 r7, r12, r9 - usub16 r7, r8, r7 - uadd16 r6, r6, r10 - uadd16 r10, r11, r14 - usub16 r8, r11, r14 - uadd16 r9, r10, r6 - usub16 r10, r10, r6 - uadd16 r6, r8, r7 - usub16 r7, r8, r7 - str r6, [r1, #8] - ldrne r6, [r0, #8] - str r7, [r1, #16] - str r10, [r1, #24] - str r9, [r1], #4 - bne vp8_dequant_idct_loop1_v6 - - mov r5, #2 - sub r0, r1, #8 -vp8_dequant_idct_loop2_v6 - ldr r6, [r0], #4 - ldr r7, [r0], #4 - ldr r8, [r0], #4 - ldr r9, [r0], #4 - smulwt r1, r3, r6 - smulwt r12, r4, r6 - smulwt lr, r3, r8 - smulwt r10, r4, r8 - pkhbt r11, r8, r6, lsl #16 - pkhbt r1, lr, r1, lsl #16 - pkhbt r12, r10, r12, lsl #16 - pkhtb r6, r6, r8, asr #16 - uadd16 r6, r1, r6 - pkhbt lr, r9, r7, lsl #16 - uadd16 r10, r11, lr - usub16 lr, r11, lr - pkhtb r8, r7, r9, asr #16 - subs r5, r5, #1 - smulwt r1, r3, r8 - smulwb r7, r3, r8 - smulwt r11, r4, r8 - smulwb r9, r4, r8 - pkhbt r1, r7, r1, lsl #16 - uadd16 r8, r1, r8 - pkhbt r11, r9, r11, lsl #16 - usub16 r1, r12, r8 - uadd16 r8, r11, r6 - ldr r9, c0x00040004 - ldr r12, [sp, #40] - uadd16 r6, r10, r8 - usub16 r7, r10, r8 - uadd16 r7, r7, r9 - uadd16 r6, r6, r9 - uadd16 r10, r14, r1 - usub16 r1, r14, r1 - uadd16 r10, r10, r9 - uadd16 r1, r1, r9 - ldr r11, [r2], r12 - mov r8, r7, asr #3 - pkhtb r9, r8, r10, asr #19 - mov r8, r1, asr #3 - pkhtb r8, r8, r6, asr #19 - uxtb16 lr, r11, ror #8 - qadd16 r9, r9, lr - uxtb16 lr, r11 - qadd16 r8, r8, lr - usat16 r9, #8, r9 - usat16 r8, #8, r8 - orr r9, r8, r9, lsl #8 - ldr r11, [r2], r12 - ldr lr, [sp] - ldr r12, [sp, #44] - mov r7, r7, lsl #16 - mov r1, r1, lsl #16 - mov r10, r10, lsl #16 - mov r6, r6, lsl #16 - mov r7, r7, asr #3 - pkhtb r7, r7, r10, asr #19 - mov r1, r1, asr #3 - pkhtb r1, r1, r6, asr #19 - uxtb16 r8, r11, ror #8 - qadd16 r7, r7, r8 - uxtb16 r8, r11 - qadd16 r1, r1, r8 - usat16 r7, #8, r7 - usat16 r1, #8, r1 - orr r1, r1, r7, lsl #8 - str r9, [lr], r12 - str r1, [lr], r12 - str lr, [sp] - bne vp8_dequant_idct_loop2_v6 - -; vpx_memset - sub r0, r0, #32 - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_dequant_idct_add_v6| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x00004E7B -sinpi8sqrt2 DCD 0x00008A8C -c0x00040004 DCD 0x00040004 - - END diff --git a/vp9/decoder/arm/armv6/vp9_dequantize_v6.asm b/vp9/decoder/arm/armv6/vp9_dequantize_v6.asm deleted file mode 100644 index 72f7e0ee5..000000000 --- a/vp9/decoder/arm/armv6/vp9_dequantize_v6.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_v6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------- -;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_v6| PROC - stmdb sp!, {r4-r9, lr} - - ldr r3, [r0] ;load Q - ldr r4, [r1] ;load DQC - ldr r5, [r0, #4] - ldr r6, [r1, #4] - - mov r12, #2 ;loop counter - -dequant_loop - smulbb r7, r3, r4 ;multiply - smultt r8, r3, r4 - smulbb r9, r5, r6 - smultt lr, r5, r6 - - ldr r3, [r0, #8] - ldr r4, [r1, #8] - ldr r5, [r0, #12] - ldr r6, [r1, #12] - - strh r7, [r2], #2 ;store result - smulbb r7, r3, r4 ;multiply - strh r8, [r2], #2 - smultt r8, r3, r4 - strh r9, [r2], #2 - smulbb r9, r5, r6 - strh lr, [r2], #2 - smultt lr, r5, r6 - - subs r12, r12, #1 - - add r0, r0, #16 - add r1, r1, #16 - - ldrne r3, [r0] - strh r7, [r2], #2 ;store result - ldrne r4, [r1] - strh r8, [r2], #2 - ldrne r5, [r0, #4] - strh r9, [r2], #2 - ldrne r6, [r1, #4] - strh lr, [r2], #2 - - bne dequant_loop - - ldmia sp!, {r4-r9, pc} - ENDP ;|vp8_dequantize_b_loop_v6| - - END diff --git a/vp9/decoder/arm/armv6/vp9_idct_blk_v6.c b/vp9/decoder/arm/armv6/vp9_idct_blk_v6.c deleted file mode 100644 index d4fa4b52f..000000000 --- a/vp9/decoder/arm/armv6/vp9_idct_blk_v6.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_ports/config.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/decoder/vp9_dequantize.h" - -void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq, - unsigned char *pre, - unsigned char *dst, int stride, - unsigned short *eobs, short *dc) { - int i; - - for (i = 0; i < 4; i++) { - if (eobs[0] > 1) - vp8_dequant_dc_idct_add_v6(q, dq, pre, dst, 16, stride, dc[0]); - else - vp8_dc_only_idct_add_v6(dc[0], pre, dst, 16, stride); - - if (eobs[1] > 1) - vp8_dequant_dc_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride, dc[1]); - else - vp8_dc_only_idct_add_v6(dc[1], pre + 4, dst + 4, 16, stride); - - if (eobs[2] > 1) - vp8_dequant_dc_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride, dc[2]); - else - vp8_dc_only_idct_add_v6(dc[2], pre + 8, dst + 8, 16, stride); - - if (eobs[3] > 1) - vp8_dequant_dc_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride, dc[3]); - else - vp8_dc_only_idct_add_v6(dc[3], pre + 12, dst + 12, 16, stride); - - q += 64; - dc += 4; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, - unsigned short *eobs) { - int i; - - for (i = 0; i < 4; i++) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, pre, dst, 16, stride); - else { - vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dst, 16, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dst + 4, 16, stride); - else { - vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dst + 4, 16, stride); - ((int *)(q + 16))[0] = 0; - } - - if (eobs[2] > 1) - vp8_dequant_idct_add_v6(q + 32, dq, pre + 8, dst + 8, 16, stride); - else { - vp8_dc_only_idct_add_v6(q[32]*dq[0], pre + 8, dst + 8, 16, stride); - ((int *)(q + 32))[0] = 0; - } - - if (eobs[3] > 1) - vp8_dequant_idct_add_v6(q + 48, dq, pre + 12, dst + 12, 16, stride); - else { - vp8_dc_only_idct_add_v6(q[48]*dq[0], pre + 12, dst + 12, 16, stride); - ((int *)(q + 48))[0] = 0; - } - - q += 64; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *pre, - unsigned char *dstu, unsigned char *dstv, - int stride, unsigned short *eobs) { - int i; - - for (i = 0; i < 2; i++) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, pre, dstu, 8, stride); - else { - vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstu, 8, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstu + 4, 8, stride); - else { - vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstu + 4, 8, stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - pre += 32; - dstu += 4 * stride; - eobs += 2; - } - - for (i = 0; i < 2; i++) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, pre, dstv, 8, stride); - else { - vp8_dc_only_idct_add_v6(q[0]*dq[0], pre, dstv, 8, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, pre + 4, dstv + 4, 8, stride); - else { - vp8_dc_only_idct_add_v6(q[16]*dq[0], pre + 4, dstv + 4, 8, stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - pre += 32; - dstv += 4 * stride; - eobs += 2; - } -} diff --git a/vp9/decoder/arm/neon/vp9_dequant_idct_neon.asm b/vp9/decoder/arm/neon/vp9_dequant_idct_neon.asm deleted file mode 100644 index 4bf661857..000000000 --- a/vp9/decoder/arm/neon/vp9_dequant_idct_neon.asm +++ /dev/null @@ -1,129 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequant_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride) -; r0 short *input, -; r1 short *dq, -; r2 unsigned char *pred -; r3 unsigned char *dest -; sp int pitch -; sp+4 int stride - -|vp8_dequant_idct_add_neon| PROC - vld1.16 {q3, q4}, [r0] - vld1.16 {q5, q6}, [r1] - ldr r1, [sp] ; pitch - vld1.32 {d14[0]}, [r2], r1 - vld1.32 {d14[1]}, [r2], r1 - vld1.32 {d15[0]}, [r2], r1 - vld1.32 {d15[1]}, [r2] - - ldr r1, [sp, #4] ; stride - - adr r12, cospi8sqrt2minus1 ; pointer to the first constant - - vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon - vmul.i16 q2, q4, q6 - -;|short_idct4x4llm_neon| PROC - vld1.16 {d0}, [r12] - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - -; memset(input, 0, 32) -- 32bytes - vmov.i16 q14, #0 - - vswp d3, d4 - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vmov q15, q14 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vst1.16 {q14, q15}, [r0] - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vaddw.u8 q1, q1, d14 - vaddw.u8 q2, q2, d15 - - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - - vst1.32 {d0[0]}, [r3], r1 - vst1.32 {d0[1]}, [r3], r1 - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r3] - - bx lr - - ENDP ; |vp8_dequant_idct_add_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b4e7b -sinpi8sqrt2 DCD 0x8a8c8a8c - - END diff --git a/vp9/decoder/arm/neon/vp9_dequantizeb_neon.asm b/vp9/decoder/arm/neon/vp9_dequantizeb_neon.asm deleted file mode 100644 index c8e0c31f2..000000000 --- a/vp9/decoder/arm/neon/vp9_dequantizeb_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_neon| PROC - vld1.16 {q0, q1}, [r0] - vld1.16 {q2, q3}, [r1] - - vmul.i16 q4, q0, q2 - vmul.i16 q5, q1, q3 - - vst1.16 {q4, q5}, [r2] - - bx lr - - ENDP - - END diff --git a/vp9/decoder/arm/neon/vp9_idct_blk_neon.c b/vp9/decoder/arm/neon/vp9_idct_blk_neon.c deleted file mode 100644 index f2620d972..000000000 --- a/vp9/decoder/arm/neon/vp9_idct_blk_neon.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_ports/config.h" -#include "vp9/common/vp9_blockd.h" -#include "vp9/decoder/vp9_dequantize.h" - -/* place these declarations here because we don't want to maintain them - * outside of this scope - */ -void idct_dequant_dc_full_2x_neon -(short *input, short *dq, unsigned char *pre, unsigned char *dst, - int stride, short *dc); -void idct_dequant_dc_0_2x_neon -(short *dc, unsigned char *pre, unsigned char *dst, int stride); -void idct_dequant_full_2x_neon -(short *q, short *dq, unsigned char *pre, unsigned char *dst, - int pitch, int stride); -void idct_dequant_0_2x_neon -(short *q, short dq, unsigned char *pre, int pitch, - unsigned char *dst, int stride); - -void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq, - unsigned char *pre, - unsigned char *dst, int stride, - unsigned short *eobs, short *dc) { - int i; - - for (i = 0; i < 4; i++) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_dc_full_2x_neon(q, dq, pre, dst, stride, dc); - else - idct_dequant_dc_0_2x_neon(dc, pre, dst, stride); - - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_dc_full_2x_neon(q + 32, dq, pre + 8, dst + 8, stride, dc + 2); - else - idct_dequant_dc_0_2x_neon(dc + 2, pre + 8, dst + 8, stride); - - q += 64; - dc += 4; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, - unsigned short *eobs) { - int i; - - for (i = 0; i < 4; i++) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, pre, dst, 16, stride); - else - idct_dequant_0_2x_neon(q, dq[0], pre, 16, dst, stride); - - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon(q + 32, dq, pre + 8, dst + 8, 16, stride); - else - idct_dequant_0_2x_neon(q + 32, dq[0], pre + 8, 16, dst + 8, stride); - - q += 64; - pre += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, - unsigned char *pre, - unsigned char *dstu, - unsigned char *dstv, int stride, - unsigned short *eobs) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride); - else - idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride); - - q += 32; - pre += 32; - dstu += 4 * stride; - - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, pre, dstu, 8, stride); - else - idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstu, stride); - - q += 32; - pre += 32; - - if (((short *)eobs)[2] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride); - else - idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride); - - q += 32; - pre += 32; - dstv += 4 * stride; - - if (((short *)eobs)[3] & 0xfefe) - idct_dequant_full_2x_neon(q, dq, pre, dstv, 8, stride); - else - idct_dequant_0_2x_neon(q, dq[0], pre, 8, dstv, stride); -} diff --git a/vp9/decoder/arm/neon/vp9_idct_dequant_0_2x_neon.asm b/vp9/decoder/arm/neon/vp9_idct_dequant_0_2x_neon.asm deleted file mode 100644 index 456f8e1d4..000000000 --- a/vp9/decoder/arm/neon/vp9_idct_dequant_0_2x_neon.asm +++ /dev/null @@ -1,79 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre, -; int pitch, unsigned char *dst, int stride); -; r0 *q -; r1 dq -; r2 *pre -; r3 pitch -; sp *dst -; sp+4 stride -|idct_dequant_0_2x_neon| PROC - add r12, r2, #4 - vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d4[1]}, [r2] - vld1.32 {d8[0]}, [r12], r3 - vld1.32 {d8[1]}, [r12], r3 - vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d10[1]}, [r12] - - ldrh r12, [r0] ; lo q - ldrh r2, [r0, #32] ; hi q - mov r3, #0 - strh r3, [r0] - strh r3, [r0, #32] - - sxth r12, r12 ; lo - mul r0, r12, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q0, r0 - sxth r2, r2 ; hi - mul r0, r2, r1 - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - ldr r2, [sp] ; dst - ldr r3, [sp, #4] ; stride - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - add r0, r2, #4 - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d10[1]}, [r0] - - bx lr - - ENDP ; |idct_dequant_0_2x_neon| - END diff --git a/vp9/decoder/arm/neon/vp9_idct_dequant_dc_0_2x_neon.asm b/vp9/decoder/arm/neon/vp9_idct_dequant_dc_0_2x_neon.asm deleted file mode 100644 index 0dc036acb..000000000 --- a/vp9/decoder/arm/neon/vp9_idct_dequant_dc_0_2x_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_0_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre, -; unsigned char *dst, int stride); -; r0 *dc -; r1 *pre -; r2 *dst -; r3 stride -|idct_dequant_dc_0_2x_neon| PROC - ldr r0, [r0] ; *dc - mov r12, #16 - - vld1.32 {d2[0]}, [r1], r12 ; lo - vld1.32 {d2[1]}, [r1], r12 - vld1.32 {d4[0]}, [r1], r12 - vld1.32 {d4[1]}, [r1] - sub r1, r1, #44 - vld1.32 {d8[0]}, [r1], r12 ; hi - vld1.32 {d8[1]}, [r1], r12 - vld1.32 {d10[0]}, [r1], r12 - vld1.32 {d10[1]}, [r1] - - sxth r1, r0 ; lo *dc - add r1, r1, #4 - asr r1, r1, #3 - vdup.16 q0, r1 - sxth r0, r0, ror #16 ; hi *dc - add r0, r0, #4 - asr r0, r0, #3 - vdup.16 q3, r0 - - vaddw.u8 q1, q0, d2 ; lo - vaddw.u8 q2, q0, d4 - vaddw.u8 q4, q3, d8 ; hi - vaddw.u8 q5, q3, d10 - - vqmovun.s16 d2, q1 ; lo - vqmovun.s16 d4, q2 - vqmovun.s16 d8, q4 ; hi - vqmovun.s16 d10, q5 - - add r0, r2, #4 - vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d4[1]}, [r2] - vst1.32 {d8[0]}, [r0], r3 ; hi - vst1.32 {d8[1]}, [r0], r3 - vst1.32 {d10[0]}, [r0], r3 - vst1.32 {d10[1]}, [r0] - - bx lr - - ENDP ;|idct_dequant_dc_0_2x_neon| - END diff --git a/vp9/decoder/arm/neon/vp9_idct_dequant_dc_full_2x_neon.asm b/vp9/decoder/arm/neon/vp9_idct_dequant_dc_full_2x_neon.asm deleted file mode 100644 index 61fa66075..000000000 --- a/vp9/decoder/arm/neon/vp9_idct_dequant_dc_full_2x_neon.asm +++ /dev/null @@ -1,205 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_dc_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre, -; unsigned char *dst, int stride, short *dc); -; r0 *q, -; r1 *dq, -; r2 *pre -; r3 *dst -; sp stride -; sp+4 *dc -|idct_dequant_dc_full_2x_neon| PROC - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - mov r1, #16 ; pitch - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r1 ; l pre - vld1.32 {d28[1]}, [r12], r1 ; r pre - vld1.32 {d29[0]}, [r2], r1 - vld1.32 {d29[1]}, [r12], r1 - vld1.32 {d30[0]}, [r2], r1 - vld1.32 {d30[1]}, [r12], r1 - vld1.32 {d31[0]}, [r2] - ldr r1, [sp, #4] - vld1.32 {d31[1]}, [r12] - - adr r2, cospi8sqrt2minus1 ; pointer to the first constant - - ldrh r12, [r1], #2 ; lo *dc - ldrh r1, [r1] ; hi *dc - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - ; move dc up to neon and overwrite first element - vmov.16 d4[0], r12 - vmov.16 d8[0], r1 - - vld1.16 {d0}, [r2] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - ldr r1, [sp] ; stride - add r2, r3, #4 ; hi - vst1.32 {d0[0]}, [r3], r1 ; lo - vst1.32 {d0[1]}, [r2], r1 ; hi - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r2], r1 - vst1.32 {d2[0]}, [r3], r1 - vst1.32 {d2[1]}, [r2], r1 - vst1.32 {d3[0]}, [r3] - vst1.32 {d3[1]}, [r2] - - bx lr - - ENDP ; |idct_dequant_dc_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/vp9/decoder/arm/neon/vp9_idct_dequant_full_2x_neon.asm b/vp9/decoder/arm/neon/vp9_idct_dequant_full_2x_neon.asm deleted file mode 100644 index 772ec4685..000000000 --- a/vp9/decoder/arm/neon/vp9_idct_dequant_full_2x_neon.asm +++ /dev/null @@ -1,197 +0,0 @@ -; -; Copyright (c) 2010 The Webm project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |idct_dequant_full_2x_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre, -; unsigned char *dst, int pitch, int stride); -; r0 *q, -; r1 *dq, -; r2 *pre -; r3 *dst -; sp pitch -; sp+4 stride -|idct_dequant_full_2x_neon| PROC - vld1.16 {q0, q1}, [r1] ; dq (same l/r) - vld1.16 {q2, q3}, [r0] ; l q - ldr r1, [sp] ; pitch - add r0, r0, #32 - vld1.16 {q4, q5}, [r0] ; r q - add r12, r2, #4 - ; interleave the predictors - vld1.32 {d28[0]}, [r2], r1 ; l pre - vld1.32 {d28[1]}, [r12], r1 ; r pre - vld1.32 {d29[0]}, [r2], r1 - vld1.32 {d29[1]}, [r12], r1 - vld1.32 {d30[0]}, [r2], r1 - vld1.32 {d30[1]}, [r12], r1 - vld1.32 {d31[0]}, [r2] - vld1.32 {d31[1]}, [r12] - - adr r2, cospi8sqrt2minus1 ; pointer to the first constant - - ; dequant: q[i] = q[i] * dq[i] - vmul.i16 q2, q2, q0 - vmul.i16 q3, q3, q1 - vmul.i16 q4, q4, q0 - vmul.i16 q5, q5, q1 - - vld1.16 {d0}, [r2] - - ; q2: l0r0 q3: l8r8 - ; q4: l4r4 q5: l12r12 - vswp d5, d8 - vswp d7, d10 - - ; _CONSTANTS_ * 4,12 >> 16 - ; q6: 4 * sinpi : c1/temp1 - ; q7: 12 * sinpi : d1/temp2 - ; q8: 4 * cospi - ; q9: 12 * cospi - vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q7, q5, d0[2] - vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q9, q5, d0[0] - - vqadd.s16 q10, q2, q3 ; a1 = 0 + 8 - vqsub.s16 q11, q2, q3 ; b1 = 0 - 8 - - ; vqdmulh only accepts signed values. this was a problem because - ; our constant had the high bit set, and was treated as a negative value. - ; vqdmulh also doubles the value before it shifts by 16. we need to - ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0, - ; so we can shift the constant without losing precision. this avoids - ; shift again afterward, but also avoids the sign issue. win win! - ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we - ; pre-shift it - vshr.s16 q8, q8, #1 - vshr.s16 q9, q9, #1 - - ; q4: 4 + 4 * cospi : d1/temp1 - ; q5: 12 + 12 * cospi : c1/temp2 - vqadd.s16 q4, q4, q8 - vqadd.s16 q5, q5, q9 - - ; c1 = temp1 - temp2 - ; d1 = temp1 + temp2 - vqsub.s16 q2, q6, q5 - vqadd.s16 q3, q4, q7 - - ; [0]: a1+d1 - ; [1]: b1+c1 - ; [2]: b1-c1 - ; [3]: a1-d1 - vqadd.s16 q4, q10, q3 - vqadd.s16 q5, q11, q2 - vqsub.s16 q6, q11, q2 - vqsub.s16 q7, q10, q3 - - ; rotate - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - ; idct loop 2 - ; q4: l 0, 4, 8,12 r 0, 4, 8,12 - ; q5: l 1, 5, 9,13 r 1, 5, 9,13 - ; q6: l 2, 6,10,14 r 2, 6,10,14 - ; q7: l 3, 7,11,15 r 3, 7,11,15 - - ; q8: 1 * sinpi : c1/temp1 - ; q9: 3 * sinpi : d1/temp2 - ; q10: 1 * cospi - ; q11: 3 * cospi - vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2 - vqdmulh.s16 q9, q7, d0[2] - vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1 - vqdmulh.s16 q11, q7, d0[0] - - vqadd.s16 q2, q4, q6 ; a1 = 0 + 2 - vqsub.s16 q3, q4, q6 ; b1 = 0 - 2 - - ; see note on shifting above - vshr.s16 q10, q10, #1 - vshr.s16 q11, q11, #1 - - ; q10: 1 + 1 * cospi : d1/temp1 - ; q11: 3 + 3 * cospi : c1/temp2 - vqadd.s16 q10, q5, q10 - vqadd.s16 q11, q7, q11 - - ; q8: c1 = temp1 - temp2 - ; q9: d1 = temp1 + temp2 - vqsub.s16 q8, q8, q11 - vqadd.s16 q9, q10, q9 - - ; a1+d1 - ; b1+c1 - ; b1-c1 - ; a1-d1 - vqadd.s16 q4, q2, q9 - vqadd.s16 q5, q3, q8 - vqsub.s16 q6, q3, q8 - vqsub.s16 q7, q2, q9 - - ; +4 >> 3 (rounding) - vrshr.s16 q4, q4, #3 ; lo - vrshr.s16 q5, q5, #3 - vrshr.s16 q6, q6, #3 ; hi - vrshr.s16 q7, q7, #3 - - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; adding pre - ; input is still packed. pre was read interleaved - vaddw.u8 q4, q4, d28 - vaddw.u8 q5, q5, d29 - vaddw.u8 q6, q6, d30 - vaddw.u8 q7, q7, d31 - - vmov.i16 q14, #0 - vmov q15, q14 - vst1.16 {q14, q15}, [r0] ; write over high input - sub r0, r0, #32 - vst1.16 {q14, q15}, [r0] ; write over low input - - ;saturate and narrow - vqmovun.s16 d0, q4 ; lo - vqmovun.s16 d1, q5 - vqmovun.s16 d2, q6 ; hi - vqmovun.s16 d3, q7 - - ldr r1, [sp, #4] ; stride - add r2, r3, #4 ; hi - vst1.32 {d0[0]}, [r3], r1 ; lo - vst1.32 {d0[1]}, [r2], r1 ; hi - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r2], r1 - vst1.32 {d2[0]}, [r3], r1 - vst1.32 {d2[1]}, [r2], r1 - vst1.32 {d3[0]}, [r3] - vst1.32 {d3[1]}, [r2] - - bx lr - - ENDP ; |idct_dequant_full_2x_neon| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x4e7b -; because the lowest bit in 0x8a8c is 0, we can pre-shift this -sinpi8sqrt2 DCD 0x4546 - - END diff --git a/vp9/decoder/arm/vp9_dequantize_arm.c b/vp9/decoder/arm/vp9_dequantize_arm.c deleted file mode 100644 index ff1eec640..000000000 --- a/vp9/decoder/arm/vp9_dequantize_arm.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9/decoder/vp9_dequantize.h" -#include "vp9/common/vp9_blockd.h" -#include "vpx_mem/vpx_mem.h" - -#if HAVE_ARMV7 -extern void vp9_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ); -#endif - -#if HAVE_ARMV6 -extern void vp9_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); -#endif - -#if HAVE_ARMV7 - -void vp9_dequantize_b_neon(BLOCKD *d) { - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; - short *DQC = d->dequant; - - vp9_dequantize_b_loop_neon(Q, DQC, DQ); -} -#endif - -#if HAVE_ARMV6 -void vp9_dequantize_b_v6(BLOCKD *d) { - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; - short *DQC = d->dequant; - - vp9_dequantize_b_loop_v6(Q, DQC, DQ); -} -#endif diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c index e92712bd6..e01910d53 100644 --- a/vp9/decoder/vp9_onyxd_if.c +++ b/vp9/decoder/vp9_onyxd_if.c @@ -28,9 +28,6 @@ #include "vpx_ports/vpx_timer.h" #include "vp9/decoder/vp9_decodframe.h" #include "vp9/decoder/vp9_detokenize.h" -#if ARCH_ARM -#include "vpx_ports/arm.h" -#endif static int get_free_fb(VP9_COMMON *cm); static void ref_cnt_fb(int *buf, int *idx, int new_idx); @@ -235,11 +232,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag, return pbi->common.error.error_code; } -/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/ -#if HAVE_ARMV7 -extern void vp9_push_neon(int64_t *store); -extern void vp9_pop_neon(int64_t *store); -#endif static int get_free_fb(VP9_COMMON *cm) { int i; @@ -317,9 +309,6 @@ static int swap_frame_buffers(VP9_COMMON *cm) { int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, const unsigned char **psource, int64_t time_stamp) { -#if HAVE_ARMV7 - int64_t dx_store_reg[8]; -#endif VP9D_COMP *pbi = (VP9D_COMP *) ptr; VP9_COMMON *cm = &pbi->common; const unsigned char *source = *psource; @@ -346,26 +335,9 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, cm->yv12_fb[cm->lst_fb_idx].corrupted = 1; } -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_push_neon(dx_store_reg); - } -#endif - cm->new_fb_idx = get_free_fb(cm); if (setjmp(pbi->common.error.jmp)) { -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(dx_store_reg); - } -#endif pbi->common.error.setjmp = 0; /* We do not know if the missing frame(s) was supposed to update @@ -384,14 +356,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, retcode = vp9_decode_frame(pbi, psource); if (retcode < 0) { -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(dx_store_reg); - } -#endif pbi->common.error.error_code = VPX_CODEC_ERROR; pbi->common.error.setjmp = 0; if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) @@ -401,14 +365,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, { if (swap_frame_buffers(cm)) { -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(dx_store_reg); - } -#endif pbi->common.error.error_code = VPX_CODEC_ERROR; pbi->common.error.setjmp = 0; return -1; @@ -455,14 +411,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size, pbi->last_time_stamp = time_stamp; pbi->source_sz = 0; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(dx_store_reg); - } -#endif pbi->common.error.setjmp = 0; return retcode; } diff --git a/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm b/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm deleted file mode 100644 index 94e65ef8d..000000000 --- a/vp9/encoder/arm/armv5te/vp9_boolhuff_armv5te.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_start_encode| - EXPORT |vp9_encode_bool| - EXPORT |vp8_stop_encode| - EXPORT |vp8_encode_value| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 BOOL_CODER *br -; r1 unsigned char *source - -|vp8_start_encode| PROC - mov r12, #0 - mov r3, #255 - mvn r2, #23 - str r12, [r0, #vp9_writer_lowvalue] - str r3, [r0, #vp9_writer_range] - str r12, [r0, #vp9_writer_value] - str r2, [r0, #vp9_writer_count] - str r12, [r0, #vp9_writer_pos] - str r1, [r0, #vp9_writer_buffer] - bx lr - ENDP - -; r0 BOOL_CODER *br -; r1 int bit -; r2 int probability -|vp9_encode_bool| PROC - push {r4-r9, lr} - - mov r4, r2 - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - sub r7, r5, #1 ; range-1 - - cmp r1, #0 - mul r6, r4, r7 ; ((range-1) * probability) - - mov r7, #1 - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * probability) >> 8) - - addne r2, r2, r4 ; if (bit) lowvalue += split - subne r4, r5, r4 ; if (bit) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r1, [r7, r4] - cmpge r1, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r1, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r1, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r9, pc} - ENDP - -; r0 BOOL_CODER *br -|vp8_stop_encode| PROC - push {r4-r10, lr} - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - mov r10, #32 - -stop_encode_loop - sub r7, r5, #1 ; range-1 - - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_se ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_se - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_se -token_zero_while_loop_se - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_se - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r1, [r7, r4] - cmpge r1, #0xff - beq token_zero_while_loop_se - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_se - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r1, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r1, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_se - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r10, r10, #1 - bne stop_encode_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r10, pc} - - ENDP - -; r0 BOOL_CODER *br -; r1 int data -; r2 int bits -|vp8_encode_value| PROC - push {r4-r11, lr} - - mov r10, r2 - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - - rsb r4, r10, #32 ; 32-n - - ; v is kept in r1 during the token pack loop - lsl r1, r1, r4 ; r1 = v << 32 - n - -encode_value_loop - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r1, r1, #1 ; bit = v >> n - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - addcs r2, r2, r4 ; if (bit) lowvalue += split - subcs r4, r5, r4 ; if (bit) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_ev ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_ev - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_ev -token_zero_while_loop_ev - mov r9, #0 - strb r9, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_ev - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop_ev - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r9, [r7, r4] ; w->buffer[x] - add r9, r9, #1 - strb r9, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_ev - rsb r4, r6, #24 ; 24-offset - ldr r9, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r9, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_ev - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r10, r10, #1 - bne encode_value_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - pop {r4-r11, pc} - ENDP - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm deleted file mode 100644 index 9ccbaa6c1..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_armv5.asm +++ /dev/null @@ -1,291 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_tokens_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 vp9_writer *w -; r1 const TOKENEXTRA *p -; r2 int xcount -; r3 vp8_coef_encodings -; s0 vp8_extra_bits -; s1 vp8_coef_tree -|vp8cx_pack_tokens_armv5| PROC - push {r4-r11, lr} - - ; Add size of xcount * sizeof (TOKENEXTRA) to get stop - ; sizeof (TOKENEXTRA) is 8 - sub sp, sp, #12 - add r2, r1, r2, lsl #3 ; stop = p + xcount*sizeof(TOKENEXTRA) - str r2, [sp, #0] - str r3, [sp, #8] ; save vp8_coef_encodings - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - b check_p_lt_stop - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #8] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #52] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #52] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #48] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - add sp, sp, #12 - pop {r4-r11, pc} - ENDP - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm deleted file mode 100644 index 0938ce1a3..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_mbrow_armv5.asm +++ /dev/null @@ -1,327 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_mb_row_tokens_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 VP8_COMP *cpi -; r1 vp9_writer *w -; r2 vp8_coef_encodings -; r3 vp8_extra_bits -; s0 vp8_coef_tree - -|vp8cx_pack_mb_row_tokens_armv5| PROC - push {r4-r11, lr} - sub sp, sp, #24 - - ; Compute address of cpi->common.mb_rows - ldr r4, _VP8_COMP_common_ - ldr r6, _VP8_COMMON_MBrows_ - add r4, r0, r4 - - ldr r5, [r4, r6] ; load up mb_rows - - str r2, [sp, #20] ; save vp8_coef_encodings - str r5, [sp, #12] ; save mb_rows - str r3, [sp, #8] ; save vp8_extra_bits - - ldr r4, _VP8_COMP_tplist_ - add r4, r0, r4 - ldr r7, [r4, #0] ; dereference cpi->tp_list - - mov r0, r1 ; keep same as other loops - - ldr r2, [r0, #vp9_writer_lowvalue] - ldr r5, [r0, #vp9_writer_range] - ldr r3, [r0, #vp9_writer_count] - -mb_row_loop - - ldr r1, [r7, #tokenlist_start] - ldr r9, [r7, #tokenlist_stop] - str r9, [sp, #0] ; save stop for later comparison - str r7, [sp, #16] ; tokenlist address for next time - - b check_p_lt_stop - - ; actuall work gets done here! - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #20] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #60] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #60] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #8] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - ldr r6, [sp, #12] ; mb_rows - ldr r7, [sp, #16] ; tokenlist address - subs r6, r6, #1 - add r7, r7, #TOKENLIST_SZ ; next element in the array - str r6, [sp, #12] - bne mb_row_loop - - str r2, [r0, #vp9_writer_lowvalue] - str r5, [r0, #vp9_writer_range] - str r3, [r0, #vp9_writer_count] - add sp, sp, #24 - pop {r4-r11, pc} - ENDP - -_VP8_COMP_common_ - DCD vp8_comp_common -_VP8_COMMON_MBrows_ - DCD vp8_common_mb_rows -_VP8_COMP_tplist_ - DCD vp8_comp_tplist - - END diff --git a/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm b/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm deleted file mode 100644 index 4611b407d..000000000 --- a/vp9/encoder/arm/armv5te/vp9_packtokens_partitions_armv5.asm +++ /dev/null @@ -1,465 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8cx_pack_tokens_into_partitions_armv5| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY - -; r0 VP8_COMP *cpi -; r1 unsigned char *cx_data -; r2 int num_part -; r3 *size -; s0 vp8_coef_encodings -; s1 vp8_extra_bits, -; s2 const vp9_tree_index *, - -|vp8cx_pack_tokens_into_partitions_armv5| PROC - push {r4-r11, lr} - sub sp, sp, #44 - - ; Compute address of cpi->common.mb_rows - ldr r4, _VP8_COMP_common_ - ldr r6, _VP8_COMMON_MBrows_ - add r4, r0, r4 - - ldr r5, [r4, r6] ; load up mb_rows - - str r5, [sp, #36] ; save mb_rows - str r1, [sp, #24] ; save cx_data - str r2, [sp, #20] ; save num_part - str r3, [sp, #8] ; save *size - - ; *size = 3*(num_part -1 ); - sub r2, r2, #1 ; num_part - 1 - add r2, r2, r2, lsl #1 ; 3*(num_part - 1) - str r2, [r3] - - add r2, r2, r1 ; cx_data + *size - str r2, [sp, #40] ; ptr - - ldr r4, _VP8_COMP_tplist_ - add r4, r0, r4 - ldr r7, [r4, #0] ; dereference cpi->tp_list - str r7, [sp, #32] ; store start of cpi->tp_list - - ldr r11, _VP8_COMP_bc2_ ; load up vp9_writer out of cpi - add r0, r0, r11 - - mov r11, #0 - str r11, [sp, #28] ; i - -numparts_loop - ldr r10, [sp, #40] ; ptr - ldr r5, [sp, #36] ; move mb_rows to the counting section - sub r5, r5, r11 ; move start point with each partition - ; mb_rows starts at i - str r5, [sp, #12] - - ; Reset all of the VP8 Writer data for each partition that - ; is processed. - ; start_encode - mov r2, #0 ; vp9_writer_lowvalue - mov r5, #255 ; vp9_writer_range - mvn r3, #23 ; vp9_writer_count - - str r2, [r0, #vp9_writer_value] - str r2, [r0, #vp9_writer_pos] - str r10, [r0, #vp9_writer_buffer] - -mb_row_loop - - ldr r1, [r7, #tokenlist_start] - ldr r9, [r7, #tokenlist_stop] - str r9, [sp, #0] ; save stop for later comparison - str r7, [sp, #16] ; tokenlist address for next time - - b check_p_lt_stop - - ; actual work gets done here! - -while_p_lt_stop - ldrb r6, [r1, #tokenextra_token] ; t - ldr r4, [sp, #80] ; vp8_coef_encodings - mov lr, #0 - add r4, r4, r6, lsl #3 ; a = vp8_coef_encodings + t - ldr r9, [r1, #tokenextra_context_tree] ; pp - - ldrb r7, [r1, #tokenextra_skip_eob_node] - - ldr r6, [r4, #vp9_token_value] ; v - ldr r8, [r4, #vp9_token_len] ; n - - ; vp8 specific skip_eob_node - cmp r7, #0 - movne lr, #2 ; i = 2 - subne r8, r8, #1 ; --n - - rsb r4, r8, #32 ; 32-n - ldr r10, [sp, #88] ; vp8_coef_tree - - ; v is kept in r12 during the token pack loop - lsl r12, r6, r4 ; r12 = v << 32 - n - -; loop start -token_loop - ldrb r4, [r9, lr, asr #1] ; pp [i>>1] - sub r7, r5, #1 ; range-1 - - ; Decisions are made based on the bit value shifted - ; off of v, so set a flag here based on this. - ; This value is refered to as "bb" - lsls r12, r12, #1 ; bb = v >> n - mul r6, r4, r7 ; ((range-1) * pp[i>>1])) - - ; bb can only be 0 or 1. So only execute this statement - ; if bb == 1, otherwise it will act like i + 0 - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = vp8_coef_tree[i+bb] - add r4, r7, r6, lsr #8 ; 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start -token_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - - ; r10 is used earlier in the loop, but r10 is used as - ; temp variable here. So after r10 is used, reload - ; vp8_coef_tree_dcd into r10 - ldr r10, [sp, #88] ; vp8_coef_tree - -token_count_lt_zero - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r8, r8, #1 ; --n - bne token_loop - - ldrb r6, [r1, #tokenextra_token] ; t - ldr r7, [sp, #84] ; vp8_extra_bits - ; Add t * sizeof (vp9_extra_bit_struct) to get the desired - ; element. Here vp9_extra_bit_struct == 16 - add r12, r7, r6, lsl #4 ; b = vp8_extra_bits + t - - ldr r4, [r12, #vp9_extra_bit_struct_base_val] - cmp r4, #0 - beq skip_extra_bits - -; if( b->base_val) - ldr r8, [r12, #vp9_extra_bit_struct_len] ; L - ldrsh lr, [r1, #tokenextra_extra] ; e = p->Extra - cmp r8, #0 ; if( L) - beq no_extra_bits - - ldr r9, [r12, #vp9_extra_bit_struct_prob] - asr r7, lr, #1 ; v=e>>1 - - ldr r10, [r12, #vp9_extra_bit_struct_tree] - str r10, [sp, #4] ; b->tree - - rsb r4, r8, #32 - lsl r12, r7, r4 - - mov lr, #0 ; i = 0 - -extra_bits_loop - ldrb r4, [r9, lr, asr #1] ; pp[i>>1] - sub r7, r5, #1 ; range-1 - lsls r12, r12, #1 ; v >> n - mul r6, r4, r7 ; (range-1) * pp[i>>1] - addcs lr, lr, #1 ; i + bb - - mov r7, #1 - ldrsb lr, [r10, lr] ; i = b->tree[i+bb] - add r4, r7, r6, lsr #8 ; split = 1 + (((range-1) * pp[i>>1]) >> 8) - - addcs r2, r2, r4 ; if (bb) lowvalue += split - subcs r4, r5, r4 ; if (bb) range = range-split - - clz r6, r4 - sub r6, r6, #24 - - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi extra_count_lt_zero ; if(count >= 0) - - sub r6, r6, r3 ; offset= shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl extra_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos - 1 - b extra_zero_while_start -extra_zero_while_loop - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -extra_zero_while_start - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq extra_zero_while_loop - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] - add r10, r10, #1 - strb r10, [r7, r4] -extra_high_bit_not_set - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++]=(lowvalue >> (24-offset)) - ldr r10, [sp, #4] ; b->tree -extra_count_lt_zero - lsl r2, r2, r6 - - subs r8, r8, #1 ; --n - bne extra_bits_loop ; while (n) - -no_extra_bits - ldr lr, [r1, #4] ; e = p->Extra - add r4, r5, #1 ; range + 1 - tst lr, #1 - lsr r4, r4, #1 ; split = (range + 1) >> 1 - addne r2, r2, r4 ; lowvalue += split - subne r4, r5, r4 ; range = range-split - tst r2, #0x80000000 ; lowvalue & 0x80000000 - lsl r5, r4, #1 ; range <<= 1 - beq end_high_bit_not_set - - ldr r4, [r0, #vp9_writer_pos] - mov r7, #0 - sub r4, r4, #1 - b end_zero_while_start -end_zero_while_loop - strb r7, [r6, r4] - sub r4, r4, #1 ; x-- -end_zero_while_start - cmp r4, #0 - ldrge r6, [r0, #vp9_writer_buffer] - ldrb r12, [r6, r4] - cmpge r12, #0xff - beq end_zero_while_loop - - ldr r6, [r0, #vp9_writer_buffer] - ldrb r7, [r6, r4] - add r7, r7, #1 - strb r7, [r6, r4] -end_high_bit_not_set - adds r3, r3, #1 ; ++count - lsl r2, r2, #1 ; lowvalue <<= 1 - bne end_count_zero - - ldr r4, [r0, #vp9_writer_pos] - mvn r3, #7 - ldr r7, [r0, #vp9_writer_buffer] - lsr r6, r2, #24 ; lowvalue >> 24 - add r12, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r12, [r0, #0x10] - strb r6, [r7, r4] -end_count_zero -skip_extra_bits - add r1, r1, #TOKENEXTRA_SZ ; ++p -check_p_lt_stop - ldr r4, [sp, #0] ; stop - cmp r1, r4 ; while( p < stop) - bcc while_p_lt_stop - - ldr r10, [sp, #20] ; num_parts - mov r1, #TOKENLIST_SZ - mul r1, r10, r1 - - ldr r6, [sp, #12] ; mb_rows - ldr r7, [sp, #16] ; tokenlist address - subs r6, r6, r10 - add r7, r7, r1 ; next element in the array - str r6, [sp, #12] - bgt mb_row_loop - - mov r12, #32 - -stop_encode_loop - sub r7, r5, #1 ; range-1 - - mov r4, r7, lsl #7 ; ((range-1) * 128) - - mov r7, #1 - add r4, r7, r4, lsr #8 ; 1 + (((range-1) * 128) >> 8) - - ; Counting the leading zeros is used to normalize range. - clz r6, r4 - sub r6, r6, #24 ; shift - - ; Flag is set on the sum of count. This flag is used later - ; to determine if count >= 0 - adds r3, r3, r6 ; count += shift - lsl r5, r4, r6 ; range <<= shift - bmi token_count_lt_zero_se ; if(count >= 0) - - sub r6, r6, r3 ; offset = shift - count - sub r4, r6, #1 ; offset-1 - lsls r4, r2, r4 ; if((lowvalue<<(offset-1)) & 0x80000000 ) - bpl token_high_bit_not_set_se - - ldr r4, [r0, #vp9_writer_pos] ; x - sub r4, r4, #1 ; x = w->pos-1 - b token_zero_while_start_se -token_zero_while_loop_se - mov r10, #0 - strb r10, [r7, r4] ; w->buffer[x] =(unsigned char)0 - sub r4, r4, #1 ; x-- -token_zero_while_start_se - cmp r4, #0 - ldrge r7, [r0, #vp9_writer_buffer] - ldrb r11, [r7, r4] - cmpge r11, #0xff - beq token_zero_while_loop_se - - ldr r7, [r0, #vp9_writer_buffer] - ldrb r10, [r7, r4] ; w->buffer[x] - add r10, r10, #1 - strb r10, [r7, r4] ; w->buffer[x] + 1 -token_high_bit_not_set_se - rsb r4, r6, #24 ; 24-offset - ldr r10, [r0, #vp9_writer_buffer] - lsr r7, r2, r4 ; lowvalue >> (24-offset) - ldr r4, [r0, #vp9_writer_pos] ; w->pos - lsl r2, r2, r6 ; lowvalue <<= offset - mov r6, r3 ; shift = count - add r11, r4, #1 ; w->pos++ - bic r2, r2, #0xff000000 ; lowvalue &= 0xffffff - str r11, [r0, #vp9_writer_pos] - sub r3, r3, #8 ; count -= 8 - strb r7, [r10, r4] ; w->buffer[w->pos++] - -token_count_lt_zero_se - lsl r2, r2, r6 ; lowvalue <<= shift - - subs r12, r12, #1 - bne stop_encode_loop - - ldr r10, [sp, #8] ; *size - ldr r11, [r10] - ldr r4, [r0, #vp9_writer_pos] ; w->pos - add r11, r11, r4 ; *size += w->pos - str r11, [r10] - - ldr r9, [sp, #20] ; num_parts - sub r9, r9, #1 - ldr r10, [sp, #28] ; i - cmp r10, r9 ; if(i<(num_part - 1)) - bge skip_write_partition - - ldr r12, [sp, #40] ; ptr - add r12, r12, r4 ; ptr += w->pos - str r12, [sp, #40] - - ldr r9, [sp, #24] ; cx_data - mov r8, r4, asr #8 - strb r4, [r9, #0] - strb r8, [r9, #1] - mov r4, r4, asr #16 - strb r4, [r9, #2] - - add r9, r9, #3 ; cx_data += 3 - str r9, [sp, #24] - -skip_write_partition - - ldr r11, [sp, #28] ; i - ldr r10, [sp, #20] ; num_parts - - add r11, r11, #1 ; i++ - str r11, [sp, #28] - - ldr r7, [sp, #32] ; cpi->tp_list[i] - mov r1, #TOKENLIST_SZ - add r7, r7, r1 ; next element in cpi->tp_list - str r7, [sp, #32] ; cpi->tp_list[i+1] - - cmp r10, r11 - bgt numparts_loop - - - add sp, sp, #44 - pop {r4-r11, pc} - ENDP - -_VP8_COMP_common_ - DCD vp8_comp_common -_VP8_COMMON_MBrows_ - DCD vp8_common_mb_rows -_VP8_COMP_tplist_ - DCD vp8_comp_tplist -_VP8_COMP_bc2_ - DCD vp8_comp_bc2 - - END diff --git a/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm b/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm deleted file mode 100644 index 4f75ef5e7..000000000 --- a/vp9/encoder/arm/armv6/vp9_fast_quantize_b_armv6.asm +++ /dev/null @@ -1,223 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_quantize_b_armv6| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 BLOCK *b -; r1 BLOCKD *d -|vp8_fast_quantize_b_armv6| PROC - stmfd sp!, {r1, r4-r11, lr} - - ldr r3, [r0, #vp8_block_coeff] ; coeff - ldr r4, [r0, #vp8_block_quant_fast] ; quant_fast - ldr r5, [r0, #vp8_block_round] ; round - ldr r6, [r1, #vp8_blockd_qcoeff] ; qcoeff - ldr r7, [r1, #vp8_blockd_dqcoeff] ; dqcoeff - ldr r8, [r1, #vp8_blockd_dequant] ; dequant - - ldr r2, loop_count ; loop_count=0x1000000. 'lsls' instruction - ; is used to update the counter so that - ; it can be used to mark nonzero - ; quantized coefficient pairs. - - mov r1, #0 ; flags for quantized coeffs - - ; PART 1: quantization and dequantization loop -loop - ldr r9, [r3], #4 ; [z1 | z0] - ldr r10, [r5], #4 ; [r1 | r0] - ldr r11, [r4], #4 ; [q1 | q0] - - ssat16 lr, #1, r9 ; [sz1 | sz0] - eor r9, r9, lr ; [z1 ^ sz1 | z0 ^ sz0] - ssub16 r9, r9, lr ; x = (z ^ sz) - sz - sadd16 r9, r9, r10 ; [x1+r1 | x0+r0] - - ldr r12, [r3], #4 ; [z3 | z2] - - smulbb r0, r9, r11 ; [(x0+r0)*q0] - smultt r9, r9, r11 ; [(x1+r1)*q1] - - ldr r10, [r5], #4 ; [r3 | r2] - - ssat16 r11, #1, r12 ; [sz3 | sz2] - eor r12, r12, r11 ; [z3 ^ sz3 | z2 ^ sz2] - pkhtb r0, r9, r0, asr #16 ; [y1 | y0] - ldr r9, [r4], #4 ; [q3 | q2] - ssub16 r12, r12, r11 ; x = (z ^ sz) - sz - - sadd16 r12, r12, r10 ; [x3+r3 | x2+r2] - - eor r0, r0, lr ; [(y1 ^ sz1) | (y0 ^ sz0)] - - smulbb r10, r12, r9 ; [(x2+r2)*q2] - smultt r12, r12, r9 ; [(x3+r3)*q3] - - ssub16 r0, r0, lr ; x = (y ^ sz) - sz - - cmp r0, #0 ; check if zero - orrne r1, r1, r2, lsr #24 ; add flag for nonzero coeffs - - str r0, [r6], #4 ; *qcoeff++ = x - ldr r9, [r8], #4 ; [dq1 | dq0] - - pkhtb r10, r12, r10, asr #16 ; [y3 | y2] - eor r10, r10, r11 ; [(y3 ^ sz3) | (y2 ^ sz2)] - ssub16 r10, r10, r11 ; x = (y ^ sz) - sz - - cmp r10, #0 ; check if zero - orrne r1, r1, r2, lsr #23 ; add flag for nonzero coeffs - - str r10, [r6], #4 ; *qcoeff++ = x - ldr r11, [r8], #4 ; [dq3 | dq2] - - smulbb r12, r0, r9 ; [x0*dq0] - smultt r0, r0, r9 ; [x1*dq1] - - smulbb r9, r10, r11 ; [x2*dq2] - smultt r10, r10, r11 ; [x3*dq3] - - lsls r2, r2, #2 ; update loop counter - strh r12, [r7, #0] ; dqcoeff[0] = [x0*dq0] - strh r0, [r7, #2] ; dqcoeff[1] = [x1*dq1] - strh r9, [r7, #4] ; dqcoeff[2] = [x2*dq2] - strh r10, [r7, #6] ; dqcoeff[3] = [x3*dq3] - add r7, r7, #8 ; dqcoeff += 8 - bne loop - - ; PART 2: check position for eob... - mov lr, #0 ; init eob - cmp r1, #0 ; coeffs after quantization? - ldr r11, [sp, #0] ; restore BLOCKD pointer - beq end ; skip eob calculations if all zero - - ldr r0, [r11, #vp8_blockd_qcoeff] - - ; check shortcut for nonzero qcoeffs - tst r1, #0x80 - bne quant_coeff_15_14 - tst r1, #0x20 - bne quant_coeff_13_11 - tst r1, #0x8 - bne quant_coeff_12_7 - tst r1, #0x40 - bne quant_coeff_10_9 - tst r1, #0x10 - bne quant_coeff_8_3 - tst r1, #0x2 - bne quant_coeff_6_5 - tst r1, #0x4 - bne quant_coeff_4_2 - b quant_coeff_1_0 - -quant_coeff_15_14 - ldrh r2, [r0, #30] ; rc=15, i=15 - mov lr, #16 - cmp r2, #0 - bne end - - ldrh r3, [r0, #28] ; rc=14, i=14 - mov lr, #15 - cmp r3, #0 - bne end - -quant_coeff_13_11 - ldrh r2, [r0, #22] ; rc=11, i=13 - mov lr, #14 - cmp r2, #0 - bne end - -quant_coeff_12_7 - ldrh r3, [r0, #14] ; rc=7, i=12 - mov lr, #13 - cmp r3, #0 - bne end - - ldrh r2, [r0, #20] ; rc=10, i=11 - mov lr, #12 - cmp r2, #0 - bne end - -quant_coeff_10_9 - ldrh r3, [r0, #26] ; rc=13, i=10 - mov lr, #11 - cmp r3, #0 - bne end - - ldrh r2, [r0, #24] ; rc=12, i=9 - mov lr, #10 - cmp r2, #0 - bne end - -quant_coeff_8_3 - ldrh r3, [r0, #18] ; rc=9, i=8 - mov lr, #9 - cmp r3, #0 - bne end - - ldrh r2, [r0, #12] ; rc=6, i=7 - mov lr, #8 - cmp r2, #0 - bne end - -quant_coeff_6_5 - ldrh r3, [r0, #6] ; rc=3, i=6 - mov lr, #7 - cmp r3, #0 - bne end - - ldrh r2, [r0, #4] ; rc=2, i=5 - mov lr, #6 - cmp r2, #0 - bne end - -quant_coeff_4_2 - ldrh r3, [r0, #10] ; rc=5, i=4 - mov lr, #5 - cmp r3, #0 - bne end - - ldrh r2, [r0, #16] ; rc=8, i=3 - mov lr, #4 - cmp r2, #0 - bne end - - ldrh r3, [r0, #8] ; rc=4, i=2 - mov lr, #3 - cmp r3, #0 - bne end - -quant_coeff_1_0 - ldrh r2, [r0, #2] ; rc=1, i=1 - mov lr, #2 - cmp r2, #0 - bne end - - mov lr, #1 ; rc=0, i=0 - -end - str lr, [r11, #vp8_blockd_eob] - ldmfd sp!, {r1, r4-r11, pc} - - ENDP - -loop_count - DCD 0x1000000 - - END diff --git a/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm deleted file mode 100644 index 8e7283667..000000000 --- a/vp9/encoder/arm/armv6/vp9_mse16x16_armv6.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vp9_variance16x16_armv6. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vp8_mse16x16_armv6| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm deleted file mode 100644 index 4dcceb2bf..000000000 --- a/vp9/encoder/arm/armv6/vp9_sad16x16_armv6.asm +++ /dev/null @@ -1,95 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 const unsigned char *src_ptr -; r1 int src_stride -; r2 const unsigned char *ref_ptr -; r3 int ref_stride -; stack max_sad (not used) -|vp8_sad16x16_armv6| PROC - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - mov r4, #0 ; sad = 0; - mov r5, #8 ; loop count - -loop - ; 1st row - ldr r6, [r0, #0x0] ; load 4 src pixels (1A) - ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) - ldr r7, [r0, #0x4] ; load 4 src pixels (1A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) - ldr r10, [r0, #0x8] ; load 4 src pixels (1B) - ldr r11, [r0, #0xC] ; load 4 src pixels (1B) - - usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - ldr r6, [r0, #0x0] ; load 4 src pixels (2A) - ldr r7, [r0, #0x4] ; load 4 src pixels (2A) - add r4, r4, r8 ; add partial sad values - - ; 2nd row - ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) - ldr r10, [r0, #0x8] ; load 4 src pixels (2B) - ldr r11, [r0, #0xC] ; load 4 src pixels (2B) - - usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - subs r5, r5, #1 ; decrement loop counter - add r4, r4, r8 ; add partial sad values - - bne loop - - mov r0, r4 ; return sad - ldmfd sp!, {r4-r12, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm b/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm deleted file mode 100644 index 8034c1db9..000000000 --- a/vp9/encoder/arm/armv6/vp9_short_fdct4x4_armv6.asm +++ /dev/null @@ -1,262 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_fdct4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY -; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_armv6| PROC - - stmfd sp!, {r4 - r12, lr} - - ; PART 1 - - ; coeffs 0-3 - ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] - - ldr r10, c7500 - ldr r11, c14500 - ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] - ldr lr, c0x00080008 - ror r5, r5, #16 ; [i2 | i3] - - qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift - qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 - smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 - - smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] - - pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] - - str r6, [r1, #4] - - ; coeffs 4-7 - ror r9, r9, #16 ; [i6 | i7] - - qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift - qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 - smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 - - smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] - - pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] - - str r6, [r1, #12] - - ; coeffs 8-11 - ror r5, r5, #16 ; [i10 | i11] - - qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift - qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 - smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 - - smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] - - pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] - - str r6, [r1, #20] - - ; coeffs 12-15 - ror r5, r5, #16 ; [i14 | i15] - - qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift - qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 - smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 - - smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) - - pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] - - str r6, [r1, #28] - - - ; PART 2 ------------------------------------------------- - ldr r11, c12000 - ldr r10, c51000 - ldr lr, c0x00070007 - - qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] - qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] - qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] - qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] - - qadd16 r4, r4, lr ; a1 + 7 - - add r0, r11, #0x10000 ; add (d!=0) - - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - ldr r12, c0x08a914e8 ; [2217 | 5352] - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #0] ; [ o1 | o0] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #16] ; [ o9 | o8] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - ldr r3, [r1, #4] ; [i3 | i2] - - pkhtb r5, r5, r4, asr #16 ; [o13|o12] - - str r9, [r1, #8] ; [o5 | 04] - - ldr r9, [r1, #12] ; [i7 | i6] - ldr r8, [r1, #28] ; [i15|i14] - ldr r2, [r1, #20] ; [i11|i10] - str r5, [r1, #24] ; [o13|o12] - - qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] - qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] - - qadd16 r4, r4, lr ; a1 + 7 - - qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #4] ; [ o3 | o2] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #20] ; [ o11 | o10] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - str r9, [r1, #12] ; [o7 | o6] - pkhtb r5, r5, r4, asr #16 ; [o15|o14] - - str r5, [r1, #28] ; [o15|o14] - - ldmfd sp!, {r4 - r12, pc} - - ENDP - -; Used constants -c7500 - DCD 7500 -c14500 - DCD 14500 -c0x22a453a0 - DCD 0x22a453a0 -c0x00080008 - DCD 0x00080008 -c12000 - DCD 12000 -c51000 - DCD 51000 -c0x00070007 - DCD 0x00070007 -c0x08a914e8 - DCD 0x08a914e8 - - END diff --git a/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm b/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm deleted file mode 100644 index e53c1ed5b..000000000 --- a/vp9/encoder/arm/armv6/vp9_subtract_armv6.asm +++ /dev/null @@ -1,264 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_subtract_mby_armv6| - EXPORT |vp8_subtract_mbuv_armv6| - EXPORT |vp8_subtract_b_armv6| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 BLOCK *be -; r1 BLOCKD *bd -; r2 int pitch -|vp8_subtract_b_armv6| PROC - - stmfd sp!, {r4-r9} - - ldr r4, [r0, #vp8_block_base_src] - ldr r5, [r0, #vp8_block_src] - ldr r6, [r0, #vp8_block_src_diff] - - ldr r3, [r4] - ldr r7, [r0, #vp8_block_src_stride] - add r3, r3, r5 ; src = *base_src + src - ldr r8, [r1, #vp8_blockd_predictor] - - mov r9, #4 ; loop count - -loop_block - - ldr r0, [r3], r7 ; src - ldr r1, [r8], r2 ; pred - - uxtb16 r4, r0 ; [s2 | s0] - uxtb16 r5, r1 ; [p2 | p0] - uxtb16 r0, r0, ror #8 ; [s3 | s1] - uxtb16 r1, r1, ror #8 ; [p3 | p1] - - usub16 r4, r4, r5 ; [d2 | d0] - usub16 r5, r0, r1 ; [d3 | d1] - - subs r9, r9, #1 ; decrement loop counter - - pkhbt r0, r4, r5, lsl #16 ; [d1 | d0] - pkhtb r1, r5, r4, asr #16 ; [d3 | d2] - - str r0, [r6, #0] ; diff - str r1, [r6, #4] ; diff - - add r6, r6, r2, lsl #1 ; update diff pointer - bne loop_block - - ldmfd sp!, {r4-r9} - mov pc, lr - - ENDP - - -; r0 short *diff -; r1 unsigned char *usrc -; r2 unsigned char *vsrc -; r3 unsigned char *pred -; stack int stride -|vp8_subtract_mbuv_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - add r0, r0, #512 ; set *diff point to Cb - add r3, r3, #256 ; set *pred point to Cb - - mov r4, #8 ; loop count - ldr r5, [sp, #40] ; stride - - ; Subtract U block -loop_u - ldr r6, [r1] ; src (A) - ldr r7, [r3], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r1, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - add r1, r1, r5 ; update usrc pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (B) - - bne loop_u - - mov r4, #8 ; loop count - - ; Subtract V block -loop_v - ldr r6, [r2] ; src (A) - ldr r7, [r3], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r2, #4] ; src (B) - ldr r11, [r3], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - add r2, r2, r5 ; update vsrc pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (B) - - bne loop_v - - ldmfd sp!, {r4-r12, pc} - - ENDP - - -; r0 short *diff -; r1 unsigned char *src -; r2 unsigned char *pred -; r3 int stride -|vp8_subtract_mby_armv6| PROC - - stmfd sp!, {r4-r11} - - mov r4, #16 -loop - ldr r6, [r1] ; src (A) - ldr r7, [r2], #4 ; pred (A) - - uxtb16 r8, r6 ; [s2 | s0] (A) - uxtb16 r9, r7 ; [p2 | p0] (A) - uxtb16 r10, r6, ror #8 ; [s3 | s1] (A) - uxtb16 r11, r7, ror #8 ; [p3 | p1] (A) - - usub16 r6, r8, r9 ; [d2 | d0] (A) - usub16 r7, r10, r11 ; [d3 | d1] (A) - - ldr r10, [r1, #4] ; src (B) - ldr r11, [r2], #4 ; pred (B) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A) - - str r8, [r0], #4 ; diff (A) - uxtb16 r8, r10 ; [s2 | s0] (B) - str r9, [r0], #4 ; diff (A) - - uxtb16 r9, r11 ; [p2 | p0] (B) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (B) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (B) - - usub16 r6, r8, r9 ; [d2 | d0] (B) - usub16 r7, r10, r11 ; [d3 | d1] (B) - - ldr r10, [r1, #8] ; src (C) - ldr r11, [r2], #4 ; pred (C) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B) - - str r8, [r0], #4 ; diff (B) - uxtb16 r8, r10 ; [s2 | s0] (C) - str r9, [r0], #4 ; diff (B) - - uxtb16 r9, r11 ; [p2 | p0] (C) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (C) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (C) - - usub16 r6, r8, r9 ; [d2 | d0] (C) - usub16 r7, r10, r11 ; [d3 | d1] (C) - - ldr r10, [r1, #12] ; src (D) - ldr r11, [r2], #4 ; pred (D) - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C) - - str r8, [r0], #4 ; diff (C) - uxtb16 r8, r10 ; [s2 | s0] (D) - str r9, [r0], #4 ; diff (C) - - uxtb16 r9, r11 ; [p2 | p0] (D) - uxtb16 r10, r10, ror #8 ; [s3 | s1] (D) - uxtb16 r11, r11, ror #8 ; [p3 | p1] (D) - - usub16 r6, r8, r9 ; [d2 | d0] (D) - usub16 r7, r10, r11 ; [d3 | d1] (D) - - add r1, r1, r3 ; update src pointer - - pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D) - pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D) - - str r8, [r0], #4 ; diff (D) - subs r4, r4, #1 ; update loop counter - str r9, [r0], #4 ; diff (D) - - bne loop - - ldmfd sp!, {r4-r11} - mov pc, lr - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm deleted file mode 100644 index aa4727e66..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance16x16_armv6.asm +++ /dev/null @@ -1,153 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance16x16_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance16x16_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm deleted file mode 100644 index 101f6838d..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance8x8_armv6.asm +++ /dev/null @@ -1,101 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance8x8_armv6| - - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance8x8_armv6| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; substract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm deleted file mode 100644 index e25436c22..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_h_armv6.asm +++ /dev/null @@ -1,181 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_h_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_h_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm deleted file mode 100644 index 6ad5e90bb..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_hv_armv6.asm +++ /dev/null @@ -1,222 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_hv_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_hv_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; pointer to pixels on the next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load source pixels a, row N - ldr r6, [r0, #1] ; load source pixels b, row N - ldr r5, [r9, #0] ; load source pixels c, row N+1 - ldr r7, [r9, #1] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #0] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load source pixels a, row N - ldr r6, [r0, #5] ; load source pixels b, row N - ldr r5, [r9, #4] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #5] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #4] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load source pixels a, row N - ldr r6, [r0, #9] ; load source pixels b, row N - ldr r5, [r9, #8] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #9] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #8] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load source pixels a, row N - ldr r6, [r0, #13] ; load source pixels b, row N - ldr r5, [r9, #12] ; load source pixels c, row N+1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - ldr r7, [r9, #13] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #12] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm b/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm deleted file mode 100644 index c1ac5a1cb..000000000 --- a/vp9/encoder/arm/armv6/vp9_variance_halfpixvar16x16_v_armv6.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_v_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vp9_variance_halfpixvar16x16_v_armv6| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; set src pointer to next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r9, #0] ; load 4 src pixels from next row - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r9, #4] ; load 4 src pixels from next row - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r9, #8] ; load 4 src pixels from next row - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r9, #12] ; load 4 src pixels from next row - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; substract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vp9/encoder/arm/armv6/vp9_walsh_v6.asm b/vp9/encoder/arm/armv6/vp9_walsh_v6.asm deleted file mode 100644 index 5eaf3f25a..000000000 --- a/vp9/encoder/arm/armv6/vp9_walsh_v6.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_walsh4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_armv6| PROC - - stmdb sp!, {r4 - r11, lr} - - ldrd r4, r5, [r0], r2 - ldr lr, c00040004 - ldrd r6, r7, [r0], r2 - - ; 0-3 - qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] - qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] - - ldrd r8, r9, [r0], r2 - ; 4-7 - qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] - qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] - - ldrd r10, r11, [r0] - ; 8-11 - qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] - qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] - - ; 12-15 - qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] - qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] - - - lsls r2, r3, #16 - smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 - addne r11, r11, #1 ; A0 += (a1!=0) - - lsls r2, r7, #16 - smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; C0 += (a1!=0) - - add r0, r11, r12 ; a1_0 = A0 + C0 - sub r11, r11, r12 ; b1_0 = A0 - C0 - - lsls r2, r5, #16 - smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; B0 += (a1!=0) - - lsls r2, r9, #16 - smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 - addne r2, r2, #1 ; D0 += (a1!=0) - - add lr, r12, r2 ; d1_0 = B0 + D0 - sub r12, r12, r2 ; c1_0 = B0 - D0 - - ; op[0,4,8,12] - adds r2, r0, lr ; a2 = a1_0 + d1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r0, lr ; d2 = a1_0 - d1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1] ; op[0] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - ldr lr, c00040004 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #24] ; op[12] - - adds r2, r11, r12 ; b2 = b1_0 + c1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r11, r12 ; c2 = b1_0 - c1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #8] ; op[4] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 - smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #16] ; op[8] - - - ; op[3,7,11,15] - add r0, r3, r7 ; a1_3 = A3 + C3 - sub r3, r3, r7 ; b1_3 = A3 - C3 - - smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 - smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 - add r7, r5, r9 ; d1_3 = B3 + D3 - sub r5, r5, r9 ; c1_3 = B3 - D3 - - adds r2, r0, r7 ; a2 = a1_3 + d1_3 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r5 ; b2 = b1_3 + c1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #6] ; op[3] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r5 ; c2 = b1_3 - c1_3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #14] ; op[7] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r7 ; d2 = a1_3 - d1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #22] ; op[11] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 - smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #30] ; op[15] - - ; op[1,5,9,13] - add r0, r3, r5 ; a1_1 = A1 + C1 - sub r3, r3, r5 ; b1_1 = A1 - C1 - - smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 - smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 - add r5, r7, r9 ; d1_1 = B1 + D1 - sub r7, r7, r9 ; c1_1 = B1 - D1 - - adds r2, r0, r5 ; a2 = a1_1 + d1_1 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r7 ; b2 = b1_1 + c1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #2] ; op[1] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r7 ; c2 = b1_1 - c1_1 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #10] ; op[5] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r5 ; d2 = a1_1 - d1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #18] ; op[9] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 - smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #26] ; op[13] - - - ; op[2,6,10,14] - add r11, r4, r8 ; a1_2 = A2 + C2 - sub r12, r4, r8 ; b1_2 = A2 - C2 - - smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 - smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 - add r4, r6, r10 ; d1_2 = B2 + D2 - sub r8, r6, r10 ; c1_2 = B2 - D2 - - adds r2, r11, r4 ; a2 = a1_2 + d1_2 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r12, r8 ; b2 = b1_2 + c1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #4] ; op[2] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r12, r8 ; c2 = b1_2 - c1_2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #12] ; op[6] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r11, r4 ; d2 = a1_2 - d1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #20] ; op[10] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #28] ; op[14] - - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_walsh4x4_armv6| - -c00040004 - DCD 0x00040004 - - END diff --git a/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm b/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm deleted file mode 100644 index c68233617..000000000 --- a/vp9/encoder/arm/neon/vp9_fastquantizeb_neon.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_quantize_b_neon| - EXPORT |vp8_fast_quantize_b_pair_neon| - - INCLUDE vp9_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - -;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); -|vp8_fast_quantize_b_pair_neon| PROC - - stmfd sp!, {r4-r9} - vstmdb sp!, {q4-q7} - - ldr r4, [r0, #vp8_block_coeff] - ldr r5, [r0, #vp8_block_quant_fast] - ldr r6, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r4@128] ; load z - - ldr r7, [r2, #vp8_blockd_qcoeff] - - vabs.s16 q4, q0 ; calculate x = abs(z) - vabs.s16 q5, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vshr.s16 q3, q1, #15 - - vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] - - ldr r4, [r1, #vp8_block_coeff] - - vadd.s16 q4, q6 ; x + Round - vadd.s16 q5, q7 - - vld1.16 {q0, q1}, [r4@128] ; load z2 - - vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q5, q9 - - vabs.s16 q10, q0 ; calculate x2 = abs(z_2) - vabs.s16 q11, q1 - vshr.s16 q12, q0, #15 ; sz2 - vshr.s16 q13, q1, #15 - - ;modify data to have its original sign - veor.s16 q4, q2 ; y^sz - veor.s16 q5, q3 - - vadd.s16 q10, q6 ; x2 + Round - vadd.s16 q11, q7 - - ldr r8, [r2, #vp8_blockd_dequant] - - vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q11, q9 - - vshr.s16 q4, #1 ; right shift 1 after vqdmulh - vshr.s16 q5, #1 - - vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] - - vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q5, q3 - - vshr.s16 q10, #1 ; right shift 1 after vqdmulh - vshr.s16 q11, #1 - - ldr r9, [r2, #vp8_blockd_dqcoeff] - - veor.s16 q10, q12 ; y2^sz2 - veor.s16 q11, q13 - - vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 - - - vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q11, q13 - - ldr r6, [r3, #vp8_blockd_qcoeff] - - vmul.s16 q2, q6, q4 ; x * Dequant - vmul.s16 q3, q7, q5 - - ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table - - vceq.s16 q8, q8 ; set q8 to all 1 - - vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 - - vmul.s16 q12, q6, q10 ; x2 * Dequant - vmul.s16 q13, q7, q11 - - vld1.16 {q6, q7}, [r0@128] ; load inverse scan order - - vtst.16 q14, q4, q8 ; now find eob - vtst.16 q15, q5, q8 ; non-zero element is set to all 1 - - vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant - - ldr r7, [r3, #vp8_blockd_dqcoeff] - - vand q0, q6, q14 ; get all valid numbers from scan array - vand q1, q7, q15 - - vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant - - vtst.16 q2, q10, q8 ; now find eob - vtst.16 q3, q11, q8 ; non-zero element is set to all 1 - - vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 - - vand q10, q6, q2 ; get all valid numbers from scan array - vand q11, q7, q3 - vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 - - vmax.u16 d0, d0, d1 - vmax.u16 d20, d20, d21 - vmovl.u16 q0, d0 - vmovl.u16 q10, d20 - - - vmax.u32 d0, d0, d1 - vmax.u32 d20, d20, d21 - vpmax.u32 d0, d0, d0 - vpmax.u32 d20, d20, d20 - - add r4, r2, #vp8_blockd_eob - add r5, r3, #vp8_blockd_eob - - vst1.32 {d0[0]}, [r4@32] - vst1.32 {d20[0]}, [r5@32] - - vldmia sp!, {q4-q7} - ldmfd sp!, {r4-r9} - bx lr - - ENDP - -;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) -|vp8_fast_quantize_b_neon| PROC - - stmfd sp!, {r4-r7} - - ldr r3, [r0, #vp8_block_coeff] - ldr r4, [r0, #vp8_block_quant_fast] - ldr r5, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r3@128] ; load z - vorr.s16 q14, q0, q1 ; check if all zero (step 1) - ldr r6, [r1, #vp8_blockd_qcoeff] - ldr r7, [r1, #vp8_blockd_dqcoeff] - vorr.s16 d28, d28, d29 ; check if all zero (step 2) - - vabs.s16 q12, q0 ; calculate x = abs(z) - vabs.s16 q13, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vmov r2, r3, d28 ; check if all zero (step 3) - vshr.s16 q3, q1, #15 - - vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15] - - vadd.s16 q12, q14 ; x + Round - vadd.s16 q13, q15 - - ldr r0, _inv_zig_zag_ ; load ptr of inverse zigzag table - - vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q13, q9 - - vld1.16 {q10, q11}, [r0@128]; load inverse scan order - - vceq.s16 q8, q8 ; set q8 to all 1 - - ldr r4, [r1, #vp8_blockd_dequant] - - vshr.s16 q12, #1 ; right shift 1 after vqdmulh - vshr.s16 q13, #1 - - orr r2, r2, r3 ; check if all zero (step 4) - cmp r2, #0 ; check if all zero (step 5) - beq zero_output ; check if all zero (step 6) - - ;modify data to have its original sign - veor.s16 q12, q2 ; y^sz - veor.s16 q13, q3 - - vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q13, q3 - - vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i] - - vtst.16 q14, q12, q8 ; now find eob - vtst.16 q15, q13, q8 ; non-zero element is set to all 1 - - vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1 - - vand q10, q10, q14 ; get all valid numbers from scan array - vand q11, q11, q15 - - - vmax.u16 q0, q10, q11 ; find maximum value in q0, q1 - vmax.u16 d0, d0, d1 - vmovl.u16 q0, d0 - - vmul.s16 q2, q12 ; x * Dequant - vmul.s16 q3, q13 - - vmax.u32 d0, d0, d1 - vpmax.u32 d0, d0, d0 - - vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - - add r4, r1, #vp8_blockd_eob - vst1.32 {d0[0]}, [r4@32] - - ldmfd sp!, {r4-r7} - bx lr - -zero_output - str r2, [r1, #vp8_blockd_eob] - vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0 - vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0 - - ldmfd sp!, {r4-r7} - bx lr - - ENDP - -; default inverse zigzag table is defined in vp9/common/vp9_entropy.c -_inv_zig_zag_ - DCD inv_zig_zag - - ALIGN 16 ; enable use of @128 bit aligned loads -inv_zig_zag - DCW 0x0001, 0x0002, 0x0006, 0x0007 - DCW 0x0003, 0x0005, 0x0008, 0x000d - DCW 0x0004, 0x0009, 0x000c, 0x000e - DCW 0x000a, 0x000b, 0x000f, 0x0010 - - END - diff --git a/vp9/encoder/arm/neon/vp9_memcpy_neon.asm b/vp9/encoder/arm/neon/vp9_memcpy_neon.asm deleted file mode 100644 index b0450e523..000000000 --- a/vp9/encoder/arm/neon/vp9_memcpy_neon.asm +++ /dev/null @@ -1,68 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_memcpy_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;========================================= -;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); -|vp8_memcpy_neon| PROC - ;pld [r1] ;preload pred data - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - mov r12, r2, lsr #8 ;copy 256 bytes data at one time - -memcpy_neon_loop - vld1.8 {q0, q1}, [r1]! ;load src data - subs r12, r12, #1 - vld1.8 {q2, q3}, [r1]! - vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr - vld1.8 {q4, q5}, [r1]! - vst1.8 {q2, q3}, [r0]! - vld1.8 {q6, q7}, [r1]! - vst1.8 {q4, q5}, [r0]! - vld1.8 {q8, q9}, [r1]! - vst1.8 {q6, q7}, [r0]! - vld1.8 {q10, q11}, [r1]! - vst1.8 {q8, q9}, [r0]! - vld1.8 {q12, q13}, [r1]! - vst1.8 {q10, q11}, [r0]! - vld1.8 {q14, q15}, [r1]! - vst1.8 {q12, q13}, [r0]! - vst1.8 {q14, q15}, [r0]! - - ;pld [r1] ;preload pred data -- need to adjust for real device - ;pld [r1, #128] - ;pld [r1, #256] - ;pld [r1, #384] - - bne memcpy_neon_loop - - ands r3, r2, #0xff ;extra copy - beq done_copy_neon_loop - -extra_copy_neon_loop - vld1.8 {q0}, [r1]! ;load src data - subs r3, r3, #16 - vst1.8 {q0}, [r0]! - bne extra_copy_neon_loop - -done_copy_neon_loop - bx lr - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm b/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm deleted file mode 100644 index 4d1512d40..000000000 --- a/vp9/encoder/arm/neon/vp9_mse16x16_neon.asm +++ /dev/null @@ -1,116 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_neon| - EXPORT |vp8_get4x4sse_cs_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;============================ -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -;note: in this function, sum is never used. So, we can remove this part of calculation -;from vp9_variance(). - -|vp8_mse16x16_neon| PROC - vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse - vmov.i8 q8, #0 - vmov.i8 q9, #0 - vmov.i8 q10, #0 - - mov r12, #8 - -mse16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmlal.s16 q7, d22, d22 - vmlal.s16 q8, d23, d23 - - subs r12, r12, #1 - - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vmlal.s16 q7, d26, d26 - vmlal.s16 q8, d27, d27 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne mse16x16_neon_loop - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - - ldr r12, [sp] ;load *sse from stack - - vadd.u32 q10, q7, q9 - vpaddl.u32 q1, q10 - vadd.u64 d0, d2, d3 - - vst1.32 {d0[0]}, [r12] - vmov.32 r0, d0[0] - - bx lr - - ENDP - - -;============================= -; r0 unsigned char *src_ptr, -; r1 int source_stride, -; r2 unsigned char *ref_ptr, -; r3 int recon_stride -|vp8_get4x4sse_cs_neon| PROC - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmull.s16 q7, d22, d22 - vmull.s16 q8, d24, d24 - vmull.s16 q9, d26, d26 - vmull.s16 q10, d28, d28 - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - vadd.u32 q9, q7, q9 - - vpaddl.u32 q1, q9 - vadd.u64 d0, d2, d3 - - vmov.32 r0, d0[0] - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_picklpf_arm.c b/vp9/encoder/arm/neon/vp9_picklpf_arm.c deleted file mode 100644 index b427e5ef7..000000000 --- a/vp9/encoder/arm/neon/vp9_picklpf_arm.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/vp9_onyxc_int.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/vp9_quantize.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/vpxscale.h" -#include "vp9/common/vp9_alloccommon.h" - -extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); - - -void -vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { - unsigned char *src_y, *dst_y; - int yheight; - int ystride; - int border; - int yoffset; - int linestocopy; - - border = src_ybc->border; - yheight = src_ybc->y_height; - ystride = src_ybc->y_stride; - - linestocopy = (yheight >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - yoffset = ystride * ((yheight >> 5) * 16 - 8); - src_y = src_ybc->y_buffer + yoffset; - dst_y = dst_ybc->y_buffer + yoffset; - - // vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16)); - vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride * (linestocopy + 16))); -} diff --git a/vp9/encoder/arm/neon/vp9_sad16_neon.asm b/vp9/encoder/arm/neon/vp9_sad16_neon.asm deleted file mode 100644 index d7c590e15..000000000 --- a/vp9/encoder/arm/neon/vp9_sad16_neon.asm +++ /dev/null @@ -1,207 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad16x16_neon| - EXPORT |vp8_sad16x8_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int src_stride -; r2 unsigned char *ref_ptr -; r3 int ref_stride -|vp8_sad16x16_neon| PROC -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - -;; - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0] - vld1.8 {q7}, [r2] - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================== -;unsigned int vp8_sad16x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) -|vp8_sad16x8_neon| PROC - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabdl.u8 q12, d0, d8 - vabdl.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vld1.8 {q0}, [r0], r1 - vld1.8 {q4}, [r2], r3 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vld1.8 {q1}, [r0], r1 - vld1.8 {q5}, [r2], r3 - - vabal.u8 q12, d0, d8 - vabal.u8 q13, d1, d9 - - vld1.8 {q2}, [r0], r1 - vld1.8 {q6}, [r2], r3 - - vabal.u8 q12, d2, d10 - vabal.u8 q13, d3, d11 - - vld1.8 {q3}, [r0], r1 - vld1.8 {q7}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q13, d5, d13 - - vabal.u8 q12, d6, d14 - vabal.u8 q13, d7, d15 - - vadd.u16 q0, q12, q13 - - vpaddl.u16 q1, q0 - vpaddl.u32 q0, q1 - - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_sad8_neon.asm b/vp9/encoder/arm/neon/vp9_sad8_neon.asm deleted file mode 100644 index 23ba6df93..000000000 --- a/vp9/encoder/arm/neon/vp9_sad8_neon.asm +++ /dev/null @@ -1,209 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sad8x8_neon| - EXPORT |vp8_sad8x16_neon| - EXPORT |vp8_sad4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; unsigned int vp8_sad8x8_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x8_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;============================ -;unsigned int vp8_sad8x16_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad8x16_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vabal.u8 q12, d6, d14 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabal.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 q1, q12 - vpaddl.u32 q0, q1 - vadd.u32 d0, d0, d1 - - vmov.32 r0, d0[0] - - bx lr - - ENDP - -;=========================== -;unsigned int vp8_sad4x4_c( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride) - -|vp8_sad4x4_neon| PROC - vld1.8 {d0}, [r0], r1 - vld1.8 {d8}, [r2], r3 - - vld1.8 {d2}, [r0], r1 - vld1.8 {d10}, [r2], r3 - - vabdl.u8 q12, d0, d8 - - vld1.8 {d4}, [r0], r1 - vld1.8 {d12}, [r2], r3 - - vabal.u8 q12, d2, d10 - - vld1.8 {d6}, [r0], r1 - vld1.8 {d14}, [r2], r3 - - vabal.u8 q12, d4, d12 - vabal.u8 q12, d6, d14 - - vpaddl.u16 d1, d24 - vpaddl.u32 d0, d1 - vmov.32 r0, d0[0] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm b/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm deleted file mode 100644 index 09dd011ec..000000000 --- a/vp9/encoder/arm/neon/vp9_shortfdct_neon.asm +++ /dev/null @@ -1,221 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_fdct4x4_neon| - EXPORT |vp8_short_fdct8x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - - - ALIGN 16 ; enable use of @128 bit aligned loads -coeff - DCW 5352, 5352, 5352, 5352 - DCW 2217, 2217, 2217, 2217 - DCD 14500, 14500, 14500, 14500 - DCD 7500, 7500, 7500, 7500 - DCD 12000, 12000, 12000, 12000 - DCD 51000, 51000, 51000, 51000 - -;void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_neon| PROC - - ; Part one - vld1.16 {d0}, [r0@64], r2 - adr r12, coeff - vld1.16 {d1}, [r0@64], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {d2}, [r0@64], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {d3}, [r0@64], r2 - - ; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3] - vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2] - vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2] - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3] - - vshl.s16 q2, q2, #3 ; (a1, b1) << 3 - vshl.s16 q3, q3, #3 ; (c1, d1) << 3 - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 - vsub.s16 d2, d4, d5 ; op[2] = a1 - b1 - - vmlal.s16 q9, d7, d16 ; d1*5352 + 14500 - vmlal.s16 q10, d7, d17 ; d1*2217 + 7500 - vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500 - - vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - - ; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vmov.s16 d26, #7 - - vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12] - vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8] - vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8] - vadd.s16 d4, d4, d26 ; a1 + 7 - vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12] - - vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7 - vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7 - - vmlal.s16 q11, d7, d16 ; d1*5352 + 12000 - vmlal.s16 q12, d7, d17 ; d1*2217 + 51000 - - vceq.s16 d4, d7, #0 - - vshr.s16 d0, d0, #4 - vshr.s16 d2, d2, #4 - - vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000 - - vmvn.s16 d4, d4 - vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16 - vsub.s16 d1, d1, d4 ; op[4] += (d1!=0) - vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - -;void vp8_short_fdct8x4_c(short *input, short *output, int pitch) -|vp8_short_fdct8x4_neon| PROC - - ; Part one - - vld1.16 {q0}, [r0@128], r2 - adr r12, coeff - vld1.16 {q1}, [r0@128], r2 - vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217 - vld1.16 {q2}, [r0@128], r2 - vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500 - vld1.16 {q3}, [r0@128], r2 - - ; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3] - vtrn.32 q0, q2 ; [A0|B0] - vtrn.32 q1, q3 ; [A1|B1] - vtrn.16 q0, q1 ; [A2|B2] - vtrn.16 q2, q3 ; [A3|B3] - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3] - vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2] - vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3] - - vshl.s16 q11, q11, #3 ; a1 << 3 - vshl.s16 q12, q12, #3 ; b1 << 3 - vshl.s16 q13, q13, #3 ; c1 << 3 - vshl.s16 q14, q14, #3 ; d1 << 3 - - vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1 - vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1 - - vmov.s16 q11, q9 ; 14500 - vmov.s16 q12, q10 ; 7500 - - vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500 - vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500 - vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500 - vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500 - - vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500 - vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500 - vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500 - - vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12 - vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12 - vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12 - - - ; Part two - vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000 - - ; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12] - vtrn.32 q0, q2 ; q0=[A0 | B0] - vtrn.32 q1, q3 ; q1=[A4 | B4] - vtrn.16 q0, q1 ; q2=[A8 | B8] - vtrn.16 q2, q3 ; q3=[A12|B12] - - vmov.s16 q15, #7 - - vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12] - vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8] - vadd.s16 q11, q11, q15 ; a1 + 7 - vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8] - vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12] - - vadd.s16 q0, q11, q12 ; a1 + b1 + 7 - vsub.s16 q1, q11, q12 ; a1 - b1 + 7 - - vmov.s16 q11, q9 ; 12000 - vmov.s16 q12, q10 ; 51000 - - vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4 - vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4 - vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4 - vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4 - - - vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000 - vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000 - vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000 - vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000 - - vceq.s16 q14, q14, #0 - - vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000 - vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000 - vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000 - - vmvn.s16 q14, q14 - - vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d1, d1, d28 ; A[4] += (d1!=0) - - vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16 - vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16 - vsub.s16 d5, d5, d29 ; B[4] += (d1!=0) - - vst1.16 {q0, q1}, [r1@128]! ; block A - vst1.16 {q2, q3}, [r1@128]! ; block B - - bx lr - - ENDP - - END - diff --git a/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm b/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm deleted file mode 100644 index 22266297a..000000000 --- a/vp9/encoder/arm/neon/vp9_shortwalsh4x4_neon.asm +++ /dev/null @@ -1,103 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_neon| PROC - - vld1.16 {d0}, [r0@64], r2 ; load input - vld1.16 {d1}, [r0@64], r2 - vld1.16 {d2}, [r0@64], r2 - vld1.16 {d3}, [r0@64] - - ;First for-loop - ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3] - vtrn.32 d0, d2 - vtrn.32 d1, d3 - - vmov.s32 q15, #3 ; add 3 to all values - - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.s16 d4, d0, d2 ; ip[0] + ip[2] - vadd.s16 d5, d1, d3 ; ip[1] + ip[3] - vsub.s16 d6, d1, d3 ; ip[1] - ip[3] - vsub.s16 d7, d0, d2 ; ip[0] - ip[2] - - vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2 - vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2 - vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2 - vceq.s16 d16, d4, #0 ; a1 == 0 - vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2 - - vadd.s16 d0, d4, d5 ; a1 + d1 - vmvn d16, d16 ; a1 != 0 - vsub.s16 d3, d4, d5 ; op[3] = a1 - d1 - vadd.s16 d1, d7, d6 ; op[1] = b1 + c1 - vsub.s16 d2, d7, d6 ; op[2] = b1 - c1 - vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0) - - ;Second for-loop - ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] - vtrn.32 d1, d3 - vtrn.32 d0, d2 - vtrn.16 d2, d3 - vtrn.16 d0, d1 - - vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8] - vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12] - vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12] - vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8] - - vadd.s32 q0, q8, q9 ; a2 = a1 + d1 - vadd.s32 q1, q11, q10 ; b2 = b1 + c1 - vsub.s32 q2, q11, q10 ; c2 = b1 - c1 - vsub.s32 q3, q8, q9 ; d2 = a1 - d1 - - vclt.s32 q8, q0, #0 - vclt.s32 q9, q1, #0 - vclt.s32 q10, q2, #0 - vclt.s32 q11, q3, #0 - - ; subtract -1 (or 0) - vsub.s32 q0, q0, q8 ; a2 += a2 < 0 - vsub.s32 q1, q1, q9 ; b2 += b2 < 0 - vsub.s32 q2, q2, q10 ; c2 += c2 < 0 - vsub.s32 q3, q3, q11 ; d2 += d2 < 0 - - vadd.s32 q8, q0, q15 ; a2 + 3 - vadd.s32 q9, q1, q15 ; b2 + 3 - vadd.s32 q10, q2, q15 ; c2 + 3 - vadd.s32 q11, q3, q15 ; d2 + 3 - - ; vrshrn? would add 1 << 3-1 = 2 - vshrn.s32 d0, q8, #3 - vshrn.s32 d1, q9, #3 - vshrn.s32 d2, q10, #3 - vshrn.s32 d3, q11, #3 - - vst1.16 {q0, q1}, [r1@128] - - bx lr - - ENDP - - END diff --git a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm deleted file mode 100644 index 8bb0734d1..000000000 --- a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16_neon.asm +++ /dev/null @@ -1,425 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_sub_pixel_variance16x16_neon_func| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pixels_per_line, -; stack(r6) unsigned int *sse -;note: most of the code is copied from bilinear_predict16x16_neon and vp9_variance16x16_neon. - -|vp9_sub_pixel_variance16x16_neon_func| PROC - push {r4-r6, lr} - - ldr r12, _BilinearTaps_coeff_ - ldr r4, [sp, #16] ;load *dst_ptr from stack - ldr r5, [sp, #20] ;load dst_pixels_per_line from stack - ldr r6, [sp, #24] ;load *sse from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -vp8e_filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne vp8e_filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - sub sp, sp, #256 - mov r3, sp - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -vp8e_filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - vst1.u8 {d4, d5}, [r3]! - vst1.u8 {d6, d7}, [r3]! - vmov q11, q15 - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_sp16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - sub sp, sp, #528 ;reserve space on stack for temporary storage - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - mov r3, sp - -;First Pass: output_height lines x output_width columns (16x16) -vp8e_filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r3]! ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r3]! - vst1.u8 {d18, d19}, [r3]! - vst1.u8 {d20, d21}, [r3]! - - bne vp8e_filt_blk2d_fpo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - sub sp, sp, #528 ;reserve space on stack for temporary storage - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - mov r3, sp - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -vp8e_filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r3]! ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r3]! - vmov q11, q15 - vst1.u8 {d6, d7}, [r3]! - vst1.u8 {d8, d9}, [r3]! - - bne vp8e_filt_blk2d_spo16x16_loop_neon - - b sub_pixel_variance16x16_neon - -;---------------------------- -;variance16x16 -sub_pixel_variance16x16_neon - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - - sub r3, r3, #256 - mov r12, #8 - -sub_pixel_variance16x16_neon_loop - vld1.8 {q0}, [r3]! ;Load up source and reference - vld1.8 {q2}, [r4], r5 - vld1.8 {q1}, [r3]! - vld1.8 {q3}, [r4], r5 - - vsubl.u8 q11, d0, d4 ;diff - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne sub_pixel_variance16x16_neon_loop - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [r6] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - add sp, sp, #528 - vmov.32 r0, d0[0] ;return - - pop {r4-r6,pc} - - ENDP - -;----------------- - -_BilinearTaps_coeff_ - DCD bilinear_taps_coeff -bilinear_taps_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm b/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm deleted file mode 100644 index a3faf9a77..000000000 --- a/vp9/encoder/arm/neon/vp9_subpixelvariance16x16s_neon.asm +++ /dev/null @@ -1,572 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_variance_halfpixvar16x16_h_neon| - EXPORT |vp9_variance_halfpixvar16x16_v_neon| - EXPORT |vp9_variance_halfpixvar16x16_hv_neon| - EXPORT |vp9_sub_pixel_variance16x16s_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_h_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_h_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -;First Pass: output_height lines x output_width columns (16x16) -vp8_filt_fpo16x16s_4_0_loop_neon - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - vld1.8 {q11}, [r2], r3 - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.8 {q12}, [r2], r3 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.8 {q13}, [r2], r3 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - vext.8 q3, q2, q3, #1 - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vld1.8 {q14}, [r2], r3 - vrhadd.u8 q1, q2, q3 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - - vsubl.u8 q4, d0, d22 ;diff - vsubl.u8 q5, d1, d23 - vsubl.u8 q6, d2, d24 - vsubl.u8 q7, d3, d25 - vsubl.u8 q0, d4, d26 - vsubl.u8 q1, d5, d27 - vsubl.u8 q2, d6, d28 - vsubl.u8 q3, d7, d29 - - vpadal.s16 q8, q4 ;sum - vmlal.s16 q9, d8, d8 ;sse - vmlal.s16 q10, d9, d9 - - subs r12, r12, #1 - - vpadal.s16 q8, q5 - vmlal.s16 q9, d10, d10 - vmlal.s16 q10, d11, d11 - vpadal.s16 q8, q6 - vmlal.s16 q9, d12, d12 - vmlal.s16 q10, d13, d13 - vpadal.s16 q8, q7 - vmlal.s16 q9, d14, d14 - vmlal.s16 q10, d15, d15 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_fpo16x16s_4_0_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_v_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_v_neon| PROC - push {lr} - - mov r12, #4 ;loop counter - - vld1.u8 {q0}, [r0], r1 ;load src data - ldr lr, [sp, #4] ;load *sse from stack - - vmov.i8 q8, #0 ;q8 - sum - vmov.i8 q9, #0 ;q9, q10 - sse - vmov.i8 q10, #0 - -vp8_filt_spo16x16s_0_4_loop_neon - vld1.u8 {q2}, [r0], r1 - vld1.8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vld1.8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vld1.8 {q5}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - - vrhadd.u8 q0, q0, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q4, q4, q6 - vrhadd.u8 q6, q6, q15 - - vsubl.u8 q11, d0, d2 ;diff - vsubl.u8 q12, d1, d3 - vsubl.u8 q13, d4, d6 - vsubl.u8 q14, d5, d7 - vsubl.u8 q0, d8, d10 - vsubl.u8 q1, d9, d11 - vsubl.u8 q2, d12, d14 - vsubl.u8 q3, d13, d15 - - vpadal.s16 q8, q11 ;sum - vmlal.s16 q9, d22, d22 ;sse - vmlal.s16 q10, d23, d23 - - subs r12, r12, #1 - - vpadal.s16 q8, q12 - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vpadal.s16 q8, q13 - vmlal.s16 q9, d26, d26 - vmlal.s16 q10, d27, d27 - vpadal.s16 q8, q14 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - vpadal.s16 q8, q0 ;sum - vmlal.s16 q9, d0, d0 ;sse - vmlal.s16 q10, d1, d1 - vpadal.s16 q8, q1 - vmlal.s16 q9, d2, d2 - vmlal.s16 q10, d3, d3 - vpadal.s16 q8, q2 - vmlal.s16 q9, d4, d4 - vmlal.s16 q10, d5, d5 - - vmov q0, q15 - - vpadal.s16 q8, q3 - vmlal.s16 q9, d6, d6 - vmlal.s16 q10, d7, d7 - - bne vp8_filt_spo16x16s_0_4_loop_neon - - vadd.u32 q10, q9, q10 ;accumulate sse - vpaddl.s32 q0, q8 ;accumulate sum - - vpaddl.u32 q1, q10 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;================================================ -;unsigned int vp9_variance_halfpixvar16x16_hv_neon -;( -; unsigned char *src_ptr, r0 -; int src_pixels_per_line, r1 -; unsigned char *dst_ptr, r2 -; int dst_pixels_per_line, r3 -; unsigned int *sse -;); -;================================================ -|vp9_variance_halfpixvar16x16_hv_neon| PROC - push {lr} - - vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data - - ldr lr, [sp, #4] ;load *sse from stack - vmov.i8 q13, #0 ;q8 - sum - vext.8 q1, q0, q1, #1 ;construct src_ptr[1] - - vmov.i8 q14, #0 ;q9, q10 - sse - vmov.i8 q15, #0 - - mov r12, #4 ;loop counter - vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - -;First Pass: output_height lines x output_width columns (17x16) -vp8_filt16x16s_4_4_loop_neon - vld1.u8 {d4, d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14, d15}, [r0], r1 - vld1.u8 {d16, d17, d18, d19}, [r0], r1 - - ;pld [r0] - ;pld [r0, r1] - ;pld [r0, r1, lsl #1] - - vext.8 q3, q2, q3, #1 ;construct src_ptr[1] - vext.8 q5, q4, q5, #1 - vext.8 q7, q6, q7, #1 - vext.8 q9, q8, q9, #1 - - vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1 - vrhadd.u8 q2, q4, q5 - vrhadd.u8 q3, q6, q7 - vrhadd.u8 q4, q8, q9 - - vld1.8 {q5}, [r2], r3 - vrhadd.u8 q0, q0, q1 - vld1.8 {q6}, [r2], r3 - vrhadd.u8 q1, q1, q2 - vld1.8 {q7}, [r2], r3 - vrhadd.u8 q2, q2, q3 - vld1.8 {q8}, [r2], r3 - vrhadd.u8 q3, q3, q4 - - vsubl.u8 q9, d0, d10 ;diff - vsubl.u8 q10, d1, d11 - vsubl.u8 q11, d2, d12 - vsubl.u8 q12, d3, d13 - - vsubl.u8 q0, d4, d14 ;diff - vsubl.u8 q1, d5, d15 - vsubl.u8 q5, d6, d16 - vsubl.u8 q6, d7, d17 - - vpadal.s16 q13, q9 ;sum - vmlal.s16 q14, d18, d18 ;sse - vmlal.s16 q15, d19, d19 - - vpadal.s16 q13, q10 ;sum - vmlal.s16 q14, d20, d20 ;sse - vmlal.s16 q15, d21, d21 - - vpadal.s16 q13, q11 ;sum - vmlal.s16 q14, d22, d22 ;sse - vmlal.s16 q15, d23, d23 - - vpadal.s16 q13, q12 ;sum - vmlal.s16 q14, d24, d24 ;sse - vmlal.s16 q15, d25, d25 - - subs r12, r12, #1 - - vpadal.s16 q13, q0 ;sum - vmlal.s16 q14, d0, d0 ;sse - vmlal.s16 q15, d1, d1 - - vpadal.s16 q13, q1 ;sum - vmlal.s16 q14, d2, d2 ;sse - vmlal.s16 q15, d3, d3 - - vpadal.s16 q13, q5 ;sum - vmlal.s16 q14, d10, d10 ;sse - vmlal.s16 q15, d11, d11 - - vmov q0, q4 - - vpadal.s16 q13, q6 ;sum - vmlal.s16 q14, d12, d12 ;sse - vmlal.s16 q15, d13, d13 - - bne vp8_filt16x16s_4_4_loop_neon - - vadd.u32 q15, q14, q15 ;accumulate sse - vpaddl.s32 q0, q13 ;accumulate sum - - vpaddl.u32 q1, q15 - vadd.s64 d0, d0, d1 - vadd.u64 d1, d2, d3 - - vmull.s32 q5, d0, d0 - vst1.32 {d1[0]}, [lr] ;store sse - vshr.s32 d10, d10, #8 - vsub.s32 d0, d1, d10 - - vmov.32 r0, d0[0] ;return - pop {pc} - ENDP - -;============================== -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pixels_per_line, -; stack unsigned int *sse -;note: in vp8_find_best_half_pixel_step()(called when 8common.rtcd.flags; - -#if HAVE_ARMV5TE - if (flags & HAS_EDSP) { - } -#endif - -#if HAVE_ARMV6 - if (flags & HAS_MEDIA) { - cpi->rtcd.variance.sad16x16 = vp9_sad16x16_armv6; - /*cpi->rtcd.variance.sad16x8 = vp9_sad16x8_c; - cpi->rtcd.variance.sad8x16 = vp9_sad8x16_c; - cpi->rtcd.variance.sad8x8 = vp9_sad8x8_c; - cpi->rtcd.variance.sad4x4 = vp9_sad4x4_c;*/ - - /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ - cpi->rtcd.variance.var8x8 = vp9_variance8x8_armv6; - /*cpi->rtcd.variance.var8x16 = vp9_variance8x16_c; - cpi->rtcd.variance.var16x8 = vp9_variance16x8_c;*/ - cpi->rtcd.variance.var16x16 = vp9_variance16x16_armv6; - - /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ - cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_armv6; - /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; - cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ - cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_armv6; - cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_armv6; - cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_armv6; - cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_armv6; - - cpi->rtcd.variance.mse16x16 = vp9_mse16x16_armv6; - /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ - - cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_armv6; - cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_armv6; - cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_armv6; - cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_armv6; - cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_armv6; - - /*cpi->rtcd.encodemb.berr = vp9_block_error_c; - cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; - cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ - cpi->rtcd.encodemb.subb = vp9_subtract_b_armv6; - cpi->rtcd.encodemb.submby = vp9_subtract_mby_armv6; - cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_armv6; - - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b;*/ - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_armv6; - } -#endif - -#if HAVE_ARMV7 - if (flags & HAS_NEON) { - cpi->rtcd.variance.sad16x16 = vp9_sad16x16_neon; - cpi->rtcd.variance.sad16x8 = vp9_sad16x8_neon; - cpi->rtcd.variance.sad8x16 = vp9_sad8x16_neon; - cpi->rtcd.variance.sad8x8 = vp9_sad8x8_neon; - cpi->rtcd.variance.sad4x4 = vp9_sad4x4_neon; - - /*cpi->rtcd.variance.var4x4 = vp9_variance4x4_c;*/ - cpi->rtcd.variance.var8x8 = vp9_variance8x8_neon; - cpi->rtcd.variance.var8x16 = vp9_variance8x16_neon; - cpi->rtcd.variance.var16x8 = vp9_variance16x8_neon; - cpi->rtcd.variance.var16x16 = vp9_variance16x16_neon; - - /*cpi->rtcd.variance.subpixvar4x4 = vp9_sub_pixel_variance4x4_c;*/ - cpi->rtcd.variance.subpixvar8x8 = vp9_sub_pixel_variance8x8_neon; - /*cpi->rtcd.variance.subpixvar8x16 = vp9_sub_pixel_variance8x16_c; - cpi->rtcd.variance.subpixvar16x8 = vp9_sub_pixel_variance16x8_c;*/ - cpi->rtcd.variance.subpixvar16x16 = vp9_sub_pixel_variance16x16_neon; - cpi->rtcd.variance.halfpixvar16x16_h = vp9_variance_halfpixvar16x16_h_neon; - cpi->rtcd.variance.halfpixvar16x16_v = vp9_variance_halfpixvar16x16_v_neon; - cpi->rtcd.variance.halfpixvar16x16_hv = vp9_variance_halfpixvar16x16_hv_neon; - - cpi->rtcd.variance.mse16x16 = vp9_mse16x16_neon; - /*cpi->rtcd.variance.getmbss = vp9_get_mb_ss_c;*/ - - cpi->rtcd.fdct.short4x4 = vp9_short_fdct4x4_neon; - cpi->rtcd.fdct.short8x4 = vp9_short_fdct8x4_neon; - cpi->rtcd.fdct.fast4x4 = vp9_short_fdct4x4_neon; - cpi->rtcd.fdct.fast8x4 = vp9_short_fdct8x4_neon; - cpi->rtcd.fdct.walsh_short4x4 = vp9_short_walsh4x4_neon; - - /*cpi->rtcd.encodemb.berr = vp9_block_error_c; - cpi->rtcd.encodemb.mberr = vp9_mbblock_error_c; - cpi->rtcd.encodemb.mbuverr = vp9_mbuverror_c;*/ - cpi->rtcd.encodemb.subb = vp9_subtract_b_neon; - cpi->rtcd.encodemb.submby = vp9_subtract_mby_neon; - cpi->rtcd.encodemb.submbuv = vp9_subtract_mbuv_neon; - - /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; - cpi->rtcd.quantize.quantb_pair = vp8_regular_quantize_b_pair;*/ - cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; - cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; - } -#endif - -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (flags & HAS_NEON) -#endif - { - vp9_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon; - } -#endif -#endif -} diff --git a/vp9/encoder/arm/vp9_boolhuff_arm.c b/vp9/encoder/arm/vp9_boolhuff_arm.c deleted file mode 100644 index 9ff8e5f56..000000000 --- a/vp9/encoder/arm/vp9_boolhuff_arm.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/encoder/vp9_boolhuff.h" -#include "vp9/common/vp9_blockd.h" - -const unsigned int vp9_prob_cost[256] = { - 2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046, - 1023, 1000, 979, 959, 940, 922, 905, 889, 873, 858, 843, 829, 816, 803, 790, 778, - 767, 755, 744, 733, 723, 713, 703, 693, 684, 675, 666, 657, 649, 641, 633, 625, - 617, 609, 602, 594, 587, 580, 573, 567, 560, 553, 547, 541, 534, 528, 522, 516, - 511, 505, 499, 494, 488, 483, 477, 472, 467, 462, 457, 452, 447, 442, 437, 433, - 428, 424, 419, 415, 410, 406, 401, 397, 393, 389, 385, 381, 377, 373, 369, 365, - 361, 357, 353, 349, 346, 342, 338, 335, 331, 328, 324, 321, 317, 314, 311, 307, - 304, 301, 297, 294, 291, 288, 285, 281, 278, 275, 272, 269, 266, 263, 260, 257, - 255, 252, 249, 246, 243, 240, 238, 235, 232, 229, 227, 224, 221, 219, 216, 214, - 211, 208, 206, 203, 201, 198, 196, 194, 191, 189, 186, 184, 181, 179, 177, 174, - 172, 170, 168, 165, 163, 161, 159, 156, 154, 152, 150, 148, 145, 143, 141, 139, - 137, 135, 133, 131, 129, 127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, - 105, 103, 101, 99, 97, 95, 93, 92, 90, 88, 86, 84, 82, 81, 79, 77, - 75, 73, 72, 70, 68, 66, 65, 63, 61, 60, 58, 56, 55, 53, 51, 50, - 48, 46, 45, 43, 41, 40, 38, 37, 35, 33, 32, 30, 29, 27, 25, 24, - 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 1 -}; - diff --git a/vp9/encoder/arm/vp9_dct_arm.c b/vp9/encoder/arm/vp9_dct_arm.c deleted file mode 100644 index 5e20a4723..000000000 --- a/vp9/encoder/arm/vp9_dct_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "./vp9_rtcd.h" - -#if HAVE_ARMV6 - -void vp9_short_fdct8x4_armv6(short *input, short *output, int pitch) { - vp9_short_fdct4x4_armv6(input, output, pitch); - vp9_short_fdct4x4_armv6(input + 4, output + 16, pitch); -} - -#endif /* HAVE_ARMV6 */ diff --git a/vp9/encoder/arm/vp9_dct_arm.h b/vp9/encoder/arm/vp9_dct_arm.h deleted file mode 100644 index 8eed31e60..000000000 --- a/vp9/encoder/arm/vp9_dct_arm.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_DCT_ARM_H_ -#define VP9_ENCODER_ARM_VP9_DCT_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_fdct(vp9_short_walsh4x4_armv6); -extern prototype_fdct(vp9_short_fdct4x4_armv6); -extern prototype_fdct(vp9_short_fdct8x4_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_fdct_walsh_short4x4 -#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_armv6 - -#undef vp8_fdct_short4x4 -#define vp8_fdct_short4x4 vp9_short_fdct4x4_armv6 - -#undef vp8_fdct_short8x4 -#define vp8_fdct_short8x4 vp9_short_fdct8x4_armv6 - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp9_short_fdct4x4_armv6 - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp9_short_fdct8x4_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -extern prototype_fdct(vp9_short_fdct4x4_neon); -extern prototype_fdct(vp9_short_fdct8x4_neon); -extern prototype_fdct(vp8_fast_fdct4x4_neon); -extern prototype_fdct(vp8_fast_fdct8x4_neon); -extern prototype_fdct(vp9_short_walsh4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_fdct_short4x4 -#define vp8_fdct_short4x4 vp9_short_fdct4x4_neon - -#undef vp8_fdct_short8x4 -#define vp8_fdct_short8x4 vp9_short_fdct8x4_neon - -#undef vp8_fdct_fast4x4 -#define vp8_fdct_fast4x4 vp9_short_fdct4x4_neon - -#undef vp8_fdct_fast8x4 -#define vp8_fdct_fast8x4 vp9_short_fdct8x4_neon - -#undef vp8_fdct_walsh_short4x4 -#define vp8_fdct_walsh_short4x4 vp9_short_walsh4x4_neon -#endif - -#endif - -#endif diff --git a/vp9/encoder/arm/vp9_encodemb_arm.h b/vp9/encoder/arm/vp9_encodemb_arm.h deleted file mode 100644 index 2f21d2cba..000000000 --- a/vp9/encoder/arm/vp9_encodemb_arm.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_ -#define VP9_ENCODER_ARM_VP9_ENCODEMB_ARM_H_ - -#if HAVE_ARMV6 -extern prototype_subb(vp9_subtract_b_armv6); -extern prototype_submby(vp9_subtract_mby_armv6); -extern prototype_submbuv(vp9_subtract_mbuv_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_encodemb_subb -#define vp8_encodemb_subb vp9_subtract_b_armv6 - -#undef vp8_encodemb_submby -#define vp8_encodemb_submby vp9_subtract_mby_armv6 - -#undef vp8_encodemb_submbuv -#define vp8_encodemb_submbuv vp9_subtract_mbuv_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -// extern prototype_berr(vp9_block_error_c); -// extern prototype_mberr(vp9_mbblock_error_c); -// extern prototype_mbuverr(vp9_mbuverror_c); - -extern prototype_subb(vp9_subtract_b_neon); -extern prototype_submby(vp9_subtract_mby_neon); -extern prototype_submbuv(vp9_subtract_mbuv_neon); - -// #undef vp8_encodemb_berr -// #define vp8_encodemb_berr vp9_block_error_c - -// #undef vp8_encodemb_mberr -// #define vp8_encodemb_mberr vp9_mbblock_error_c - -// #undef vp8_encodemb_mbuverr -// #define vp8_encodemb_mbuverr vp9_mbuverror_c - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_encodemb_subb -#define vp8_encodemb_subb vp9_subtract_b_neon - -#undef vp8_encodemb_submby -#define vp8_encodemb_submby vp9_subtract_mby_neon - -#undef vp8_encodemb_submbuv -#define vp8_encodemb_submbuv vp9_subtract_mbuv_neon -#endif - -#endif - -#endif diff --git a/vp9/encoder/arm/vp9_quantize_arm.c b/vp9/encoder/arm/vp9_quantize_arm.c deleted file mode 100644 index aacaa529c..000000000 --- a/vp9/encoder/arm/vp9_quantize_arm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "vpx_mem/vpx_mem.h" - -#include "vp9/encoder/vp9_quantize.h" -#include "vp9/common/vp9_entropy.h" - - -#if HAVE_ARMV7 - -/* vp8_quantize_mbX functions here differs from corresponding ones in - * vp9_quantize.c only by using quantize_b_pair function pointer instead of - * the regular quantize_b function pointer */ -void vp8_quantize_mby_neon(MACROBLOCK *x) { - int i; - int has_2nd_order = get_2nd_order_usage(xd); - - for (i = 0; i < 16; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); - - if (has_2nd_order) - x->quantize_b(&x->block[24], &x->e_mbd.block[24]); -} - -void vp8_quantize_mb_neon(MACROBLOCK *x) { - int i; - int has_2nd_order = get_2nd_order_usage(xd); - - for (i = 0; i < 24; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); - - if (has_2nd_order) - x->quantize_b(&x->block[i], &x->e_mbd.block[i]); -} - - -void vp8_quantize_mbuv_neon(MACROBLOCK *x) { - int i; - - for (i = 16; i < 24; i += 2) - x->quantize_b_pair(&x->block[i], &x->block[i + 1], - &x->e_mbd.block[i], &x->e_mbd.block[i + 1]); -} - -#endif /* HAVE_ARMV7 */ diff --git a/vp9/encoder/arm/vp9_quantize_arm.h b/vp9/encoder/arm/vp9_quantize_arm.h deleted file mode 100644 index 41a83d7f9..000000000 --- a/vp9/encoder/arm/vp9_quantize_arm.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_QUANTIZE_ARM_H_ -#define VP9_ENCODER_ARM_VP9_QUANTIZE_ARM_H_ - -#if HAVE_ARMV6 - -extern prototype_quantize_block(vp8_fast_quantize_b_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_quantize_fastquantb -#define vp8_quantize_fastquantb vp8_fast_quantize_b_armv6 -#endif - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 - -extern prototype_quantize_block(vp8_fast_quantize_b_neon); -extern prototype_quantize_block_pair(vp8_fast_quantize_b_pair_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_quantize_fastquantb -#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon - -#undef vp8_quantize_fastquantb_pair -#define vp8_quantize_fastquantb_pair vp8_fast_quantize_b_pair_neon - -#undef vp8_quantize_mb -#define vp8_quantize_mb vp8_quantize_mb_neon - -#undef vp8_quantize_mbuv -#define vp8_quantize_mbuv vp8_quantize_mbuv_neon - -#undef vp8_quantize_mby -#define vp8_quantize_mby vp8_quantize_mby_neon -#endif - -#endif /* HAVE_ARMV7 */ - -#endif - diff --git a/vp9/encoder/arm/vp9_variance_arm.c b/vp9/encoder/arm/vp9_variance_arm.c deleted file mode 100644 index 91c0236e3..000000000 --- a/vp9/encoder/arm/vp9_variance_arm.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/arm/vp9_bilinearfilter_arm.h" - -#define HALFNDX 8 - -#if HAVE_ARMV6 - -unsigned int vp9_sub_pixel_variance8x8_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - unsigned short first_pass[10 * 8]; - unsigned char second_pass[8 * 8]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 9, 8, HFilter); - vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 8, 8, 8, VFilter); - - return vp9_variance8x8_armv6(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance16x16_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - unsigned short first_pass[36 * 16]; - unsigned char second_pass[20 * 16]; - const short *HFilter, *VFilter; - unsigned int var; - - if (xoffset == HALFNDX && yoffset == 0) { - var = vp9_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 0 && yoffset == HALFNDX) { - var = vp9_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - var = vp9_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } else { - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 17, 16, HFilter); - vp9_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 16, 16, 16, VFilter); - - var = vp9_variance16x16_armv6(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); - } - return var; -} - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 - -unsigned int vp9_sub_pixel_variance16x16_neon -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - if (xoffset == HALFNDX && yoffset == 0) - return vp9_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 0 && yoffset == HALFNDX) - return vp9_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == HALFNDX && yoffset == HALFNDX) - return vp9_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else - return vp9_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); -} - -#endif diff --git a/vp9/encoder/arm/vp9_variance_arm.h b/vp9/encoder/arm/vp9_variance_arm.h deleted file mode 100644 index 144feea3d..000000000 --- a/vp9/encoder/arm/vp9_variance_arm.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_ENCODER_ARM_VP9_VARIANCE_ARM_H_ -#define VP9_ENCODER_ARM_VP9_VARIANCE_ARM_H_ - -#if HAVE_ARMV6 - -extern prototype_sad(vp9_sad16x16_armv6); -extern prototype_variance(vp9_variance16x16_armv6); -extern prototype_variance(vp9_variance8x8_armv6); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_armv6); -extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_h_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_v_armv6); -extern prototype_variance(vp9_variance_halfpixvar16x16_hv_armv6); -extern prototype_variance(vp9_mse16x16_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_variance_sad16x16 -#define vp9_variance_sad16x16 vp9_sad16x16_armv6 - -#undef vp9_variance_subpixvar16x16 -#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_armv6 - -#undef vp9_variance_subpixvar8x8 -#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_armv6 - -#undef vp9_variance_var16x16 -#define vp9_variance_var16x16 vp9_variance16x16_armv6 - -#undef vp9_variance_mse16x16 -#define vp9_variance_mse16x16 vp9_mse16x16_armv6 - -#undef vp9_variance_var8x8 -#define vp9_variance_var8x8 vp9_variance8x8_armv6 - -#undef vp9_variance_halfpixvar16x16_h -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_armv6 - -#undef vp9_variance_halfpixvar16x16_v -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_armv6 - -#undef vp9_variance_halfpixvar16x16_hv -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_armv6 - -#endif /* !CONFIG_RUNTIME_CPU_DETECT */ - -#endif /* HAVE_ARMV6 */ - - -#if HAVE_ARMV7 -extern prototype_sad(vp9_sad4x4_neon); -extern prototype_sad(vp9_sad8x8_neon); -extern prototype_sad(vp9_sad8x16_neon); -extern prototype_sad(vp9_sad16x8_neon); -extern prototype_sad(vp9_sad16x16_neon); - -extern prototype_variance(vp9_variance8x8_neon); -extern prototype_variance(vp9_variance8x16_neon); -extern prototype_variance(vp9_variance16x8_neon); -extern prototype_variance(vp9_variance16x16_neon); - -extern prototype_subpixvariance(vp9_sub_pixel_variance8x8_neon); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon); -extern prototype_subpixvariance(vp9_sub_pixel_variance16x16_neon_func); -extern prototype_variance(vp9_variance_halfpixvar16x16_h_neon); -extern prototype_variance(vp9_variance_halfpixvar16x16_v_neon); -extern prototype_variance(vp9_variance_halfpixvar16x16_hv_neon); - -extern prototype_variance(vp9_mse16x16_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_variance_sad4x4 -#define vp9_variance_sad4x4 vp9_sad4x4_neon - -#undef vp9_variance_sad8x8 -#define vp9_variance_sad8x8 vp9_sad8x8_neon - -#undef vp9_variance_sad8x16 -#define vp9_variance_sad8x16 vp9_sad8x16_neon - -#undef vp9_variance_sad16x8 -#define vp9_variance_sad16x8 vp9_sad16x8_neon - -#undef vp9_variance_sad16x16 -#define vp9_variance_sad16x16 vp9_sad16x16_neon - -#undef vp9_variance_var8x8 -#define vp9_variance_var8x8 vp9_variance8x8_neon - -#undef vp9_variance_var8x16 -#define vp9_variance_var8x16 vp9_variance8x16_neon - -#undef vp9_variance_var16x8 -#define vp9_variance_var16x8 vp9_variance16x8_neon - -#undef vp9_variance_var16x16 -#define vp9_variance_var16x16 vp9_variance16x16_neon - -#undef vp9_variance_subpixvar8x8 -#define vp9_variance_subpixvar8x8 vp9_sub_pixel_variance8x8_neon - -#undef vp9_variance_subpixvar16x16 -#define vp9_variance_subpixvar16x16 vp9_sub_pixel_variance16x16_neon - -#undef vp9_variance_halfpixvar16x16_h -#define vp9_variance_halfpixvar16x16_h vp9_variance_halfpixvar16x16_h_neon - -#undef vp9_variance_halfpixvar16x16_v -#define vp9_variance_halfpixvar16x16_v vp9_variance_halfpixvar16x16_v_neon - -#undef vp9_variance_halfpixvar16x16_hv -#define vp9_variance_halfpixvar16x16_hv vp9_variance_halfpixvar16x16_hv_neon - -#undef vp9_variance_mse16x16 -#define vp9_variance_mse16x16 vp9_mse16x16_neon - -#endif - -#endif - -#endif diff --git a/vp9/encoder/vp9_asm_enc_offsets.c b/vp9/encoder/vp9_asm_enc_offsets.c index 3fe9c8fb7..30431ff8c 100644 --- a/vp9/encoder/vp9_asm_enc_offsets.c +++ b/vp9/encoder/vp9_asm_enc_offsets.c @@ -79,12 +79,4 @@ END /* add asserts for any offset that is not supported by assembly code * add asserts for any size that is not supported by assembly code - - * These are used in vp8cx_pack_tokens. They are hard coded so if their sizes - * change they will have to be adjusted. */ - -#if HAVE_ARMV5TE -ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8) -ct_assert(vp9_extra_bit_struct_sz, sizeof(vp9_extra_bit_struct) == 16) -#endif diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index f94e00c1e..4270a1d35 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -40,24 +40,12 @@ #include "vp9/common/vp9_mvref_common.h" #include "vp9/encoder/vp9_temporal_filter.h" -#if ARCH_ARM -#include "vpx_ports/arm.h" -#endif - #include #include #include extern void print_tree_update_probs(); -#if HAVE_ARMV7 -extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc); - -extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc); -#endif - static void set_default_lf_deltas(VP9_COMP *cpi); #define DEFAULT_INTERP_FILTER EIGHTTAP /* SWITCHABLE for better performance */ @@ -4055,33 +4043,15 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, } } -// For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us. -#if HAVE_ARMV7 -extern void vp9_push_neon(int64_t *store); -extern void vp9_pop_neon(int64_t *store); -#endif - int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { -#if HAVE_ARMV7 - int64_t store_reg[8]; -#endif VP9_COMP *cpi = (VP9_COMP *) ptr; VP9_COMMON *cm = &cpi->common; struct vpx_usec_timer timer; int res = 0; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) @@ -4090,15 +4060,6 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif - return res; } @@ -4119,9 +4080,6 @@ static int frame_is_reference(const VP9_COMP *cpi) { int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, int64_t *time_stamp, int64_t *time_end, int flush) { -#if HAVE_ARMV7 - int64_t store_reg[8]; -#endif VP9_COMP *cpi = (VP9_COMP *) ptr; VP9_COMMON *cm = &cpi->common; struct vpx_usec_timer cmptimer; @@ -4130,15 +4088,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, if (!cpi) return -1; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_push_neon(store_reg); - } -#endif - vpx_usec_timer_start(&cmptimer); cpi->source = NULL; @@ -4191,14 +4140,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cpi->twopass.first_pass_done = 1; } -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif return -1; } @@ -4425,15 +4366,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #endif -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp9_pop_neon(store_reg); - } -#endif - return 0; } diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 824951afa..f10fb3a1d 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -17,13 +17,6 @@ #include "vpx_scale/vpxscale.h" #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_loopfilter.h" -#if ARCH_ARM -#include "vpx_ports/arm.h" -#endif - -#if HAVE_ARMV7 -extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); -#endif void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) { @@ -254,22 +247,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { int Bias = 0; // Bias against raising loop filter and in favour of lowering it // Make a copy of the unfiltered / processed recon buffer -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); - } -#endif + vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -295,22 +273,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_best = filt_mid; // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); while (filter_step > 0) { Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; // PGW change 12/12/06 for small images @@ -334,22 +297,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // If value is close to the best so far then bias towards a lower loop filter value. if ((filt_err - Bias) < best_err) { @@ -369,22 +317,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); // Re-instate the unfiltered frame -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); - } -#endif + vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // Was it better than the previous best? if (filt_err < (best_err - Bias)) { @@ -405,4 +338,3 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi) { cm->filter_level = filt_best; } - diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index d801ca74b..dd11e75ba 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -26,10 +26,6 @@ #include "x86/vp9_quantize_x86.h" #endif -#if ARCH_ARM -#include "arm/vp9_quantize_arm.h" -#endif - #define prototype_quantize_block_type(sym) \ void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type) extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 83ce9edf9..5d2fe6ff9 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -128,60 +128,6 @@ vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 endif -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_arm_systemdependent.c -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_bilinearfilter_arm.c -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_bilinearfilter_arm.h -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_filter_arm.c -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_idct_arm.h -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_loopfilter_arm.c -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_loopfilter_arm.h -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_recon_arm.h -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_reconintra_arm.c -VP9_COMMON_SRCS-$(ARCH_ARM) += common/arm/vp9_subpixel_arm.h - -# common (armv6) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/bilinearfilter_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x4_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem8x8_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/copymem16x16_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/dc_only_idct_add_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM) - -# common (neon) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict4x4_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x4_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict8x8_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/bilinearpredict16x16_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x4_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem8x8_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/copymem16x16_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/iwalsh_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM) -VP9_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/vp9_recon_neon.c - - $(eval $(call asm_offsets_template,\ vp9_asm_com_offsets.asm, $(VP9_PREFIX)common/vp9_asm_com_offsets.c)) diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index d20e79a6c..4ab0a9696 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -15,10 +15,6 @@ VP9_CX_SRCS-no += $(VP9_COMMON_SRCS-no) VP9_CX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes) VP9_CX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) -ifeq ($(ARCH_ARM),yes) - include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9cx_arm.mk -endif - VP9_CX_SRCS-yes += vp9_cx_iface.c # encoder diff --git a/vp9/vp9cx_arm.mk b/vp9/vp9cx_arm.mk deleted file mode 100644 index d0108a84e..000000000 --- a/vp9/vp9cx_arm.mk +++ /dev/null @@ -1,63 +0,0 @@ -## -## Copyright (c) 2010 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## - - -#VP9_CX_SRCS list is modified according to different platforms. - -#File list for arm -# encoder -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_arm_csystemdependent.c - -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_dct_arm.c -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_dct_arm.h -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_encodemb_arm.h -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_quantize_arm.c -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_quantize_arm.h -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_variance_arm.c -VP9_CX_SRCS-$(ARCH_ARM) += encoder/arm/vp9_variance_arm.h - -#File list for armv5te -# encoder -VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/vp9_boolhuff_arm.c -VP9_CX_SRCS_REMOVE-$(HAVE_ARMV5TE) += encoder/vp9_boolhuff.c -VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/boolhuff_armv5te$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM) - -#File list for armv6 -# encoder -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_subtract_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance8x8_armv6$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/walsh_v6$(ASM) - -#File list for neon -# encoder -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/fastquantizeb_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp9_picklpf_arm.c -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad8_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/sad16_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/shortfdct_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/subtract_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/variance_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_subpixelvariance16x16s_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_memcpy_neon$(ASM) -VP9_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/neon/vp8_shortwalsh4x4_neon$(ASM) diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 54af58d7f..004039016 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -15,36 +15,8 @@ VP9_DX_SRCS-no += $(VP9_COMMON_SRCS-no) VP9_DX_SRCS_REMOVE-yes += $(VP9_COMMON_SRCS_REMOVE-yes) VP9_DX_SRCS_REMOVE-no += $(VP9_COMMON_SRCS_REMOVE-no) -ifeq ($(ARCH_ARM),yes) - include $(SRC_PATH_BARE)/$(VP9_PREFIX)vp9dx_arm.mk -endif - VP9_DX_SRCS-yes += vp9_dx_iface.c -# common -#define ARM -#define DISABLE_THREAD - -#INCLUDES += algo/vpx_common/vpx_mem/include -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += decoder - - - -# decoder -#define ARM -#define DISABLE_THREAD - -#INCLUDES += algo/vpx_common/vpx_mem/include -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += common -#INCLUDES += decoder - VP9_DX_SRCS-yes += decoder/vp9_asm_dec_offsets.c VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c VP9_DX_SRCS-yes += decoder/vp9_decodemv.c diff --git a/vp9/vp9dx_arm.mk b/vp9/vp9dx_arm.mk deleted file mode 100644 index 32ec26afa..000000000 --- a/vp9/vp9dx_arm.mk +++ /dev/null @@ -1,29 +0,0 @@ -## -## Copyright (c) 2010 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## - - -#VP8_DX_SRCS list is modified according to different platforms. - -VP8_DX_SRCS-$(ARCH_ARM) += decoder/arm/vp9_dequantize_arm.c - -#File list for armv6 -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_dc_idct_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequant_idct_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/dequantize_v6$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV6) += decoder/arm/armv6/vp9_idct_blk_v6.c - -#File list for neon -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequant_idct_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/dequantizeb_neon$(ASM) -VP8_DX_SRCS-$(HAVE_ARMV7) += decoder/arm/neon/vp9_idct_blk_neon.c