Bug 1148639 - Update libvpx. r=kinetik

From b40321d9379faaa62fa16d6d412098fca115042d Mon Sep 17 00:00:00 2001
Result of running ./update.py --ndk ~/android/android-ndk-r9 --commit
afad1a84c15b9af8298a37c0fa449e0af40931fd
This commit is contained in:
Ralph Giles 2015-03-27 14:56:04 -07:00
Родитель c536fc79d4
Коммит 177b274c46
406 изменённых файлов: 53972 добавлений и 59908 удалений

Просмотреть файл

@ -1,23 +1,22 @@
Additional IP Rights Grant (Patents)
------------------------------------
"These implementations" means the copyrightable works that implement the WebM
codecs distributed by Google as part of the WebM Project.
"This implementation" means the copyrightable works distributed by
Google as part of the WebM Project.
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
royalty-free, irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and otherwise
run, modify and propagate the contents of these implementations of WebM, where
such license applies only to those patent claims, both currently owned by
Google and acquired in the future, licensable by Google that are necessarily
infringed by these implementations of WebM. This grant does not include claims
that would be infringed only as a consequence of further modification of these
implementations. If you or your agent or exclusive licensee institute or order
or agree to the institution of patent litigation or any other patent
enforcement activity against any entity (including a cross-claim or
counterclaim in a lawsuit) alleging that any of these implementations of WebM
or any code incorporated within any of these implementations of WebM
constitutes direct or contributory patent infringement, or inducement of
patent infringement, then any patent rights granted to you under this License
for these implementations of WebM shall terminate as of the date such
litigation is filed.
Google hereby grants to you a perpetual, worldwide, non-exclusive,
no-charge, royalty-free, irrevocable (except as stated in this section)
patent license to make, have made, use, offer to sell, sell, import,
transfer, and otherwise run, modify and propagate the contents of this
implementation of VP8, where such license applies only to those patent
claims, both currently owned by Google and acquired in the future,
licensable by Google that are necessarily infringed by this
implementation of VP8. This grant does not include claims that would be
infringed only as a consequence of further modification of this
implementation. If you or your agent or exclusive licensee institute or
order or agree to the institution of patent litigation against any
entity (including a cross-claim or counterclaim in a lawsuit) alleging
that this implementation of VP8 or any code incorporated within this
implementation of VP8 constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any patent
rights granted to you under this License for this implementation of VP8
shall terminate as of the date such litigation is filed.

Просмотреть файл

@ -8,4 +8,4 @@ The libvpx git repository is:
https://gerrit.chromium.org/gerrit/webm/libvpx
The git commit ID used was 587ff646f
The git commit ID used was afad1a84c15b9af8298a37c0fa449e0af40931fd

Просмотреть файл

@ -34,18 +34,6 @@ int log_msg(const char *fmt, ...) {
}
#if defined(__GNUC__) && __GNUC__
#if defined(FORCE_PARSE_ELF)
#if defined(__MACH__)
#undef __MACH__
#endif
#if !defined(__ELF__)
#define __ELF__
#endif
#endif
#if defined(__MACH__)
#include <mach-o/loader.h>

Просмотреть файл

@ -1,4 +1,4 @@
#!/usr/bin/env perl
#!/usr/bin/perl
##
## Copyright (c) 2013 The WebM project authors. All Rights Reserved.
##
@ -51,7 +51,7 @@ sub FixThumbInstructions($$)
# Convert register post indexing to a separate add instruction.
# This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
# "addne r0, r0, r2".
# "add r0, r2".
s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
# Convert a conditional addition to the pc register into a series of

Просмотреть файл

@ -25,24 +25,34 @@ files = {
'vp8/common/arm/filter_arm.c',
'vp8/common/arm/loopfilter_arm.c',
'vp8/common/arm/neon/bilinearpredict_neon.c',
'vp8/common/arm/neon/copymem_neon.c',
'vp8/common/arm/neon/dc_only_idct_add_neon.c',
'vp8/common/arm/neon/dequant_idct_neon.c',
'vp8/common/arm/neon/dequantizeb_neon.c',
'vp8/common/arm/neon/buildintrapredictorsmby_neon.asm',
'vp8/common/arm/neon/copymem16x16_neon.asm',
'vp8/common/arm/neon/copymem8x4_neon.asm',
'vp8/common/arm/neon/copymem8x8_neon.asm',
'vp8/common/arm/neon/dc_only_idct_add_neon.asm',
'vp8/common/arm/neon/dequant_idct_neon.asm',
'vp8/common/arm/neon/dequantizeb_neon.asm',
'vp8/common/arm/neon/idct_blk_neon.c',
'vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
'vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
'vp8/common/arm/neon/iwalsh_neon.c',
'vp8/common/arm/neon/loopfilter_neon.c',
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
'vp8/common/arm/neon/mbloopfilter_neon.c',
'vp8/common/arm/neon/reconintra_neon.c',
'vp8/common/arm/neon/sad_neon.c',
'vp8/common/arm/neon/shortidct4x4llm_neon.c',
'vp8/common/arm/neon/sixtappredict_neon.c',
'vp8/common/arm/neon/variance_neon.c',
'vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
'vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
'vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
'vp8/common/arm/neon/iwalsh_neon.asm',
'vp8/common/arm/neon/loopfilter_neon.asm',
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm',
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
'vp8/common/arm/neon/mbloopfilter_neon.asm',
'vp8/common/arm/neon/sad16_neon.asm',
'vp8/common/arm/neon/sad8_neon.asm',
'vp8/common/arm/neon/save_reg_neon.asm',
'vp8/common/arm/neon/shortidct4x4llm_neon.asm',
'vp8/common/arm/neon/sixtappredict16x16_neon.asm',
'vp8/common/arm/neon/sixtappredict4x4_neon.asm',
'vp8/common/arm/neon/sixtappredict8x4_neon.asm',
'vp8/common/arm/neon/sixtappredict8x8_neon.asm',
'vp8/common/arm/neon/variance_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
'vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
'vp8/common/arm/reconintra_arm.c',
'vp8/common/arm/variance_arm.c',
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
@ -57,10 +67,12 @@ files = {
'vp8/encoder/arm/dct_arm.c',
'vp8/encoder/arm/neon/denoising_neon.c',
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
'vp8/encoder/arm/neon/shortfdct_neon.c',
'vp8/encoder/arm/neon/subtract_neon.c',
'vp8/encoder/arm/neon/vp8_mse16x16_neon.c',
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
'vp8/encoder/arm/neon/picklpf_arm.c',
'vp8/encoder/arm/neon/shortfdct_neon.asm',
'vp8/encoder/arm/neon/subtract_neon.asm',
'vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm',
'vp8/encoder/arm/quantize_arm.c',
'vp9/common/arm/neon/vp9_avg_neon.asm',
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
@ -85,19 +97,16 @@ files = {
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
'vp9/common/arm/neon/vp9_reconintra_neon.asm',
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
'vp9/encoder/arm/neon/vp9_dct_neon.c',
'vp9/encoder/arm/neon/vp9_quantize_neon.c',
'vp9/encoder/arm/neon/vp9_sad_neon.c',
'vp9/encoder/arm/neon/vp9_subtract_neon.c',
'vp9/encoder/arm/neon/vp9_variance_neon.c',
'vpx_ports/arm_cpudetect.c'],
'vpx_ports/arm_cpudetect.c',
'vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm',
'vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm',
'vpx_scale/arm/neon/yv12extend_arm.c'],
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c',
'vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c',
'vp9/encoder/x86/vp9_dct32x32_avx2.c',
'vp9/encoder/x86/vp9_dct_avx2.c',
'vp9/encoder/x86/vp9_error_intrin_avx2.c',
'vp9/encoder/x86/vp9_sad4d_intrin_avx2.c',
'vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c',
'vp9/encoder/x86/vp9_variance_avx2.c',
'vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c'],
'ERROR_CONCEALMENT': ['vp8/decoder/error_concealment.c'],
@ -123,6 +132,7 @@ files = {
'vp8/encoder/bitstream.c',
'vp8/encoder/onyx_if.c',
'vp8/vp8_dx_iface.c',
'vp9/common/generic/vp9_systemdependent.c',
'vp9/common/vp9_alloccommon.c',
'vp9/common/vp9_blockd.c',
'vp9/common/vp9_common_data.c',
@ -146,48 +156,39 @@ files = {
'vp9/common/vp9_scale.c',
'vp9/common/vp9_scan.c',
'vp9/common/vp9_seg_common.c',
'vp9/common/vp9_thread.c',
'vp9/common/vp9_tile_common.c',
'vp9/decoder/vp9_decodeframe.c',
'vp9/decoder/vp9_decodemv.c',
'vp9/decoder/vp9_decoder.c',
'vp9/decoder/vp9_detokenize.c',
'vp9/decoder/vp9_dsubexp.c',
'vp9/decoder/vp9_dthread.c',
'vp9/decoder/vp9_onyxd_if.c',
'vp9/decoder/vp9_reader.c',
'vp9/encoder/vp9_aq_complexity.c',
'vp9/encoder/vp9_aq_cyclicrefresh.c',
'vp9/encoder/vp9_aq_variance.c',
'vp9/encoder/vp9_bitstream.c',
'vp9/encoder/vp9_context_tree.c',
'vp9/encoder/vp9_cost.c',
'vp9/encoder/vp9_dct.c',
'vp9/encoder/vp9_encodeframe.c',
'vp9/encoder/vp9_encodemb.c',
'vp9/encoder/vp9_encodemv.c',
'vp9/encoder/vp9_encoder.c',
'vp9/encoder/vp9_extend.c',
'vp9/encoder/vp9_firstpass.c',
'vp9/encoder/vp9_lookahead.c',
'vp9/encoder/vp9_mbgraph.c',
'vp9/encoder/vp9_mcomp.c',
'vp9/encoder/vp9_onyx_if.c',
'vp9/encoder/vp9_picklpf.c',
'vp9/encoder/vp9_pickmode.c',
'vp9/encoder/vp9_quantize.c',
'vp9/encoder/vp9_ratectrl.c',
'vp9/encoder/vp9_rd.c',
'vp9/encoder/vp9_rdopt.c',
'vp9/encoder/vp9_resize.c',
'vp9/encoder/vp9_sad.c',
'vp9/encoder/vp9_segmentation.c',
'vp9/encoder/vp9_speed_features.c',
'vp9/encoder/vp9_subexp.c',
'vp9/encoder/vp9_svc_layercontext.c',
'vp9/encoder/vp9_temporal_filter.c',
'vp9/encoder/vp9_tokenize.c',
'vp9/encoder/vp9_treewriter.c',
'vp9/encoder/vp9_vaq.c',
'vp9/encoder/vp9_variance.c',
'vp9/encoder/vp9_write_bit_buffer.c',
'vp9/encoder/vp9_writer.c',
'vp9/vp9_cx_iface.c',
'vp9/vp9_dx_iface.c',
@ -242,6 +243,7 @@ files = {
'vp8/encoder/mr_dissim.c',
'vp8/encoder/pickinter.c',
'vp8/encoder/picklpf.c',
'vp8/encoder/psnr.c',
'vp8/encoder/quantize.c',
'vp8/encoder/ratectrl.c',
'vp8/encoder/rdopt.c',
@ -250,20 +252,18 @@ files = {
'vp8/encoder/tokenize.c',
'vp8/encoder/treewriter.c',
'vp8/vp8_cx_iface.c',
'vp9/decoder/vp9_read_bit_buffer.c',
'vp9/decoder/vp9_thread.c',
'vp9/encoder/vp9_psnr.c',
'vpx/src/vpx_codec.c',
'vpx/src/vpx_decoder.c',
'vpx/src/vpx_image.c',
'vpx/src/vpx_psnr.c',
'vpx_scale/generic/gen_scalers.c',
'vpx_scale/generic/vpx_scale.c'],
'VP8_POSTPROC': ['vp8/common/mfqe.c', 'vp8/common/postproc.c'],
'VP9_POSTPROC': ['vp9/common/vp9_postproc.c'],
'X86-64_ASM': ['third_party/x86inc/x86inc.asm',
'vp8/common/x86/loopfilter_block_sse2_x86_64.asm',
'vp8/encoder/x86/ssim_opt_x86_64.asm',
'vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
'vp9/encoder/x86/vp9_ssim_opt_x86_64.asm'],
'vp8/common/x86/loopfilter_block_sse2.asm',
'vp9/encoder/x86/vp9_quantize_ssse3.asm'],
'X86_ASM': ['vp8/common/x86/dequantize_mmx.asm',
'vp8/common/x86/filter_x86.c',
'vp8/common/x86/idct_blk_mmx.c',
@ -278,6 +278,7 @@ files = {
'vp8/common/x86/mfqe_sse2.asm',
'vp8/common/x86/postproc_mmx.asm',
'vp8/common/x86/postproc_sse2.asm',
'vp8/common/x86/postproc_x86.c',
'vp8/common/x86/recon_mmx.asm',
'vp8/common/x86/recon_sse2.asm',
'vp8/common/x86/recon_wrapper_sse2.c',
@ -303,8 +304,8 @@ files = {
'vp8/encoder/x86/fwalsh_sse2.asm',
'vp8/encoder/x86/quantize_mmx.asm',
'vp8/encoder/x86/quantize_sse2.c',
'vp8/encoder/x86/quantize_sse4.c',
'vp8/encoder/x86/quantize_ssse3.c',
'vp8/encoder/x86/quantize_sse4.asm',
'vp8/encoder/x86/quantize_ssse3.asm',
'vp8/encoder/x86/subtract_mmx.asm',
'vp8/encoder/x86/subtract_sse2.asm',
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
@ -312,35 +313,31 @@ files = {
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
'vp9/common/x86/vp9_asm_stubs.c',
'vp9/common/x86/vp9_copy_sse2.asm',
'vp9/common/x86/vp9_high_intrapred_sse2.asm',
'vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c',
'vp9/common/x86/vp9_high_subpixel_8t_sse2.asm',
'vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm',
'vp9/common/x86/vp9_idct_intrin_sse2.c',
'vp9/common/x86/vp9_idct_intrin_ssse3.c',
'vp9/common/x86/vp9_idct_ssse3_x86_64.asm',
'vp9/common/x86/vp9_intrapred_sse2.asm',
'vp9/common/x86/vp9_intrapred_ssse3.asm',
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
'vp9/common/x86/vp9_loopfilter_mmx.asm',
'vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c',
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
'vp9/common/x86/vp9_subpixel_bilinear_sse2.asm',
'vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm',
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
'vp9/encoder/x86/vp9_dct_mmx.asm',
'vp9/encoder/x86/vp9_dct_sse2.c',
'vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm',
'vp9/encoder/x86/vp9_error_sse2.asm',
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
'vp9/encoder/x86/vp9_sad_mmx.asm',
'vp9/encoder/x86/vp9_sad_sse2.asm',
'vp9/encoder/x86/vp9_sad_sse3.asm',
'vp9/encoder/x86/vp9_sad_sse4.asm',
'vp9/encoder/x86/vp9_sad_ssse3.asm',
'vp9/encoder/x86/vp9_subpel_variance.asm',
'vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm',
'vp9/encoder/x86/vp9_subtract_sse2.asm',
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
'vp9/encoder/x86/vp9_variance_impl_mmx.asm',
'vp9/encoder/x86/vp9_variance_impl_sse2.asm',
'vp9/encoder/x86/vp9_variance_mmx.c',
'vp9/encoder/x86/vp9_variance_sse2.c',
'vpx_ports/emms.asm']
}

45
media/libvpx/third_party/x86inc/x86inc.asm поставляемый
Просмотреть файл

@ -234,10 +234,10 @@ ALIGNMODE k7
%define r%1mp %2
%elif ARCH_X86_64 ; memory
%define r%1m [rsp + stack_offset + %6]
%define r%1mp qword r %+ %1 %+ m
%define r%1mp qword r %+ %1m
%else
%define r%1m [esp + stack_offset + %6]
%define r%1mp dword r %+ %1 %+ m
%define r%1mp dword r %+ %1m
%endif
%define r%1 %2
%endmacro
@ -395,23 +395,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%assign n_arg_names %0
%endmacro
%if ARCH_X86_64
%macro ALLOC_STACK 2 ; stack_size, num_regs
%assign %%stack_aligment ((mmsize + 15) & ~15)
%assign stack_size_padded %1
%assign %%reg_num (%2 - 1)
%xdefine rsp_tmp r %+ %%reg_num
mov rsp_tmp, rsp
sub rsp, stack_size_padded
and rsp, ~(%%stack_aligment - 1)
%endmacro
%macro RESTORE_STACK 0 ; reset rsp register
mov rsp, rsp_tmp
%endmacro
%endif
%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx, ecx, cx, cl
@ -609,20 +592,16 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
CAT_XDEFINE cglobaled_, %1, 1
%endif
%xdefine current_function %1
%ifdef CHROMIUM
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf32
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
global %1:private_extern
%elifidn __OUTPUT_FORMAT__,macho64
global %1:private_extern
%else
global %1
%endif
%ifidn __OUTPUT_FORMAT__,elf
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf32
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,elf64
global %1:function hidden
%elifidn __OUTPUT_FORMAT__,macho32
global %1:private_extern
%elifidn __OUTPUT_FORMAT__,macho64
global %1:private_extern
%else
global %1
%endif

Просмотреть файл

@ -53,7 +53,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -77,7 +77,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -101,7 +101,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -127,7 +127,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

Просмотреть файл

@ -51,7 +51,7 @@ loop
orr r8, r8, r10 ; differences of all 4 pixels
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
sub r4, r4, r7 ; substract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords
@ -77,7 +77,7 @@ loop
; calculate total sum
add r4, r4, r6 ; add positive differences to sum
sub r4, r4, r7 ; subtract negative differences from sum
sub r4, r4, r7 ; substract negative differences from sum
; calculate sse
uxtb16 r7, r8 ; byte (two pixels) to halfwords

Просмотреть файл

@ -58,7 +58,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -89,7 +89,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -120,7 +120,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

Просмотреть файл

@ -69,7 +69,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -111,7 +111,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -153,7 +153,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -195,7 +195,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

Просмотреть файл

@ -59,7 +59,7 @@ loop
orr r6, r6, r7 ; differences of all 4 pixels
; calculate total sum
adds r8, r8, r4 ; add positive differences to sum
subs r8, r8, r5 ; subtract negative differences from sum
subs r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -90,7 +90,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -121,7 +121,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords
@ -154,7 +154,7 @@ loop
; calculate total sum
add r8, r8, r4 ; add positive differences to sum
sub r8, r8, r5 ; subtract negative differences from sum
sub r8, r8, r5 ; substract negative differences from sum
; calculate sse
uxtb16 r5, r6 ; byte (two pixels) to halfwords

Просмотреть файл

@ -12,9 +12,26 @@
#include "vpx_config.h"
#include "vp8/common/blockd.h"
#if HAVE_NEON
extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_MEDIA
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
#endif
#if HAVE_NEON
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;
short *Q = d->qcoeff;
vp8_dequantize_b_loop_neon(Q, DQC, DQ);
}
#endif
#if HAVE_MEDIA
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
{
short *DQ = d->dqcoeff;

Просмотреть файл

@ -34,11 +34,11 @@ typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif

Просмотреть файл

@ -10,7 +10,7 @@
#include <arm_neon.h>
static const uint8_t bifilter4_coeff[8][2] = {
static const uint16_t bifilter4_coeff[8][2] = {
{128, 0},
{112, 16},
{ 96, 32},
@ -30,6 +30,7 @@ void vp8_bilinear_predict4x4_neon(
int dst_pitch) {
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
uint32x2_t d28u32, d29u32, d30u32;
uint8x16_t q1u8, q2u8;
uint16x8_t q1u16, q2u16;
uint16x8_t q7u16, q8u16, q9u16;
@ -38,10 +39,6 @@ void vp8_bilinear_predict4x4_neon(
uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
if (xoffset == 0) { // skip_1stpass_filter
uint32x2_t d28u32 = vdup_n_u32(0);
uint32x2_t d29u32 = vdup_n_u32(0);
uint32x2_t d30u32 = vdup_n_u32(0);
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
src_ptr += src_pixels_per_line;
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
@ -64,8 +61,8 @@ void vp8_bilinear_predict4x4_neon(
q1u8 = vcombine_u8(d2u8, d3u8);
q2u8 = vcombine_u8(d4u8, d5u8);
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
@ -155,8 +152,8 @@ void vp8_bilinear_predict8x4_neon(
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q5u8 = vld1q_u8(src_ptr);
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
@ -245,8 +242,8 @@ void vp8_bilinear_predict8x8_neon(
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);

Просмотреть файл

@ -0,0 +1,584 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_build_intra_predictors_mby_neon_func|
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_neon_func| PROC
push {r4-r8, lr}
cmp r3, #0
beq case_dc_pred
cmp r3, #1
beq case_v_pred
cmp r3, #2
beq case_h_pred
cmp r3, #3
beq case_tm_pred
case_dc_pred
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up
cmp r5, #0
beq skip_dc_pred_left
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_v_pred
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
vst1.u8 {q0}, [r1]!
pop {r4-r8,pc}
case_h_pred
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
pop {r4-r8,pc}
case_tm_pred
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1]!
vst1.u8 {q1}, [r1]!
vst1.u8 {q2}, [r1]!
vst1.u8 {q3}, [r1]!
subs r12, r12, #1
bne case_tm_pred_loop
pop {r4-r8,pc}
ENDP
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; r0 unsigned char *y_buffer
; r1 unsigned char *ypred_ptr
; r2 int y_stride
; r3 int mode
; stack int Up
; stack int Left
|vp8_build_intra_predictors_mby_s_neon_func| PROC
push {r4-r8, lr}
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
cmp r3, #0
beq case_dc_pred_s
cmp r3, #1
beq case_v_pred_s
cmp r3, #2
beq case_h_pred_s
cmp r3, #3
beq case_tm_pred_s
case_dc_pred_s
ldr r4, [sp, #24] ; Up
ldr r5, [sp, #28] ; Left
; Default the DC average to 128
mov r12, #128
vdup.u8 q0, r12
; Zero out running sum
mov r12, #0
; compute shift and jump
adds r7, r4, r5
beq skip_dc_pred_up_left_s
; Load above row, if it exists
cmp r4, #0
beq skip_dc_pred_up_s
sub r6, r0, r2
vld1.8 {q1}, [r6]
vpaddl.u8 q2, q1
vpaddl.u16 q3, q2
vpaddl.u32 q4, q3
vmov.32 r4, d8[0]
vmov.32 r6, d9[0]
add r12, r4, r6
; Move back to interger registers
skip_dc_pred_up_s
cmp r5, #0
beq skip_dc_pred_left_s
sub r0, r0, #1
; Load left row, if it exists
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0]
add r12, r12, r3
add r12, r12, r4
add r12, r12, r5
add r12, r12, r6
skip_dc_pred_left_s
add r7, r7, #3 ; Shift
sub r4, r7, #1
mov r5, #1
add r12, r12, r5, lsl r4
mov r5, r12, lsr r7 ; expected_dc
vdup.u8 q0, r5
skip_dc_pred_up_left_s
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_v_pred_s
; Copy down above row
sub r6, r0, r2
vld1.8 {q0}, [r6]
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
vst1.u8 {q0}, [r1], r2
pop {r4-r8,pc}
case_h_pred_s
; Load 4x yleft_col
sub r0, r0, #1
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u8 q0, r3
vdup.u8 q1, r4
vdup.u8 q2, r5
vdup.u8 q3, r6
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
pop {r4-r8,pc}
case_tm_pred_s
; Load yabove_row
sub r3, r0, r2
vld1.8 {q8}, [r3]
; Load ytop_left
sub r3, r3, #1
ldrb r7, [r3]
vdup.u16 q7, r7
; Compute yabove_row - ytop_left
mov r3, #1
vdup.u8 q0, r3
vmull.u8 q4, d16, d0
vmull.u8 q5, d17, d0
vsub.s16 q4, q4, q7
vsub.s16 q5, q5, q7
; Load 4x yleft_col
sub r0, r0, #1
mov r12, #4
case_tm_pred_loop_s
ldrb r3, [r0], r2
ldrb r4, [r0], r2
ldrb r5, [r0], r2
ldrb r6, [r0], r2
vdup.u16 q0, r3
vdup.u16 q1, r4
vdup.u16 q2, r5
vdup.u16 q3, r6
vqadd.s16 q8, q0, q4
vqadd.s16 q9, q0, q5
vqadd.s16 q10, q1, q4
vqadd.s16 q11, q1, q5
vqadd.s16 q12, q2, q4
vqadd.s16 q13, q2, q5
vqadd.s16 q14, q3, q4
vqadd.s16 q15, q3, q5
vqshrun.s16 d0, q8, #0
vqshrun.s16 d1, q9, #0
vqshrun.s16 d2, q10, #0
vqshrun.s16 d3, q11, #0
vqshrun.s16 d4, q12, #0
vqshrun.s16 d5, q13, #0
vqshrun.s16 d6, q14, #0
vqshrun.s16 d7, q15, #0
vst1.u8 {q0}, [r1], r2
vst1.u8 {q1}, [r1], r2
vst1.u8 {q2}, [r1], r2
vst1.u8 {q3}, [r1], r2
subs r12, r12, #1
bne case_tm_pred_loop_s
pop {r4-r8,pc}
ENDP
END

Просмотреть файл

@ -0,0 +1,59 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem16x16_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem16x16_neon| PROC
vld1.u8 {q0}, [r0], r1
vld1.u8 {q1}, [r0], r1
vld1.u8 {q2}, [r0], r1
vst1.u8 {q0}, [r2], r3
vld1.u8 {q3}, [r0], r1
vst1.u8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vst1.u8 {q2}, [r2], r3
vld1.u8 {q5}, [r0], r1
vst1.u8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vst1.u8 {q4}, [r2], r3
vld1.u8 {q7}, [r0], r1
vst1.u8 {q5}, [r2], r3
vld1.u8 {q8}, [r0], r1
vst1.u8 {q6}, [r2], r3
vld1.u8 {q9}, [r0], r1
vst1.u8 {q7}, [r2], r3
vld1.u8 {q10}, [r0], r1
vst1.u8 {q8}, [r2], r3
vld1.u8 {q11}, [r0], r1
vst1.u8 {q9}, [r2], r3
vld1.u8 {q12}, [r0], r1
vst1.u8 {q10}, [r2], r3
vld1.u8 {q13}, [r0], r1
vst1.u8 {q11}, [r2], r3
vld1.u8 {q14}, [r0], r1
vst1.u8 {q12}, [r2], r3
vld1.u8 {q15}, [r0], r1
vst1.u8 {q13}, [r2], r3
vst1.u8 {q14}, [r2], r3
vst1.u8 {q15}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem16x16_neon|
END

Просмотреть файл

@ -0,0 +1,34 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x4_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x4_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vst1.u8 {d3}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x4_neon|
END

Просмотреть файл

@ -0,0 +1,43 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem8x8_neon|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem8x8_neon| PROC
vld1.u8 {d0}, [r0], r1
vld1.u8 {d1}, [r0], r1
vst1.u8 {d0}, [r2], r3
vld1.u8 {d2}, [r0], r1
vst1.u8 {d1}, [r2], r3
vld1.u8 {d3}, [r0], r1
vst1.u8 {d2}, [r2], r3
vld1.u8 {d4}, [r0], r1
vst1.u8 {d3}, [r2], r3
vld1.u8 {d5}, [r0], r1
vst1.u8 {d4}, [r2], r3
vld1.u8 {d6}, [r0], r1
vst1.u8 {d5}, [r2], r3
vld1.u8 {d7}, [r0], r1
vst1.u8 {d6}, [r2], r3
vst1.u8 {d7}, [r2], r3
mov pc, lr
ENDP ; |vp8_copy_mem8x8_neon|
END

Просмотреть файл

@ -1,59 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_copy_mem8x4_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 4; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem8x8_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
uint8x8_t vtmp;
int r;
for (r = 0; r < 8; r++) {
vtmp = vld1_u8(src);
vst1_u8(dst, vtmp);
src += src_stride;
dst += dst_stride;
}
}
void vp8_copy_mem16x16_neon(
unsigned char *src,
int src_stride,
unsigned char *dst,
int dst_stride) {
int r;
uint8x16_t qtmp;
for (r = 0; r < 16; r++) {
qtmp = vld1q_u8(src);
vst1q_u8(dst, qtmp);
src += src_stride;
dst += dst_stride;
}
}

Просмотреть файл

@ -0,0 +1,54 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |vp8_dc_only_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
; int pred_stride, unsigned char *dst_ptr,
; int dst_stride)
; r0 input_dc
; r1 pred_ptr
; r2 pred_stride
; r3 dst_ptr
; sp dst_stride
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
vld1.32 {d2[0]}, [r1], r2
vld1.32 {d2[1]}, [r1], r2
vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
vaddw.u8 q2, q0, d4
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
vst1.32 {d2[0]}, [r3], r12
vst1.32 {d2[1]}, [r3], r12
vst1.32 {d4[0]}, [r3], r12
vst1.32 {d4[1]}, [r3]
bx lr
ENDP
END

Просмотреть файл

@ -1,42 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_dc_only_idct_add_neon(
int16_t input_dc,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint16_t a1 = ((input_dc + 4) >> 3);
uint32x2_t d2u32 = vdup_n_u32(0);
uint8x8_t d2u8;
uint16x8_t q1u16;
uint16x8_t qAdd;
qAdd = vdupq_n_u16(a1);
for (i = 0; i < 2; i++) {
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
pred_ptr += pred_stride;
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
dst_ptr += dst_stride;
}
}

Просмотреть файл

@ -0,0 +1,131 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequant_idct_add_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_dequant_idct_add_neon(short *input, short *dq,
; unsigned char *dest, int stride)
; r0 short *input,
; r1 short *dq,
; r2 unsigned char *dest
; r3 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
add r1, r2, r3 ; r1 = dest + stride
lsl r3, #1 ; 2x stride
vld1.32 {d14[0]}, [r2], r3
vld1.32 {d14[1]}, [r1], r3
vld1.32 {d15[0]}, [r2]
vld1.32 {d15[1]}, [r1]
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
vmul.i16 q2, q4, q6
;|short_idct4x4llm_neon| PROC
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; memset(input, 0, 32) -- 32bytes
vmov.i16 q14, #0
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vmov q15, q14
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vst1.16 {q14, q15}, [r0]
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
sub r2, r2, r3
sub r1, r1, r3
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
vst1.32 {d0[0]}, [r2], r3
vst1.32 {d0[1]}, [r1], r3
vst1.32 {d1[0]}, [r2]
vst1.32 {d1[1]}, [r1]
bx lr
ENDP ; |vp8_dequant_idct_add_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b4e7b
sinpi8sqrt2 DCD 0x8a8c8a8c
END

Просмотреть файл

@ -1,142 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_dequant_idct_add_neon(
int16_t *input,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int32x2_t d14, d15;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
int16x8_t q1, q2, q3, q4, q5, q6;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x2x2_t d2tmp0, d2tmp1;
int16x4x2_t d2tmp2, d2tmp3;
d14 = d15 = vdup_n_s32(0);
// load input
q3 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
input += 8;
q4 = vld1q_s16(input);
vst1q_s16(input, qEmpty);
// load dq
q5 = vld1q_s16(dq);
dq += 8;
q6 = vld1q_s16(dq);
// load src from dst
dst0 = dst;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
dst0 += stride;
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
dst0 += stride;
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
vreinterpretq_u16_s16(q5)));
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
vreinterpretq_u16_s16(q6)));
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
// loop 2
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
q3 = vshrq_n_s16(q3, 1);
q4 = vshrq_n_s16(q4, 1);
q3 = vqaddq_s16(q3, q2);
q4 = vqaddq_s16(q4, q2);
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
vreinterpret_s16_s32(d2tmp1.val[0]));
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
vreinterpret_s16_s32(d2tmp1.val[1]));
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
vreinterpret_u8_s32(d14)));
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
vreinterpret_u8_s32(d15)));
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d14, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d14, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d15, 1);
return;
}

Просмотреть файл

@ -0,0 +1,34 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_dequantize_b_loop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 short *Q,
; r1 short *DQC
; r2 short *DQ
|vp8_dequantize_b_loop_neon| PROC
vld1.16 {q0, q1}, [r0]
vld1.16 {q2, q3}, [r1]
vmul.i16 q4, q0, q2
vmul.i16 q5, q1, q3
vst1.16 {q4, q5}, [r2]
bx lr
ENDP
END

Просмотреть файл

@ -1,25 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/common/blockd.h"
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
int16x8x2_t qQ, qDQC, qDQ;
qQ = vld2q_s16(d->qcoeff);
qDQC = vld2q_s16(DQC);
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
vst2q_s16(d->dqcoeff, qDQ);
}

Просмотреть файл

@ -0,0 +1,79 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license and patent
; grant that can be found in the LICENSE file in the root of the source
; tree. All contributing project authors may be found in the AUTHORS
; file in the root of the source tree.
;
EXPORT |idct_dequant_0_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_0_2x_neon(short *q, short dq,
; unsigned char *dst, int stride);
; r0 *q
; r1 dq
; r2 *dst
; r3 stride
|idct_dequant_0_2x_neon| PROC
push {r4, r5}
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
vld1.32 {d8[0]}, [r12], r3
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d8[1]}, [r12], r3
vld1.32 {d4[0]}, [r2], r3
vld1.32 {d10[0]}, [r12], r3
vld1.32 {d4[1]}, [r2], r3
vld1.32 {d10[1]}, [r12], r3
ldrh r12, [r0] ; lo q
ldrh r4, [r0, #32] ; hi q
mov r5, #0
strh r5, [r0]
strh r5, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
sxth r4, r4 ; hi
mul r0, r4, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
vaddw.u8 q1, q0, d2 ; lo
vaddw.u8 q2, q0, d4
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r0, r2, #4
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
vst1.32 {d2[0]}, [r2], r3 ; lo
vst1.32 {d8[0]}, [r0], r3 ; hi
vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
pop {r4, r5}
bx lr
ENDP ; |idct_dequant_0_2x_neon|
END

Просмотреть файл

@ -1,62 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void idct_dequant_0_2x_neon(
int16_t *q,
int16_t dq,
unsigned char *dst,
int stride) {
unsigned char *dst0;
int i, a0, a1;
int16x8x2_t q2Add;
int32x2_t d2s32, d4s32;
uint8x8_t d2u8, d4u8;
uint16x8_t q1u16, q2u16;
a0 = ((q[0] * dq) + 4) >> 3;
a1 = ((q[16] * dq) + 4) >> 3;
q[0] = q[16] = 0;
q2Add.val[0] = vdupq_n_s16((int16_t)a0);
q2Add.val[1] = vdupq_n_s16((int16_t)a1);
for (i = 0; i < 2; i++, dst += 4) {
dst0 = dst;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
dst0 += stride;
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
dst0 += stride;
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d2s32));
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
vreinterpret_u8_s32(d4s32));
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
d2s32 = vreinterpret_s32_u8(d2u8);
d4s32 = vreinterpret_s32_u8(d4u8);
dst0 = dst;
vst1_lane_s32((int32_t *)dst0, d2s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d2s32, 1);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst0, d4s32, 1);
}
return;
}

Просмотреть файл

@ -0,0 +1,196 @@
;
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |idct_dequant_full_2x_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void idct_dequant_full_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride);
; r0 *q,
; r1 *dq,
; r2 *dst
; r3 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
; interleave the predictors
vld1.32 {d28[0]}, [r2], r3 ; l pre
vld1.32 {d28[1]}, [r12], r3 ; r pre
vld1.32 {d29[0]}, [r2], r3
vld1.32 {d29[1]}, [r12], r3
vld1.32 {d30[0]}, [r2], r3
vld1.32 {d30[1]}, [r12], r3
vld1.32 {d31[0]}, [r2], r3
vld1.32 {d31[1]}, [r12]
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
vmul.i16 q3, q3, q1
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
vld1.16 {d0}, [r1]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
vswp d5, d8
vswp d7, d10
; _CONSTANTS_ * 4,12 >> 16
; q6: 4 * sinpi : c1/temp1
; q7: 12 * sinpi : d1/temp2
; q8: 4 * cospi
; q9: 12 * cospi
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q7, q5, d0[2]
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q9, q5, d0[0]
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
; vqdmulh only accepts signed values. this was a problem because
; our constant had the high bit set, and was treated as a negative value.
; vqdmulh also doubles the value before it shifts by 16. we need to
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
; so we can shift the constant without losing precision. this avoids
; shift again afterward, but also avoids the sign issue. win win!
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
; pre-shift it
vshr.s16 q8, q8, #1
vshr.s16 q9, q9, #1
; q4: 4 + 4 * cospi : d1/temp1
; q5: 12 + 12 * cospi : c1/temp2
vqadd.s16 q4, q4, q8
vqadd.s16 q5, q5, q9
; c1 = temp1 - temp2
; d1 = temp1 + temp2
vqsub.s16 q2, q6, q5
vqadd.s16 q3, q4, q7
; [0]: a1+d1
; [1]: b1+c1
; [2]: b1-c1
; [3]: a1-d1
vqadd.s16 q4, q10, q3
vqadd.s16 q5, q11, q2
vqsub.s16 q6, q11, q2
vqsub.s16 q7, q10, q3
; rotate
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; idct loop 2
; q4: l 0, 4, 8,12 r 0, 4, 8,12
; q5: l 1, 5, 9,13 r 1, 5, 9,13
; q6: l 2, 6,10,14 r 2, 6,10,14
; q7: l 3, 7,11,15 r 3, 7,11,15
; q8: 1 * sinpi : c1/temp1
; q9: 3 * sinpi : d1/temp2
; q10: 1 * cospi
; q11: 3 * cospi
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
vqdmulh.s16 q9, q7, d0[2]
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
vqdmulh.s16 q11, q7, d0[0]
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
; see note on shifting above
vshr.s16 q10, q10, #1
vshr.s16 q11, q11, #1
; q10: 1 + 1 * cospi : d1/temp1
; q11: 3 + 3 * cospi : c1/temp2
vqadd.s16 q10, q5, q10
vqadd.s16 q11, q7, q11
; q8: c1 = temp1 - temp2
; q9: d1 = temp1 + temp2
vqsub.s16 q8, q8, q11
vqadd.s16 q9, q10, q9
; a1+d1
; b1+c1
; b1-c1
; a1-d1
vqadd.s16 q4, q2, q9
vqadd.s16 q5, q3, q8
vqsub.s16 q6, q3, q8
vqsub.s16 q7, q2, q9
; +4 >> 3 (rounding)
vrshr.s16 q4, q4, #3 ; lo
vrshr.s16 q5, q5, #3
vrshr.s16 q6, q6, #3 ; hi
vrshr.s16 q7, q7, #3
vtrn.32 q4, q6
vtrn.32 q5, q7
vtrn.16 q4, q5
vtrn.16 q6, q7
; adding pre
; input is still packed. pre was read interleaved
vaddw.u8 q4, q4, d28
vaddw.u8 q5, q5, d29
vaddw.u8 q6, q6, d30
vaddw.u8 q7, q7, d31
vmov.i16 q14, #0
vmov q15, q14
vst1.16 {q14, q15}, [r0] ; write over high input
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r1, r2, #4 ; hi
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
vst1.32 {d0[0]}, [r2], r3 ; lo
vst1.32 {d0[1]}, [r1], r3 ; hi
vst1.32 {d1[0]}, [r2], r3
vst1.32 {d1[1]}, [r1], r3
vst1.32 {d2[0]}, [r2], r3
vst1.32 {d2[1]}, [r1], r3
vst1.32 {d3[0]}, [r2]
vst1.32 {d3[1]}, [r1]
bx lr
ENDP ; |idct_dequant_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
sinpi8sqrt2 DCD 0x4546
END

Просмотреть файл

@ -1,185 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 17734;
// because the lowest bit in 0x8a8c is 0, we can pre-shift this
void idct_dequant_full_2x_neon(
int16_t *q,
int16_t *dq,
unsigned char *dst,
int stride) {
unsigned char *dst0, *dst1;
int32x2_t d28, d29, d30, d31;
int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
int16x8_t qEmpty = vdupq_n_s16(0);
int32x4x2_t q2tmp0, q2tmp1;
int16x8x2_t q2tmp2, q2tmp3;
int16x4_t dLow0, dLow1, dHigh0, dHigh1;
d28 = d29 = d30 = d31 = vdup_n_s32(0);
// load dq
q0 = vld1q_s16(dq);
dq += 8;
q1 = vld1q_s16(dq);
// load q
q2 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q3 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q4 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
q += 8;
q5 = vld1q_s16(q);
vst1q_s16(q, qEmpty);
// load src from dst
dst0 = dst;
dst1 = dst + 4;
d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
dst0 += stride;
d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
dst1 += stride;
d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
dst0 += stride;
d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
dst1 += stride;
d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
dst0 += stride;
d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
dst1 += stride;
d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
q2 = vmulq_s16(q2, q0);
q3 = vmulq_s16(q3, q1);
q4 = vmulq_s16(q4, q0);
q5 = vmulq_s16(q5, q1);
// vswp
dLow0 = vget_low_s16(q2);
dHigh0 = vget_high_s16(q2);
dLow1 = vget_low_s16(q4);
dHigh1 = vget_high_s16(q4);
q2 = vcombine_s16(dLow0, dLow1);
q4 = vcombine_s16(dHigh0, dHigh1);
dLow0 = vget_low_s16(q3);
dHigh0 = vget_high_s16(q3);
dLow1 = vget_low_s16(q5);
dHigh1 = vget_high_s16(q5);
q3 = vcombine_s16(dLow0, dLow1);
q5 = vcombine_s16(dHigh0, dHigh1);
q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
q10 = vqaddq_s16(q2, q3);
q11 = vqsubq_s16(q2, q3);
q8 = vshrq_n_s16(q8, 1);
q9 = vshrq_n_s16(q9, 1);
q4 = vqaddq_s16(q4, q8);
q5 = vqaddq_s16(q5, q9);
q2 = vqsubq_s16(q6, q5);
q3 = vqaddq_s16(q7, q4);
q4 = vqaddq_s16(q10, q3);
q5 = vqaddq_s16(q11, q2);
q6 = vqsubq_s16(q11, q2);
q7 = vqsubq_s16(q10, q3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
// loop 2
q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
q10 = vshrq_n_s16(q10, 1);
q11 = vshrq_n_s16(q11, 1);
q10 = vqaddq_s16(q2tmp2.val[1], q10);
q11 = vqaddq_s16(q2tmp3.val[1], q11);
q8 = vqsubq_s16(q8, q11);
q9 = vqaddq_s16(q9, q10);
q4 = vqaddq_s16(q2, q9);
q5 = vqaddq_s16(q3, q8);
q6 = vqsubq_s16(q3, q8);
q7 = vqsubq_s16(q2, q9);
q4 = vrshrq_n_s16(q4, 3);
q5 = vrshrq_n_s16(q5, 3);
q6 = vrshrq_n_s16(q6, 3);
q7 = vrshrq_n_s16(q7, 3);
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
vreinterpretq_s16_s32(q2tmp1.val[0]));
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
vreinterpretq_s16_s32(q2tmp1.val[1]));
q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
vreinterpret_u8_s32(d28)));
q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
vreinterpret_u8_s32(d29)));
q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
vreinterpret_u8_s32(d30)));
q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
vreinterpret_u8_s32(d31)));
d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
dst0 = dst;
dst1 = dst + 4;
vst1_lane_s32((int32_t *)dst0, d28, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d28, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d29, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d29, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d30, 0);
dst0 += stride;
vst1_lane_s32((int32_t *)dst1, d30, 1);
dst1 += stride;
vst1_lane_s32((int32_t *)dst0, d31, 0);
vst1_lane_s32((int32_t *)dst1, d31, 1);
return;
}

Просмотреть файл

@ -0,0 +1,87 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_inv_walsh4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA |.text|, CODE, READONLY ; name this block of code
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
vld1.i16 {q0-q1}, [r0@128]
; first for loop
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d6, d1, d2 ;b = [4] + [8]
vsub.s16 d5, d0, d3 ;d = [0] - [12]
vsub.s16 d7, d1, d2 ;c = [4] - [8]
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vtrn.32 d0, d2 ;d0: 0 1 8 9
;d2: 2 3 10 11
vtrn.32 d1, d3 ;d1: 4 5 12 13
;d3: 6 7 14 15
vtrn.16 d0, d1 ;d0: 0 4 8 12
;d1: 1 5 9 13
vtrn.16 d2, d3 ;d2: 2 6 10 14
;d3: 3 7 11 15
; second for loop
vadd.s16 d4, d0, d3 ;a = [0] + [3]
vadd.s16 d6, d1, d2 ;b = [1] + [2]
vsub.s16 d5, d0, d3 ;d = [0] - [3]
vsub.s16 d7, d1, d2 ;c = [1] - [2]
vmov.i16 q8, #3
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vadd.i16 q0, q0, q8 ;e/f += 3
vadd.i16 q1, q1, q8 ;g/h += 3
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
mov r2, #64
add r3, r1, #32
vst1.i16 d0[0], [r1],r2
vst1.i16 d1[0], [r3],r2
vst1.i16 d2[0], [r1],r2
vst1.i16 d3[0], [r3],r2
vst1.i16 d0[1], [r1],r2
vst1.i16 d1[1], [r3],r2
vst1.i16 d2[1], [r1],r2
vst1.i16 d3[1], [r3],r2
vst1.i16 d0[2], [r1],r2
vst1.i16 d1[2], [r3],r2
vst1.i16 d2[2], [r1],r2
vst1.i16 d3[2], [r3],r2
vst1.i16 d0[3], [r1],r2
vst1.i16 d1[3], [r3],r2
vst1.i16 d2[3], [r1]
vst1.i16 d3[3], [r3]
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
END

Просмотреть файл

@ -1,102 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_short_inv_walsh4x4_neon(
int16_t *input,
int16_t *mb_dqcoeff) {
int16x8_t q0s16, q1s16, q2s16, q3s16;
int16x4_t d4s16, d5s16, d6s16, d7s16;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
int16x8_t qAdd3;
q0s16 = vld1q_s16(input);
q1s16 = vld1q_s16(input + 8);
// 1st for loop
d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
vreinterpret_s32_s16(vget_low_s16(q1s16)));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
vreinterpret_s32_s16(vget_high_s16(q1s16)));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
vreinterpret_s16_s32(v2tmp3.val[0]));
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
vreinterpret_s16_s32(v2tmp3.val[1]));
// 2nd for loop
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
qAdd3 = vdupq_n_s16(3);
q0s16 = vaddq_s16(q2s16, q3s16);
q1s16 = vsubq_s16(q2s16, q3s16);
q0s16 = vaddq_s16(q0s16, qAdd3);
q1s16 = vaddq_s16(q1s16, qAdd3);
q0s16 = vshrq_n_s16(q0s16, 3);
q1s16 = vshrq_n_s16(q1s16, 3);
// store
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
mb_dqcoeff += 16;
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
mb_dqcoeff += 16;
return;
}

Просмотреть файл

@ -0,0 +1,397 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1
add r1, r1, r1
vdup.u8 q2, r3 ; duplicate thresh
vld1.u8 {q3}, [r2@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r2@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r2@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r2@128] ; q2
vld1.u8 {q10}, [r12@128] ; q3
sub r2, r2, r1, lsl #1
sub r12, r12, r1, lsl #1
bl vp8_loop_filter_neon
vst1.u8 {q5}, [r2@128], r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r3@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r3@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r3@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r3@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r3@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r3@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r3@64] ; q3
vld1.u8 {d21}, [r12@64] ; q3
bl vp8_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r2@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r2@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_vertical_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r12], r1
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r12], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d19}, [r2]
vld1.u8 {d21}, [r12]
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r3 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp8_loop_filter_neon
vswp d12, d11
vswp d16, d13
sub r0, r0, #2 ; dst ptr
vswp d14, d12
vswp d16, d15
add r12, r0, r1, asr #1
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r12 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
bl vp8_loop_filter_neon
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
sub r0, r0, #2
sub r2, r2, #2
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
; This is a helper function for the loopfilters. The invidual functions do the
; necessary load, transpose (if necessary) and store.
; r0-r3 PRESERVE
; q0 flimit
; q1 limit
; q2 thresh
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
|vp8_loop_filter_neon| PROC
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vmov.u8 q10, #0x80 ; 0x80
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vcge.u8 q15, q1, q15
; vp8_filter() function
; convert to signed
veor q7, q7, q10 ; qs0
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
veor q5, q5, q10 ; ps1
vqadd.u8 q9, q9, q2 ; a = b + a
veor q8, q8, q10 ; qs1
vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vorr q14, q13, q14 ; vp8_hevmask
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
vmul.i16 q11, q11, q4
vand q1, q1, q14 ; vp8_filter &= hev
vand q15, q15, q9 ; vp8_filter_mask
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
vmov.u8 q9, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q11
vand q1, q1, q15 ; vp8_filter &= mask
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
;-----------------
END

Просмотреть файл

@ -1,550 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "vpx_ports/arm.h"
static INLINE void vp8_loop_filter_neon(
uint8x16_t qblimit, // flimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p3
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r) { // q1
uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q2s16, q11s16;
uint16x8_t q4u16;
int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
int8x8_t d2s8, d3s8;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q3 = vabdq_u8(q9, q8);
q4 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q3 = vmaxq_u8(q3, q4);
q15u8 = vmaxq_u8(q11u8, q12u8);
q9 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q3);
q2u8 = vabdq_u8(q5, q8);
q9 = vqaddq_u8(q9, q9);
q15u8 = vcgeq_u8(qlimit, q15u8);
// vp8_filter() function
// convert to signed
q10 = vdupq_n_u8(0x80);
q8 = veorq_u8(q8, q10);
q7 = veorq_u8(q7, q10);
q6 = veorq_u8(q6, q10);
q5 = veorq_u8(q5, q10);
q2u8 = vshrq_n_u8(q2u8, 1);
q9 = vqaddq_u8(q9, q2u8);
q10 = vdupq_n_u8(3);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q9 = vcgeq_u8(qblimit, q9);
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q14u8 = vorrq_u8(q13u8, q14u8);
q4u16 = vmovl_u8(vget_low_u8(q10));
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
q15u8 = vandq_u8(q15u8, q9);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
q9 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2s8 = vqmovn_s16(q2s16);
d3s8 = vqmovn_s16(q11s16);
q1s8 = vcombine_s8(d2s8, d3s8);
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
q1s8 = vreinterpretq_s8_u8(q1u8);
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
q2s8 = vshrq_n_s8(q2s8, 3);
q1s8 = vshrq_n_s8(q1s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
q1s8 = vrshrq_n_s8(q1s8, 1);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
q0u8 = vdupq_n_u8(0x80);
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
return;
}
void vp8_loop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
src -= (pitch * 5);
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
return;
}
void vp8_loop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
u -= (pitch * 5);
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(u, vget_low_u8(q8));
v -= (pitch * 5);
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(v, vget_high_u8(q8));
return;
}
static INLINE void write_4x8(unsigned char *dst, int pitch,
const uint8x8x4_t result) {
#ifdef VPX_INCOMPATIBLE_GCC
/*
* uint8x8x4_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
20 21 22 23 | 24 25 26 27
30 31 32 33 | 34 35 36 37
---
* after vtrn_u16
00 01 20 21 | 04 05 24 25
02 03 22 23 | 06 07 26 27
10 11 30 31 | 14 15 34 35
12 13 32 33 | 16 17 36 37
---
* after vtrn_u8
00 10 20 30 | 04 14 24 34
01 11 21 31 | 05 15 25 35
02 12 22 32 | 06 16 26 36
03 13 23 33 | 07 17 27 37
*/
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
vreinterpret_u16_u8(result.val[2]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
vreinterpret_u16_u8(result.val[3]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
dst += pitch;
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
#else
vst4_lane_u8(dst, result, 0);
dst += pitch;
vst4_lane_u8(dst, result, 1);
dst += pitch;
vst4_lane_u8(dst, result, 2);
dst += pitch;
vst4_lane_u8(dst, result, 3);
dst += pitch;
vst4_lane_u8(dst, result, 4);
dst += pitch;
vst4_lane_u8(dst, result, 5);
dst += pitch;
vst4_lane_u8(dst, result, 6);
dst += pitch;
vst4_lane_u8(dst, result, 7);
#endif // VPX_INCOMPATIBLE_GCC
}
void vp8_loop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s, *d;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s = src - 4;
d6 = vld1_u8(s);
s += pitch;
d8 = vld1_u8(s);
s += pitch;
d10 = vld1_u8(s);
s += pitch;
d12 = vld1_u8(s);
s += pitch;
d14 = vld1_u8(s);
s += pitch;
d16 = vld1_u8(s);
s += pitch;
d18 = vld1_u8(s);
s += pitch;
d20 = vld1_u8(s);
s += pitch;
d7 = vld1_u8(s);
s += pitch;
d9 = vld1_u8(s);
s += pitch;
d11 = vld1_u8(s);
s += pitch;
d13 = vld1_u8(s);
s += pitch;
d15 = vld1_u8(s);
s += pitch;
d17 = vld1_u8(s);
s += pitch;
d19 = vld1_u8(s);
s += pitch;
d21 = vld1_u8(s);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
d = src - 2;
write_4x8(d, pitch, q4ResultL);
d += pitch * 8;
write_4x8(d, pitch, q4ResultH);
}
void vp8_loop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
uint8x8x4_t q4ResultH, q4ResultL;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
d6 = vld1_u8(us);
us += pitch;
d8 = vld1_u8(us);
us += pitch;
d10 = vld1_u8(us);
us += pitch;
d12 = vld1_u8(us);
us += pitch;
d14 = vld1_u8(us);
us += pitch;
d16 = vld1_u8(us);
us += pitch;
d18 = vld1_u8(us);
us += pitch;
d20 = vld1_u8(us);
vs = v - 4;
d7 = vld1_u8(vs);
vs += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q5, &q6, &q7, &q8);
q4ResultL.val[0] = vget_low_u8(q5); // d10
q4ResultL.val[1] = vget_low_u8(q6); // d12
q4ResultL.val[2] = vget_low_u8(q7); // d14
q4ResultL.val[3] = vget_low_u8(q8); // d16
ud = u - 2;
write_4x8(ud, pitch, q4ResultL);
q4ResultH.val[0] = vget_high_u8(q5); // d11
q4ResultH.val[1] = vget_high_u8(q6); // d13
q4ResultH.val[2] = vget_high_u8(q7); // d15
q4ResultH.val[3] = vget_high_u8(q8); // d17
vd = v - 2;
write_4x8(vd, pitch, q4ResultH);
}

Просмотреть файл

@ -0,0 +1,117 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
vand q14, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
END

Просмотреть файл

@ -1,111 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
uint8_t *sp;
uint8x16_t qblimit, q0u8;
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
int16x8_t q2s16, q3s16, q13s16;
int8x8_t d8s8, d9s8;
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
qblimit = vdupq_n_u8(*blimit);
sp = s - (p << 1);
q5u8 = vld1q_u8(sp);
sp += p;
q6u8 = vld1q_u8(sp);
sp += p;
q7u8 = vld1q_u8(sp);
sp += p;
q8u8 = vld1q_u8(sp);
q15u8 = vabdq_u8(q6u8, q7u8);
q14u8 = vabdq_u8(q5u8, q8u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q13s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q7u8 = veorq_u8(q7u8, q0u8);
q8u8 = veorq_u8(q8u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
vreinterpretq_s8_u8(q8u8));
q2s16 = vmulq_s16(q2s16, q13s16);
q3s16 = vmulq_s16(q3s16, q13s16);
q10u8 = vdupq_n_u8(3);
q9u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
d8s8 = vqmovn_s16(q2s16);
d9s8 = vqmovn_s16(q3s16);
q4s8 = vcombine_s8(d8s8, d9s8);
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q3s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
vst1q_u8(s, q7u8);
s -= p;
vst1q_u8(s, q6u8);
return;
}
void vp8_loop_filter_bhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
y_ptr += y_stride * 4;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbhs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
return;
}

Просмотреть файл

@ -0,0 +1,154 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
ARM
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vswp d7, d10
vswp d12, d9
;vp8_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vmul.s16 q13, q13, q11
vmov.u8 q11, #0x03 ; 0x03
vmov.u8 q12, #0x04 ; 0x04
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d29
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d29, q13
add r0, r0, #1
add r3, r0, r1
vand q14, q14, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
;calculate output
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
add r12, r1, r1
vswp d13, d14
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0], r12
vst2.8 {d12[1], d13[1]}, [r3], r12
vst2.8 {d12[2], d13[2]}, [r0], r12
vst2.8 {d12[3], d13[3]}, [r3], r12
vst2.8 {d12[4], d13[4]}, [r0], r12
vst2.8 {d12[5], d13[5]}, [r3], r12
vst2.8 {d12[6], d13[6]}, [r0], r12
vst2.8 {d12[7], d13[7]}, [r3], r12
vst2.8 {d14[0], d15[0]}, [r0], r12
vst2.8 {d14[1], d15[1]}, [r3], r12
vst2.8 {d14[2], d15[2]}, [r0], r12
vst2.8 {d14[3], d15[3]}, [r3], r12
vst2.8 {d14[4], d15[4]}, [r0], r12
vst2.8 {d14[5], d15[5]}, [r3], r12
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bvs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
mov r4, r0
add r0, r0, #4
vdup.s8 q1, r3 ; duplicate blim
bl vp8_loop_filter_simple_vertical_edge_neon
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
add r0, r4, #8
bl vp8_loop_filter_simple_vertical_edge_neon
add r0, r4, #12
pop {r4, lr}
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbvs_neon| PROC
ldrb r3, [r2] ; load mblim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
END

Просмотреть файл

@ -1,280 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "vpx_ports/arm.h"
#ifdef VPX_INCOMPATIBLE_GCC
static INLINE void write_2x4(unsigned char *dst, int pitch,
const uint8x8x2_t result) {
/*
* uint8x8x2_t result
00 01 02 03 | 04 05 06 07
10 11 12 13 | 14 15 16 17
---
* after vtrn_u8
00 10 02 12 | 04 14 06 16
01 11 03 13 | 05 15 07 17
*/
const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
result.val[1]);
const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
dst += pitch;
vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
}
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
write_2x4(dst, pitch, result);
dst += pitch * 8;
write_2x4(dst, pitch, result2);
}
#else
static INLINE void write_2x8(unsigned char *dst, int pitch,
const uint8x8x2_t result,
const uint8x8x2_t result2) {
vst2_lane_u8(dst, result, 0);
dst += pitch;
vst2_lane_u8(dst, result, 1);
dst += pitch;
vst2_lane_u8(dst, result, 2);
dst += pitch;
vst2_lane_u8(dst, result, 3);
dst += pitch;
vst2_lane_u8(dst, result, 4);
dst += pitch;
vst2_lane_u8(dst, result, 5);
dst += pitch;
vst2_lane_u8(dst, result, 6);
dst += pitch;
vst2_lane_u8(dst, result, 7);
dst += pitch;
vst2_lane_u8(dst, result2, 0);
dst += pitch;
vst2_lane_u8(dst, result2, 1);
dst += pitch;
vst2_lane_u8(dst, result2, 2);
dst += pitch;
vst2_lane_u8(dst, result2, 3);
dst += pitch;
vst2_lane_u8(dst, result2, 4);
dst += pitch;
vst2_lane_u8(dst, result2, 5);
dst += pitch;
vst2_lane_u8(dst, result2, 6);
dst += pitch;
vst2_lane_u8(dst, result2, 7);
}
#endif // VPX_INCOMPATIBLE_GCC
#ifdef VPX_INCOMPATIBLE_GCC
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
const uint8x8_t a = vld1_u8(src);
const uint8x8_t b = vld1_u8(src + pitch * 1);
const uint8x8_t c = vld1_u8(src + pitch * 2);
const uint8x8_t d = vld1_u8(src + pitch * 3);
const uint8x8_t e = vld1_u8(src + pitch * 4);
const uint8x8_t f = vld1_u8(src + pitch * 5);
const uint8x8_t g = vld1_u8(src + pitch * 6);
const uint8x8_t h = vld1_u8(src + pitch * 7);
const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
vreinterpret_u32_u8(e));
const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
vreinterpret_u32_u8(f));
const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
vreinterpret_u32_u8(g));
const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
vreinterpret_u32_u8(h));
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
vreinterpret_u16_u32(r26_u32.val[0]));
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
vreinterpret_u16_u32(r37_u32.val[0]));
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
vreinterpret_u8_u16(r13_u16.val[0]));
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
vreinterpret_u8_u16(r13_u16.val[1]));
/*
* after vtrn_u32
00 01 02 03 | 40 41 42 43
10 11 12 13 | 50 51 52 53
20 21 22 23 | 60 61 62 63
30 31 32 33 | 70 71 72 73
---
* after vtrn_u16
00 01 20 21 | 40 41 60 61
02 03 22 23 | 42 43 62 63
10 11 30 31 | 50 51 70 71
12 13 32 33 | 52 52 72 73
00 01 20 21 | 40 41 60 61
10 11 30 31 | 50 51 70 71
02 03 22 23 | 42 43 62 63
12 13 32 33 | 52 52 72 73
---
* after vtrn_u8
00 10 20 30 | 40 50 60 70
01 11 21 31 | 41 51 61 71
02 12 22 32 | 42 52 62 72
03 13 23 33 | 43 53 63 73
*/
x.val[0] = r01_u8.val[0];
x.val[1] = r01_u8.val[1];
x.val[2] = r23_u8.val[0];
x.val[3] = r23_u8.val[1];
return x;
}
#else
static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
x = vld4_lane_u8(src, x, 0);
src += pitch;
x = vld4_lane_u8(src, x, 1);
src += pitch;
x = vld4_lane_u8(src, x, 2);
src += pitch;
x = vld4_lane_u8(src, x, 3);
src += pitch;
x = vld4_lane_u8(src, x, 4);
src += pitch;
x = vld4_lane_u8(src, x, 5);
src += pitch;
x = vld4_lane_u8(src, x, 6);
src += pitch;
x = vld4_lane_u8(src, x, 7);
return x;
}
#endif // VPX_INCOMPATIBLE_GCC
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
unsigned char *s,
int p,
const unsigned char *blimit) {
unsigned char *src1;
uint8x16_t qblimit, q0u8;
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
int16x8_t q2s16, q13s16, q11s16;
int8x8_t d28s8, d29s8;
int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
uint8x8x4_t d0u8x4; // d6, d7, d8, d9
uint8x8x4_t d1u8x4; // d10, d11, d12, d13
uint8x8x2_t d2u8x2; // d12, d13
uint8x8x2_t d3u8x2; // d14, d15
qblimit = vdupq_n_u8(*blimit);
src1 = s - 2;
d0u8x4 = read_4x8(src1, p, d0u8x4);
src1 += p * 8;
d1u8x4 = read_4x8(src1, p, d1u8x4);
q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
q15u8 = vabdq_u8(q5u8, q4u8);
q14u8 = vabdq_u8(q3u8, q6u8);
q15u8 = vqaddq_u8(q15u8, q15u8);
q14u8 = vshrq_n_u8(q14u8, 1);
q0u8 = vdupq_n_u8(0x80);
q11s16 = vdupq_n_s16(3);
q15u8 = vqaddq_u8(q15u8, q14u8);
q3u8 = veorq_u8(q3u8, q0u8);
q4u8 = veorq_u8(q4u8, q0u8);
q5u8 = veorq_u8(q5u8, q0u8);
q6u8 = veorq_u8(q6u8, q0u8);
q15u8 = vcgeq_u8(qblimit, q15u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
vget_low_s8(vreinterpretq_s8_u8(q5u8)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
vget_high_s8(vreinterpretq_s8_u8(q5u8)));
q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
vreinterpretq_s8_u8(q6u8));
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q11u8 = vdupq_n_u8(3);
q12u8 = vdupq_n_u8(4);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
d28s8 = vqmovn_s16(q2s16);
d29s8 = vqmovn_s16(q13s16);
q14s8 = vcombine_s8(d28s8, d29s8);
q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q14s8 = vshrq_n_s8(q3s8, 3);
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
d2u8x2.val[0] = vget_low_u8(q6u8); // d12
d2u8x2.val[1] = vget_low_u8(q7u8); // d14
d3u8x2.val[0] = vget_high_u8(q6u8); // d13
d3u8x2.val[1] = vget_high_u8(q7u8); // d15
src1 = s - 1;
write_2x8(src1, p, d2u8x2, d3u8x2);
}
void vp8_loop_filter_bvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
y_ptr += 4;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}
void vp8_loop_filter_mbvs_neon(
unsigned char *y_ptr,
int y_stride,
const unsigned char *blimit) {
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
return;
}

Просмотреть файл

@ -0,0 +1,469 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
AREA ||.text||, CODE, READONLY, ALIGN=2
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
push {lr}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
vld1.u8 {q3}, [r0@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r0@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r0@128], r1 ; q2
vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #2
add r0, r12, r1, lsr #1
vst1.u8 {q4}, [r12@128],r1 ; store op2
vst1.u8 {q5}, [r0@128],r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r0@128],r1 ; store oq0
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r0@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r0@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r0@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r0@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r0@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r0@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r0@64], r1 ; q3
vld1.u8 {d21}, [r12@64], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r12, r12, r1, lsl #3
add r0, r0, r1
add r12, r12, r1
vst1.u8 {d8}, [r0@64], r1 ; store u op2
vst1.u8 {d9}, [r12@64], r1 ; store v op2
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r12@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r12@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
; This is a helper function for the macroblock loopfilters. The individual
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; r0,r1 PRESERVE
; r2 mblimit
; r3 limit
; q2 thresh
; q3 p3 PRESERVE
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3 PRESERVE
|vp8_mbloop_filter_neon| PROC
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q1, q1, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q1
vdup.u8 q1, r3 ; limit
vdup.u8 q2, r2 ; mblimit
vmov.u8 q0, #0x80 ; 0x80
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vmov.u16 q11, #3 ; #3
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
vqadd.u8 q12, q12, q1 ; a = b + a
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vand q15, q15, q12 ; vp8_filter_mask
vmul.i16 q13, q13, q11
vmov.u8 q12, #3 ; #3
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vmov.u8 q11, #4 ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
vqmovn.s16 d3, q13
vand q1, q1, q15 ; vp8_filter &= mask
vmov.u16 q15, #63 ; #63
vand q13, q1, q14 ; Filter2 &= hev
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vmov q0, q15
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q11, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov.u8 d5, #9 ; #9
vmov.u8 d4, #18 ; #18
vmov q13, q15
vmov q14, q15
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
vmlal.s8 q11, d3, d5
vmov.u8 d5, #27 ; #27
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
vmlal.s8 q13, d3, d4
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d1, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vmov.u8 q1, #0x80 ; 0x80
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q1 ; *oq2 = s^0x80
veor q4, q0, q1 ; *op2 = s^0x80
veor q8, q13, q1 ; *oq1 = s^0x80
veor q5, q12, q1 ; *op2 = s^0x80
veor q7, q15, q1 ; *oq0 = s^0x80
veor q6, q14, q1 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|
;-----------------
END

Просмотреть файл

@ -1,625 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
static INLINE void vp8_mbloop_filter_neon(
uint8x16_t qblimit, // mblimit
uint8x16_t qlimit, // limit
uint8x16_t qthresh, // thresh
uint8x16_t q3, // p2
uint8x16_t q4, // p2
uint8x16_t q5, // p1
uint8x16_t q6, // p0
uint8x16_t q7, // q0
uint8x16_t q8, // q1
uint8x16_t q9, // q2
uint8x16_t q10, // q3
uint8x16_t *q4r, // p1
uint8x16_t *q5r, // p1
uint8x16_t *q6r, // p0
uint8x16_t *q7r, // q0
uint8x16_t *q8r, // q1
uint8x16_t *q9r) { // q1
uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
int8x16_t q0s8, q12s8, q14s8, q15s8;
int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
q11u8 = vabdq_u8(q3, q4);
q12u8 = vabdq_u8(q4, q5);
q13u8 = vabdq_u8(q5, q6);
q14u8 = vabdq_u8(q8, q7);
q1u8 = vabdq_u8(q9, q8);
q0u8 = vabdq_u8(q10, q9);
q11u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vmaxq_u8(q13u8, q14u8);
q1u8 = vmaxq_u8(q1u8, q0u8);
q15u8 = vmaxq_u8(q11u8, q12u8);
q12u8 = vabdq_u8(q6, q7);
// vp8_hevmask
q13u8 = vcgtq_u8(q13u8, qthresh);
q14u8 = vcgtq_u8(q14u8, qthresh);
q15u8 = vmaxq_u8(q15u8, q1u8);
q15u8 = vcgeq_u8(qlimit, q15u8);
q1u8 = vabdq_u8(q5, q8);
q12u8 = vqaddq_u8(q12u8, q12u8);
// vp8_filter() function
// convert to signed
q0u8 = vdupq_n_u8(0x80);
q9 = veorq_u8(q9, q0u8);
q8 = veorq_u8(q8, q0u8);
q7 = veorq_u8(q7, q0u8);
q6 = veorq_u8(q6, q0u8);
q5 = veorq_u8(q5, q0u8);
q4 = veorq_u8(q4, q0u8);
q1u8 = vshrq_n_u8(q1u8, 1);
q12u8 = vqaddq_u8(q12u8, q1u8);
q14u8 = vorrq_u8(q13u8, q14u8);
q12u8 = vcgeq_u8(qblimit, q12u8);
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
vget_low_s8(vreinterpretq_s8_u8(q6)));
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
vget_high_s8(vreinterpretq_s8_u8(q6)));
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
vreinterpretq_s8_u8(q8));
q11s16 = vdupq_n_s16(3);
q2s16 = vmulq_s16(q2s16, q11s16);
q13s16 = vmulq_s16(q13s16, q11s16);
q15u8 = vandq_u8(q15u8, q12u8);
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
q12u8 = vdupq_n_u8(3);
q11u8 = vdupq_n_u8(4);
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
d2 = vqmovn_s16(q2s16);
d3 = vqmovn_s16(q13s16);
q1s8 = vcombine_s8(d2, d3);
q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
q2s8 = vshrq_n_s8(q2s8, 3);
q13s8 = vshrq_n_s8(q13s8, 3);
q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
d5 = vdup_n_s8(9);
d4 = vdup_n_s8(18);
q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
d5 = vdup_n_s8(27);
q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
d0 = vqshrn_n_s16(q0s16 , 7);
d1 = vqshrn_n_s16(q11s16, 7);
d24 = vqshrn_n_s16(q12s16, 7);
d25 = vqshrn_n_s16(q13s16, 7);
d28 = vqshrn_n_s16(q14s16, 7);
d29 = vqshrn_n_s16(q15s16, 7);
q0s8 = vcombine_s8(d0, d1);
q12s8 = vcombine_s8(d24, d25);
q14s8 = vcombine_s8(d28, d29);
q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
q15s8 = vqsubq_s8((q7s8), q14s8);
q14s8 = vqaddq_s8((q6s8), q14s8);
q1u8 = vdupq_n_u8(0x80);
*q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
*q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
*q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
*q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
*q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
*q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
return;
}
void vp8_mbloop_filter_horizontal_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
src -= (pitch << 2);
q3 = vld1q_u8(src);
src += pitch;
q4 = vld1q_u8(src);
src += pitch;
q5 = vld1q_u8(src);
src += pitch;
q6 = vld1q_u8(src);
src += pitch;
q7 = vld1q_u8(src);
src += pitch;
q8 = vld1q_u8(src);
src += pitch;
q9 = vld1q_u8(src);
src += pitch;
q10 = vld1q_u8(src);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
src -= (pitch * 6);
vst1q_u8(src, q4);
src += pitch;
vst1q_u8(src, q5);
src += pitch;
vst1q_u8(src, q6);
src += pitch;
vst1q_u8(src, q7);
src += pitch;
vst1q_u8(src, q8);
src += pitch;
vst1q_u8(src, q9);
return;
}
void vp8_mbloop_filter_horizontal_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
u -= (pitch << 2);
v -= (pitch << 2);
d6 = vld1_u8(u);
u += pitch;
d7 = vld1_u8(v);
v += pitch;
d8 = vld1_u8(u);
u += pitch;
d9 = vld1_u8(v);
v += pitch;
d10 = vld1_u8(u);
u += pitch;
d11 = vld1_u8(v);
v += pitch;
d12 = vld1_u8(u);
u += pitch;
d13 = vld1_u8(v);
v += pitch;
d14 = vld1_u8(u);
u += pitch;
d15 = vld1_u8(v);
v += pitch;
d16 = vld1_u8(u);
u += pitch;
d17 = vld1_u8(v);
v += pitch;
d18 = vld1_u8(u);
u += pitch;
d19 = vld1_u8(v);
v += pitch;
d20 = vld1_u8(u);
d21 = vld1_u8(v);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
u -= (pitch * 6);
v -= (pitch * 6);
vst1_u8(u, vget_low_u8(q4));
u += pitch;
vst1_u8(v, vget_high_u8(q4));
v += pitch;
vst1_u8(u, vget_low_u8(q5));
u += pitch;
vst1_u8(v, vget_high_u8(q5));
v += pitch;
vst1_u8(u, vget_low_u8(q6));
u += pitch;
vst1_u8(v, vget_high_u8(q6));
v += pitch;
vst1_u8(u, vget_low_u8(q7));
u += pitch;
vst1_u8(v, vget_high_u8(q7));
v += pitch;
vst1_u8(u, vget_low_u8(q8));
u += pitch;
vst1_u8(v, vget_high_u8(q8));
v += pitch;
vst1_u8(u, vget_low_u8(q9));
vst1_u8(v, vget_high_u8(q9));
return;
}
void vp8_mbloop_filter_vertical_edge_y_neon(
unsigned char *src,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh) {
unsigned char *s1, *s2;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
s1 = src - 4;
s2 = s1 + 8 * pitch;
d6 = vld1_u8(s1);
s1 += pitch;
d7 = vld1_u8(s2);
s2 += pitch;
d8 = vld1_u8(s1);
s1 += pitch;
d9 = vld1_u8(s2);
s2 += pitch;
d10 = vld1_u8(s1);
s1 += pitch;
d11 = vld1_u8(s2);
s2 += pitch;
d12 = vld1_u8(s1);
s1 += pitch;
d13 = vld1_u8(s2);
s2 += pitch;
d14 = vld1_u8(s1);
s1 += pitch;
d15 = vld1_u8(s2);
s2 += pitch;
d16 = vld1_u8(s1);
s1 += pitch;
d17 = vld1_u8(s2);
s2 += pitch;
d18 = vld1_u8(s1);
s1 += pitch;
d19 = vld1_u8(s2);
s2 += pitch;
d20 = vld1_u8(s1);
d21 = vld1_u8(s2);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
s1 -= 7 * pitch;
s2 -= 7 * pitch;
vst1_u8(s1, vget_low_u8(q3));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q3));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q4));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q4));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q5));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q5));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q6));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q6));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q7));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q7));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q8));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q8));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q9));
s1 += pitch;
vst1_u8(s2, vget_high_u8(q9));
s2 += pitch;
vst1_u8(s1, vget_low_u8(q10));
vst1_u8(s2, vget_high_u8(q10));
return;
}
void vp8_mbloop_filter_vertical_edge_uv_neon(
unsigned char *u,
int pitch,
unsigned char blimit,
unsigned char limit,
unsigned char thresh,
unsigned char *v) {
unsigned char *us, *ud;
unsigned char *vs, *vd;
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
uint8x16_t q5, q6, q7, q8, q9, q10;
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
qblimit = vdupq_n_u8(blimit);
qlimit = vdupq_n_u8(limit);
qthresh = vdupq_n_u8(thresh);
us = u - 4;
vs = v - 4;
d6 = vld1_u8(us);
us += pitch;
d7 = vld1_u8(vs);
vs += pitch;
d8 = vld1_u8(us);
us += pitch;
d9 = vld1_u8(vs);
vs += pitch;
d10 = vld1_u8(us);
us += pitch;
d11 = vld1_u8(vs);
vs += pitch;
d12 = vld1_u8(us);
us += pitch;
d13 = vld1_u8(vs);
vs += pitch;
d14 = vld1_u8(us);
us += pitch;
d15 = vld1_u8(vs);
vs += pitch;
d16 = vld1_u8(us);
us += pitch;
d17 = vld1_u8(vs);
vs += pitch;
d18 = vld1_u8(us);
us += pitch;
d19 = vld1_u8(vs);
vs += pitch;
d20 = vld1_u8(us);
d21 = vld1_u8(vs);
q3 = vcombine_u8(d6, d7);
q4 = vcombine_u8(d8, d9);
q5 = vcombine_u8(d10, d11);
q6 = vcombine_u8(d12, d13);
q7 = vcombine_u8(d14, d15);
q8 = vcombine_u8(d16, d17);
q9 = vcombine_u8(d18, d19);
q10 = vcombine_u8(d20, d21);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
q5, q6, q7, q8, q9, q10,
&q4, &q5, &q6, &q7, &q8, &q9);
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
vreinterpretq_u16_u32(q2tmp2.val[0]));
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
vreinterpretq_u16_u32(q2tmp3.val[0]));
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
vreinterpretq_u16_u32(q2tmp2.val[1]));
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
vreinterpretq_u16_u32(q2tmp3.val[1]));
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
vreinterpretq_u8_u16(q2tmp5.val[0]));
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
vreinterpretq_u8_u16(q2tmp5.val[1]));
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
vreinterpretq_u8_u16(q2tmp7.val[0]));
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
vreinterpretq_u8_u16(q2tmp7.val[1]));
q3 = q2tmp8.val[0];
q4 = q2tmp8.val[1];
q5 = q2tmp9.val[0];
q6 = q2tmp9.val[1];
q7 = q2tmp10.val[0];
q8 = q2tmp10.val[1];
q9 = q2tmp11.val[0];
q10 = q2tmp11.val[1];
ud = u - 4;
vst1_u8(ud, vget_low_u8(q3));
ud += pitch;
vst1_u8(ud, vget_low_u8(q4));
ud += pitch;
vst1_u8(ud, vget_low_u8(q5));
ud += pitch;
vst1_u8(ud, vget_low_u8(q6));
ud += pitch;
vst1_u8(ud, vget_low_u8(q7));
ud += pitch;
vst1_u8(ud, vget_low_u8(q8));
ud += pitch;
vst1_u8(ud, vget_low_u8(q9));
ud += pitch;
vst1_u8(ud, vget_low_u8(q10));
vd = v - 4;
vst1_u8(vd, vget_high_u8(q3));
vd += pitch;
vst1_u8(vd, vget_high_u8(q4));
vd += pitch;
vst1_u8(vd, vget_high_u8(q5));
vd += pitch;
vst1_u8(vd, vget_high_u8(q6));
vd += pitch;
vst1_u8(vd, vget_high_u8(q7));
vd += pitch;
vst1_u8(vd, vget_high_u8(q8));
vd += pitch;
vst1_u8(vd, vget_high_u8(q9));
vd += pitch;
vst1_u8(vd, vget_high_u8(q10));
return;
}

Просмотреть файл

@ -1,210 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/common/blockd.h"
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
unsigned char * yabove_row,
unsigned char * yleft,
int left_stride,
unsigned char * ypred_ptr,
int y_stride) {
const int mode = x->mode_info_context->mbmi.mode;
int i;
switch (mode) {
case DC_PRED:
{
int shift = x->up_available + x->left_available;
uint8x16_t v_expected_dc = vdupq_n_u8(128);
if (shift) {
unsigned int average = 0;
int expected_dc;
if (x->up_available) {
const uint8x16_t v_above = vld1q_u8(yabove_row);
const uint16x8_t a = vpaddlq_u8(v_above);
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
vreinterpret_u32_u64(vget_high_u64(c)));
average = vget_lane_u32(d, 0);
}
if (x->left_available) {
for (i = 0; i < 16; ++i) {
average += yleft[0];
yleft += left_stride;
}
}
shift += 3;
expected_dc = (average + (1 << (shift - 1))) >> shift;
v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
}
for (i = 0; i < 16; ++i) {
vst1q_u8(ypred_ptr, v_expected_dc);
ypred_ptr += y_stride;
}
}
break;
case V_PRED:
{
const uint8x16_t v_above = vld1q_u8(yabove_row);
for (i = 0; i < 16; ++i) {
vst1q_u8(ypred_ptr, v_above);
ypred_ptr += y_stride;
}
}
break;
case H_PRED:
{
for (i = 0; i < 16; ++i) {
const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
yleft += left_stride;
vst1q_u8(ypred_ptr, v_yleft);
ypred_ptr += y_stride;
}
}
break;
case TM_PRED:
{
const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
const uint8x16_t v_above = vld1q_u8(yabove_row);
for (i = 0; i < 16; ++i) {
const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
vreinterpretq_s16_u16(v_ytop_left));
const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
vreinterpretq_s16_u16(v_ytop_left));
const uint8x8_t pred_lo = vqmovun_s16(b_lo);
const uint8x8_t pred_hi = vqmovun_s16(b_hi);
vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
ypred_ptr += y_stride;
yleft += left_stride;
}
}
break;
}
}
void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
unsigned char * uabove_row,
unsigned char * vabove_row,
unsigned char * uleft,
unsigned char * vleft,
int left_stride,
unsigned char * upred_ptr,
unsigned char * vpred_ptr,
int pred_stride) {
const int mode = x->mode_info_context->mbmi.uv_mode;
int i;
switch (mode) {
case DC_PRED:
{
int shift = x->up_available + x->left_available;
uint8x8_t v_expected_udc = vdup_n_u8(128);
uint8x8_t v_expected_vdc = vdup_n_u8(128);
if (shift) {
unsigned int average_u = 0;
unsigned int average_v = 0;
int expected_udc;
int expected_vdc;
if (x->up_available) {
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
const uint32x4_t b = vpaddlq_u16(a);
const uint64x2_t c = vpaddlq_u32(b);
average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
}
if (x->left_available) {
for (i = 0; i < 8; ++i) {
average_u += uleft[0];
uleft += left_stride;
average_v += vleft[0];
vleft += left_stride;
}
}
shift += 2;
expected_udc = (average_u + (1 << (shift - 1))) >> shift;
expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
}
for (i = 0; i < 8; ++i) {
vst1_u8(upred_ptr, v_expected_udc);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_expected_vdc);
vpred_ptr += pred_stride;
}
}
break;
case V_PRED:
{
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
for (i = 0; i < 8; ++i) {
vst1_u8(upred_ptr, v_uabove);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_vabove);
vpred_ptr += pred_stride;
}
}
break;
case H_PRED:
{
for (i = 0; i < 8; ++i) {
const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
uleft += left_stride;
vleft += left_stride;
vst1_u8(upred_ptr, v_uleft);
upred_ptr += pred_stride;
vst1_u8(vpred_ptr, v_vleft);
vpred_ptr += pred_stride;
}
}
break;
case TM_PRED:
{
const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
const uint8x8_t v_uabove = vld1_u8(uabove_row);
const uint8x8_t v_vabove = vld1_u8(vabove_row);
for (i = 0; i < 8; ++i) {
const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
vreinterpretq_s16_u16(v_utop_left));
const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
vreinterpretq_s16_u16(v_vtop_left));
const uint8x8_t pred_u = vqmovun_s16(b_u);
const uint8x8_t pred_v = vqmovun_s16(b_v);
vst1_u8(upred_ptr, pred_u);
vst1_u8(vpred_ptr, pred_v);
upred_ptr += pred_stride;
vpred_ptr += pred_stride;
uleft += left_stride;
vleft += left_stride;
}
}
break;
}
}

Просмотреть файл

@ -0,0 +1,207 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad16x16_neon|
EXPORT |vp8_sad16x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int src_stride
; r2 unsigned char *ref_ptr
; r3 int ref_stride
|vp8_sad16x16_neon| PROC
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
;;
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0]
vld1.8 {q7}, [r2]
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;==============================
;unsigned int vp8_sad16x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad16x8_neon| PROC
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabdl.u8 q12, d0, d8
vabdl.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vld1.8 {q0}, [r0], r1
vld1.8 {q4}, [r2], r3
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vld1.8 {q1}, [r0], r1
vld1.8 {q5}, [r2], r3
vabal.u8 q12, d0, d8
vabal.u8 q13, d1, d9
vld1.8 {q2}, [r0], r1
vld1.8 {q6}, [r2], r3
vabal.u8 q12, d2, d10
vabal.u8 q13, d3, d11
vld1.8 {q3}, [r0], r1
vld1.8 {q7}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q13, d5, d13
vabal.u8 q12, d6, d14
vabal.u8 q13, d7, d15
vadd.u16 q0, q12, q13
vpaddl.u16 q1, q0
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,209 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sad8x8_neon|
EXPORT |vp8_sad8x16_neon|
EXPORT |vp8_sad4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; unsigned int vp8_sad8x8_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x8_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;============================
;unsigned int vp8_sad8x16_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad8x16_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vabal.u8 q12, d6, d14
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabal.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 q1, q12
vpaddl.u32 q0, q1
vadd.u32 d0, d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
;===========================
;unsigned int vp8_sad4x4_c(
; unsigned char *src_ptr,
; int src_stride,
; unsigned char *ref_ptr,
; int ref_stride)
|vp8_sad4x4_neon| PROC
vld1.8 {d0}, [r0], r1
vld1.8 {d8}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d10}, [r2], r3
vabdl.u8 q12, d0, d8
vld1.8 {d4}, [r0], r1
vld1.8 {d12}, [r2], r3
vabal.u8 q12, d2, d10
vld1.8 {d6}, [r0], r1
vld1.8 {d14}, [r2], r3
vabal.u8 q12, d4, d12
vabal.u8 q12, d6, d14
vpaddl.u16 d1, d24
vpaddl.u32 d0, d1
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -1,184 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_sad8x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 7; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad8x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 15; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad4x4_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x8_t d0, d8;
uint16x8_t q12;
uint32x2_t d1;
uint64x1_t d3;
int i;
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(d0, d8);
for (i = 0; i < 3; i++) {
d0 = vld1_u8(src_ptr);
src_ptr += src_stride;
d8 = vld1_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, d0, d8);
}
d1 = vpaddl_u16(vget_low_u16(q12));
d3 = vpaddl_u32(d1);
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
}
unsigned int vp8_sad16x16_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 15; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}
unsigned int vp8_sad16x8_neon(
unsigned char *src_ptr,
int src_stride,
unsigned char *ref_ptr,
int ref_stride) {
uint8x16_t q0, q4;
uint16x8_t q12, q13;
uint32x4_t q1;
uint64x2_t q3;
uint32x2_t d5;
int i;
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
for (i = 0; i < 7; i++) {
q0 = vld1q_u8(src_ptr);
src_ptr += src_stride;
q4 = vld1q_u8(ref_ptr);
ref_ptr += ref_stride;
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
}
q12 = vaddq_u16(q12, q13);
q1 = vpaddlq_u16(q12);
q3 = vpaddlq_u32(q1);
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
vreinterpret_u32_u64(vget_high_u64(q3)));
return vget_lane_u32(d5, 0);
}

Просмотреть файл

@ -0,0 +1,36 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_push_neon|
EXPORT |vp8_pop_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
|vp8_push_neon| PROC
vst1.i64 {d8, d9, d10, d11}, [r0]!
vst1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
|vp8_pop_neon| PROC
vld1.i64 {d8, d9, d10, d11}, [r0]!
vld1.i64 {d12, d13, d14, d15}, [r0]!
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,139 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_idct4x4llm_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
; unsigned char *dst, int stride)
;r0 short * input
;r1 short * pred
;r2 int pitch
;r3 unsigned char dst
;sp int stride
;*************************************************************
; static const int cospi8sqrt2minus1=20091;
; static const int sinpi8sqrt2 =35468;
; static const int rounding = 0;
; Optimization note: The resulted data from dequantization are signed
; 13-bit data that is in the range of [-4096, 4095]. This allows to
; use "vqdmulh"(neon) instruction since it won't go out of range
; (13+16+1=30bits<32bits). This instruction gives the high half
; result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
ldr r0, [sp] ; stride
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
;d6 - c1:temp1
;d7 - d1:temp2
;d8 - d1:temp1
;d9 - c1:temp2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
vswp d3, d4
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
vqadd.s16 d12, d2, d3 ;a1
vqsub.s16 d13, d2, d3 ;b1
vshr.s16 q3, q3, #1
vshr.s16 q4, q4, #1
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
vqadd.s16 q4, q4, q2
vqsub.s16 d10, d6, d9 ;c1
vqadd.s16 d11, d7, d8 ;d1
vqadd.s16 d2, d12, d11
vqadd.s16 d3, d13, d10
vqsub.s16 d4, d13, d10
vqsub.s16 d5, d12, d11
vrshr.s16 d2, d2, #3
vrshr.s16 d3, d3, #3
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
; load prediction data
vld1.32 d6[0], [r1], r2
vld1.32 d6[1], [r1], r2
vld1.32 d7[0], [r1], r2
vld1.32 d7[1], [r1], r2
; add prediction and residual
vaddw.u8 q1, q1, d6
vaddw.u8 q2, q2, d7
vqmovun.s16 d1, q1
vqmovun.s16 d2, q2
; store to destination
vst1.32 d1[0], [r3], r0
vst1.32 d1[1], [r3], r0
vst1.32 d2[0], [r3], r0
vst1.32 d2[1], [r3], r0
bx lr
ENDP
;-----------------
idct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c
;20091, 20091, 35468, 35468
END

Просмотреть файл

@ -1,123 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
static const int16_t cospi8sqrt2minus1 = 20091;
static const int16_t sinpi8sqrt2 = 35468;
void vp8_short_idct4x4llm_neon(
int16_t *input,
unsigned char *pred_ptr,
int pred_stride,
unsigned char *dst_ptr,
int dst_stride) {
int i;
uint32x2_t d6u32 = vdup_n_u32(0);
uint8x8_t d1u8;
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
uint16x8_t q1u16;
int16x8_t q1s16, q2s16, q3s16, q4s16;
int32x2x2_t v2tmp0, v2tmp1;
int16x4x2_t v2tmp2, v2tmp3;
d2 = vld1_s16(input);
d3 = vld1_s16(input + 4);
d4 = vld1_s16(input + 8);
d5 = vld1_s16(input + 12);
// 1st for loop
q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
q2s16 = vcombine_s16(d3, d5);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
// 2nd for loop
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
q3s16 = vshrq_n_s16(q3s16, 1);
q4s16 = vshrq_n_s16(q4s16, 1);
q3s16 = vqaddq_s16(q3s16, q2s16);
q4s16 = vqaddq_s16(q4s16, q2s16);
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
d2 = vqadd_s16(d12, d11);
d3 = vqadd_s16(d13, d10);
d4 = vqsub_s16(d13, d10);
d5 = vqsub_s16(d12, d11);
d2 = vrshr_n_s16(d2, 3);
d3 = vrshr_n_s16(d3, 3);
d4 = vrshr_n_s16(d4, 3);
d5 = vrshr_n_s16(d5, 3);
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
vreinterpret_s16_s32(v2tmp1.val[0]));
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
vreinterpret_s16_s32(v2tmp1.val[1]));
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
// dc_only_idct_add
for (i = 0; i < 2; i++, q1s16 = q2s16) {
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
pred_ptr += pred_stride;
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
pred_ptr += pred_stride;
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
vreinterpret_u8_u32(d6u32));
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
dst_ptr += dst_stride;
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
dst_ptr += dst_stride;
}
return;
}

Просмотреть файл

@ -0,0 +1,490 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict16x16_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
; that the result can be a large positive number (> 2^15-1), which could be confused as a
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
; which ensures that the result stays in s16 range. Finally, saturated add the result by
; applying 3rd filter coeff. Same applys to other filter functions.
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, filter16_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter16x16_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter16x16_only
sub sp, sp, #336 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #7 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (21x16)
filt_blk2d_fp16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d7, d0
vmull.u8 q10, d9, d0
vmull.u8 q11, d10, d0
vmull.u8 q12, d12, d0
vmull.u8 q13, d13, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d9, d10, #1
vext.8 d30, d12, d13, #1
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q10, d29, d1
vmlsl.u8 q12, d30, d1
vext.8 d28, d7, d8, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d13, d14, #1
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q11, d29, d1
vmlsl.u8 q13, d30, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d9, d10, #4
vext.8 d30, d12, d13, #4
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q10, d29, d4
vmlsl.u8 q12, d30, d4
vext.8 d28, d7, d8, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d13, d14, #4
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q11, d29, d4
vmlsl.u8 q13, d30, d4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vext.8 d30, d12, d13, #5
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q10, d29, d5
vmlal.u8 q12, d30, d5
vext.8 d28, d7, d8, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d13, d14, #5
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q11, d29, d5
vmlal.u8 q13, d30, d5
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d9, d10, #2
vext.8 d30, d12, d13, #2
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q10, d29, d2
vmlal.u8 q12, d30, d2
vext.8 d28, d7, d8, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d13, d14, #2
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q11, d29, d2
vmlal.u8 q13, d30, d2
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d9, d10, #3
vext.8 d30, d12, d13, #3
vext.8 d15, d7, d8, #3
vext.8 d31, d10, d11, #3
vext.8 d6, d13, d14, #3
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
vqadd.s16 q10, q5
vqadd.s16 q12, q6
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q7, d31, d3
vmull.u8 q3, d6, d3
subs r2, r2, #1
vqadd.s16 q9, q6
vqadd.s16 q11, q7
vqadd.s16 q13, q3
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q9, #7
vqrshrun.s16 d8, q10, #7
vqrshrun.s16 d9, q11, #7
vqrshrun.s16 d10, q12, #7
vqrshrun.s16 d11, q13, #7
vst1.u8 {d6, d7, d8}, [lr]! ;store result
vst1.u8 {d9, d10, d11}, [lr]!
bne filt_blk2d_fp16x16_loop_neon
;Second pass: 16x16
;secondpass_filter - do first 8-columns and then second 8-columns
add r3, r12, r3, lsl #5
sub lr, lr, #336
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
mov r2, #16
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp16x16_outloop_neon
vld1.u8 {d18}, [lr], r2 ;load src data
vld1.u8 {d19}, [lr], r2
vld1.u8 {d20}, [lr], r2
vld1.u8 {d21}, [lr], r2
mov r12, #4 ;loop counter
vld1.u8 {d22}, [lr], r2
secondpass_inner_loop_neon
vld1.u8 {d23}, [lr], r2 ;load src data
vld1.u8 {d24}, [lr], r2
vld1.u8 {d25}, [lr], r2
vld1.u8 {d26}, [lr], r2
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_inner_loop_neon
subs r3, r3, #1
sub lr, lr, #336
add lr, lr, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_sp16x16_outloop_neon
add sp, sp, #336
pop {r4-r5,pc}
;--------------------
firstpass_filter16x16_only
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #8 ;loop counter
sub r0, r0, #2 ;move srcptr back to (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First Pass: output_height lines x output_width columns (16x16)
filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
vld1.u8 {d9, d10, d11}, [r0], r1
pld [r0]
pld [r0, r1]
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q7, d7, d0
vmull.u8 q8, d9, d0
vmull.u8 q9, d10, d0
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d21, d9, d10, #1
vext.8 d22, d7, d8, #1
vext.8 d23, d10, d11, #1
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
vext.8 d25, d9, d10, #4
vext.8 d26, d7, d8, #4
vext.8 d27, d10, d11, #4
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d9, d10, #5
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d21, d1
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d23, d1
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d25, d4
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d27, d4
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vext.8 d20, d7, d8, #5
vext.8 d21, d10, d11, #5
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
vext.8 d23, d9, d10, #2
vext.8 d24, d7, d8, #2
vext.8 d25, d10, d11, #2
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
vext.8 d27, d9, d10, #3
vext.8 d28, d7, d8, #3
vext.8 d29, d10, d11, #3
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d21, d5
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d23, d2
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q11, d27, d3
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q15, d29, d3
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q11
vqadd.s16 q7, q12
vqadd.s16 q9, q15
subs r2, r2, #1
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q7, #7
vqrshrun.s16 d8, q8, #7
vqrshrun.s16 d9, q9, #7
vst1.u8 {q3}, [r4], r5 ;store result
vst1.u8 {q4}, [r4], r5
bne filt_blk2d_fpo16x16_loop_neon
pop {r4-r5,pc}
;--------------------
secondpass_filter16x16_only
;Second pass: 16x16
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
mov r3, #2 ;loop counter
vabs.s32 q7, q5
vabs.s32 q8, q6
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_spo16x16_outloop_neon
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.u8 {d19}, [r0], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r0], r1
mov r12, #4 ;loop counter
vld1.u8 {d22}, [r0], r1
secondpass_only_inner_loop_neon
vld1.u8 {d23}, [r0], r1 ;load src data
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r12, r12, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vmov q9, q11
vst1.u8 {d7}, [r4], r5
vmov q10, q12
vst1.u8 {d8}, [r4], r5
vmov d22, d26
vst1.u8 {d9}, [r4], r5
bne secondpass_only_inner_loop_neon
subs r3, r3, #1
sub r0, r0, r1, lsl #4
sub r0, r0, r1, lsl #2
sub r0, r0, r1
add r0, r0, #8
sub r4, r4, r5, lsl #4
add r4, r4, #8
bne filt_blk2d_spo16x16_outloop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

Просмотреть файл

@ -0,0 +1,422 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(lr) int dst_pitch
|vp8_sixtap_predict4x4_neon| PROC
push {r4, lr}
adr r12, filter4_coeff
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter4x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
;First pass: output_height lines x output_width columns (9x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
vld1.u8 {q4}, [r0], r1
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
;First Pass on rest 5-line data
vld1.u8 {q11}, [r0], r1
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
add r3, r12, r3, lsl #5
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqadd.s16 q12, q11
vext.8 d23, d27, d28, #4
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d30, q8, #7
vqrshrun.s16 d31, q12, #7
;Second pass: 4x4
vabs.s32 q7, q5
vabs.s32 q8, q6
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
;---------------------
firstpass_filter4x4_only
vabs.s32 q12, q14 ;get abs(filer_parameters)
vabs.s32 q13, q15
sub r0, r0, #2 ;go back 2 columns of src data
;First pass: output_height lines x output_width columns (4x4)
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
vext.8 d19, d8, d9, #5
vext.8 d20, d10, d11, #5
vext.8 d21, d12, d13, #5
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
vswp d11, d12
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
vzip.32 d20, d21
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q8, d20, d5
vmov q4, q3 ;keep original src data in q4 q6
vmov q6, q5
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
vzip.32 d10, d11
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
vshr.u64 q10, q6, #8
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
vmlal.u8 q8, d10, d0
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
vzip.32 d20, d21
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
vshr.u64 q5, q6, #32
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d20, d1
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
vzip.32 d10, d11
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
vshr.u64 q10, q6, #16
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d10, d4
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
vzip.32 d20, d21
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
vshr.u64 q5, q6, #24
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d20, d2
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
vzip.32 d10, d11
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q10, d10, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q10
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d28, q8, #7
vst1.32 {d27[0]}, [r4] ;store result
vst1.32 {d27[1]}, [r0]
vst1.32 {d28[0]}, [r1]
vst1.32 {d28[1]}, [r2]
pop {r4, pc}
;---------------------
secondpass_filter4x4_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.32 {d27[0]}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.32 {d27[1]}, [r0], r1
vabs.s32 q7, q5
vld1.32 {d28[0]}, [r0], r1
vabs.s32 q8, q6
vld1.32 {d28[1]}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.32 {d29[0]}, [r0], r1
vdup.8 d1, d14[4]
vld1.32 {d29[1]}, [r0], r1
vdup.8 d2, d15[0]
vld1.32 {d30[0]}, [r0], r1
vdup.8 d3, d15[4]
vld1.32 {d30[1]}, [r0], r1
vdup.8 d4, d16[0]
vld1.32 {d31[0]}, [r0], r1
vdup.8 d5, d16[4]
vext.8 d23, d27, d28, #4
vext.8 d24, d28, d29, #4
vext.8 d25, d29, d30, #4
vext.8 d26, d30, d31, #4
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d28, d0
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
vmull.u8 q6, d26, d5
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d30, d4
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q6, d24, d1
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d29, d2
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
vmlal.u8 q6, d25, d3
add r0, r4, lr
add r1, r0, lr
add r2, r1, lr
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q6, q4
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
vqrshrun.s16 d4, q6, #7
vst1.32 {d3[0]}, [r4] ;store result
vst1.32 {d3[1]}, [r0]
vst1.32 {d4[0]}, [r1]
vst1.32 {d4[1]}, [r2]
pop {r4, pc}
ENDP
;-----------------
END

Просмотреть файл

@ -0,0 +1,473 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; r4 unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x4_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x4_only
sub sp, sp, #32 ;reserve space on stack for temporary storage
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
mov lr, sp
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (9x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vld1.u8 {q4}, [r0], r1
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q5}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q7}, [r0], r1
vst1.u8 {d25}, [lr]!
;first_pass filtering on the rest 5-line data
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
vqrshrun.s16 d27, q9, #7
vqrshrun.s16 d28, q10, #7
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
vqrshrun.s16 d30, q12, #7
;Second pass: 8x4
;secondpass_filter
add r3, r12, r3, lsl #5
sub lr, lr, #32
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {q12}, [lr]!
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
add sp, sp, #32
pop {r4-r5,pc}
;--------------------
firstpass_filter8x4_only
vabs.s32 q12, q14
vabs.s32 q13, q15
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vld1.u8 {q4}, [r0], r1
vdup.8 d1, d24[4]
vld1.u8 {q5}, [r0], r1
vdup.8 d2, d25[0]
vld1.u8 {q6}, [r0], r1
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (4x8)
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
pop {r4-r5,pc}
;---------------------
secondpass_filter8x4_only
;Second pass: 8x4
add r3, r12, r3, lsl #5
sub r0, r0, r1, lsl #1
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vabs.s32 q7, q5
vabs.s32 q8, q6
vld1.u8 {d22}, [r0], r1
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d25}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d28}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d29}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d30}, [r0], r1
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d23, d0
vmull.u8 q5, d24, d0
vmull.u8 q6, d25, d0
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d24, d1
vmlsl.u8 q5, d25, d1
vmlsl.u8 q6, d26, d1
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d27, d4
vmlsl.u8 q5, d28, d4
vmlsl.u8 q6, d29, d4
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d25, d2
vmlal.u8 q5, d26, d2
vmlal.u8 q6, d27, d2
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d28, d5
vmlal.u8 q5, d29, d5
vmlal.u8 q6, d30, d5
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d26, d3
vmull.u8 q9, d27, d3
vmull.u8 q10, d28, d3
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vst1.u8 {d6}, [r4], r5 ;store result
vst1.u8 {d7}, [r4], r5
vst1.u8 {d8}, [r4], r5
vst1.u8 {d9}, [r4], r5
pop {r4-r5,pc}
ENDP
;-----------------
END

Просмотреть файл

@ -0,0 +1,524 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sixtap_predict8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pitch
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_filter8x8_only
add r2, r12, r2, lsl #5 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
beq firstpass_filter8x8_only
sub sp, sp, #64 ;reserve space on stack for temporary storage
mov lr, sp
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
sub r0, r0, r1, lsl #1
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
;First pass: output_height lines x output_width columns (13x8)
vld1.u8 {q3}, [r0], r1 ;load src data
vdup.8 d3, d25[4]
vld1.u8 {q4}, [r0], r1
vdup.8 d4, d26[0]
vld1.u8 {q5}, [r0], r1
vdup.8 d5, d26[4]
vld1.u8 {q6}, [r0], r1
filt_blk2d_fp8x8_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
subs r2, r2, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vld1.u8 {q3}, [r0], r1 ;load src data
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [lr]! ;store result
vld1.u8 {q4}, [r0], r1
vst1.u8 {d23}, [lr]!
vld1.u8 {q5}, [r0], r1
vst1.u8 {d24}, [lr]!
vld1.u8 {q6}, [r0], r1
vst1.u8 {d25}, [lr]!
bne filt_blk2d_fp8x8_loop_neon
;first_pass filtering on the rest 5-line data
;vld1.u8 {q3}, [r0], r1 ;load src data
;vld1.u8 {q4}, [r0], r1
;vld1.u8 {q5}, [r0], r1
;vld1.u8 {q6}, [r0], r1
vld1.u8 {q7}, [r0], r1
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vmull.u8 q11, d12, d0
vmull.u8 q12, d14, d0
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d28, d8, d9, #1
vext.8 d29, d10, d11, #1
vext.8 d30, d12, d13, #1
vext.8 d31, d14, d15, #1
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q9, d28, d1
vmlsl.u8 q10, d29, d1
vmlsl.u8 q11, d30, d1
vmlsl.u8 q12, d31, d1
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
vext.8 d28, d8, d9, #4
vext.8 d29, d10, d11, #4
vext.8 d30, d12, d13, #4
vext.8 d31, d14, d15, #4
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q9, d28, d4
vmlsl.u8 q10, d29, d4
vmlsl.u8 q11, d30, d4
vmlsl.u8 q12, d31, d4
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
vext.8 d28, d8, d9, #2
vext.8 d29, d10, d11, #2
vext.8 d30, d12, d13, #2
vext.8 d31, d14, d15, #2
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q9, d28, d2
vmlal.u8 q10, d29, d2
vmlal.u8 q11, d30, d2
vmlal.u8 q12, d31, d2
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
vext.8 d28, d8, d9, #5
vext.8 d29, d10, d11, #5
vext.8 d30, d12, d13, #5
vext.8 d31, d14, d15, #5
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q9, d28, d5
vmlal.u8 q10, d29, d5
vmlal.u8 q11, d30, d5
vmlal.u8 q12, d31, d5
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
vext.8 d28, d8, d9, #3
vext.8 d29, d10, d11, #3
vext.8 d30, d12, d13, #3
vext.8 d31, d14, d15, #3
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d28, d3
vmull.u8 q5, d29, d3
vmull.u8 q6, d30, d3
vmull.u8 q7, d31, d3
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q9, q4
vqadd.s16 q10, q5
vqadd.s16 q11, q6
vqadd.s16 q12, q7
add r3, r12, r3, lsl #5
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
sub lr, lr, #64
vqrshrun.s16 d27, q9, #7
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
vqrshrun.s16 d28, q10, #7
vld1.u8 {q10}, [lr]!
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vqrshrun.s16 d29, q11, #7
vld1.u8 {q11}, [lr]!
vabs.s32 q7, q5
vabs.s32 q8, q6
vqrshrun.s16 d30, q12, #7
vld1.u8 {q12}, [lr]!
;Second pass: 8x8
mov r3, #2 ;loop counter
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vdup.8 d1, d14[4]
vdup.8 d2, d15[0]
vdup.8 d3, d15[4]
vdup.8 d4, d16[0]
vdup.8 d5, d16[4]
filt_blk2d_sp8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_sp8x8_loop_neon
add sp, sp, #64
pop {r4-r5,pc}
;---------------------
firstpass_filter8x8_only
;add r2, r12, r2, lsl #5 ;calculate filter location
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
vabs.s32 q12, q14
vabs.s32 q13, q15
mov r2, #2 ;loop counter
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
vdup.8 d1, d24[4]
vdup.8 d2, d25[0]
vdup.8 d3, d25[4]
vdup.8 d4, d26[0]
vdup.8 d5, d26[4]
;First pass: output_height lines x output_width columns (8x8)
filt_blk2d_fpo8x8_loop_neon
vld1.u8 {q3}, [r0], r1 ;load src data
vld1.u8 {q4}, [r0], r1
vld1.u8 {q5}, [r0], r1
vld1.u8 {q6}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q8, d8, d0
vmull.u8 q9, d10, d0
vmull.u8 q10, d12, d0
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
vext.8 d29, d8, d9, #1
vext.8 d30, d10, d11, #1
vext.8 d31, d12, d13, #1
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q8, d29, d1
vmlsl.u8 q9, d30, d1
vmlsl.u8 q10, d31, d1
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
vext.8 d29, d8, d9, #4
vext.8 d30, d10, d11, #4
vext.8 d31, d12, d13, #4
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q8, d29, d4
vmlsl.u8 q9, d30, d4
vmlsl.u8 q10, d31, d4
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
vext.8 d29, d8, d9, #2
vext.8 d30, d10, d11, #2
vext.8 d31, d12, d13, #2
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q8, d29, d2
vmlal.u8 q9, d30, d2
vmlal.u8 q10, d31, d2
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
vext.8 d29, d8, d9, #5
vext.8 d30, d10, d11, #5
vext.8 d31, d12, d13, #5
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q8, d29, d5
vmlal.u8 q9, d30, d5
vmlal.u8 q10, d31, d5
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
vext.8 d29, d8, d9, #3
vext.8 d30, d10, d11, #3
vext.8 d31, d12, d13, #3
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q4, d29, d3
vmull.u8 q5, d30, d3
vmull.u8 q6, d31, d3
;
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
subs r2, r2, #1
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d23, q8, #7
vqrshrun.s16 d24, q9, #7
vqrshrun.s16 d25, q10, #7
vst1.u8 {d22}, [r4], r5 ;store result
vst1.u8 {d23}, [r4], r5
vst1.u8 {d24}, [r4], r5
vst1.u8 {d25}, [r4], r5
bne filt_blk2d_fpo8x8_loop_neon
pop {r4-r5,pc}
;---------------------
secondpass_filter8x8_only
sub r0, r0, r1, lsl #1
add r3, r12, r3, lsl #5
vld1.u8 {d18}, [r0], r1 ;load src data
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
vld1.u8 {d19}, [r0], r1
vabs.s32 q7, q5
vld1.u8 {d20}, [r0], r1
vabs.s32 q8, q6
vld1.u8 {d21}, [r0], r1
mov r3, #2 ;loop counter
vld1.u8 {d22}, [r0], r1
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
vld1.u8 {d23}, [r0], r1
vdup.8 d1, d14[4]
vld1.u8 {d24}, [r0], r1
vdup.8 d2, d15[0]
vld1.u8 {d25}, [r0], r1
vdup.8 d3, d15[4]
vld1.u8 {d26}, [r0], r1
vdup.8 d4, d16[0]
vld1.u8 {d27}, [r0], r1
vdup.8 d5, d16[4]
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
;Second pass: 8x8
filt_blk2d_spo8x8_loop_neon
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
vmull.u8 q4, d19, d0
vmull.u8 q5, d20, d0
vmull.u8 q6, d21, d0
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
vmlsl.u8 q4, d20, d1
vmlsl.u8 q5, d21, d1
vmlsl.u8 q6, d22, d1
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
vmlsl.u8 q4, d23, d4
vmlsl.u8 q5, d24, d4
vmlsl.u8 q6, d25, d4
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
vmlal.u8 q4, d21, d2
vmlal.u8 q5, d22, d2
vmlal.u8 q6, d23, d2
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
vmlal.u8 q4, d24, d5
vmlal.u8 q5, d25, d5
vmlal.u8 q6, d26, d5
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
vmull.u8 q8, d22, d3
vmull.u8 q9, d23, d3
vmull.u8 q10, d24, d3
subs r3, r3, #1
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
vqadd.s16 q8, q4
vqadd.s16 q9, q5
vqadd.s16 q10, q6
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
vqrshrun.s16 d7, q8, #7
vqrshrun.s16 d8, q9, #7
vqrshrun.s16 d9, q10, #7
vmov q9, q11
vst1.u8 {d6}, [r4], r5 ;store result
vmov q10, q12
vst1.u8 {d7}, [r4], r5
vmov q11, q13
vst1.u8 {d8}, [r4], r5
vmov q12, q14
vst1.u8 {d9}, [r4], r5
vmov d26, d30
bne filt_blk2d_spo8x8_loop_neon
pop {r4-r5,pc}
ENDP
;-----------------
END

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,276 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance16x16_neon|
EXPORT |vp8_variance16x8_neon|
EXPORT |vp8_variance8x16_neon|
EXPORT |vp8_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance16x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
;the results into the elements of the destination vector. The explanation
;in ARM guide is wrong.
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
;vmov.32 r0, d0[0] ;this instruction costs a lot
;vmov.32 r1, d1[0]
;mul r0, r0, r0
;str r1, [r12]
;sub r0, r1, r0, lsr #8
; while sum is signed, sum * sum is always positive and must be treated as
; unsigned to avoid propagating the sign bit.
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;================================
;unsigned int vp8_variance16x8_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance16x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #4
variance16x8_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance16x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;=================================
;unsigned int vp8_variance8x16_c(
; unsigned char *src_ptr,
; int source_stride,
; unsigned char *ref_ptr,
; int recon_stride,
; unsigned int *sse)
|vp8_variance8x16_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #8
variance8x16_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d2, d6
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
bne variance8x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #7
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
;==================================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
|vp8_variance8x8_neon| PROC
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
variance8x8_neon_loop
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4 ;calculate diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;calculate sum
vmlal.s16 q9, d22, d22 ;calculate sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
ldr r12, [sp] ;load *sse from stack
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r12] ;store sse
vshr.u32 d10, d10, #6
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
bx lr
ENDP
END

Просмотреть файл

@ -1,320 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vpx_ports/mem.h"
unsigned int vp8_variance16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) {
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance16x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d2u8, d4u8, d6u8;
int16x4_t d22s16, d23s16, d24s16, d25s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
__builtin_prefetch(src_ptr);
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
__builtin_prefetch(ref_ptr);
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d2u8, d6u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}
unsigned int vp8_variance8x8_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
uint32x2_t d0u32, d10u32;
int64x1_t d0s64, d1s64;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int32x4_t q8s32, q9s32, q10s32;
int64x2_t q0s64, q1s64, q5s64;
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q10s32 = vaddq_s32(q10s32, q9s32);
q0s64 = vpaddlq_s32(q8s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
vreinterpret_s32_s64(d0s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
return vget_lane_u32(d0u32, 0);
}

Просмотреть файл

@ -0,0 +1,423 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
;-----------------
EXPORT |vp8_sub_pixel_variance16x16_neon_func|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|vp8_sub_pixel_variance16x16_neon_func| PROC
push {r4-r6, lr}
adr r12, bilinear_taps_coeff
ldr r4, [sp, #16] ;load *dst_ptr from stack
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
ldr r6, [sp, #24] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16_only
add r2, r12, r2, lsl #3 ;calculate filter location
cmp r3, #0 ;skip second_pass filter if yoffset=0
vld1.s32 {d31}, [r2] ;load first_pass filter
beq firstpass_bfilter16x16_only
sub sp, sp, #272 ;reserve space on stack for temporary storage
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
mov lr, sp
vld1.u8 {d5, d6, d7}, [r0], r1
mov r2, #3 ;loop counter
vld1.u8 {d8, d9, d10}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {d11, d12, d13}, [r0], r1
vdup.8 d1, d31[4]
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16_loop_neon
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vqrshrn.u16 d21, q14, #7
vld1.u8 {d5, d6, d7}, [r0], r1
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
vld1.u8 {d8, d9, d10}, [r0], r1
vst1.u8 {d18, d19, d20, d21}, [lr]!
vld1.u8 {d11, d12, d13}, [r0], r1
bne vp8e_filt_blk2d_fp16x16_loop_neon
;First-pass filtering for rest 5 lines
vld1.u8 {d14, d15, d16}, [r0], r1
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q10, d3, d0
vmull.u8 q11, d5, d0
vmull.u8 q12, d6, d0
vmull.u8 q13, d8, d0
vmull.u8 q14, d9, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q11, d5, d1
vmlal.u8 q13, d8, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q12, d6, d1
vmlal.u8 q14, d9, d1
vmull.u8 q1, d11, d0
vmull.u8 q2, d12, d0
vmull.u8 q3, d14, d0
vmull.u8 q4, d15, d0
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
vext.8 d14, d14, d15, #1
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q3, d14, d1
vext.8 d12, d12, d13, #1
vext.8 d15, d15, d16, #1
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q4, d15, d1
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
vqrshrn.u16 d11, q10, #7
vqrshrn.u16 d12, q11, #7
vqrshrn.u16 d13, q12, #7
vqrshrn.u16 d14, q13, #7
vqrshrn.u16 d15, q14, #7
vqrshrn.u16 d16, q1, #7
vqrshrn.u16 d17, q2, #7
vqrshrn.u16 d18, q3, #7
vqrshrn.u16 d19, q4, #7
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
vst1.u8 {d14, d15, d16, d17}, [lr]!
vst1.u8 {d18, d19}, [lr]!
;Second pass: 16x16
;secondpass_filter
add r3, r12, r3, lsl #3
sub lr, lr, #272
vld1.u32 {d31}, [r3] ;load second_pass filter
sub sp, sp, #256
mov r3, sp
vld1.u8 {d22, d23}, [lr]! ;load src data
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
mov r12, #4 ;loop counter
vp8e_filt_blk2d_sp16x16_loop_neon
vld1.u8 {d24, d25}, [lr]!
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [lr]!
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [lr]!
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [lr]!
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
subs r12, r12, #1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5}, [r3]!
vst1.u8 {d6, d7}, [r3]!
vmov q11, q15
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_sp16x16_loop_neon
b sub_pixel_variance16x16_neon
;--------------------
firstpass_bfilter16x16_only
mov r2, #4 ;loop counter
sub sp, sp, #528 ;reserve space on stack for temporary storage
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vdup.8 d1, d31[4]
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16_loop_neon
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
vld1.u8 {d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10}, [r0], r1
vld1.u8 {d11, d12, d13}, [r0], r1
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q8, d3, d0
vmull.u8 q9, d5, d0
vmull.u8 q10, d6, d0
vmull.u8 q11, d8, d0
vmull.u8 q12, d9, d0
vmull.u8 q13, d11, d0
vmull.u8 q14, d12, d0
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
vext.8 d5, d5, d6, #1
vext.8 d8, d8, d9, #1
vext.8 d11, d11, d12, #1
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q9, d5, d1
vmlal.u8 q11, d8, d1
vmlal.u8 q13, d11, d1
vext.8 d3, d3, d4, #1
vext.8 d6, d6, d7, #1
vext.8 d9, d9, d10, #1
vext.8 d12, d12, d13, #1
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
vmlal.u8 q10, d6, d1
vmlal.u8 q12, d9, d1
vmlal.u8 q14, d12, d1
subs r2, r2, #1
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
vqrshrn.u16 d15, q8, #7
vqrshrn.u16 d16, q9, #7
vqrshrn.u16 d17, q10, #7
vqrshrn.u16 d18, q11, #7
vqrshrn.u16 d19, q12, #7
vqrshrn.u16 d20, q13, #7
vst1.u8 {d14, d15}, [r3]! ;store result
vqrshrn.u16 d21, q14, #7
vst1.u8 {d16, d17}, [r3]!
vst1.u8 {d18, d19}, [r3]!
vst1.u8 {d20, d21}, [r3]!
bne vp8e_filt_blk2d_fpo16x16_loop_neon
b sub_pixel_variance16x16_neon
;---------------------
secondpass_bfilter16x16_only
;Second pass: 16x16
;secondpass_filter
sub sp, sp, #528 ;reserve space on stack for temporary storage
add r3, r12, r3, lsl #3
mov r12, #4 ;loop counter
vld1.u32 {d31}, [r3] ;load second_pass filter
vld1.u8 {d22, d23}, [r0], r1 ;load src data
mov r3, sp
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vp8e_filt_blk2d_spo16x16_loop_neon
vld1.u8 {d24, d25}, [r0], r1
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vld1.u8 {d26, d27}, [r0], r1
vmull.u8 q2, d23, d0
vld1.u8 {d28, d29}, [r0], r1
vmull.u8 q3, d24, d0
vld1.u8 {d30, d31}, [r0], r1
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d25, d1
vmlal.u8 q3, d26, d1
vmlal.u8 q4, d27, d1
vmlal.u8 q5, d28, d1
vmlal.u8 q6, d29, d1
vmlal.u8 q7, d30, d1
vmlal.u8 q8, d31, d1
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d3, q2, #7
vqrshrn.u16 d4, q3, #7
vqrshrn.u16 d5, q4, #7
vqrshrn.u16 d6, q5, #7
vqrshrn.u16 d7, q6, #7
vqrshrn.u16 d8, q7, #7
vqrshrn.u16 d9, q8, #7
vst1.u8 {d2, d3}, [r3]! ;store result
subs r12, r12, #1
vst1.u8 {d4, d5}, [r3]!
vmov q11, q15
vst1.u8 {d6, d7}, [r3]!
vst1.u8 {d8, d9}, [r3]!
bne vp8e_filt_blk2d_spo16x16_loop_neon
b sub_pixel_variance16x16_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r12, #8
sub_pixel_variance16x16_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q2}, [r4], r5
vld1.8 {q1}, [r3]!
vld1.8 {q3}, [r4], r5
vsubl.u8 q11, d0, d4 ;diff
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne sub_pixel_variance16x16_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [r6] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
add sp, sp, #528
vmov.32 r0, d0[0] ;return
pop {r4-r6,pc}
ENDP
END

Просмотреть файл

@ -0,0 +1,572 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_variance_halfpixvar16x16_h_neon|
EXPORT |vp8_variance_halfpixvar16x16_v_neon|
EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
EXPORT |vp8_sub_pixel_variance16x16s_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;================================================
;unsigned int vp8_variance_halfpixvar16x16_h_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_h_neon| PROC
push {lr}
mov r12, #4 ;loop counter
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
;First Pass: output_height lines x output_width columns (16x16)
vp8_filt_fpo16x16s_4_0_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.8 {q11}, [r2], r3
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.8 {q12}, [r2], r3
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.8 {q13}, [r2], r3
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vext.8 q3, q2, q3, #1
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vld1.8 {q14}, [r2], r3
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vsubl.u8 q4, d0, d22 ;diff
vsubl.u8 q5, d1, d23
vsubl.u8 q6, d2, d24
vsubl.u8 q7, d3, d25
vsubl.u8 q0, d4, d26
vsubl.u8 q1, d5, d27
vsubl.u8 q2, d6, d28
vsubl.u8 q3, d7, d29
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
subs r12, r12, #1
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_fpo16x16s_4_0_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp8_variance_halfpixvar16x16_v_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_v_neon| PROC
push {lr}
mov r12, #4 ;loop counter
vld1.u8 {q0}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
vp8_filt_spo16x16s_0_4_loop_neon
vld1.u8 {q2}, [r0], r1
vld1.8 {q1}, [r2], r3
vld1.u8 {q4}, [r0], r1
vld1.8 {q3}, [r2], r3
vld1.u8 {q6}, [r0], r1
vld1.8 {q5}, [r2], r3
vld1.u8 {q15}, [r0], r1
vrhadd.u8 q0, q0, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q4
vrhadd.u8 q4, q4, q6
vrhadd.u8 q6, q6, q15
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r12, r12, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vmov q0, q15
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne vp8_filt_spo16x16s_0_4_loop_neon
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;================================================
;unsigned int vp8_variance_halfpixvar16x16_hv_neon
;(
; unsigned char *src_ptr, r0
; int src_pixels_per_line, r1
; unsigned char *dst_ptr, r2
; int dst_pixels_per_line, r3
; unsigned int *sse
;);
;================================================
|vp8_variance_halfpixvar16x16_hv_neon| PROC
push {lr}
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
ldr lr, [sp, #4] ;load *sse from stack
vmov.i8 q13, #0 ;q8 - sum
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vmov.i8 q14, #0 ;q9, q10 - sse
vmov.i8 q15, #0
mov r12, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8_filt16x16s_4_4_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vld1.8 {q5}, [r2], r3
vrhadd.u8 q0, q0, q1
vld1.8 {q6}, [r2], r3
vrhadd.u8 q1, q1, q2
vld1.8 {q7}, [r2], r3
vrhadd.u8 q2, q2, q3
vld1.8 {q8}, [r2], r3
vrhadd.u8 q3, q3, q4
vsubl.u8 q9, d0, d10 ;diff
vsubl.u8 q10, d1, d11
vsubl.u8 q11, d2, d12
vsubl.u8 q12, d3, d13
vsubl.u8 q0, d4, d14 ;diff
vsubl.u8 q1, d5, d15
vsubl.u8 q5, d6, d16
vsubl.u8 q6, d7, d17
vpadal.s16 q13, q9 ;sum
vmlal.s16 q14, d18, d18 ;sse
vmlal.s16 q15, d19, d19
vpadal.s16 q13, q10 ;sum
vmlal.s16 q14, d20, d20 ;sse
vmlal.s16 q15, d21, d21
vpadal.s16 q13, q11 ;sum
vmlal.s16 q14, d22, d22 ;sse
vmlal.s16 q15, d23, d23
vpadal.s16 q13, q12 ;sum
vmlal.s16 q14, d24, d24 ;sse
vmlal.s16 q15, d25, d25
subs r12, r12, #1
vpadal.s16 q13, q0 ;sum
vmlal.s16 q14, d0, d0 ;sse
vmlal.s16 q15, d1, d1
vpadal.s16 q13, q1 ;sum
vmlal.s16 q14, d2, d2 ;sse
vmlal.s16 q15, d3, d3
vpadal.s16 q13, q5 ;sum
vmlal.s16 q14, d10, d10 ;sse
vmlal.s16 q15, d11, d11
vmov q0, q4
vpadal.s16 q13, q6 ;sum
vmlal.s16 q14, d12, d12 ;sse
vmlal.s16 q15, d13, d13
bne vp8_filt16x16s_4_4_loop_neon
vadd.u32 q15, q14, q15 ;accumulate sse
vpaddl.s32 q0, q13 ;accumulate sum
vpaddl.u32 q1, q15
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {pc}
ENDP
;==============================
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack unsigned char *dst_ptr,
; stack int dst_pixels_per_line,
; stack unsigned int *sse
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
;or filter coeff is {64, 64}. This simplified program only works in this situation.
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|vp8_sub_pixel_variance16x16s_neon| PROC
push {r4, lr}
ldr r4, [sp, #8] ;load *dst_ptr from stack
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
ldr lr, [sp, #16] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq secondpass_bfilter16x16s_only
cmp r3, #0 ;skip second_pass filter if yoffset=0
beq firstpass_bfilter16x16s_only
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
sub sp, sp, #256 ;reserve space on stack for temporary storage
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
mov r3, sp
mov r2, #4 ;loop counter
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
;First Pass: output_height lines x output_width columns (17x16)
vp8e_filt_blk2d_fp16x16s_loop_neon
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
vld1.u8 {d16, d17, d18, d19}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
vext.8 q5, q4, q5, #1
vext.8 q7, q6, q7, #1
vext.8 q9, q8, q9, #1
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q0, q0, q1
vrhadd.u8 q1, q1, q2
vrhadd.u8 q2, q2, q3
vrhadd.u8 q3, q3, q4
subs r2, r2, #1
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
vmov q0, q4
vst1.u8 {d4, d5, d6, d7}, [r3]!
bne vp8e_filt_blk2d_fp16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;--------------------
firstpass_bfilter16x16s_only
mov r2, #2 ;loop counter
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r3, sp
;First Pass: output_height lines x output_width columns (16x16)
vp8e_filt_blk2d_fpo16x16s_loop_neon
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
vld1.u8 {d4, d5, d6, d7}, [r0], r1
vld1.u8 {d8, d9, d10, d11}, [r0], r1
vld1.u8 {d12, d13, d14, d15}, [r0], r1
;pld [r0]
;pld [r0, r1]
;pld [r0, r1, lsl #1]
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
vld1.u8 {d16, d17, d18, d19}, [r0], r1
vext.8 q3, q2, q3, #1
vld1.u8 {d20, d21, d22, d23}, [r0], r1
vext.8 q5, q4, q5, #1
vld1.u8 {d24, d25, d26, d27}, [r0], r1
vext.8 q7, q6, q7, #1
vld1.u8 {d28, d29, d30, d31}, [r0], r1
vext.8 q9, q8, q9, #1
vext.8 q11, q10, q11, #1
vext.8 q13, q12, q13, #1
vext.8 q15, q14, q15, #1
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
vrhadd.u8 q1, q2, q3
vrhadd.u8 q2, q4, q5
vrhadd.u8 q3, q6, q7
vrhadd.u8 q4, q8, q9
vrhadd.u8 q5, q10, q11
vrhadd.u8 q6, q12, q13
vrhadd.u8 q7, q14, q15
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]!
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;---------------------
secondpass_bfilter16x16s_only
sub sp, sp, #256 ;reserve space on stack for temporary storage
mov r2, #2 ;loop counter
vld1.u8 {d0, d1}, [r0], r1 ;load src data
mov r3, sp
vp8e_filt_blk2d_spo16x16s_loop_neon
vld1.u8 {d2, d3}, [r0], r1
vld1.u8 {d4, d5}, [r0], r1
vld1.u8 {d6, d7}, [r0], r1
vld1.u8 {d8, d9}, [r0], r1
vrhadd.u8 q0, q0, q1
vld1.u8 {d10, d11}, [r0], r1
vrhadd.u8 q1, q1, q2
vld1.u8 {d12, d13}, [r0], r1
vrhadd.u8 q2, q2, q3
vld1.u8 {d14, d15}, [r0], r1
vrhadd.u8 q3, q3, q4
vld1.u8 {d16, d17}, [r0], r1
vrhadd.u8 q4, q4, q5
vrhadd.u8 q5, q5, q6
vrhadd.u8 q6, q6, q7
vrhadd.u8 q7, q7, q8
subs r2, r2, #1
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
vmov q0, q8
vst1.u8 {d4, d5, d6, d7}, [r3]!
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
vst1.u8 {d12, d13, d14, d15}, [r3]!
bne vp8e_filt_blk2d_spo16x16s_loop_neon
b sub_pixel_variance16x16s_neon
;----------------------------
;variance16x16
sub_pixel_variance16x16s_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
sub r3, r3, #256
mov r2, #4
sub_pixel_variance16x16s_neon_loop
vld1.8 {q0}, [r3]! ;Load up source and reference
vld1.8 {q1}, [r4], r12
vld1.8 {q2}, [r3]!
vld1.8 {q3}, [r4], r12
vld1.8 {q4}, [r3]!
vld1.8 {q5}, [r4], r12
vld1.8 {q6}, [r3]!
vld1.8 {q7}, [r4], r12
vsubl.u8 q11, d0, d2 ;diff
vsubl.u8 q12, d1, d3
vsubl.u8 q13, d4, d6
vsubl.u8 q14, d5, d7
vsubl.u8 q0, d8, d10
vsubl.u8 q1, d9, d11
vsubl.u8 q2, d12, d14
vsubl.u8 q3, d13, d15
vpadal.s16 q8, q11 ;sum
vmlal.s16 q9, d22, d22 ;sse
vmlal.s16 q10, d23, d23
subs r2, r2, #1
vpadal.s16 q8, q12
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vpadal.s16 q8, q13
vmlal.s16 q9, d26, d26
vmlal.s16 q10, d27, d27
vpadal.s16 q8, q14
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
vpadal.s16 q8, q0 ;sum
vmlal.s16 q9, d0, d0 ;sse
vmlal.s16 q10, d1, d1
vpadal.s16 q8, q1
vmlal.s16 q9, d2, d2
vmlal.s16 q10, d3, d3
vpadal.s16 q8, q2
vmlal.s16 q9, d4, d4
vmlal.s16 q10, d5, d5
vpadal.s16 q8, q3
vmlal.s16 q9, d6, d6
vmlal.s16 q10, d7, d7
bne sub_pixel_variance16x16s_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #8
vsub.u32 d0, d1, d10
add sp, sp, #256
vmov.32 r0, d0[0] ;return
pop {r4, pc}
ENDP
END

Просмотреть файл

@ -0,0 +1,222 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_sub_pixel_variance8x8_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
; r3 int yoffset,
; stack(r4) unsigned char *dst_ptr,
; stack(r5) int dst_pixels_per_line,
; stack(r6) unsigned int *sse
;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
|vp8_sub_pixel_variance8x8_neon| PROC
push {r4-r5, lr}
adr r12, bilinear_taps_coeff
ldr r4, [sp, #12] ;load *dst_ptr from stack
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
ldr lr, [sp, #20] ;load *sse from stack
cmp r2, #0 ;skip first_pass filter if xoffset=0
beq skip_firstpass_filter
;First pass: output_height lines x output_width columns (9x8)
add r2, r12, r2, lsl #3 ;calculate filter location
vld1.u8 {q1}, [r0], r1 ;load src data
vld1.u32 {d31}, [r2] ;load first_pass filter
vld1.u8 {q2}, [r0], r1
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
vld1.u8 {q3}, [r0], r1
vdup.8 d1, d31[4]
vld1.u8 {q4}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vld1.u8 {q1}, [r0], r1 ;load src data
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
vld1.u8 {q2}, [r0], r1
vqrshrn.u16 d23, q7, #7
vld1.u8 {q3}, [r0], r1
vqrshrn.u16 d24, q8, #7
vld1.u8 {q4}, [r0], r1
vqrshrn.u16 d25, q9, #7
;first_pass filtering on the rest 5-line data
vld1.u8 {q5}, [r0], r1
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q7, d4, d0
vmull.u8 q8, d6, d0
vmull.u8 q9, d8, d0
vmull.u8 q10, d10, d0
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
vext.8 d5, d4, d5, #1
vext.8 d7, d6, d7, #1
vext.8 d9, d8, d9, #1
vext.8 d11, d10, d11, #1
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
vmlal.u8 q7, d5, d1
vmlal.u8 q8, d7, d1
vmlal.u8 q9, d9, d1
vmlal.u8 q10, d11, d1
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
vqrshrn.u16 d27, q7, #7
vqrshrn.u16 d28, q8, #7
vqrshrn.u16 d29, q9, #7
vqrshrn.u16 d30, q10, #7
;Second pass: 8x8
secondpass_filter
cmp r3, #0 ;skip second_pass filter if yoffset=0
;skip_secondpass_filter
beq sub_pixel_variance8x8_neon
add r3, r12, r3, lsl #3
vld1.u32 {d31}, [r3] ;load second_pass filter
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
vdup.8 d1, d31[4]
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
vmull.u8 q2, d23, d0
vmull.u8 q3, d24, d0
vmull.u8 q4, d25, d0
vmull.u8 q5, d26, d0
vmull.u8 q6, d27, d0
vmull.u8 q7, d28, d0
vmull.u8 q8, d29, d0
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
vmlal.u8 q2, d24, d1
vmlal.u8 q3, d25, d1
vmlal.u8 q4, d26, d1
vmlal.u8 q5, d27, d1
vmlal.u8 q6, d28, d1
vmlal.u8 q7, d29, d1
vmlal.u8 q8, d30, d1
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
vqrshrn.u16 d23, q2, #7
vqrshrn.u16 d24, q3, #7
vqrshrn.u16 d25, q4, #7
vqrshrn.u16 d26, q5, #7
vqrshrn.u16 d27, q6, #7
vqrshrn.u16 d28, q7, #7
vqrshrn.u16 d29, q8, #7
b sub_pixel_variance8x8_neon
;--------------------
skip_firstpass_filter
vld1.u8 {d22}, [r0], r1 ;load src data
vld1.u8 {d23}, [r0], r1
vld1.u8 {d24}, [r0], r1
vld1.u8 {d25}, [r0], r1
vld1.u8 {d26}, [r0], r1
vld1.u8 {d27}, [r0], r1
vld1.u8 {d28}, [r0], r1
vld1.u8 {d29}, [r0], r1
vld1.u8 {d30}, [r0], r1
b secondpass_filter
;----------------------
;vp8_variance8x8_neon
sub_pixel_variance8x8_neon
vmov.i8 q8, #0 ;q8 - sum
vmov.i8 q9, #0 ;q9, q10 - sse
vmov.i8 q10, #0
mov r12, #2
sub_pixel_variance8x8_neon_loop
vld1.8 {d0}, [r4], r5 ;load dst data
subs r12, r12, #1
vld1.8 {d1}, [r4], r5
vld1.8 {d2}, [r4], r5
vsubl.u8 q4, d22, d0 ;calculate diff
vld1.8 {d3}, [r4], r5
vsubl.u8 q5, d23, d1
vsubl.u8 q6, d24, d2
vpadal.s16 q8, q4 ;sum
vmlal.s16 q9, d8, d8 ;sse
vmlal.s16 q10, d9, d9
vsubl.u8 q7, d25, d3
vpadal.s16 q8, q5
vmlal.s16 q9, d10, d10
vmlal.s16 q10, d11, d11
vmov q11, q13
vpadal.s16 q8, q6
vmlal.s16 q9, d12, d12
vmlal.s16 q10, d13, d13
vmov q12, q14
vpadal.s16 q8, q7
vmlal.s16 q9, d14, d14
vmlal.s16 q10, d15, d15
bne sub_pixel_variance8x8_neon_loop
vadd.u32 q10, q9, q10 ;accumulate sse
vpaddl.s32 q0, q8 ;accumulate sum
vpaddl.u32 q1, q10
vadd.s64 d0, d0, d1
vadd.u64 d1, d2, d3
vmull.s32 q5, d0, d0
vst1.32 {d1[0]}, [lr] ;store sse
vshr.u32 d10, d10, #6
vsub.u32 d0, d1, d10
vmov.32 r0, d0[0] ;return
pop {r4-r5, pc}
ENDP
;-----------------
bilinear_taps_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
END

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,58 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "vp8/common/blockd.h"
#include "vpx_mem/vpx_mem.h"
#if HAVE_NEON
extern void vp8_build_intra_predictors_mby_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
extern void vp8_build_intra_predictors_mby_s_neon_func(
unsigned char *y_buffer,
unsigned char *ypred_ptr,
int y_stride,
int mode,
int Up,
int Left);
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
{
unsigned char *y_buffer = x->dst.y_buffer;
unsigned char *ypred_ptr = x->predictor;
int y_stride = x->dst.y_stride;
int mode = x->mode_info_context->mbmi.mode;
int Up = x->up_available;
int Left = x->left_available;
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
}
#endif

Просмотреть файл

@ -22,9 +22,6 @@
extern "C" {
#endif
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
/* Only need this for fixed-size arrays, for structs just assign. */
#define vp8_copy( Dest, Src) { \

Просмотреть файл

@ -15,6 +15,7 @@
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
typedef unsigned char uc;
static void lf_init_lut(loop_filter_info_n *lfi)
{

Просмотреть файл

@ -104,18 +104,7 @@ extern "C"
struct vpx_rational timebase;
unsigned int target_bandwidth; /* kilobits per second */
/* Parameter used for applying denoiser.
* For temporal denoiser: noise_sensitivity = 0 means off,
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
* noise_sensitivity = 2 means temporal denoiser on for all channels.
* noise_sensitivity = 3 means aggressive denoising mode.
* noise_sensitivity >= 4 means adaptive denoising mode.
* Temporal denoiser is enabled via the configuration option:
* CONFIG_TEMPORAL_DENOISING.
* For spatial denoiser: noise_sensitivity controls the amount of
* pre-processing blur: noise_sensitivity = 0 means off.
* Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
*/
/* parameter used for applying pre processing blur: recommendation 0 */
int noise_sensitivity;
/* parameter used for sharpening output: recommendation 0: */
@ -224,7 +213,7 @@ extern "C"
int arnr_strength;
int arnr_type;
vpx_fixed_buf_t two_pass_stats_in;
struct vpx_fixed_buf two_pass_stats_in;
struct vpx_codec_pkt_list *output_pkt_list;
vp8e_tuning tuning;

Просмотреть файл

@ -214,7 +214,6 @@ static int q2mbl(int x)
x = 50 + (x - 50) * 10 / 8;
return x * x / 3;
}
void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
{
int r, c, i;
@ -227,14 +226,14 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
int sumsq = 0;
int sum = 0;
for (i = -8; i < 0; i++)
for (i = -8; i<0; i++)
s[i]=s[0];
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[i+cols]=s[cols-1];
for (i = cols; i<cols+17; i++)
s[i]=s[cols-1];
for (i = -8; i <= 6; i++)
{
@ -265,6 +264,7 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
}
}
void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
{
int r, c, i;
@ -284,8 +284,8 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
/* 17 avoids valgrind warning - we buffer values in c in d
* and only write them when we've read 8 ahead...
*/
for (i = 0; i < 17; i++)
s[(i+rows)*pitch]=s[(rows-1)*pitch];
for (i = rows; i < rows+17; i++)
s[i*pitch]=s[(rows-1)*pitch];
for (i = -8; i <= 6; i++)
{
@ -303,14 +303,13 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
{
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
}
if (r >= 8)
s[-8*pitch] = d[(r-8)&15];
s[-8*pitch] = d[(r-8)&15];
s += pitch;
}
}
}
#if CONFIG_POSTPROC
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
int q)
{
@ -383,22 +382,21 @@ void vp8_deblock(VP8_COMMON *cm,
vp8_yv12_copy_frame(source, post);
}
}
#endif
#if !(CONFIG_TEMPORAL_DENOISING)
void vp8_de_noise(VP8_COMMON *cm,
YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
int flag,
int uvfilter)
int flag)
{
int mbr;
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
int ppl = (int)(level + .5);
int mb_rows = cm->mb_rows;
int mb_cols = cm->mb_cols;
int mb_rows = source->y_width >> 4;
int mb_cols = source->y_height >> 4;
unsigned char *limits = cm->pp_limits_buffer;;
int mbr, mbc;
(void) post;
(void) low_var_thresh;
(void) flag;
@ -412,20 +410,18 @@ void vp8_de_noise(VP8_COMMON *cm,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_buffer + 16 * mbr * source->y_stride,
source->y_stride, source->y_stride, source->y_width, limits, 16);
if (uvfilter == 1) {
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,
8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits,
8);
}
vp8_post_proc_down_and_across_mb_row(
source->u_buffer + 8 * mbr * source->uv_stride,
source->u_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
vp8_post_proc_down_and_across_mb_row(
source->v_buffer + 8 * mbr * source->uv_stride,
source->v_buffer + 8 * mbr * source->uv_stride,
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
}
}
#endif
double vp8_gaussian(double sigma, double mu, double x)
{

Просмотреть файл

@ -39,8 +39,7 @@ void vp8_de_noise(struct VP8Common *oci,
YV12_BUFFER_CONFIG *post,
int q,
int low_var_thresh,
int flag,
int uvfilter);
int flag);
void vp8_deblock(struct VP8Common *oci,
YV12_BUFFER_CONFIG *source,

Просмотреть файл

@ -0,0 +1,29 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_COMMON_PRAGMAS_H_
#define VP8_COMMON_PRAGMAS_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __INTEL_COMPILER
#pragma warning(disable:997 1011 170)
#endif
#ifdef _MSC_VER
#pragma warning(disable:4799)
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP8_COMMON_PRAGMAS_H_

Просмотреть файл

@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
pxor mm7, [GLOBAL(t80)] ; unoffset
; mm7 = q1
; transpose and write back
; tranpose and write back
; mm1 = 72 62 52 42 32 22 12 02
; mm6 = 73 63 53 43 33 23 13 03
; mm3 = 74 64 54 44 34 24 14 04
@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
; transpose and write back
; tranpose and write back
movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00

Просмотреть файл

@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
; start work on filters
B_FILTER 2
; transpose and write back - only work on q1, q0, p0, p1
; tranpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
; store 16-line result
@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
; start work on filters
B_FILTER 2
; transpose and write back - only work on q1, q0, p0, p1
; tranpose and write back - only work on q1, q0, p0, p1
BV_TRANSPOSE
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing

Просмотреть файл

@ -204,16 +204,13 @@ sym(vp8_mbpost_proc_down_mmx):
and rcx, 15
movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
cmp edx, 8
jl .skip_assignment
mov rcx, rdx
sub rcx, 8
and rcx, 15
movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
movd [rsi], mm1
.skip_assignment
movd [rsi], mm1
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
@ -246,6 +243,7 @@ sym(vp8_mbpost_proc_down_mmx):
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
; unsigned int Width, unsigned int Height, int Pitch)
extern sym(rand)
global sym(vp8_plane_add_noise_mmx) PRIVATE
sym(vp8_plane_add_noise_mmx):
push rbp
@ -257,7 +255,7 @@ sym(vp8_plane_add_noise_mmx):
; end prolog
.addnoise_loop:
call sym(LIBVPX_RAND) WRT_PLT
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
add rcx, rax

Просмотреть файл

@ -425,16 +425,13 @@ sym(vp8_mbpost_proc_down_xmm):
and rcx, 15
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
cmp edx, 8
jl .skip_assignment
mov rcx, rdx
sub rcx, 8
and rcx, 15
movq mm0, [rsp + rcx*8] ;d[rcx*8]
movq [rsi], mm0
.skip_assignment
movq [rsi], mm0
lea rsi, [rsi+rax]
lea rdi, [rdi+rax]
@ -660,6 +657,7 @@ sym(vp8_mbpost_proc_across_ip_xmm):
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
; unsigned int Width, unsigned int Height, int Pitch)
extern sym(rand)
global sym(vp8_plane_add_noise_wmt) PRIVATE
sym(vp8_plane_add_noise_wmt):
push rbp
@ -671,7 +669,7 @@ sym(vp8_plane_add_noise_wmt):
; end prolog
.addnoise_loop:
call sym(LIBVPX_RAND) WRT_PLT
call sym(rand) WRT_PLT
mov rcx, arg(1) ;noise
and rax, 0xff
add rcx, rax

Просмотреть файл

@ -0,0 +1,24 @@
/*
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
#if defined(__ANDROID__)
#define rand __rand
#include <stdlib.h>
#undef rand
extern int rand(void)
{
return __rand();
}
#else
/* ISO C forbids an empty translation unit. */
int vp8_unused;
#endif

Просмотреть файл

@ -365,7 +365,6 @@ sym(vp8_intra_pred_uv_tm_%1):
GET_GOT rbx
push rsi
push rdi
push rbx
; end prolog
; read top row
@ -396,11 +395,8 @@ sym(vp8_intra_pred_uv_tm_%1):
movsxd rcx, dword ptr arg(1) ;dst_stride
.vp8_intra_pred_uv_tm_%1_loop:
mov bl, [rsi]
movd xmm3, ebx
mov bl, [rsi+rax]
movd xmm5, ebx
movd xmm3, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
punpcklbw xmm3, xmm0
punpcklbw xmm5, xmm0
@ -423,7 +419,6 @@ sym(vp8_intra_pred_uv_tm_%1):
jnz .vp8_intra_pred_uv_tm_%1_loop
; begin epilog
pop rbx
pop rdi
pop rsi
RESTORE_GOT
@ -491,8 +486,10 @@ sym(vp8_intra_pred_uv_ho_%1):
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
%ifidn %1, ssse3
%ifndef GET_GOT_SAVE_ARG
push rbx
%endif
GET_GOT rbx
%endif
; end prolog
@ -510,16 +507,13 @@ sym(vp8_intra_pred_uv_ho_%1):
%ifidn %1, ssse3
lea rdx, [rcx*3]
movdqa xmm2, [GLOBAL(dc_00001111)]
lea rbx, [rax*3]
%endif
%ifidn %1, mmx2
.vp8_intra_pred_uv_ho_%1_loop:
mov bl, [rsi]
movd mm0, ebx
mov bl, [rsi+rax]
movd mm1, ebx
movd mm0, [rsi]
movd mm1, [rsi+rax]
punpcklbw mm0, mm0
punpcklbw mm1, mm1
pshufw mm0, mm0, 0x0
@ -531,19 +525,10 @@ sym(vp8_intra_pred_uv_ho_%1):
dec edx
jnz .vp8_intra_pred_uv_ho_%1_loop
%else
mov bl, [rsi]
movd xmm0, ebx
mov bl, [rsi+rax]
movd xmm3, ebx
mov bl, [rsi+rax*2]
movd xmm1, ebx
lea rbx, [rax*3]
mov bl, [rsi+rbx]
movd xmm4, ebx
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
movd xmm1, [rsi+rax*2]
movd xmm4, [rsi+rbx]
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
@ -554,20 +539,10 @@ sym(vp8_intra_pred_uv_ho_%1):
movhps [rdi+rdx], xmm1
lea rsi, [rsi+rax*4]
lea rdi, [rdi+rcx*4]
mov bl, [rsi]
movd xmm0, ebx
mov bl, [rsi+rax]
movd xmm3, ebx
mov bl, [rsi+rax*2]
movd xmm1, ebx
lea rbx, [rax*3]
mov bl, [rsi+rbx]
movd xmm4, ebx
movd xmm0, [rsi]
movd xmm3, [rsi+rax]
movd xmm1, [rsi+rax*2]
movd xmm4, [rsi+rbx]
punpcklbw xmm0, xmm3
punpcklbw xmm1, xmm4
pshufb xmm0, xmm2
@ -581,8 +556,10 @@ sym(vp8_intra_pred_uv_ho_%1):
; begin epilog
%ifidn %1, ssse3
RESTORE_GOT
%endif
%ifndef GET_GOT_SAVE_ARG
pop rbx
%endif
%endif
pop rdi
pop rsi
UNSHADOW_ARGS
@ -916,7 +893,6 @@ sym(vp8_intra_pred_y_tm_%1):
SAVE_XMM 7
push rsi
push rdi
push rbx
GET_GOT rbx
; end prolog
@ -950,11 +926,8 @@ sym(vp8_intra_pred_y_tm_%1):
mov rdi, arg(0) ;dst;
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_y_tm_%1_loop:
mov bl, [rsi]
movd xmm4, ebx
mov bl, [rsi+rax]
movd xmm5, ebx
movd xmm4, [rsi]
movd xmm5, [rsi+rax]
%ifidn %1, sse2
punpcklbw xmm4, xmm0
punpcklbw xmm5, xmm0
@ -983,7 +956,6 @@ vp8_intra_pred_y_tm_%1_loop:
; begin epilog
RESTORE_GOT
pop rbx
pop rdi
pop rsi
RESTORE_XMM
@ -1057,7 +1029,6 @@ sym(vp8_intra_pred_y_ho_sse2):
SHADOW_ARGS_TO_STACK 5
push rsi
push rdi
push rbx
; end prolog
;arg(2) not used
@ -1070,11 +1041,8 @@ sym(vp8_intra_pred_y_ho_sse2):
movsxd rcx, dword ptr arg(1) ;dst_stride
vp8_intra_pred_y_ho_sse2_loop:
mov bl, [rsi]
movd xmm0, ebx
mov bl, [rsi+rax]
movd xmm1, ebx
movd xmm0, [rsi]
movd xmm1, [rsi+rax]
; FIXME use pshufb for ssse3 version
punpcklbw xmm0, xmm0
punpcklbw xmm1, xmm1
@ -1090,7 +1058,6 @@ vp8_intra_pred_y_ho_sse2_loop:
jnz vp8_intra_pred_y_ho_sse2_loop
; begin epilog
pop rbx
pop rdi
pop rsi
UNSHADOW_ARGS

Просмотреть файл

@ -342,8 +342,8 @@ sym(vp8_get4x4var_mmx):
movsxd rdx, dword ptr arg(3) ;[recon_stride]
; Row 1
movd mm0, [rax] ; Copy four bytes to mm0
movd mm1, [rbx] ; Copy four bytes to mm1
movq mm0, [rax] ; Copy eight bytes to mm0
movq mm1, [rbx] ; Copy eight bytes to mm1
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@ -351,12 +351,12 @@ sym(vp8_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 2
movd mm0, [rax] ; Copy four bytes to mm0
movq mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
@ -365,12 +365,12 @@ sym(vp8_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 3
movd mm0, [rax] ; Copy four bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher precision
movq mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6
psubsw mm0, mm1 ; A-B (low order) to MM0
paddw mm5, mm0 ; accumulate differences in mm5
@ -378,11 +378,11 @@ sym(vp8_get4x4var_mmx):
pmaddwd mm0, mm0 ; square and accumulate
add rbx,rdx ; Inc pointer into ref data
add rax,rcx ; Inc pointer into the new data
movd mm1, [rbx] ; Copy four bytes to mm1
movq mm1, [rbx] ; Copy eight bytes to mm1
paddd mm7, mm0 ; accumulate in mm7
; Row 4
movd mm0, [rax] ; Copy four bytes to mm0
movq mm0, [rax] ; Copy eight bytes to mm0
punpcklbw mm0, mm6 ; unpack to higher prrcision
punpcklbw mm1, mm6

Просмотреть файл

@ -10,6 +10,7 @@
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
#include "vp8/common/x86/filter_x86.h"

Просмотреть файл

@ -10,6 +10,7 @@
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
#include "vp8/common/x86/filter_x86.h"

Просмотреть файл

@ -10,6 +10,7 @@
#include "vpx_config.h"
#include "vp8/common/variance.h"
#include "vp8/common/pragmas.h"
#include "vpx_ports/mem.h"
extern unsigned int vp8_get16x16var_sse2

Просмотреть файл

@ -10,12 +10,11 @@
#include "dboolhuff.h"
#include "vp8/common/common.h"
int vp8dx_start_decode(BOOL_DECODER *br,
const unsigned char *source,
unsigned int source_sz,
vpx_decrypt_cb decrypt_cb,
vp8_decrypt_cb *decrypt_cb,
void *decrypt_state)
{
br->user_buffer_end = source+source_sz;
@ -40,7 +39,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
const unsigned char *bufptr = br->user_buffer;
VP8_BD_VALUE value = br->value;
int count = br->count;
int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8);
size_t bytes_left = br->user_buffer_end - bufptr;
size_t bits_left = bytes_left * CHAR_BIT;
int x = (int)(shift + CHAR_BIT - bits_left);
@ -48,7 +47,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
if (br->decrypt_cb) {
size_t n = MIN(sizeof(decrypted), bytes_left);
size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
bufptr = decrypted;
}

Просмотреть файл

@ -17,7 +17,6 @@
#include "vpx_config.h"
#include "vpx_ports/mem.h"
#include "vpx/vp8dx.h"
#include "vpx/vpx_integer.h"
#ifdef __cplusplus
@ -33,6 +32,12 @@ typedef size_t VP8_BD_VALUE;
Even relatively modest values like 100 would work fine.*/
#define VP8_LOTS_OF_BITS (0x40000000)
/*Decrypt n bytes of data from input -> output, using the decrypt_state
passed in VP8D_SET_DECRYPTOR.
*/
typedef void (vp8_decrypt_cb)(void *decrypt_state, const unsigned char *input,
unsigned char *output, int count);
typedef struct
{
const unsigned char *user_buffer_end;
@ -40,7 +45,7 @@ typedef struct
VP8_BD_VALUE value;
int count;
unsigned int range;
vpx_decrypt_cb decrypt_cb;
vp8_decrypt_cb *decrypt_cb;
void *decrypt_state;
} BOOL_DECODER;
@ -49,7 +54,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
int vp8dx_start_decode(BOOL_DECODER *br,
const unsigned char *source,
unsigned int source_sz,
vpx_decrypt_cb decrypt_cb,
vp8_decrypt_cb *decrypt_cb,
void *decrypt_state);
void vp8dx_bool_decoder_fill(BOOL_DECODER *br);

Просмотреть файл

@ -17,7 +17,6 @@
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/reconinter.h"
#include "detokenize.h"
#include "vp8/common/common.h"
#include "vp8/common/invtrans.h"
#include "vp8/common/alloccommon.h"
#include "vp8/common/entropymode.h"
@ -632,17 +631,9 @@ static void decode_mb_rows(VP8D_COMP *pbi)
xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
} else {
// ref_frame is INTRA_FRAME, pre buffer should not be used.
xd->pre.y_buffer = 0;
xd->pre.u_buffer = 0;
xd->pre.v_buffer = 0;
}
xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
/* propagate errors from reference frames */
xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
@ -1019,7 +1010,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
const unsigned char *clear = data;
if (pbi->decrypt_cb)
{
int n = (int)MIN(sizeof(clear_buffer), data_end - data);
int n = (int)(data_end - data);
if (n > 10) n = 10;
pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
clear = clear_buffer;
}

Просмотреть файл

@ -15,7 +15,9 @@
#include "decodemv.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/findnearmv.h"
#include "vp8/common/common.h"
#define MIN(x,y) (((x)<(y))?(x):(y))
#define MAX(x,y) (((x)>(y))?(x):(y))
#define FLOOR(x,q) ((x) & -(1 << (q)))

Просмотреть файл

@ -178,6 +178,12 @@ vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_
return pbi->common.error.error_code;
}
/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
#if HAVE_NEON
extern void vp8_push_neon(int64_t *store);
extern void vp8_pop_neon(int64_t *store);
#endif
static int get_free_fb (VP8_COMMON *cm)
{
int i;
@ -301,6 +307,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
const uint8_t *source,
int64_t time_stamp)
{
#if HAVE_NEON
int64_t dx_store_reg[8];
#endif
VP8_COMMON *cm = &pbi->common;
int retcode = -1;
@ -310,6 +319,15 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
if(retcode <= 0)
return retcode;
#if HAVE_NEON
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->cpu_caps & HAS_NEON)
#endif
{
vp8_push_neon(dx_store_reg);
}
#endif
cm->new_fb_idx = get_free_fb (cm);
/* setup reference frames for vp8_decode_frame */
@ -385,8 +403,16 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
pbi->last_time_stamp = time_stamp;
decode_exit:
#if HAVE_NEON
#if CONFIG_RUNTIME_CPU_DETECT
if (cm->cpu_caps & HAS_NEON)
#endif
{
vp8_pop_neon(dx_store_reg);
}
#endif
pbi->common.error.setjmp = 0;
vp8_clear_system_state();
return retcode;
}
int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)

Просмотреть файл

@ -126,7 +126,7 @@ typedef struct VP8D_COMP
int independent_partitions;
int frame_corrupt_residual;
vpx_decrypt_cb decrypt_cb;
vp8_decrypt_cb *decrypt_cb;
void *decrypt_state;
} VP8D_COMP;

Просмотреть файл

@ -45,31 +45,37 @@
* [16, 255] 3 6 7
*/
int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
int mc_running_avg_y_stride,
unsigned char *running_avg_y,
int running_avg_y_stride,
unsigned char *sig, int sig_stride,
unsigned int motion_magnitude,
int increase_denoising) {
int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
YV12_BUFFER_CONFIG *running_avg,
MACROBLOCK *signal, unsigned int motion_magnitude,
int y_offset, int uv_offset) {
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level, level1 adjustment is
* increased, the deltas stay the same.
*/
int shift_inc = (increase_denoising &&
motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
const uint8x16_t v_level1_adjustment = vmovq_n_u8(
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
const uint8x16_t v_level1_adjustment = vdupq_n_u8(
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3);
const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
/* Local variables for array pointers and strides. */
unsigned char *sig = signal->thismb;
int sig_stride = 16;
unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
int mc_running_avg_y_stride = mc_running_avg->y_stride;
unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
int running_avg_y_stride = running_avg->y_stride;
/* Go over lines. */
int r;
for (r = 0; r < 16; ++r) {
int i;
int sum_diff = 0;
for (i = 0; i < 16; ++i) {
int8x16_t v_sum_diff = vdupq_n_s8(0);
uint8x16_t v_running_avg_y;
/* Load inputs. */
const uint8x16_t v_sig = vld1q_u8(sig);
const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
@ -111,9 +117,12 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
v_abs_adjustment);
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
v_abs_adjustment);
uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
v_sum_diff = vqaddq_s8(v_sum_diff,
vreinterpretq_s8_u8(v_pos_adjustment));
v_sum_diff = vqsubq_s8(v_sum_diff,
vreinterpretq_s8_u8(v_neg_adjustment));
/* Store results. */
vst1q_u8(running_avg_y, v_running_avg_y);
@ -122,19 +131,23 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
* for this macroblock.
*/
{
const int8x16_t v_sum_diff =
vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
vreinterpretq_s8_u8(v_neg_adjustment));
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
const int32x4_t fedc_ba98_7654_3210 =
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 =
vpaddlq_s32(fedc_ba98_7654_3210);
v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
int s0 = vgetq_lane_s8(v_sum_diff, 0) +
vgetq_lane_s8(v_sum_diff, 1) +
vgetq_lane_s8(v_sum_diff, 2) +
vgetq_lane_s8(v_sum_diff, 3);
int s1 = vgetq_lane_s8(v_sum_diff, 4) +
vgetq_lane_s8(v_sum_diff, 5) +
vgetq_lane_s8(v_sum_diff, 6) +
vgetq_lane_s8(v_sum_diff, 7);
int s2 = vgetq_lane_s8(v_sum_diff, 8) +
vgetq_lane_s8(v_sum_diff, 9) +
vgetq_lane_s8(v_sum_diff, 10) +
vgetq_lane_s8(v_sum_diff, 11);
int s3 = vgetq_lane_s8(v_sum_diff, 12) +
vgetq_lane_s8(v_sum_diff, 13) +
vgetq_lane_s8(v_sum_diff, 14) +
vgetq_lane_s8(v_sum_diff, 15);
sum_diff += s0 + s1+ s2 + s3;
}
/* Update pointers for next iteration. */
@ -144,335 +157,11 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
}
/* Too much adjustments => copy block. */
{
int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
vget_low_s64(v_sum_diff_total));
int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
int sum_diff_thresh = SUM_DIFF_THRESHOLD;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
if (sum_diff > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising),
// checK if we can still apply some (weaker) temporal filtering to
// this block, that would otherwise not be denoised at all. Simplest
// is to apply an additional adjustment to running_avg_y to bring it
// closer to sig. The adjustment is capped by a maximum delta, and
// chosen such that in most cases the resulting sum_diff will be
// within the accceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over the
// threshold.
int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const uint8x16_t k_delta = vmovq_n_u8(delta);
sig -= sig_stride * 16;
mc_running_avg_y -= mc_running_avg_y_stride * 16;
running_avg_y -= running_avg_y_stride * 16;
for (r = 0; r < 16; ++r) {
uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
const uint8x16_t v_sig = vld1q_u8(sig);
const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
/* Calculate absolute difference and sign masks. */
const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
v_mc_running_avg_y);
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
v_mc_running_avg_y);
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
v_mc_running_avg_y);
// Clamp absolute difference to delta to get the adjustment.
const uint8x16_t v_abs_adjustment =
vminq_u8(v_abs_diff, (k_delta));
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
v_abs_adjustment);
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
v_abs_adjustment);
v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
/* Store results. */
vst1q_u8(running_avg_y, v_running_avg_y);
{
const int8x16_t v_sum_diff =
vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
vreinterpretq_s8_u8(v_pos_adjustment));
const int16x8_t fe_dc_ba_98_76_54_32_10 =
vpaddlq_s8(v_sum_diff);
const int32x4_t fedc_ba98_7654_3210 =
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 =
vpaddlq_s32(fedc_ba98_7654_3210);
v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
fedcba98_76543210);
}
/* Update pointers for next iteration. */
sig += sig_stride;
mc_running_avg_y += mc_running_avg_y_stride;
running_avg_y += running_avg_y_stride;
}
{
// Update the sum of all pixel differences of this MB.
x = vqadd_s64(vget_high_s64(v_sum_diff_total),
vget_low_s64(v_sum_diff_total));
sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
if (sum_diff > sum_diff_thresh) {
return COPY_BLOCK;
}
}
} else {
return COPY_BLOCK;
}
}
}
if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
return COPY_BLOCK;
/* Tell above level that block was filtered. */
running_avg_y -= running_avg_y_stride * 16;
sig -= sig_stride * 16;
vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
return FILTER_BLOCK;
}
int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg,
int mc_running_avg_stride,
unsigned char *running_avg,
int running_avg_stride,
unsigned char *sig, int sig_stride,
unsigned int motion_magnitude,
int increase_denoising) {
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level, level1 adjustment is
* increased, the deltas stay the same.
*/
int shift_inc = (increase_denoising &&
motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
const uint8x16_t v_level1_adjustment = vmovq_n_u8(
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3);
const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
int r;
{
uint16x4_t v_sum_block = vdup_n_u16(0);
// Avoid denoising color signal if its close to average level.
for (r = 0; r < 8; ++r) {
const uint8x8_t v_sig = vld1_u8(sig);
const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig);
v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10);
sig += sig_stride;
}
sig -= sig_stride * 8;
{
const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block);
const uint64x1_t _76543210 = vpaddl_u32(_7654_3210);
const int sum_block =
vget_lane_s32(vreinterpret_s32_u64(_76543210), 0);
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
return COPY_BLOCK;
}
}
}
/* Go over lines. */
for (r = 0; r < 4; ++r) {
/* Load inputs. */
const uint8x8_t v_sig_lo = vld1_u8(sig);
const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
const uint8x8_t v_mc_running_avg_hi =
vld1_u8(&mc_running_avg[mc_running_avg_stride]);
const uint8x16_t v_mc_running_avg =
vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
/* Calculate absolute difference and sign masks. */
const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg);
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg);
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg);
/* Figure out which level that put us in. */
const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
v_abs_diff);
const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
v_abs_diff);
const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
v_abs_diff);
/* Calculate absolute adjustments for level 1, 2 and 3. */
const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
v_delta_level_1_and_2);
const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
v_delta_level_2_and_3);
const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
v_level2_adjustment);
const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
v_level1and2_adjustment, v_level3_adjustment);
/* Figure adjustment absolute value by selecting between the absolute
* difference if in level0 or the value for level 1, 2 and 3.
*/
const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
v_level1and2and3_adjustment, v_abs_diff);
/* Calculate positive and negative adjustments. Apply them to the signal
* and accumulate them. Adjustments are less than eight and the maximum
* sum of them (7 * 16) can fit in a signed char.
*/
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
v_abs_adjustment);
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
v_abs_adjustment);
uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment);
v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment);
/* Store results. */
vst1_u8(running_avg, vget_low_u8(v_running_avg));
vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg));
/* Sum all the accumulators to have the sum of all pixel differences
* for this macroblock.
*/
{
const int8x16_t v_sum_diff =
vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
vreinterpretq_s8_u8(v_neg_adjustment));
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
const int32x4_t fedc_ba98_7654_3210 =
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 =
vpaddlq_s32(fedc_ba98_7654_3210);
v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
}
/* Update pointers for next iteration. */
sig += sig_stride * 2;
mc_running_avg += mc_running_avg_stride * 2;
running_avg += running_avg_stride * 2;
}
/* Too much adjustments => copy block. */
{
int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
vget_low_s64(v_sum_diff_total));
int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
if (sum_diff > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising),
// checK if we can still apply some (weaker) temporal filtering to
// this block, that would otherwise not be denoised at all. Simplest
// is to apply an additional adjustment to running_avg_y to bring it
// closer to sig. The adjustment is capped by a maximum delta, and
// chosen such that in most cases the resulting sum_diff will be
// within the accceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over the
// threshold.
int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const uint8x16_t k_delta = vmovq_n_u8(delta);
sig -= sig_stride * 8;
mc_running_avg -= mc_running_avg_stride * 8;
running_avg -= running_avg_stride * 8;
for (r = 0; r < 4; ++r) {
const uint8x8_t v_sig_lo = vld1_u8(sig);
const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
const uint8x8_t v_mc_running_avg_hi =
vld1_u8(&mc_running_avg[mc_running_avg_stride]);
const uint8x16_t v_mc_running_avg =
vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
/* Calculate absolute difference and sign masks. */
const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
v_mc_running_avg);
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
v_mc_running_avg);
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
v_mc_running_avg);
// Clamp absolute difference to delta to get the adjustment.
const uint8x16_t v_abs_adjustment =
vminq_u8(v_abs_diff, (k_delta));
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
v_abs_adjustment);
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
v_abs_adjustment);
const uint8x8_t v_running_avg_lo = vld1_u8(running_avg);
const uint8x8_t v_running_avg_hi =
vld1_u8(&running_avg[running_avg_stride]);
uint8x16_t v_running_avg =
vcombine_u8(v_running_avg_lo, v_running_avg_hi);
v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment);
v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment);
/* Store results. */
vst1_u8(running_avg, vget_low_u8(v_running_avg));
vst1_u8(&running_avg[running_avg_stride],
vget_high_u8(v_running_avg));
{
const int8x16_t v_sum_diff =
vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
vreinterpretq_s8_u8(v_pos_adjustment));
const int16x8_t fe_dc_ba_98_76_54_32_10 =
vpaddlq_s8(v_sum_diff);
const int32x4_t fedc_ba98_7654_3210 =
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
const int64x2_t fedcba98_76543210 =
vpaddlq_s32(fedc_ba98_7654_3210);
v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
fedcba98_76543210);
}
/* Update pointers for next iteration. */
sig += sig_stride * 2;
mc_running_avg += mc_running_avg_stride * 2;
running_avg += running_avg_stride * 2;
}
{
// Update the sum of all pixel differences of this MB.
x = vqadd_s64(vget_high_s64(v_sum_diff_total),
vget_low_s64(v_sum_diff_total));
sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
if (sum_diff > sum_diff_thresh) {
return COPY_BLOCK;
}
}
} else {
return COPY_BLOCK;
}
}
}
/* Tell above level that block was filtered. */
running_avg -= running_avg_stride * 8;
sig -= sig_stride * 8;
vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride);
vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride,
signal->thismb, sig_stride);
return FILTER_BLOCK;
}

Просмотреть файл

@ -0,0 +1,46 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp8/common/loopfilter.h"
#include "vpx_scale/yv12config.h"
extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
unsigned char *src_ptr,
int sz);
void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc)
{
unsigned char *src_y, *dst_y;
int yheight;
int ystride;
int yoffset;
int linestocopy;
yheight = src_ybc->y_height;
ystride = src_ybc->y_stride;
/* number of MB rows to use in partial filtering */
linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
/* Copy extra 4 so that full filter context is available if filtering done
* on the copied partial frame and not original. Partial filter does mb
* filtering for top row also, which can modify3 pixels above.
*/
linestocopy += 4;
/* partial image starts at ~middle of frame (macroblock border) */
yoffset = ystride * (((yheight >> 5) * 16) - 4);
src_y = src_ybc->y_buffer + yoffset;
dst_y = dst_ybc->y_buffer + yoffset;
vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
}

Просмотреть файл

@ -0,0 +1,221 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_fdct4x4_neon|
EXPORT |vp8_short_fdct8x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=4
ALIGN 16 ; enable use of @128 bit aligned loads
coeff
DCW 5352, 5352, 5352, 5352
DCW 2217, 2217, 2217, 2217
DCD 14500, 14500, 14500, 14500
DCD 7500, 7500, 7500, 7500
DCD 12000, 12000, 12000, 12000
DCD 51000, 51000, 51000, 51000
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|vp8_short_fdct4x4_neon| PROC
; Part one
vld1.16 {d0}, [r0@64], r2
adr r12, coeff
vld1.16 {d1}, [r0@64], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {d2}, [r0@64], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {d3}, [r0@64], r2
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vmov.s16 d26, #7
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
vadd.s16 d4, d4, d26 ; a1 + 7
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
vceq.s16 d4, d7, #0
vshr.s16 d0, d0, #4
vshr.s16 d2, d2, #4
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
vmvn d4, d4
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|vp8_short_fdct8x4_neon| PROC
; Part one
vld1.16 {q0}, [r0@128], r2
adr r12, coeff
vld1.16 {q1}, [r0@128], r2
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
vld1.16 {q2}, [r0@128], r2
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
vld1.16 {q3}, [r0@128], r2
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
vtrn.32 q0, q2 ; [A0|B0]
vtrn.32 q1, q3 ; [A1|B1]
vtrn.16 q0, q1 ; [A2|B2]
vtrn.16 q2, q3 ; [A3|B3]
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
vshl.s16 q11, q11, #3 ; a1 << 3
vshl.s16 q12, q12, #3 ; b1 << 3
vshl.s16 q13, q13, #3 ; c1 << 3
vshl.s16 q14, q14, #3 ; d1 << 3
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
vmov.s16 q11, q9 ; 14500
vmov.s16 q12, q10 ; 7500
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
; Part two
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
vtrn.32 q0, q2 ; q0=[A0 | B0]
vtrn.32 q1, q3 ; q1=[A4 | B4]
vtrn.16 q0, q1 ; q2=[A8 | B8]
vtrn.16 q2, q3 ; q3=[A12|B12]
vmov.s16 q15, #7
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
vadd.s16 q11, q11, q15 ; a1 + 7
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
vmov.s16 q11, q9 ; 12000
vmov.s16 q12, q10 ; 51000
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
vceq.s16 q14, q14, #0
vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
vmvn q14, q14
vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
vst1.16 {q0, q1}, [r1@128]! ; block A
vst1.16 {q2, q3}, [r1@128]! ; block B
bx lr
ENDP
END

Просмотреть файл

@ -1,269 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
void vp8_short_fdct4x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int16x4_t d16s16, d17s16, d26s16, dEmptys16;
uint16x4_t d4u16;
int16x8_t q0s16, q1s16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
d16s16 = vdup_n_s16(5352);
d17s16 = vdup_n_s16(2217);
q9s32 = vdupq_n_s32(14500);
q10s32 = vdupq_n_s32(7500);
q11s32 = vdupq_n_s32(12000);
q12s32 = vdupq_n_s32(51000);
// Part one
pitch >>= 1;
d0s16 = vld1_s16(input);
input += pitch;
d1s16 = vld1_s16(input);
input += pitch;
d2s16 = vld1_s16(input);
input += pitch;
d3s16 = vld1_s16(input);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d4s16 = vshl_n_s16(d4s16, 3);
d5s16 = vshl_n_s16(d5s16, 3);
d6s16 = vshl_n_s16(d6s16, 3);
d7s16 = vshl_n_s16(d7s16, 3);
d0s16 = vadd_s16(d4s16, d5s16);
d2s16 = vsub_s16(d4s16, d5s16);
q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
d1s16 = vshrn_n_s32(q9s32, 12);
d3s16 = vshrn_n_s32(q10s32, 12);
// Part two
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
d26s16 = vdup_n_s16(7);
d4s16 = vadd_s16(d4s16, d26s16);
d0s16 = vadd_s16(d4s16, d5s16);
d2s16 = vsub_s16(d4s16, d5s16);
q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
dEmptys16 = vdup_n_s16(0);
d4u16 = vceq_s16(d7s16, dEmptys16);
d0s16 = vshr_n_s16(d0s16, 4);
d2s16 = vshr_n_s16(d2s16, 4);
q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
d4u16 = vmvn_u16(d4u16);
d1s16 = vshrn_n_s32(q11s32, 16);
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
d3s16 = vshrn_n_s32(q12s32, 16);
q0s16 = vcombine_s16(d0s16, d1s16);
q1s16 = vcombine_s16(d2s16, d3s16);
vst1q_s16(output, q0s16);
vst1q_s16(output + 8, q1s16);
return;
}
void vp8_short_fdct8x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
uint16x4_t d28u16, d29u16;
uint16x8_t q14u16;
int16x8_t q0s16, q1s16, q2s16, q3s16;
int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
int32x4_t q9s32, q10s32, q11s32, q12s32;
int16x8x2_t v2tmp0, v2tmp1;
int32x4x2_t v2tmp2, v2tmp3;
d16s16 = vdup_n_s16(5352);
d17s16 = vdup_n_s16(2217);
q9s32 = vdupq_n_s32(14500);
q10s32 = vdupq_n_s32(7500);
// Part one
pitch >>= 1;
q0s16 = vld1q_s16(input);
input += pitch;
q1s16 = vld1q_s16(input);
input += pitch;
q2s16 = vld1q_s16(input);
input += pitch;
q3s16 = vld1q_s16(input);
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
vreinterpretq_s32_s16(q2s16));
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
vreinterpretq_s32_s16(q3s16));
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q11s16 = vshlq_n_s16(q11s16, 3);
q12s16 = vshlq_n_s16(q12s16, 3);
q13s16 = vshlq_n_s16(q13s16, 3);
q14s16 = vshlq_n_s16(q14s16, 3);
q0s16 = vaddq_s16(q11s16, q12s16);
q2s16 = vsubq_s16(q11s16, q12s16);
q11s32 = q9s32;
q12s32 = q10s32;
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
d2s16 = vshrn_n_s32(q9s32, 12);
d6s16 = vshrn_n_s32(q10s32, 12);
d3s16 = vshrn_n_s32(q11s32, 12);
d7s16 = vshrn_n_s32(q12s32, 12);
q1s16 = vcombine_s16(d2s16, d3s16);
q3s16 = vcombine_s16(d6s16, d7s16);
// Part two
q9s32 = vdupq_n_s32(12000);
q10s32 = vdupq_n_s32(51000);
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
vreinterpretq_s32_s16(q2s16));
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
vreinterpretq_s32_s16(q3s16));
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
q15s16 = vdupq_n_s16(7);
q11s16 = vaddq_s16(q11s16, q15s16);
q0s16 = vaddq_s16(q11s16, q12s16);
q1s16 = vsubq_s16(q11s16, q12s16);
q11s32 = q9s32;
q12s32 = q10s32;
d0s16 = vget_low_s16(q0s16);
d1s16 = vget_high_s16(q0s16);
d2s16 = vget_low_s16(q1s16);
d3s16 = vget_high_s16(q1s16);
d0s16 = vshr_n_s16(d0s16, 4);
d4s16 = vshr_n_s16(d1s16, 4);
d2s16 = vshr_n_s16(d2s16, 4);
d6s16 = vshr_n_s16(d3s16, 4);
d26s16 = vget_low_s16(q13s16);
d27s16 = vget_high_s16(q13s16);
d28s16 = vget_low_s16(q14s16);
d29s16 = vget_high_s16(q14s16);
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
d1s16 = vshrn_n_s32(q9s32, 16);
d3s16 = vshrn_n_s32(q10s32, 16);
d5s16 = vshrn_n_s32(q11s32, 16);
d7s16 = vshrn_n_s32(q12s32, 16);
qEmptys16 = vdupq_n_s16(0);
q14u16 = vceqq_s16(q14s16, qEmptys16);
q14u16 = vmvnq_u16(q14u16);
d28u16 = vget_low_u16(q14u16);
d29u16 = vget_high_u16(q14u16);
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
q0s16 = vcombine_s16(d0s16, d1s16);
q1s16 = vcombine_s16(d2s16, d3s16);
q2s16 = vcombine_s16(d4s16, d5s16);
q3s16 = vcombine_s16(d6s16, d7s16);
vst1q_s16(output, q0s16);
vst1q_s16(output + 8, q1s16);
vst1q_s16(output + 16, q2s16);
vst1q_s16(output + 24, q3s16);
return;
}

Просмотреть файл

@ -0,0 +1,199 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_subtract_b_neon|
EXPORT |vp8_subtract_mby_neon|
EXPORT |vp8_subtract_mbuv_neon|
INCLUDE vp8_asm_enc_offsets.asm
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
|vp8_subtract_b_neon| PROC
stmfd sp!, {r4-r7}
ldr r3, [r0, #vp8_block_base_src]
ldr r4, [r0, #vp8_block_src]
ldr r5, [r0, #vp8_block_src_diff]
ldr r3, [r3]
ldr r6, [r0, #vp8_block_src_stride]
add r3, r3, r4 ; src = *base_src + src
ldr r7, [r1, #vp8_blockd_predictor]
vld1.8 {d0}, [r3], r6 ;load src
vld1.8 {d1}, [r7], r2 ;load pred
vld1.8 {d2}, [r3], r6
vld1.8 {d3}, [r7], r2
vld1.8 {d4}, [r3], r6
vld1.8 {d5}, [r7], r2
vld1.8 {d6}, [r3], r6
vld1.8 {d7}, [r7], r2
vsubl.u8 q10, d0, d1
vsubl.u8 q11, d2, d3
vsubl.u8 q12, d4, d5
vsubl.u8 q13, d6, d7
mov r2, r2, lsl #1
vst1.16 {d20}, [r5], r2 ;store diff
vst1.16 {d22}, [r5], r2
vst1.16 {d24}, [r5], r2
vst1.16 {d26}, [r5], r2
ldmfd sp!, {r4-r7}
bx lr
ENDP
;==========================================
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
; unsigned char *pred, int pred_stride)
|vp8_subtract_mby_neon| PROC
push {r4-r7}
mov r12, #4
ldr r4, [sp, #16] ; pred_stride
mov r6, #32 ; "diff" stride x2
add r5, r0, #16 ; second diff pointer
subtract_mby_loop
vld1.8 {q0}, [r1], r2 ;load src
vld1.8 {q1}, [r3], r4 ;load pred
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r3], r4
vld1.8 {q4}, [r1], r2
vld1.8 {q5}, [r3], r4
vld1.8 {q6}, [r1], r2
vld1.8 {q7}, [r3], r4
vsubl.u8 q8, d0, d2
vsubl.u8 q9, d1, d3
vsubl.u8 q10, d4, d6
vsubl.u8 q11, d5, d7
vsubl.u8 q12, d8, d10
vsubl.u8 q13, d9, d11
vsubl.u8 q14, d12, d14
vsubl.u8 q15, d13, d15
vst1.16 {q8}, [r0], r6 ;store diff
vst1.16 {q9}, [r5], r6
vst1.16 {q10}, [r0], r6
vst1.16 {q11}, [r5], r6
vst1.16 {q12}, [r0], r6
vst1.16 {q13}, [r5], r6
vst1.16 {q14}, [r0], r6
vst1.16 {q15}, [r5], r6
subs r12, r12, #1
bne subtract_mby_loop
pop {r4-r7}
bx lr
ENDP
;=================================
;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
; int src_stride, unsigned char *upred,
; unsigned char *vpred, int pred_stride)
|vp8_subtract_mbuv_neon| PROC
push {r4-r7}
ldr r4, [sp, #16] ; upred
ldr r5, [sp, #20] ; vpred
ldr r6, [sp, #24] ; pred_stride
add r0, r0, #512 ; short *udiff = diff + 256;
mov r12, #32 ; "diff" stride x2
add r7, r0, #16 ; second diff pointer
;u
vld1.8 {d0}, [r1], r3 ;load usrc
vld1.8 {d1}, [r4], r6 ;load upred
vld1.8 {d2}, [r1], r3
vld1.8 {d3}, [r4], r6
vld1.8 {d4}, [r1], r3
vld1.8 {d5}, [r4], r6
vld1.8 {d6}, [r1], r3
vld1.8 {d7}, [r4], r6
vld1.8 {d8}, [r1], r3
vld1.8 {d9}, [r4], r6
vld1.8 {d10}, [r1], r3
vld1.8 {d11}, [r4], r6
vld1.8 {d12}, [r1], r3
vld1.8 {d13}, [r4], r6
vld1.8 {d14}, [r1], r3
vld1.8 {d15}, [r4], r6
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
vsubl.u8 q10, d4, d5
vsubl.u8 q11, d6, d7
vsubl.u8 q12, d8, d9
vsubl.u8 q13, d10, d11
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
vst1.16 {q8}, [r0], r12 ;store diff
vst1.16 {q9}, [r7], r12
vst1.16 {q10}, [r0], r12
vst1.16 {q11}, [r7], r12
vst1.16 {q12}, [r0], r12
vst1.16 {q13}, [r7], r12
vst1.16 {q14}, [r0], r12
vst1.16 {q15}, [r7], r12
;v
vld1.8 {d0}, [r2], r3 ;load vsrc
vld1.8 {d1}, [r5], r6 ;load vpred
vld1.8 {d2}, [r2], r3
vld1.8 {d3}, [r5], r6
vld1.8 {d4}, [r2], r3
vld1.8 {d5}, [r5], r6
vld1.8 {d6}, [r2], r3
vld1.8 {d7}, [r5], r6
vld1.8 {d8}, [r2], r3
vld1.8 {d9}, [r5], r6
vld1.8 {d10}, [r2], r3
vld1.8 {d11}, [r5], r6
vld1.8 {d12}, [r2], r3
vld1.8 {d13}, [r5], r6
vld1.8 {d14}, [r2], r3
vld1.8 {d15}, [r5], r6
vsubl.u8 q8, d0, d1
vsubl.u8 q9, d2, d3
vsubl.u8 q10, d4, d5
vsubl.u8 q11, d6, d7
vsubl.u8 q12, d8, d9
vsubl.u8 q13, d10, d11
vsubl.u8 q14, d12, d13
vsubl.u8 q15, d14, d15
vst1.16 {q8}, [r0], r12 ;store diff
vst1.16 {q9}, [r7], r12
vst1.16 {q10}, [r0], r12
vst1.16 {q11}, [r7], r12
vst1.16 {q12}, [r0], r12
vst1.16 {q13}, [r7], r12
vst1.16 {q14}, [r0], r12
vst1.16 {q15}, [r7], r12
pop {r4-r7}
bx lr
ENDP
END

Просмотреть файл

@ -1,154 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vp8/encoder/block.h"
void vp8_subtract_b_neon(
BLOCK *be,
BLOCKD *bd,
int pitch) {
unsigned char *src_ptr, *predictor;
int src_stride;
int16_t *src_diff;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q10u16, q11u16, q12u16, q13u16;
src_ptr = *be->base_src + be->src;
src_stride = be->src_stride;
predictor = bd->predictor;
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d6u8 = vld1_u8(src_ptr);
d1u8 = vld1_u8(predictor);
predictor += pitch;
d3u8 = vld1_u8(predictor);
predictor += pitch;
d5u8 = vld1_u8(predictor);
predictor += pitch;
d7u8 = vld1_u8(predictor);
q10u16 = vsubl_u8(d0u8, d1u8);
q11u16 = vsubl_u8(d2u8, d3u8);
q12u16 = vsubl_u8(d4u8, d5u8);
q13u16 = vsubl_u8(d6u8, d7u8);
src_diff = be->src_diff;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
src_diff += pitch;
vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
return;
}
void vp8_subtract_mby_neon(
int16_t *diff,
unsigned char *src,
int src_stride,
unsigned char *pred,
int pred_stride) {
int i;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
for (i = 0; i < 8; i++) { // subtract_mby_loop
q0u8 = vld1q_u8(src);
src += src_stride;
q2u8 = vld1q_u8(src);
src += src_stride;
q1u8 = vld1q_u8(pred);
pred += pred_stride;
q3u8 = vld1q_u8(pred);
pred += pred_stride;
q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
return;
}
void vp8_subtract_mbuv_neon(
int16_t *diff,
unsigned char *usrc,
unsigned char *vsrc,
int src_stride,
unsigned char *upred,
unsigned char *vpred,
int pred_stride) {
int i, j;
unsigned char *src_ptr, *pred_ptr;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
uint16x8_t q8u16, q9u16, q10u16, q11u16;
diff += 256;
for (i = 0; i < 2; i++) {
if (i == 0) {
src_ptr = usrc;
pred_ptr = upred;
} else if (i == 1) {
src_ptr = vsrc;
pred_ptr = vpred;
}
for (j = 0; j < 2; j++) {
d0u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d1u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d3u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d4u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d5u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
d6u8 = vld1_u8(src_ptr);
src_ptr += src_stride;
d7u8 = vld1_u8(pred_ptr);
pred_ptr += pred_stride;
q8u16 = vsubl_u8(d0u8, d1u8);
q9u16 = vsubl_u8(d2u8, d3u8);
q10u16 = vsubl_u8(d4u8, d5u8);
q11u16 = vsubl_u8(d6u8, d7u8);
vst1q_u16((uint16_t *)diff, q8u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q9u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q10u16);
diff += 8;
vst1q_u16((uint16_t *)diff, q11u16);
diff += 8;
}
}
return;
}

Просмотреть файл

@ -0,0 +1,70 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_memcpy_partial_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;=========================================
;this is not a full memcpy function!!!
;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
; int sz);
|vp8_memcpy_partial_neon| PROC
;pld [r1] ;preload pred data
;pld [r1, #128]
;pld [r1, #256]
;pld [r1, #384]
mov r12, r2, lsr #8 ;copy 256 bytes data at one time
memcpy_neon_loop
vld1.8 {q0, q1}, [r1]! ;load src data
subs r12, r12, #1
vld1.8 {q2, q3}, [r1]!
vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
vld1.8 {q4, q5}, [r1]!
vst1.8 {q2, q3}, [r0]!
vld1.8 {q6, q7}, [r1]!
vst1.8 {q4, q5}, [r0]!
vld1.8 {q8, q9}, [r1]!
vst1.8 {q6, q7}, [r0]!
vld1.8 {q10, q11}, [r1]!
vst1.8 {q8, q9}, [r0]!
vld1.8 {q12, q13}, [r1]!
vst1.8 {q10, q11}, [r0]!
vld1.8 {q14, q15}, [r1]!
vst1.8 {q12, q13}, [r0]!
vst1.8 {q14, q15}, [r0]!
;pld [r1] ;preload pred data -- need to adjust for real device
;pld [r1, #128]
;pld [r1, #256]
;pld [r1, #384]
bne memcpy_neon_loop
ands r3, r2, #0xff ;extra copy
beq done_copy_neon_loop
extra_copy_neon_loop
vld1.8 {q0}, [r1]! ;load src data
subs r3, r3, #16
vst1.8 {q0}, [r0]!
bne extra_copy_neon_loop
done_copy_neon_loop
bx lr
ENDP
END

Просмотреть файл

@ -0,0 +1,116 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_mse16x16_neon|
EXPORT |vp8_get4x4sse_cs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;============================
; r0 unsigned char *src_ptr
; r1 int source_stride
; r2 unsigned char *ref_ptr
; r3 int recon_stride
; stack unsigned int *sse
;note: in this function, sum is never used. So, we can remove this part of calculation
;from vp8_variance().
|vp8_mse16x16_neon| PROC
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
vmov.i8 q8, #0
vmov.i8 q9, #0
vmov.i8 q10, #0
mov r12, #8
mse16x16_neon_loop
vld1.8 {q0}, [r0], r1 ;Load up source and reference
vld1.8 {q2}, [r2], r3
vld1.8 {q1}, [r0], r1
vld1.8 {q3}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmlal.s16 q7, d22, d22
vmlal.s16 q8, d23, d23
subs r12, r12, #1
vmlal.s16 q9, d24, d24
vmlal.s16 q10, d25, d25
vmlal.s16 q7, d26, d26
vmlal.s16 q8, d27, d27
vmlal.s16 q9, d28, d28
vmlal.s16 q10, d29, d29
bne mse16x16_neon_loop
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
ldr r12, [sp] ;load *sse from stack
vadd.u32 q10, q7, q9
vpaddl.u32 q1, q10
vadd.u64 d0, d2, d3
vst1.32 {d0[0]}, [r12]
vmov.32 r0, d0[0]
bx lr
ENDP
;=============================
; r0 unsigned char *src_ptr,
; r1 int source_stride,
; r2 unsigned char *ref_ptr,
; r3 int recon_stride
|vp8_get4x4sse_cs_neon| PROC
vld1.8 {d0}, [r0], r1 ;Load up source and reference
vld1.8 {d4}, [r2], r3
vld1.8 {d1}, [r0], r1
vld1.8 {d5}, [r2], r3
vld1.8 {d2}, [r0], r1
vld1.8 {d6}, [r2], r3
vld1.8 {d3}, [r0], r1
vld1.8 {d7}, [r2], r3
vsubl.u8 q11, d0, d4
vsubl.u8 q12, d1, d5
vsubl.u8 q13, d2, d6
vsubl.u8 q14, d3, d7
vmull.s16 q7, d22, d22
vmull.s16 q8, d24, d24
vmull.s16 q9, d26, d26
vmull.s16 q10, d28, d28
vadd.u32 q7, q7, q8
vadd.u32 q9, q9, q10
vadd.u32 q9, q7, q9
vpaddl.u32 q1, q9
vadd.u64 d0, d2, d3
vmov.32 r0, d0[0]
bx lr
ENDP
END

Просмотреть файл

@ -1,131 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
unsigned int vp8_mse16x16_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride,
unsigned int *sse) {
int i;
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
int64x1_t d0s64;
uint8x16_t q0u8, q1u8, q2u8, q3u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
q7s32 = vdupq_n_s32(0);
q8s32 = vdupq_n_s32(0);
q9s32 = vdupq_n_s32(0);
q10s32 = vdupq_n_s32(0);
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
q0u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q1u8 = vld1q_u8(src_ptr);
src_ptr += source_stride;
q2u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q3u8 = vld1q_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
}
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q10s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q10s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
unsigned int vp8_get4x4sse_cs_neon(
const unsigned char *src_ptr,
int source_stride,
const unsigned char *ref_ptr,
int recon_stride) {
int16x4_t d22s16, d24s16, d26s16, d28s16;
int64x1_t d0s64;
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
int32x4_t q7s32, q8s32, q9s32, q10s32;
uint16x8_t q11u16, q12u16, q13u16, q14u16;
int64x2_t q1s64;
d0u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d4u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d1u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d5u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d2u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d6u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
d3u8 = vld1_u8(src_ptr);
src_ptr += source_stride;
d7u8 = vld1_u8(ref_ptr);
ref_ptr += recon_stride;
q11u16 = vsubl_u8(d0u8, d4u8);
q12u16 = vsubl_u8(d1u8, d5u8);
q13u16 = vsubl_u8(d2u8, d6u8);
q14u16 = vsubl_u8(d3u8, d7u8);
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
q7s32 = vmull_s16(d22s16, d22s16);
q8s32 = vmull_s16(d24s16, d24s16);
q9s32 = vmull_s16(d26s16, d26s16);
q10s32 = vmull_s16(d28s16, d28s16);
q7s32 = vaddq_s32(q7s32, q8s32);
q9s32 = vaddq_s32(q9s32, q10s32);
q9s32 = vaddq_s32(q7s32, q9s32);
q1s64 = vpaddlq_s32(q9s32);
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}

Просмотреть файл

@ -0,0 +1,103 @@
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_short_walsh4x4_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
; r0 short *input,
; r1 short *output,
; r2 int pitch
|vp8_short_walsh4x4_neon| PROC
vld1.16 {d0}, [r0@64], r2 ; load input
vld1.16 {d1}, [r0@64], r2
vld1.16 {d2}, [r0@64], r2
vld1.16 {d3}, [r0@64]
;First for-loop
;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
vtrn.32 d0, d2
vtrn.32 d1, d3
vmov.s32 q15, #3 ; add 3 to all values
vtrn.16 d0, d1
vtrn.16 d2, d3
vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
vceq.s16 d16, d4, #0 ; a1 == 0
vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
vadd.s16 d0, d4, d5 ; a1 + d1
vmvn d16, d16 ; a1 != 0
vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
;Second for-loop
;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
vtrn.32 d1, d3
vtrn.32 d0, d2
vtrn.16 d2, d3
vtrn.16 d0, d1
vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
vadd.s32 q0, q8, q9 ; a2 = a1 + d1
vadd.s32 q1, q11, q10 ; b2 = b1 + c1
vsub.s32 q2, q11, q10 ; c2 = b1 - c1
vsub.s32 q3, q8, q9 ; d2 = a1 - d1
vclt.s32 q8, q0, #0
vclt.s32 q9, q1, #0
vclt.s32 q10, q2, #0
vclt.s32 q11, q3, #0
; subtract -1 (or 0)
vsub.s32 q0, q0, q8 ; a2 += a2 < 0
vsub.s32 q1, q1, q9 ; b2 += b2 < 0
vsub.s32 q2, q2, q10 ; c2 += c2 < 0
vsub.s32 q3, q3, q11 ; d2 += d2 < 0
vadd.s32 q8, q0, q15 ; a2 + 3
vadd.s32 q9, q1, q15 ; b2 + 3
vadd.s32 q10, q2, q15 ; c2 + 3
vadd.s32 q11, q3, q15 ; d2 + 3
; vrshrn? would add 1 << 3-1 = 2
vshrn.s32 d0, q8, #3
vshrn.s32 d1, q9, #3
vshrn.s32 d2, q10, #3
vshrn.s32 d3, q11, #3
vst1.16 {q0, q1}, [r1@128]
bx lr
ENDP
END

Просмотреть файл

@ -1,129 +0,0 @@
/*
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "vpx_ports/arm.h"
#ifdef VPX_INCOMPATIBLE_GCC
#include "./vp8_rtcd.h"
void vp8_short_walsh4x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
vp8_short_walsh4x4_c(input, output, pitch);
}
#else
void vp8_short_walsh4x4_neon(
int16_t *input,
int16_t *output,
int pitch) {
uint16x4_t d16u16;
int16x8_t q0s16, q1s16;
int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
int32x4_t q9s32, q10s32, q11s32, q15s32;
uint32x4_t q8u32, q9u32, q10u32, q11u32;
int16x4x2_t v2tmp0, v2tmp1;
int32x2x2_t v2tmp2, v2tmp3;
dEmptys16 = vdup_n_s16(0);
qEmptys32 = vdupq_n_s32(0);
q15s32 = vdupq_n_s32(3);
d0s16 = vld1_s16(input);
input += pitch/2;
d1s16 = vld1_s16(input);
input += pitch/2;
d2s16 = vld1_s16(input);
input += pitch/2;
d3s16 = vld1_s16(input);
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
d4s16 = vshl_n_s16(d4s16, 2);
d5s16 = vshl_n_s16(d5s16, 2);
d6s16 = vshl_n_s16(d6s16, 2);
d7s16 = vshl_n_s16(d7s16, 2);
d16u16 = vceq_s16(d4s16, dEmptys16);
d16u16 = vmvn_u16(d16u16);
d0s16 = vadd_s16(d4s16, d5s16);
d3s16 = vsub_s16(d4s16, d5s16);
d1s16 = vadd_s16(d7s16, d6s16);
d2s16 = vsub_s16(d7s16, d6s16);
d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
// Second for-loop
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
vreinterpret_s32_s16(d3s16));
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
vreinterpret_s32_s16(d2s16));
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2
vreinterpret_s16_s32(v2tmp2.val[1])); // d3
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0
vreinterpret_s16_s32(v2tmp2.val[0])); // d1
q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
q0s32 = vaddq_s32(q8s32, q9s32);
q1s32 = vaddq_s32(q11s32, q10s32);
q2s32 = vsubq_s32(q11s32, q10s32);
q3s32 = vsubq_s32(q8s32, q9s32);
q8u32 = vcltq_s32(q0s32, qEmptys32);
q9u32 = vcltq_s32(q1s32, qEmptys32);
q10u32 = vcltq_s32(q2s32, qEmptys32);
q11u32 = vcltq_s32(q3s32, qEmptys32);
q8s32 = vreinterpretq_s32_u32(q8u32);
q9s32 = vreinterpretq_s32_u32(q9u32);
q10s32 = vreinterpretq_s32_u32(q10u32);
q11s32 = vreinterpretq_s32_u32(q11u32);
q0s32 = vsubq_s32(q0s32, q8s32);
q1s32 = vsubq_s32(q1s32, q9s32);
q2s32 = vsubq_s32(q2s32, q10s32);
q3s32 = vsubq_s32(q3s32, q11s32);
q8s32 = vaddq_s32(q0s32, q15s32);
q9s32 = vaddq_s32(q1s32, q15s32);
q10s32 = vaddq_s32(q2s32, q15s32);
q11s32 = vaddq_s32(q3s32, q15s32);
d0s16 = vshrn_n_s32(q8s32, 3);
d1s16 = vshrn_n_s32(q9s32, 3);
d2s16 = vshrn_n_s32(q10s32, 3);
d3s16 = vshrn_n_s32(q11s32, 3);
q0s16 = vcombine_s16(d0s16, d1s16);
q1s16 = vcombine_s16(d2s16, d3s16);
vst1q_s16(output, q0s16);
vst1q_s16(output + 8, q1s16);
return;
}
#endif // VPX_INCOMPATIBLE_GCC

Просмотреть файл

@ -18,6 +18,7 @@
#include <assert.h>
#include <stdio.h>
#include <limits.h>
#include "vp8/common/pragmas.h"
#include "vpx/vpx_encoder.h"
#include "vpx_mem/vpx_mem.h"
#include "bitstream.h"

Просмотреть файл

@ -18,18 +18,18 @@ extern "C" {
#if HAVE_EDSP
void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
vp8_token *,
const vp8_token *,
const vp8_extra_bit_struct *,
const vp8_tree_index *);
void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
unsigned char * cx_data,
const unsigned char *cx_data_end,
int num_parts,
vp8_token *,
const vp8_token *,
const vp8_extra_bit_struct *,
const vp8_tree_index *);
void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
vp8_token *,
const vp8_token *,
const vp8_extra_bit_struct *,
const vp8_tree_index *);
# define pack_tokens(a,b,c) \

Просмотреть файл

@ -128,7 +128,6 @@ typedef struct macroblock
int q_index;
#if CONFIG_TEMPORAL_DENOISING
int increase_denoising;
MB_PREDICTION_MODE best_sse_inter_mode;
int_mv best_sse_mv;
MV_REFERENCE_FRAME best_reference_frame;

Просмотреть файл

@ -35,6 +35,10 @@ typedef struct
unsigned char *buffer;
unsigned char *buffer_end;
struct vpx_internal_error_info *error;
/* Variables used to track bit costs without outputing to the bitstream */
unsigned int measure_cost;
unsigned long bit_counter;
} BOOL_CODER;
extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);

Просмотреть файл

@ -8,8 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <limits.h>
#include "denoising.h"
#include "vp8/common/reconinter.h"
@ -23,7 +21,6 @@ static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
*/
static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
/*
* The filter function was modified to reduce the computational complexity.
@ -54,36 +51,27 @@ static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
* [16, 255] 6 7
*/
int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
unsigned char *running_avg_y, int avg_y_stride,
unsigned char *sig, int sig_stride,
unsigned int motion_magnitude,
int increase_denoising)
int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
unsigned int motion_magnitude, int y_offset,
int uv_offset)
{
unsigned char *running_avg_y_start = running_avg_y;
unsigned char *sig_start = sig;
int sum_diff_thresh;
int r, c;
unsigned char *sig = signal->thismb;
int sig_stride = 16;
unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
int mc_avg_y_stride = mc_running_avg->y_stride;
unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
int avg_y_stride = running_avg->y_stride;
int r, c, i;
int sum_diff = 0;
int adj_val[3] = {3, 4, 6};
int shift_inc1 = 0;
int shift_inc2 = 1;
int col_sum[16] = {0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0};
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level. Add another increment for
* blocks that are labeled for increase denoising. */
* increasing the adjustment for each level. */
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
{
if (increase_denoising) {
shift_inc1 = 1;
shift_inc2 = 2;
}
adj_val[0] += shift_inc2;
adj_val[1] += shift_inc2;
adj_val[2] += shift_inc2;
for (i = 0; i < 3; i++)
adj_val[i] += 1;
}
for (r = 0; r < 16; ++r)
@ -97,16 +85,15 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
diff = mc_running_avg_y[c] - sig[c];
absdiff = abs(diff);
// When |diff| <= |3 + shift_inc1|, use pixel value from
// last denoised raw.
if (absdiff <= 3 + shift_inc1)
/* When |diff| < 4, use pixel value from last denoised raw. */
if (absdiff <= 3)
{
running_avg_y[c] = mc_running_avg_y[c];
col_sum[c] += diff;
sum_diff += diff;
}
else
{
if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
if (absdiff >= 4 && absdiff <= 7)
adjustment = adj_val[0];
else if (absdiff >= 8 && absdiff <= 15)
adjustment = adj_val[1];
@ -120,7 +107,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
else
running_avg_y[c] = sig[c] + adjustment;
col_sum[c] += adjustment;
sum_diff += adjustment;
}
else
{
@ -129,7 +116,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
else
running_avg_y[c] = sig[c] - adjustment;
col_sum[c] -= adjustment;
sum_diff -= adjustment;
}
}
}
@ -140,269 +127,18 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
running_avg_y += avg_y_stride;
}
for (c = 0; c < 16; ++c) {
// Below we clip the value in the same way which SSE code use.
// When adopting aggressive denoiser, the adj_val for each pixel
// could be at most 8 (this is current max adjustment of the map).
// In SSE code, we calculate the sum of adj_val for
// the columns, so the sum could be upto 128(16 rows). However,
// the range of the value is -128 ~ 127 in SSE code, that's why
// we do this change in C code.
// We don't do this for UV denoiser, since there are only 8 rows,
// and max adjustments <= 8, so the sum of the columns will not
// exceed 64.
if (col_sum[c] >= 128) {
col_sum[c] = 127;
}
sum_diff += col_sum[c];
}
sum_diff_thresh= SUM_DIFF_THRESHOLD;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
if (abs(sum_diff) > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising), check
// if we can still apply some (weaker) temporal filtering to this block,
// that would otherwise not be denoised at all. Simplest is to apply
// an additional adjustment to running_avg_y to bring it closer to sig.
// The adjustment is capped by a maximum delta, and chosen such that
// in most cases the resulting sum_diff will be within the
// accceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over threshold.
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
sig -= sig_stride * 16;
mc_running_avg_y -= mc_avg_y_stride * 16;
running_avg_y -= avg_y_stride * 16;
for (r = 0; r < 16; ++r) {
for (c = 0; c < 16; ++c) {
int diff = mc_running_avg_y[c] - sig[c];
int adjustment = abs(diff);
if (adjustment > delta)
adjustment = delta;
if (diff > 0) {
// Bring denoised signal down.
if (running_avg_y[c] - adjustment < 0)
running_avg_y[c] = 0;
else
running_avg_y[c] = running_avg_y[c] - adjustment;
col_sum[c] -= adjustment;
} else if (diff < 0) {
// Bring denoised signal up.
if (running_avg_y[c] + adjustment > 255)
running_avg_y[c] = 255;
else
running_avg_y[c] = running_avg_y[c] + adjustment;
col_sum[c] += adjustment;
}
}
// TODO(marpan): Check here if abs(sum_diff) has gone below the
// threshold sum_diff_thresh, and if so, we can exit the row loop.
sig += sig_stride;
mc_running_avg_y += mc_avg_y_stride;
running_avg_y += avg_y_stride;
}
sum_diff = 0;
for (c = 0; c < 16; ++c) {
if (col_sum[c] >= 128) {
col_sum[c] = 127;
}
sum_diff += col_sum[c];
}
if (abs(sum_diff) > sum_diff_thresh)
return COPY_BLOCK;
} else {
if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
return COPY_BLOCK;
}
}
vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
signal->thismb, sig_stride);
return FILTER_BLOCK;
}
int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
int mc_avg_uv_stride,
unsigned char *running_avg_uv,
int avg_uv_stride,
unsigned char *sig,
int sig_stride,
unsigned int motion_magnitude,
int increase_denoising) {
unsigned char *running_avg_uv_start = running_avg_uv;
unsigned char *sig_start = sig;
int sum_diff_thresh;
int r, c;
int sum_diff = 0;
int sum_block = 0;
int adj_val[3] = {3, 4, 6};
int shift_inc1 = 0;
int shift_inc2 = 1;
/* If motion_magnitude is small, making the denoiser more aggressive by
* increasing the adjustment for each level. Add another increment for
* blocks that are labeled for increase denoising. */
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
if (increase_denoising) {
shift_inc1 = 1;
shift_inc2 = 2;
}
adj_val[0] += shift_inc2;
adj_val[1] += shift_inc2;
adj_val[2] += shift_inc2;
}
// Avoid denoising color signal if its close to average level.
for (r = 0; r < 8; ++r) {
for (c = 0; c < 8; ++c) {
sum_block += sig[c];
}
sig += sig_stride;
}
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
return COPY_BLOCK;
}
sig -= sig_stride * 8;
for (r = 0; r < 8; ++r) {
for (c = 0; c < 8; ++c) {
int diff = 0;
int adjustment = 0;
int absdiff = 0;
diff = mc_running_avg_uv[c] - sig[c];
absdiff = abs(diff);
// When |diff| <= |3 + shift_inc1|, use pixel value from
// last denoised raw.
if (absdiff <= 3 + shift_inc1) {
running_avg_uv[c] = mc_running_avg_uv[c];
sum_diff += diff;
} else {
if (absdiff >= 4 && absdiff <= 7)
adjustment = adj_val[0];
else if (absdiff >= 8 && absdiff <= 15)
adjustment = adj_val[1];
else
adjustment = adj_val[2];
if (diff > 0) {
if ((sig[c] + adjustment) > 255)
running_avg_uv[c] = 255;
else
running_avg_uv[c] = sig[c] + adjustment;
sum_diff += adjustment;
} else {
if ((sig[c] - adjustment) < 0)
running_avg_uv[c] = 0;
else
running_avg_uv[c] = sig[c] - adjustment;
sum_diff -= adjustment;
}
}
}
/* Update pointers for next iteration. */
sig += sig_stride;
mc_running_avg_uv += mc_avg_uv_stride;
running_avg_uv += avg_uv_stride;
}
sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
if (abs(sum_diff) > sum_diff_thresh) {
// Before returning to copy the block (i.e., apply no denoising), check
// if we can still apply some (weaker) temporal filtering to this block,
// that would otherwise not be denoised at all. Simplest is to apply
// an additional adjustment to running_avg_y to bring it closer to sig.
// The adjustment is capped by a maximum delta, and chosen such that
// in most cases the resulting sum_diff will be within the
// accceptable range given by sum_diff_thresh.
// The delta is set by the excess of absolute pixel diff over threshold.
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
sig -= sig_stride * 8;
mc_running_avg_uv -= mc_avg_uv_stride * 8;
running_avg_uv -= avg_uv_stride * 8;
for (r = 0; r < 8; ++r) {
for (c = 0; c < 8; ++c) {
int diff = mc_running_avg_uv[c] - sig[c];
int adjustment = abs(diff);
if (adjustment > delta)
adjustment = delta;
if (diff > 0) {
// Bring denoised signal down.
if (running_avg_uv[c] - adjustment < 0)
running_avg_uv[c] = 0;
else
running_avg_uv[c] = running_avg_uv[c] - adjustment;
sum_diff -= adjustment;
} else if (diff < 0) {
// Bring denoised signal up.
if (running_avg_uv[c] + adjustment > 255)
running_avg_uv[c] = 255;
else
running_avg_uv[c] = running_avg_uv[c] + adjustment;
sum_diff += adjustment;
}
}
// TODO(marpan): Check here if abs(sum_diff) has gone below the
// threshold sum_diff_thresh, and if so, we can exit the row loop.
sig += sig_stride;
mc_running_avg_uv += mc_avg_uv_stride;
running_avg_uv += avg_uv_stride;
}
if (abs(sum_diff) > sum_diff_thresh)
return COPY_BLOCK;
} else {
return COPY_BLOCK;
}
}
vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
sig_stride);
return FILTER_BLOCK;
}
void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
assert(mode > 0); // Denoiser is allocated only if mode > 0.
if (mode == 1) {
denoiser->denoiser_mode = kDenoiserOnYOnly;
} else if (mode == 2) {
denoiser->denoiser_mode = kDenoiserOnYUV;
} else if (mode == 3) {
denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
} else {
denoiser->denoiser_mode = kDenoiserOnAdaptive;
}
if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
denoiser->denoise_pars.scale_sse_thresh = 1;
denoiser->denoise_pars.scale_motion_thresh = 8;
denoiser->denoise_pars.scale_increase_filter = 0;
denoiser->denoise_pars.denoise_mv_bias = 95;
denoiser->denoise_pars.pickmode_mv_bias = 100;
denoiser->denoise_pars.qp_thresh = 0;
denoiser->denoise_pars.consec_zerolast = UINT_MAX;
denoiser->denoise_pars.spatial_blur = 0;
} else {
denoiser->denoise_pars.scale_sse_thresh = 2;
denoiser->denoise_pars.scale_motion_thresh = 16;
denoiser->denoise_pars.scale_increase_filter = 1;
denoiser->denoise_pars.denoise_mv_bias = 60;
denoiser->denoise_pars.pickmode_mv_bias = 60;
denoiser->denoise_pars.qp_thresh = 100;
denoiser->denoise_pars.consec_zerolast = 10;
denoiser->denoise_pars.spatial_blur = 20;
}
}
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
int num_mb_rows, int num_mb_cols, int mode)
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
{
int i;
assert(denoiser);
denoiser->num_mb_cols = num_mb_cols;
for (i = 0; i < MAX_REF_FRAMES; i++)
{
@ -430,45 +166,9 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
denoiser->yv12_mc_running_avg.frame_size);
if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
height, VP8BORDERINPIXELS) < 0) {
vp8_denoiser_free(denoiser);
return 1;
}
vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
denoiser->yv12_last_source.frame_size);
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
vp8_denoiser_set_parameters(denoiser, mode);
denoiser->nmse_source_diff = 0;
denoiser->nmse_source_diff_count = 0;
denoiser->qp_avg = 0;
// QP threshold below which we can go up to aggressive mode.
denoiser->qp_threshold_up = 80;
// QP threshold above which we can go back down to normal mode.
// For now keep this second threshold high, so not used currently.
denoiser->qp_threshold_down = 128;
// Bitrate thresholds and noise metric (nmse) thresholds for switching to
// aggressive mode.
// TODO(marpan): Adjust thresholds, including effect on resolution.
denoiser->bitrate_threshold = 200000; // (bits/sec).
denoiser->threshold_aggressive_mode = 35;
if (width * height > 640 * 480) {
denoiser->bitrate_threshold = 500000;
denoiser->threshold_aggressive_mode = 100;
} else if (width * height > 960 * 540) {
denoiser->bitrate_threshold = 800000;
denoiser->threshold_aggressive_mode = 150;
} else if (width * height > 1280 * 720) {
denoiser->bitrate_threshold = 2000000;
denoiser->threshold_aggressive_mode = 1400;
}
return 0;
}
void vp8_denoiser_free(VP8_DENOISER *denoiser)
{
int i;
@ -479,8 +179,6 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
}
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
vpx_free(denoiser->denoise_state);
}
@ -489,28 +187,16 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
unsigned int best_sse,
unsigned int zero_mv_sse,
int recon_yoffset,
int recon_uvoffset,
loop_filter_info_n *lfi_n,
int mb_row,
int mb_col,
int block_index)
int recon_uvoffset)
{
int mv_row;
int mv_col;
unsigned int motion_threshold;
unsigned int motion_magnitude2;
unsigned int sse_thresh;
int sse_diff_thresh = 0;
// Spatial loop filter: only applied selectively based on
// temporal filter state of block relative to top/left neighbors.
int apply_spatial_loop_filter = 1;
MV_REFERENCE_FRAME frame = x->best_reference_frame;
MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
enum vp8_denoiser_decision decision = FILTER_BLOCK;
enum vp8_denoiser_decision decision_u = COPY_BLOCK;
enum vp8_denoiser_decision decision_v = COPY_BLOCK;
if (zero_frame)
{
@ -520,11 +206,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MB_MODE_INFO saved_mbmi;
MACROBLOCKD *filter_xd = &x->e_mbd;
MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
int sse_diff = 0;
// Bias on zero motion vector sse.
const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
sse_diff = zero_mv_sse - best_sse;
int sse_diff = zero_mv_sse - best_sse;
saved_mbmi = *mbmi;
@ -535,16 +217,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
mv_col = x->best_sse_mv.as_mv.col;
mv_row = x->best_sse_mv.as_mv.row;
// Bias to zero_mv if small amount of motion.
// Note sse_diff_thresh is intialized to zero, so this ensures
// we will always choose zero_mv for denoising if
// zero_mv_see <= best_sse (i.e., sse_diff <= 0).
if ((unsigned int)(mv_row * mv_row + mv_col * mv_col)
<= NOISE_MOTION_THRESHOLD)
sse_diff_thresh = (int)SSE_DIFF_THRESHOLD;
if (frame == INTRA_FRAME ||
sse_diff <= sse_diff_thresh)
((unsigned int)(mv_row *mv_row + mv_col *mv_col)
<= NOISE_MOTION_THRESHOLD &&
sse_diff < (int)SSE_DIFF_THRESHOLD))
{
/*
* Handle intra blocks as referring to last frame with zero motion
@ -600,62 +277,20 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
mv_row = x->best_sse_mv.as_mv.row;
mv_col = x->best_sse_mv.as_mv.col;
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
NOISE_MOTION_THRESHOLD;
if (motion_magnitude2 <
denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
x->increase_denoising = 1;
sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
if (x->increase_denoising)
sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
decision = COPY_BLOCK;
if (best_sse > SSE_THRESHOLD || motion_magnitude2
> 8 * NOISE_MOTION_THRESHOLD)
{
decision = COPY_BLOCK;
}
if (decision == FILTER_BLOCK)
{
unsigned char *mc_running_avg_y =
denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset;
int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride;
unsigned char *running_avg_y =
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset;
int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
/* Filter. */
decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
running_avg_y, avg_y_stride,
x->thismb, 16, motion_magnitude2,
x->increase_denoising);
denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
kFilterNonZeroMV : kFilterZeroMV;
// Only denoise UV for zero motion, and if y channel was denoised.
if (denoiser->denoiser_mode != kDenoiserOnYOnly &&
motion_magnitude2 == 0 &&
decision == FILTER_BLOCK) {
unsigned char *mc_running_avg_u =
denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
unsigned char *running_avg_u =
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
unsigned char *mc_running_avg_v =
denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
unsigned char *running_avg_v =
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
int signal_stride = x->block[16].src_stride;
decision_u =
vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
running_avg_u, avg_uv_stride,
x->block[16].src + *x->block[16].base_src,
signal_stride, motion_magnitude2, 0);
decision_v =
vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
running_avg_v, avg_uv_stride,
x->block[20].src + *x->block[20].base_src,
signal_stride, motion_magnitude2, 0);
}
decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
&denoiser->yv12_running_avg[INTRA_FRAME],
x,
motion_magnitude2,
recon_yoffset, recon_uvoffset);
}
if (decision == COPY_BLOCK)
{
@ -666,73 +301,5 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
x->thismb, 16,
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
denoiser->denoise_state[block_index] = kNoFilter;
}
if (denoiser->denoiser_mode != kDenoiserOnYOnly) {
if (decision_u == COPY_BLOCK) {
vp8_copy_mem8x8(
x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
}
if (decision_v == COPY_BLOCK) {
vp8_copy_mem8x8(
x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
}
}
// Option to selectively deblock the denoised signal, for y channel only.
if (apply_spatial_loop_filter) {
loop_filter_info lfi;
int apply_filter_col = 0;
int apply_filter_row = 0;
int apply_filter = 0;
int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
// Fix filter level to some nominal value for now.
int filter_level = 32;
int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
// Apply filter if there is a difference in the denoiser filter state
// between the current and left/top block, or if non-zero motion vector
// is used for the motion-compensated filtering.
if (mb_col > 0) {
apply_filter_col = !((denoiser->denoise_state[block_index] ==
denoiser->denoise_state[block_index - 1]) &&
denoiser->denoise_state[block_index] != kFilterNonZeroMV);
if (apply_filter_col) {
// Filter left vertical edge.
apply_filter = 1;
vp8_loop_filter_mbv(
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
NULL, NULL, y_stride, uv_stride, &lfi);
}
}
if (mb_row > 0) {
apply_filter_row = !((denoiser->denoise_state[block_index] ==
denoiser->denoise_state[block_index - denoiser->num_mb_cols]) &&
denoiser->denoise_state[block_index] != kFilterNonZeroMV);
if (apply_filter_row) {
// Filter top horizontal edge.
apply_filter = 1;
vp8_loop_filter_mbh(
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
NULL, NULL, y_stride, uv_stride, &lfi);
}
}
if (apply_filter) {
// Update the signal block |x|. Pixel changes are only to top and/or
// left boundary pixels: can we avoid full block copy here.
vp8_copy_mem16x16(
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
y_stride, x->thismb, 16);
}
}
}

Просмотреть файл

@ -12,101 +12,36 @@
#define VP8_ENCODER_DENOISING_H_
#include "block.h"
#include "vp8/common/loopfilter.h"
#ifdef __cplusplus
extern "C" {
#endif
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
#define SUM_DIFF_THRESHOLD_HIGH (600)
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
enum vp8_denoiser_decision
{
COPY_BLOCK,
FILTER_BLOCK
};
enum vp8_denoiser_filter_state {
kNoFilter,
kFilterZeroMV,
kFilterNonZeroMV
};
enum vp8_denoiser_mode {
kDenoiserOff,
kDenoiserOnYOnly,
kDenoiserOnYUV,
kDenoiserOnYUVAggressive,
kDenoiserOnAdaptive
};
typedef struct {
// Scale factor on sse threshold above which no denoising is done.
unsigned int scale_sse_thresh;
// Scale factor on motion magnitude threshold above which no
// denoising is done.
unsigned int scale_motion_thresh;
// Scale factor on motion magnitude below which we increase the strength of
// the temporal filter (in function vp8_denoiser_filter).
unsigned int scale_increase_filter;
// Scale factor to bias to ZEROMV for denoising.
unsigned int denoise_mv_bias;
// Scale factor to bias to ZEROMV for coding mode selection.
unsigned int pickmode_mv_bias;
// Quantizer threshold below which we use the segmentation map to switch off
// loop filter for blocks that have been coded as ZEROMV-LAST a certain number
// (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
// 0 when segmentation map is used for shutting off loop filter.
unsigned int qp_thresh;
// Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
unsigned int consec_zerolast;
// Threshold for amount of spatial blur on Y channel. 0 means no spatial blur.
unsigned int spatial_blur;
} denoise_params;
typedef struct vp8_denoiser
{
YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG yv12_mc_running_avg;
// TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
YV12_BUFFER_CONFIG yv12_last_source;
unsigned char* denoise_state;
int num_mb_cols;
int denoiser_mode;
int threshold_aggressive_mode;
int nmse_source_diff;
int nmse_source_diff_count;
int qp_avg;
int qp_threshold_up;
int qp_threshold_down;
int bitrate_threshold;
denoise_params denoise_pars;
} VP8_DENOISER;
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
int num_mb_rows, int num_mb_cols, int mode);
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
void vp8_denoiser_free(VP8_DENOISER *denoiser);
void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MACROBLOCK *x,
unsigned int best_sse,
unsigned int zero_mv_sse,
int recon_yoffset,
int recon_uvoffset,
loop_filter_info_n *lfi_n,
int mb_row,
int mb_col,
int block_index);
int recon_uvoffset);
#ifdef __cplusplus
} // extern "C"

Просмотреть файл

@ -522,19 +522,6 @@ void encode_mb_row(VP8_COMP *cpi,
}
#endif
// Keep track of how many (consecutive) times a block is coded
// as ZEROMV_LASTREF, for base layer frames.
// Reset to 0 if its coded as anything else.
if (cpi->current_layer == 0) {
if (xd->mode_info_context->mbmi.mode == ZEROMV &&
xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
// Increment, check for wrap-around.
if (cpi->consec_zero_last[map_index+mb_col] < 255)
cpi->consec_zero_last[map_index+mb_col] += 1;
} else {
cpi->consec_zero_last[map_index+mb_col] = 0;
}
}
/* Special case code for cyclic refresh
* If cyclic update enabled then copy xd->mbmi.segment_id; (which
@ -1259,7 +1246,7 @@ int vp8cx_encode_inter_macroblock
x->zbin_mode_boost_enabled = 0;
}
vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
&distortion, &intra_error, mb_row, mb_col);
&distortion, &intra_error);
/* switch back to the regular quantizer for the encode */
if (cpi->sf.improved_quant)

Просмотреть файл

@ -258,6 +258,13 @@ static void optimize_b(MACROBLOCK *mb, int ib, int type,
b = &mb->block[ib];
d = &mb->e_mbd.block[ib];
/* Enable this to test the effect of RDO as a replacement for the dynamic
* zero bin instead of an augmentation of it.
*/
#if 0
vp8_strict_quantize_b(b, d);
#endif
dequant_ptr = d->dequant;
coeff_ptr = b->coeff;
qcoeff_ptr = d->qcoeff;

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше