зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1148639 - Update libvpx. r=kinetik
From b40321d9379faaa62fa16d6d412098fca115042d Mon Sep 17 00:00:00 2001 Result of running ./update.py --ndk ~/android/android-ndk-r9 --commit afad1a84c15b9af8298a37c0fa449e0af40931fd
This commit is contained in:
Родитель
c536fc79d4
Коммит
177b274c46
|
@ -1,23 +1,22 @@
|
|||
Additional IP Rights Grant (Patents)
|
||||
------------------------------------
|
||||
|
||||
"These implementations" means the copyrightable works that implement the WebM
|
||||
codecs distributed by Google as part of the WebM Project.
|
||||
"This implementation" means the copyrightable works distributed by
|
||||
Google as part of the WebM Project.
|
||||
|
||||
Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
|
||||
royalty-free, irrevocable (except as stated in this section) patent license to
|
||||
make, have made, use, offer to sell, sell, import, transfer, and otherwise
|
||||
run, modify and propagate the contents of these implementations of WebM, where
|
||||
such license applies only to those patent claims, both currently owned by
|
||||
Google and acquired in the future, licensable by Google that are necessarily
|
||||
infringed by these implementations of WebM. This grant does not include claims
|
||||
that would be infringed only as a consequence of further modification of these
|
||||
implementations. If you or your agent or exclusive licensee institute or order
|
||||
or agree to the institution of patent litigation or any other patent
|
||||
enforcement activity against any entity (including a cross-claim or
|
||||
counterclaim in a lawsuit) alleging that any of these implementations of WebM
|
||||
or any code incorporated within any of these implementations of WebM
|
||||
constitutes direct or contributory patent infringement, or inducement of
|
||||
patent infringement, then any patent rights granted to you under this License
|
||||
for these implementations of WebM shall terminate as of the date such
|
||||
litigation is filed.
|
||||
Google hereby grants to you a perpetual, worldwide, non-exclusive,
|
||||
no-charge, royalty-free, irrevocable (except as stated in this section)
|
||||
patent license to make, have made, use, offer to sell, sell, import,
|
||||
transfer, and otherwise run, modify and propagate the contents of this
|
||||
implementation of VP8, where such license applies only to those patent
|
||||
claims, both currently owned by Google and acquired in the future,
|
||||
licensable by Google that are necessarily infringed by this
|
||||
implementation of VP8. This grant does not include claims that would be
|
||||
infringed only as a consequence of further modification of this
|
||||
implementation. If you or your agent or exclusive licensee institute or
|
||||
order or agree to the institution of patent litigation against any
|
||||
entity (including a cross-claim or counterclaim in a lawsuit) alleging
|
||||
that this implementation of VP8 or any code incorporated within this
|
||||
implementation of VP8 constitutes direct or contributory patent
|
||||
infringement, or inducement of patent infringement, then any patent
|
||||
rights granted to you under this License for this implementation of VP8
|
||||
shall terminate as of the date such litigation is filed.
|
||||
|
|
|
@ -8,4 +8,4 @@ The libvpx git repository is:
|
|||
|
||||
https://gerrit.chromium.org/gerrit/webm/libvpx
|
||||
|
||||
The git commit ID used was 587ff646f
|
||||
The git commit ID used was afad1a84c15b9af8298a37c0fa449e0af40931fd
|
||||
|
|
|
@ -34,18 +34,6 @@ int log_msg(const char *fmt, ...) {
|
|||
}
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__
|
||||
|
||||
#if defined(FORCE_PARSE_ELF)
|
||||
|
||||
#if defined(__MACH__)
|
||||
#undef __MACH__
|
||||
#endif
|
||||
|
||||
#if !defined(__ELF__)
|
||||
#define __ELF__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__MACH__)
|
||||
|
||||
#include <mach-o/loader.h>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env perl
|
||||
#!/usr/bin/perl
|
||||
##
|
||||
## Copyright (c) 2013 The WebM project authors. All Rights Reserved.
|
||||
##
|
||||
|
@ -51,7 +51,7 @@ sub FixThumbInstructions($$)
|
|||
|
||||
# Convert register post indexing to a separate add instruction.
|
||||
# This converts "ldrneb r9, [r0], r2" into "ldrneb r9, [r0]",
|
||||
# "addne r0, r0, r2".
|
||||
# "add r0, r2".
|
||||
s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
|
||||
|
||||
# Convert a conditional addition to the pc register into a series of
|
||||
|
|
|
@ -25,24 +25,34 @@ files = {
|
|||
'vp8/common/arm/filter_arm.c',
|
||||
'vp8/common/arm/loopfilter_arm.c',
|
||||
'vp8/common/arm/neon/bilinearpredict_neon.c',
|
||||
'vp8/common/arm/neon/copymem_neon.c',
|
||||
'vp8/common/arm/neon/dc_only_idct_add_neon.c',
|
||||
'vp8/common/arm/neon/dequant_idct_neon.c',
|
||||
'vp8/common/arm/neon/dequantizeb_neon.c',
|
||||
'vp8/common/arm/neon/buildintrapredictorsmby_neon.asm',
|
||||
'vp8/common/arm/neon/copymem16x16_neon.asm',
|
||||
'vp8/common/arm/neon/copymem8x4_neon.asm',
|
||||
'vp8/common/arm/neon/copymem8x8_neon.asm',
|
||||
'vp8/common/arm/neon/dc_only_idct_add_neon.asm',
|
||||
'vp8/common/arm/neon/dequant_idct_neon.asm',
|
||||
'vp8/common/arm/neon/dequantizeb_neon.asm',
|
||||
'vp8/common/arm/neon/idct_blk_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_0_2x_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_full_2x_neon.c',
|
||||
'vp8/common/arm/neon/iwalsh_neon.c',
|
||||
'vp8/common/arm/neon/loopfilter_neon.c',
|
||||
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c',
|
||||
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c',
|
||||
'vp8/common/arm/neon/mbloopfilter_neon.c',
|
||||
'vp8/common/arm/neon/reconintra_neon.c',
|
||||
'vp8/common/arm/neon/sad_neon.c',
|
||||
'vp8/common/arm/neon/shortidct4x4llm_neon.c',
|
||||
'vp8/common/arm/neon/sixtappredict_neon.c',
|
||||
'vp8/common/arm/neon/variance_neon.c',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance_neon.c',
|
||||
'vp8/common/arm/neon/idct_dequant_0_2x_neon.asm',
|
||||
'vp8/common/arm/neon/idct_dequant_full_2x_neon.asm',
|
||||
'vp8/common/arm/neon/iwalsh_neon.asm',
|
||||
'vp8/common/arm/neon/loopfilter_neon.asm',
|
||||
'vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm',
|
||||
'vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm',
|
||||
'vp8/common/arm/neon/mbloopfilter_neon.asm',
|
||||
'vp8/common/arm/neon/sad16_neon.asm',
|
||||
'vp8/common/arm/neon/sad8_neon.asm',
|
||||
'vp8/common/arm/neon/save_reg_neon.asm',
|
||||
'vp8/common/arm/neon/shortidct4x4llm_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict16x16_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict4x4_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict8x4_neon.asm',
|
||||
'vp8/common/arm/neon/sixtappredict8x8_neon.asm',
|
||||
'vp8/common/arm/neon/variance_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm',
|
||||
'vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm',
|
||||
'vp8/common/arm/reconintra_arm.c',
|
||||
'vp8/common/arm/variance_arm.c',
|
||||
'vp8/encoder/arm/armv5te/boolhuff_armv5te.asm',
|
||||
'vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm',
|
||||
|
@ -57,10 +67,12 @@ files = {
|
|||
'vp8/encoder/arm/dct_arm.c',
|
||||
'vp8/encoder/arm/neon/denoising_neon.c',
|
||||
'vp8/encoder/arm/neon/fastquantizeb_neon.asm',
|
||||
'vp8/encoder/arm/neon/shortfdct_neon.c',
|
||||
'vp8/encoder/arm/neon/subtract_neon.c',
|
||||
'vp8/encoder/arm/neon/vp8_mse16x16_neon.c',
|
||||
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c',
|
||||
'vp8/encoder/arm/neon/picklpf_arm.c',
|
||||
'vp8/encoder/arm/neon/shortfdct_neon.asm',
|
||||
'vp8/encoder/arm/neon/subtract_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_memcpy_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_mse16x16_neon.asm',
|
||||
'vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm',
|
||||
'vp8/encoder/arm/quantize_arm.c',
|
||||
'vp9/common/arm/neon/vp9_avg_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_convolve8_avg_neon.asm',
|
||||
|
@ -85,19 +97,16 @@ files = {
|
|||
'vp9/common/arm/neon/vp9_mb_lpf_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_reconintra_neon.asm',
|
||||
'vp9/common/arm/neon/vp9_save_reg_neon.asm',
|
||||
'vp9/encoder/arm/neon/vp9_dct_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_quantize_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_sad_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_subtract_neon.c',
|
||||
'vp9/encoder/arm/neon/vp9_variance_neon.c',
|
||||
'vpx_ports/arm_cpudetect.c'],
|
||||
'vpx_ports/arm_cpudetect.c',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copy_y_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm',
|
||||
'vpx_scale/arm/neon/vp8_vpxyv12_extendframeborders_neon.asm',
|
||||
'vpx_scale/arm/neon/yv12extend_arm.c'],
|
||||
'AVX2': ['vp9/common/x86/vp9_loopfilter_intrin_avx2.c',
|
||||
'vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_dct32x32_avx2.c',
|
||||
'vp9/encoder/x86/vp9_dct_avx2.c',
|
||||
'vp9/encoder/x86/vp9_error_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_sad4d_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c',
|
||||
'vp9/encoder/x86/vp9_variance_avx2.c',
|
||||
'vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c'],
|
||||
'ERROR_CONCEALMENT': ['vp8/decoder/error_concealment.c'],
|
||||
|
@ -123,6 +132,7 @@ files = {
|
|||
'vp8/encoder/bitstream.c',
|
||||
'vp8/encoder/onyx_if.c',
|
||||
'vp8/vp8_dx_iface.c',
|
||||
'vp9/common/generic/vp9_systemdependent.c',
|
||||
'vp9/common/vp9_alloccommon.c',
|
||||
'vp9/common/vp9_blockd.c',
|
||||
'vp9/common/vp9_common_data.c',
|
||||
|
@ -146,48 +156,39 @@ files = {
|
|||
'vp9/common/vp9_scale.c',
|
||||
'vp9/common/vp9_scan.c',
|
||||
'vp9/common/vp9_seg_common.c',
|
||||
'vp9/common/vp9_thread.c',
|
||||
'vp9/common/vp9_tile_common.c',
|
||||
'vp9/decoder/vp9_decodeframe.c',
|
||||
'vp9/decoder/vp9_decodemv.c',
|
||||
'vp9/decoder/vp9_decoder.c',
|
||||
'vp9/decoder/vp9_detokenize.c',
|
||||
'vp9/decoder/vp9_dsubexp.c',
|
||||
'vp9/decoder/vp9_dthread.c',
|
||||
'vp9/decoder/vp9_onyxd_if.c',
|
||||
'vp9/decoder/vp9_reader.c',
|
||||
'vp9/encoder/vp9_aq_complexity.c',
|
||||
'vp9/encoder/vp9_aq_cyclicrefresh.c',
|
||||
'vp9/encoder/vp9_aq_variance.c',
|
||||
'vp9/encoder/vp9_bitstream.c',
|
||||
'vp9/encoder/vp9_context_tree.c',
|
||||
'vp9/encoder/vp9_cost.c',
|
||||
'vp9/encoder/vp9_dct.c',
|
||||
'vp9/encoder/vp9_encodeframe.c',
|
||||
'vp9/encoder/vp9_encodemb.c',
|
||||
'vp9/encoder/vp9_encodemv.c',
|
||||
'vp9/encoder/vp9_encoder.c',
|
||||
'vp9/encoder/vp9_extend.c',
|
||||
'vp9/encoder/vp9_firstpass.c',
|
||||
'vp9/encoder/vp9_lookahead.c',
|
||||
'vp9/encoder/vp9_mbgraph.c',
|
||||
'vp9/encoder/vp9_mcomp.c',
|
||||
'vp9/encoder/vp9_onyx_if.c',
|
||||
'vp9/encoder/vp9_picklpf.c',
|
||||
'vp9/encoder/vp9_pickmode.c',
|
||||
'vp9/encoder/vp9_quantize.c',
|
||||
'vp9/encoder/vp9_ratectrl.c',
|
||||
'vp9/encoder/vp9_rd.c',
|
||||
'vp9/encoder/vp9_rdopt.c',
|
||||
'vp9/encoder/vp9_resize.c',
|
||||
'vp9/encoder/vp9_sad.c',
|
||||
'vp9/encoder/vp9_segmentation.c',
|
||||
'vp9/encoder/vp9_speed_features.c',
|
||||
'vp9/encoder/vp9_subexp.c',
|
||||
'vp9/encoder/vp9_svc_layercontext.c',
|
||||
'vp9/encoder/vp9_temporal_filter.c',
|
||||
'vp9/encoder/vp9_tokenize.c',
|
||||
'vp9/encoder/vp9_treewriter.c',
|
||||
'vp9/encoder/vp9_vaq.c',
|
||||
'vp9/encoder/vp9_variance.c',
|
||||
'vp9/encoder/vp9_write_bit_buffer.c',
|
||||
'vp9/encoder/vp9_writer.c',
|
||||
'vp9/vp9_cx_iface.c',
|
||||
'vp9/vp9_dx_iface.c',
|
||||
|
@ -242,6 +243,7 @@ files = {
|
|||
'vp8/encoder/mr_dissim.c',
|
||||
'vp8/encoder/pickinter.c',
|
||||
'vp8/encoder/picklpf.c',
|
||||
'vp8/encoder/psnr.c',
|
||||
'vp8/encoder/quantize.c',
|
||||
'vp8/encoder/ratectrl.c',
|
||||
'vp8/encoder/rdopt.c',
|
||||
|
@ -250,20 +252,18 @@ files = {
|
|||
'vp8/encoder/tokenize.c',
|
||||
'vp8/encoder/treewriter.c',
|
||||
'vp8/vp8_cx_iface.c',
|
||||
'vp9/decoder/vp9_read_bit_buffer.c',
|
||||
'vp9/decoder/vp9_thread.c',
|
||||
'vp9/encoder/vp9_psnr.c',
|
||||
'vpx/src/vpx_codec.c',
|
||||
'vpx/src/vpx_decoder.c',
|
||||
'vpx/src/vpx_image.c',
|
||||
'vpx/src/vpx_psnr.c',
|
||||
'vpx_scale/generic/gen_scalers.c',
|
||||
'vpx_scale/generic/vpx_scale.c'],
|
||||
'VP8_POSTPROC': ['vp8/common/mfqe.c', 'vp8/common/postproc.c'],
|
||||
'VP9_POSTPROC': ['vp9/common/vp9_postproc.c'],
|
||||
'X86-64_ASM': ['third_party/x86inc/x86inc.asm',
|
||||
'vp8/common/x86/loopfilter_block_sse2_x86_64.asm',
|
||||
'vp8/encoder/x86/ssim_opt_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_ssim_opt_x86_64.asm'],
|
||||
'vp8/common/x86/loopfilter_block_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_quantize_ssse3.asm'],
|
||||
'X86_ASM': ['vp8/common/x86/dequantize_mmx.asm',
|
||||
'vp8/common/x86/filter_x86.c',
|
||||
'vp8/common/x86/idct_blk_mmx.c',
|
||||
|
@ -278,6 +278,7 @@ files = {
|
|||
'vp8/common/x86/mfqe_sse2.asm',
|
||||
'vp8/common/x86/postproc_mmx.asm',
|
||||
'vp8/common/x86/postproc_sse2.asm',
|
||||
'vp8/common/x86/postproc_x86.c',
|
||||
'vp8/common/x86/recon_mmx.asm',
|
||||
'vp8/common/x86/recon_sse2.asm',
|
||||
'vp8/common/x86/recon_wrapper_sse2.c',
|
||||
|
@ -303,8 +304,8 @@ files = {
|
|||
'vp8/encoder/x86/fwalsh_sse2.asm',
|
||||
'vp8/encoder/x86/quantize_mmx.asm',
|
||||
'vp8/encoder/x86/quantize_sse2.c',
|
||||
'vp8/encoder/x86/quantize_sse4.c',
|
||||
'vp8/encoder/x86/quantize_ssse3.c',
|
||||
'vp8/encoder/x86/quantize_sse4.asm',
|
||||
'vp8/encoder/x86/quantize_ssse3.asm',
|
||||
'vp8/encoder/x86/subtract_mmx.asm',
|
||||
'vp8/encoder/x86/subtract_sse2.asm',
|
||||
'vp8/encoder/x86/temporal_filter_apply_sse2.asm',
|
||||
|
@ -312,35 +313,31 @@ files = {
|
|||
'vp8/encoder/x86/vp8_enc_stubs_sse2.c',
|
||||
'vp9/common/x86/vp9_asm_stubs.c',
|
||||
'vp9/common/x86/vp9_copy_sse2.asm',
|
||||
'vp9/common/x86/vp9_high_intrapred_sse2.asm',
|
||||
'vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_high_subpixel_8t_sse2.asm',
|
||||
'vp9/common/x86/vp9_high_subpixel_bilinear_sse2.asm',
|
||||
'vp9/common/x86/vp9_idct_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_idct_intrin_ssse3.c',
|
||||
'vp9/common/x86/vp9_idct_ssse3_x86_64.asm',
|
||||
'vp9/common/x86/vp9_intrapred_sse2.asm',
|
||||
'vp9/common/x86/vp9_intrapred_ssse3.asm',
|
||||
'vp9/common/x86/vp9_loopfilter_intrin_sse2.c',
|
||||
'vp9/common/x86/vp9_loopfilter_mmx.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c',
|
||||
'vp9/common/x86/vp9_subpixel_8t_sse2.asm',
|
||||
'vp9/common/x86/vp9_subpixel_8t_ssse3.asm',
|
||||
'vp9/common/x86/vp9_subpixel_bilinear_sse2.asm',
|
||||
'vp9/common/x86/vp9_subpixel_bilinear_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_dct32x32_sse2.c',
|
||||
'vp9/encoder/x86/vp9_dct_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_dct_sse2.c',
|
||||
'vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm',
|
||||
'vp9/encoder/x86/vp9_error_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad4d_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse3.asm',
|
||||
'vp9/encoder/x86/vp9_sad_sse4.asm',
|
||||
'vp9/encoder/x86/vp9_sad_ssse3.asm',
|
||||
'vp9/encoder/x86/vp9_subpel_variance.asm',
|
||||
'vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_subtract_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_variance_impl_mmx.asm',
|
||||
'vp9/encoder/x86/vp9_variance_impl_sse2.asm',
|
||||
'vp9/encoder/x86/vp9_variance_mmx.c',
|
||||
'vp9/encoder/x86/vp9_variance_sse2.c',
|
||||
'vpx_ports/emms.asm']
|
||||
}
|
||||
|
|
|
@ -234,10 +234,10 @@ ALIGNMODE k7
|
|||
%define r%1mp %2
|
||||
%elif ARCH_X86_64 ; memory
|
||||
%define r%1m [rsp + stack_offset + %6]
|
||||
%define r%1mp qword r %+ %1 %+ m
|
||||
%define r%1mp qword r %+ %1m
|
||||
%else
|
||||
%define r%1m [esp + stack_offset + %6]
|
||||
%define r%1mp dword r %+ %1 %+ m
|
||||
%define r%1mp dword r %+ %1m
|
||||
%endif
|
||||
%define r%1 %2
|
||||
%endmacro
|
||||
|
@ -395,23 +395,6 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
|
|||
%assign n_arg_names %0
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro ALLOC_STACK 2 ; stack_size, num_regs
|
||||
%assign %%stack_aligment ((mmsize + 15) & ~15)
|
||||
%assign stack_size_padded %1
|
||||
|
||||
%assign %%reg_num (%2 - 1)
|
||||
%xdefine rsp_tmp r %+ %%reg_num
|
||||
mov rsp_tmp, rsp
|
||||
sub rsp, stack_size_padded
|
||||
and rsp, ~(%%stack_aligment - 1)
|
||||
%endmacro
|
||||
|
||||
%macro RESTORE_STACK 0 ; reset rsp register
|
||||
mov rsp, rsp_tmp
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%if WIN64 ; Windows x64 ;=================================================
|
||||
|
||||
DECLARE_REG 0, rcx, ecx, cx, cl
|
||||
|
@ -609,20 +592,16 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
|
|||
CAT_XDEFINE cglobaled_, %1, 1
|
||||
%endif
|
||||
%xdefine current_function %1
|
||||
%ifdef CHROMIUM
|
||||
%ifidn __OUTPUT_FORMAT__,elf
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf32
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf64
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,macho32
|
||||
global %1:private_extern
|
||||
%elifidn __OUTPUT_FORMAT__,macho64
|
||||
global %1:private_extern
|
||||
%else
|
||||
global %1
|
||||
%endif
|
||||
%ifidn __OUTPUT_FORMAT__,elf
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf32
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,elf64
|
||||
global %1:function hidden
|
||||
%elifidn __OUTPUT_FORMAT__,macho32
|
||||
global %1:private_extern
|
||||
%elifidn __OUTPUT_FORMAT__,macho64
|
||||
global %1:private_extern
|
||||
%else
|
||||
global %1
|
||||
%endif
|
||||
|
|
|
@ -53,7 +53,7 @@ loop
|
|||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -77,7 +77,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -101,7 +101,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -127,7 +127,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
|
|
@ -51,7 +51,7 @@ loop
|
|||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
|
@ -77,7 +77,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
sub r4, r4, r7 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
|
|
|
@ -58,7 +58,7 @@ loop
|
|||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -89,7 +89,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -120,7 +120,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -153,7 +153,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
|
|
@ -69,7 +69,7 @@ loop
|
|||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -111,7 +111,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -153,7 +153,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -195,7 +195,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
|
|
@ -59,7 +59,7 @@ loop
|
|||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
subs r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -90,7 +90,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -121,7 +121,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
@ -154,7 +154,7 @@ loop
|
|||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
sub r8, r8, r5 ; substract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
|
|
|
@ -12,9 +12,26 @@
|
|||
#include "vpx_config.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
extern void vp8_dequantize_b_loop_neon(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_MEDIA
|
||||
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
#endif
|
||||
|
||||
#if HAVE_NEON
|
||||
|
||||
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC)
|
||||
{
|
||||
short *DQ = d->dqcoeff;
|
||||
short *Q = d->qcoeff;
|
||||
|
||||
vp8_dequantize_b_loop_neon(Q, DQC, DQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_MEDIA
|
||||
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC)
|
||||
{
|
||||
short *DQ = d->dqcoeff;
|
||||
|
|
|
@ -34,11 +34,11 @@ typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
|
|||
|
||||
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
|
||||
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
|
||||
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
|
||||
|
||||
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
|
||||
#endif
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const uint8_t bifilter4_coeff[8][2] = {
|
||||
static const uint16_t bifilter4_coeff[8][2] = {
|
||||
{128, 0},
|
||||
{112, 16},
|
||||
{ 96, 32},
|
||||
|
@ -30,6 +30,7 @@ void vp8_bilinear_predict4x4_neon(
|
|||
int dst_pitch) {
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
|
||||
uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
|
||||
uint32x2_t d28u32, d29u32, d30u32;
|
||||
uint8x16_t q1u8, q2u8;
|
||||
uint16x8_t q1u16, q2u16;
|
||||
uint16x8_t q7u16, q8u16, q9u16;
|
||||
|
@ -38,10 +39,6 @@ void vp8_bilinear_predict4x4_neon(
|
|||
uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
|
||||
|
||||
if (xoffset == 0) { // skip_1stpass_filter
|
||||
uint32x2_t d28u32 = vdup_n_u32(0);
|
||||
uint32x2_t d29u32 = vdup_n_u32(0);
|
||||
uint32x2_t d30u32 = vdup_n_u32(0);
|
||||
|
||||
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
|
||||
src_ptr += src_pixels_per_line;
|
||||
d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
|
||||
|
@ -64,8 +61,8 @@ void vp8_bilinear_predict4x4_neon(
|
|||
q1u8 = vcombine_u8(d2u8, d3u8);
|
||||
q2u8 = vcombine_u8(d4u8, d5u8);
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
|
||||
q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
|
||||
|
@ -155,8 +152,8 @@ void vp8_bilinear_predict8x4_neon(
|
|||
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q5u8 = vld1q_u8(src_ptr);
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
|
||||
|
@ -245,8 +242,8 @@ void vp8_bilinear_predict8x8_neon(
|
|||
q3u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
q4u8 = vld1q_u8(src_ptr); src_ptr += src_pixels_per_line;
|
||||
|
||||
d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
|
||||
d0u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][0]);
|
||||
d1u8 = vdup_n_u8((uint8_t)bifilter4_coeff[xoffset][1]);
|
||||
|
||||
q6u16 = vmull_u8(vget_low_u8(q1u8), d0u8);
|
||||
q7u16 = vmull_u8(vget_low_u8(q2u8), d0u8);
|
||||
|
|
|
@ -0,0 +1,584 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_build_intra_predictors_mby_neon_func|
|
||||
EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred
|
||||
cmp r3, #1
|
||||
beq case_v_pred
|
||||
cmp r3, #2
|
||||
beq case_h_pred
|
||||
cmp r3, #3
|
||||
beq case_tm_pred
|
||||
|
||||
case_dc_pred
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q0}, [r1]!
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1]!
|
||||
vst1.u8 {q1}, [r1]!
|
||||
vst1.u8 {q2}, [r1]!
|
||||
vst1.u8 {q3}, [r1]!
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; r0 unsigned char *y_buffer
|
||||
; r1 unsigned char *ypred_ptr
|
||||
; r2 int y_stride
|
||||
; r3 int mode
|
||||
; stack int Up
|
||||
; stack int Left
|
||||
|
||||
|vp8_build_intra_predictors_mby_s_neon_func| PROC
|
||||
push {r4-r8, lr}
|
||||
|
||||
mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
|
||||
|
||||
cmp r3, #0
|
||||
beq case_dc_pred_s
|
||||
cmp r3, #1
|
||||
beq case_v_pred_s
|
||||
cmp r3, #2
|
||||
beq case_h_pred_s
|
||||
cmp r3, #3
|
||||
beq case_tm_pred_s
|
||||
|
||||
case_dc_pred_s
|
||||
ldr r4, [sp, #24] ; Up
|
||||
ldr r5, [sp, #28] ; Left
|
||||
|
||||
; Default the DC average to 128
|
||||
mov r12, #128
|
||||
vdup.u8 q0, r12
|
||||
|
||||
; Zero out running sum
|
||||
mov r12, #0
|
||||
|
||||
; compute shift and jump
|
||||
adds r7, r4, r5
|
||||
beq skip_dc_pred_up_left_s
|
||||
|
||||
; Load above row, if it exists
|
||||
cmp r4, #0
|
||||
beq skip_dc_pred_up_s
|
||||
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q1}, [r6]
|
||||
vpaddl.u8 q2, q1
|
||||
vpaddl.u16 q3, q2
|
||||
vpaddl.u32 q4, q3
|
||||
|
||||
vmov.32 r4, d8[0]
|
||||
vmov.32 r6, d9[0]
|
||||
|
||||
add r12, r4, r6
|
||||
|
||||
; Move back to interger registers
|
||||
|
||||
skip_dc_pred_up_s
|
||||
|
||||
cmp r5, #0
|
||||
beq skip_dc_pred_left_s
|
||||
|
||||
sub r0, r0, #1
|
||||
|
||||
; Load left row, if it exists
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0]
|
||||
|
||||
add r12, r12, r3
|
||||
add r12, r12, r4
|
||||
add r12, r12, r5
|
||||
add r12, r12, r6
|
||||
|
||||
skip_dc_pred_left_s
|
||||
add r7, r7, #3 ; Shift
|
||||
sub r4, r7, #1
|
||||
mov r5, #1
|
||||
add r12, r12, r5, lsl r4
|
||||
mov r5, r12, lsr r7 ; expected_dc
|
||||
|
||||
vdup.u8 q0, r5
|
||||
|
||||
skip_dc_pred_up_left_s
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
case_v_pred_s
|
||||
; Copy down above row
|
||||
sub r6, r0, r2
|
||||
vld1.8 {q0}, [r6]
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_h_pred_s
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u8 q0, r3
|
||||
vdup.u8 q1, r4
|
||||
vdup.u8 q2, r5
|
||||
vdup.u8 q3, r6
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
case_tm_pred_s
|
||||
; Load yabove_row
|
||||
sub r3, r0, r2
|
||||
vld1.8 {q8}, [r3]
|
||||
|
||||
; Load ytop_left
|
||||
sub r3, r3, #1
|
||||
ldrb r7, [r3]
|
||||
|
||||
vdup.u16 q7, r7
|
||||
|
||||
; Compute yabove_row - ytop_left
|
||||
mov r3, #1
|
||||
vdup.u8 q0, r3
|
||||
|
||||
vmull.u8 q4, d16, d0
|
||||
vmull.u8 q5, d17, d0
|
||||
|
||||
vsub.s16 q4, q4, q7
|
||||
vsub.s16 q5, q5, q7
|
||||
|
||||
; Load 4x yleft_col
|
||||
sub r0, r0, #1
|
||||
mov r12, #4
|
||||
|
||||
case_tm_pred_loop_s
|
||||
ldrb r3, [r0], r2
|
||||
ldrb r4, [r0], r2
|
||||
ldrb r5, [r0], r2
|
||||
ldrb r6, [r0], r2
|
||||
vdup.u16 q0, r3
|
||||
vdup.u16 q1, r4
|
||||
vdup.u16 q2, r5
|
||||
vdup.u16 q3, r6
|
||||
|
||||
vqadd.s16 q8, q0, q4
|
||||
vqadd.s16 q9, q0, q5
|
||||
|
||||
vqadd.s16 q10, q1, q4
|
||||
vqadd.s16 q11, q1, q5
|
||||
|
||||
vqadd.s16 q12, q2, q4
|
||||
vqadd.s16 q13, q2, q5
|
||||
|
||||
vqadd.s16 q14, q3, q4
|
||||
vqadd.s16 q15, q3, q5
|
||||
|
||||
vqshrun.s16 d0, q8, #0
|
||||
vqshrun.s16 d1, q9, #0
|
||||
|
||||
vqshrun.s16 d2, q10, #0
|
||||
vqshrun.s16 d3, q11, #0
|
||||
|
||||
vqshrun.s16 d4, q12, #0
|
||||
vqshrun.s16 d5, q13, #0
|
||||
|
||||
vqshrun.s16 d6, q14, #0
|
||||
vqshrun.s16 d7, q15, #0
|
||||
|
||||
vst1.u8 {q0}, [r1], r2
|
||||
vst1.u8 {q1}, [r1], r2
|
||||
vst1.u8 {q2}, [r1], r2
|
||||
vst1.u8 {q3}, [r1], r2
|
||||
|
||||
subs r12, r12, #1
|
||||
bne case_tm_pred_loop_s
|
||||
|
||||
pop {r4-r8,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
END
|
|
@ -0,0 +1,59 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem16x16_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem16x16_neon| PROC
|
||||
|
||||
vld1.u8 {q0}, [r0], r1
|
||||
vld1.u8 {q1}, [r0], r1
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vst1.u8 {q0}, [r2], r3
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vst1.u8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {q2}, [r2], r3
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {q4}, [r2], r3
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {q5}, [r2], r3
|
||||
vld1.u8 {q8}, [r0], r1
|
||||
vst1.u8 {q6}, [r2], r3
|
||||
vld1.u8 {q9}, [r0], r1
|
||||
vst1.u8 {q7}, [r2], r3
|
||||
vld1.u8 {q10}, [r0], r1
|
||||
vst1.u8 {q8}, [r2], r3
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
vst1.u8 {q9}, [r2], r3
|
||||
vld1.u8 {q12}, [r0], r1
|
||||
vst1.u8 {q10}, [r2], r3
|
||||
vld1.u8 {q13}, [r0], r1
|
||||
vst1.u8 {q11}, [r2], r3
|
||||
vld1.u8 {q14}, [r0], r1
|
||||
vst1.u8 {q12}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
vst1.u8 {q13}, [r2], r3
|
||||
vst1.u8 {q14}, [r2], r3
|
||||
vst1.u8 {q15}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem16x16_neon|
|
||||
|
||||
END
|
|
@ -0,0 +1,34 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x4_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x4_neon| PROC
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x4_neon|
|
||||
|
||||
END
|
|
@ -0,0 +1,43 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x8_neon|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x8_neon| PROC
|
||||
|
||||
vld1.u8 {d0}, [r0], r1
|
||||
vld1.u8 {d1}, [r0], r1
|
||||
vst1.u8 {d0}, [r2], r3
|
||||
vld1.u8 {d2}, [r0], r1
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vld1.u8 {d3}, [r0], r1
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vld1.u8 {d4}, [r0], r1
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
vld1.u8 {d5}, [r0], r1
|
||||
vst1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d6}, [r0], r1
|
||||
vst1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d7}, [r0], r1
|
||||
vst1.u8 {d6}, [r2], r3
|
||||
vst1.u8 {d7}, [r2], r3
|
||||
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x8_neon|
|
||||
|
||||
END
|
|
@ -1,59 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_copy_mem8x4_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
uint8x8_t vtmp;
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 4; r++) {
|
||||
vtmp = vld1_u8(src);
|
||||
vst1_u8(dst, vtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_copy_mem8x8_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
uint8x8_t vtmp;
|
||||
int r;
|
||||
|
||||
for (r = 0; r < 8; r++) {
|
||||
vtmp = vld1_u8(src);
|
||||
vst1_u8(dst, vtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_copy_mem16x16_neon(
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *dst,
|
||||
int dst_stride) {
|
||||
int r;
|
||||
uint8x16_t qtmp;
|
||||
|
||||
for (r = 0; r < 16; r++) {
|
||||
qtmp = vld1q_u8(src);
|
||||
vst1q_u8(dst, qtmp);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dc_only_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
; int pred_stride, unsigned char *dst_ptr,
|
||||
; int dst_stride)
|
||||
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 pred_stride
|
||||
; r3 dst_ptr
|
||||
; sp dst_stride
|
||||
|
||||
|vp8_dc_only_idct_add_neon| PROC
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
ldr r12, [sp]
|
||||
vdup.16 q0, r0
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r2
|
||||
vld1.32 {d2[1]}, [r1], r2
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
|
||||
vaddw.u8 q1, q0, d2
|
||||
vaddw.u8 q2, q0, d4
|
||||
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
|
||||
vst1.32 {d2[0]}, [r3], r12
|
||||
vst1.32 {d2[1]}, [r3], r12
|
||||
vst1.32 {d4[0]}, [r3], r12
|
||||
vst1.32 {d4[1]}, [r3]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,42 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_dc_only_idct_add_neon(
|
||||
int16_t input_dc,
|
||||
unsigned char *pred_ptr,
|
||||
int pred_stride,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_stride) {
|
||||
int i;
|
||||
uint16_t a1 = ((input_dc + 4) >> 3);
|
||||
uint32x2_t d2u32 = vdup_n_u32(0);
|
||||
uint8x8_t d2u8;
|
||||
uint16x8_t q1u16;
|
||||
uint16x8_t qAdd;
|
||||
|
||||
qAdd = vdupq_n_u16(a1);
|
||||
|
||||
for (i = 0; i < 2; i++) {
|
||||
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 0);
|
||||
pred_ptr += pred_stride;
|
||||
d2u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d2u32, 1);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q1u16 = vaddw_u8(qAdd, vreinterpret_u8_u32(d2u32));
|
||||
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
|
||||
dst_ptr += dst_stride;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequant_idct_add_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dequant_idct_add_neon(short *input, short *dq,
|
||||
; unsigned char *dest, int stride)
|
||||
; r0 short *input,
|
||||
; r1 short *dq,
|
||||
; r2 unsigned char *dest
|
||||
; r3 int stride
|
||||
|
||||
|vp8_dequant_idct_add_neon| PROC
|
||||
vld1.16 {q3, q4}, [r0]
|
||||
vld1.16 {q5, q6}, [r1]
|
||||
|
||||
add r1, r2, r3 ; r1 = dest + stride
|
||||
lsl r3, #1 ; 2x stride
|
||||
|
||||
vld1.32 {d14[0]}, [r2], r3
|
||||
vld1.32 {d14[1]}, [r1], r3
|
||||
vld1.32 {d15[0]}, [r2]
|
||||
vld1.32 {d15[1]}, [r1]
|
||||
|
||||
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
vmul.i16 q1, q3, q5 ;input for short_idct4x4llm_neon
|
||||
vmul.i16 q2, q4, q6
|
||||
|
||||
;|short_idct4x4llm_neon| PROC
|
||||
vld1.16 {d0}, [r12]
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
; memset(input, 0, 32) -- 32bytes
|
||||
vmov.i16 q14, #0
|
||||
|
||||
vswp d3, d4
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vmov q15, q14
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vst1.16 {q14, q15}, [r0]
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vaddw.u8 q1, q1, d14
|
||||
vaddw.u8 q2, q2, d15
|
||||
|
||||
sub r2, r2, r3
|
||||
sub r1, r1, r3
|
||||
|
||||
vqmovun.s16 d0, q1
|
||||
vqmovun.s16 d1, q2
|
||||
|
||||
vst1.32 {d0[0]}, [r2], r3
|
||||
vst1.32 {d0[1]}, [r1], r3
|
||||
vst1.32 {d1[0]}, [r2]
|
||||
vst1.32 {d1[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |vp8_dequant_idct_add_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b4e7b
|
||||
sinpi8sqrt2 DCD 0x8a8c8a8c
|
||||
|
||||
END
|
|
@ -1,142 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 35468;
|
||||
|
||||
void vp8_dequant_idct_add_neon(
|
||||
int16_t *input,
|
||||
int16_t *dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0;
|
||||
int32x2_t d14, d15;
|
||||
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
|
||||
int16x8_t q1, q2, q3, q4, q5, q6;
|
||||
int16x8_t qEmpty = vdupq_n_s16(0);
|
||||
int32x2x2_t d2tmp0, d2tmp1;
|
||||
int16x4x2_t d2tmp2, d2tmp3;
|
||||
|
||||
d14 = d15 = vdup_n_s32(0);
|
||||
|
||||
// load input
|
||||
q3 = vld1q_s16(input);
|
||||
vst1q_s16(input, qEmpty);
|
||||
input += 8;
|
||||
q4 = vld1q_s16(input);
|
||||
vst1q_s16(input, qEmpty);
|
||||
|
||||
// load dq
|
||||
q5 = vld1q_s16(dq);
|
||||
dq += 8;
|
||||
q6 = vld1q_s16(dq);
|
||||
|
||||
// load src from dst
|
||||
dst0 = dst;
|
||||
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
|
||||
dst0 += stride;
|
||||
d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
|
||||
dst0 += stride;
|
||||
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
|
||||
dst0 += stride;
|
||||
d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
|
||||
|
||||
q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
|
||||
vreinterpretq_u16_s16(q5)));
|
||||
q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
|
||||
vreinterpretq_u16_s16(q6)));
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
|
||||
d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
|
||||
|
||||
q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
|
||||
|
||||
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
|
||||
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
|
||||
|
||||
q3 = vshrq_n_s16(q3, 1);
|
||||
q4 = vshrq_n_s16(q4, 1);
|
||||
|
||||
q3 = vqaddq_s16(q3, q2);
|
||||
q4 = vqaddq_s16(q4, q2);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
|
||||
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[0]));
|
||||
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[1]));
|
||||
|
||||
// loop 2
|
||||
q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
|
||||
|
||||
q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
|
||||
q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
|
||||
d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
|
||||
|
||||
q3 = vshrq_n_s16(q3, 1);
|
||||
q4 = vshrq_n_s16(q4, 1);
|
||||
|
||||
q3 = vqaddq_s16(q3, q2);
|
||||
q4 = vqaddq_s16(q4, q2);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
|
||||
d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2 = vrshr_n_s16(d2, 3);
|
||||
d3 = vrshr_n_s16(d3, 3);
|
||||
d4 = vrshr_n_s16(d4, 3);
|
||||
d5 = vrshr_n_s16(d5, 3);
|
||||
|
||||
d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[0]));
|
||||
d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(d2tmp1.val[1]));
|
||||
|
||||
q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
|
||||
q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
|
||||
|
||||
q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
|
||||
vreinterpret_u8_s32(d14)));
|
||||
q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
|
||||
vreinterpret_u8_s32(d15)));
|
||||
|
||||
d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
|
||||
d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
|
||||
|
||||
dst0 = dst;
|
||||
vst1_lane_s32((int32_t *)dst0, d14, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d14, 1);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d15, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d15, 1);
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequantize_b_loop_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 short *Q,
|
||||
; r1 short *DQC
|
||||
; r2 short *DQ
|
||||
|vp8_dequantize_b_loop_neon| PROC
|
||||
vld1.16 {q0, q1}, [r0]
|
||||
vld1.16 {q2, q3}, [r1]
|
||||
|
||||
vmul.i16 q4, q0, q2
|
||||
vmul.i16 q5, q1, q3
|
||||
|
||||
vst1.16 {q4, q5}, [r2]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
|
||||
int16x8x2_t qQ, qDQC, qDQ;
|
||||
|
||||
qQ = vld2q_s16(d->qcoeff);
|
||||
qDQC = vld2q_s16(DQC);
|
||||
|
||||
qDQ.val[0] = vmulq_s16(qQ.val[0], qDQC.val[0]);
|
||||
qDQ.val[1] = vmulq_s16(qQ.val[1], qDQC.val[1]);
|
||||
|
||||
vst2q_s16(d->dqcoeff, qDQ);
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_0_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_0_2x_neon(short *q, short dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q
|
||||
; r1 dq
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_0_2x_neon| PROC
|
||||
push {r4, r5}
|
||||
|
||||
add r12, r2, #4
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d8[0]}, [r12], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d4[1]}, [r2], r3
|
||||
vld1.32 {d10[1]}, [r12], r3
|
||||
|
||||
ldrh r12, [r0] ; lo q
|
||||
ldrh r4, [r0, #32] ; hi q
|
||||
mov r5, #0
|
||||
strh r5, [r0]
|
||||
strh r5, [r0, #32]
|
||||
|
||||
sxth r12, r12 ; lo
|
||||
mul r0, r12, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q0, r0
|
||||
sxth r4, r4 ; hi
|
||||
mul r0, r4, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
|
||||
vaddw.u8 q1, q0, d2 ; lo
|
||||
vaddw.u8 q2, q0, d4
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r0, r2, #4
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
pop {r4, r5}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_0_2x_neon|
|
||||
END
|
|
@ -1,62 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void idct_dequant_0_2x_neon(
|
||||
int16_t *q,
|
||||
int16_t dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0;
|
||||
int i, a0, a1;
|
||||
int16x8x2_t q2Add;
|
||||
int32x2_t d2s32, d4s32;
|
||||
uint8x8_t d2u8, d4u8;
|
||||
uint16x8_t q1u16, q2u16;
|
||||
|
||||
a0 = ((q[0] * dq) + 4) >> 3;
|
||||
a1 = ((q[16] * dq) + 4) >> 3;
|
||||
q[0] = q[16] = 0;
|
||||
q2Add.val[0] = vdupq_n_s16((int16_t)a0);
|
||||
q2Add.val[1] = vdupq_n_s16((int16_t)a1);
|
||||
|
||||
for (i = 0; i < 2; i++, dst += 4) {
|
||||
dst0 = dst;
|
||||
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
|
||||
dst0 += stride;
|
||||
d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
|
||||
dst0 += stride;
|
||||
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
|
||||
dst0 += stride;
|
||||
d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
|
||||
|
||||
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
|
||||
vreinterpret_u8_s32(d2s32));
|
||||
q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
|
||||
vreinterpret_u8_s32(d4s32));
|
||||
|
||||
d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
|
||||
|
||||
d2s32 = vreinterpret_s32_u8(d2u8);
|
||||
d4s32 = vreinterpret_s32_u8(d4u8);
|
||||
|
||||
dst0 = dst;
|
||||
vst1_lane_s32((int32_t *)dst0, d2s32, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d2s32, 1);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d4s32, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d4s32, 1);
|
||||
}
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,196 @@
|
|||
;
|
||||
; Copyright (c) 2010 The Webm project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |idct_dequant_full_2x_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_full_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_full_2x_neon| PROC
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r3 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r3 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r3
|
||||
vld1.32 {d29[1]}, [r12], r3
|
||||
vld1.32 {d30[0]}, [r2], r3
|
||||
vld1.32 {d30[1]}, [r12], r3
|
||||
vld1.32 {d31[0]}, [r2], r3
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
vmul.i16 q3, q3, q1
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
vld1.16 {d0}, [r1]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
vswp d5, d8
|
||||
vswp d7, d10
|
||||
|
||||
; _CONSTANTS_ * 4,12 >> 16
|
||||
; q6: 4 * sinpi : c1/temp1
|
||||
; q7: 12 * sinpi : d1/temp2
|
||||
; q8: 4 * cospi
|
||||
; q9: 12 * cospi
|
||||
vqdmulh.s16 q6, q4, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q7, q5, d0[2]
|
||||
vqdmulh.s16 q8, q4, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q9, q5, d0[0]
|
||||
|
||||
vqadd.s16 q10, q2, q3 ; a1 = 0 + 8
|
||||
vqsub.s16 q11, q2, q3 ; b1 = 0 - 8
|
||||
|
||||
; vqdmulh only accepts signed values. this was a problem because
|
||||
; our constant had the high bit set, and was treated as a negative value.
|
||||
; vqdmulh also doubles the value before it shifts by 16. we need to
|
||||
; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,
|
||||
; so we can shift the constant without losing precision. this avoids
|
||||
; shift again afterward, but also avoids the sign issue. win win!
|
||||
; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we
|
||||
; pre-shift it
|
||||
vshr.s16 q8, q8, #1
|
||||
vshr.s16 q9, q9, #1
|
||||
|
||||
; q4: 4 + 4 * cospi : d1/temp1
|
||||
; q5: 12 + 12 * cospi : c1/temp2
|
||||
vqadd.s16 q4, q4, q8
|
||||
vqadd.s16 q5, q5, q9
|
||||
|
||||
; c1 = temp1 - temp2
|
||||
; d1 = temp1 + temp2
|
||||
vqsub.s16 q2, q6, q5
|
||||
vqadd.s16 q3, q4, q7
|
||||
|
||||
; [0]: a1+d1
|
||||
; [1]: b1+c1
|
||||
; [2]: b1-c1
|
||||
; [3]: a1-d1
|
||||
vqadd.s16 q4, q10, q3
|
||||
vqadd.s16 q5, q11, q2
|
||||
vqsub.s16 q6, q11, q2
|
||||
vqsub.s16 q7, q10, q3
|
||||
|
||||
; rotate
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
; idct loop 2
|
||||
; q4: l 0, 4, 8,12 r 0, 4, 8,12
|
||||
; q5: l 1, 5, 9,13 r 1, 5, 9,13
|
||||
; q6: l 2, 6,10,14 r 2, 6,10,14
|
||||
; q7: l 3, 7,11,15 r 3, 7,11,15
|
||||
|
||||
; q8: 1 * sinpi : c1/temp1
|
||||
; q9: 3 * sinpi : d1/temp2
|
||||
; q10: 1 * cospi
|
||||
; q11: 3 * cospi
|
||||
vqdmulh.s16 q8, q5, d0[2] ; sinpi8sqrt2
|
||||
vqdmulh.s16 q9, q7, d0[2]
|
||||
vqdmulh.s16 q10, q5, d0[0] ; cospi8sqrt2minus1
|
||||
vqdmulh.s16 q11, q7, d0[0]
|
||||
|
||||
vqadd.s16 q2, q4, q6 ; a1 = 0 + 2
|
||||
vqsub.s16 q3, q4, q6 ; b1 = 0 - 2
|
||||
|
||||
; see note on shifting above
|
||||
vshr.s16 q10, q10, #1
|
||||
vshr.s16 q11, q11, #1
|
||||
|
||||
; q10: 1 + 1 * cospi : d1/temp1
|
||||
; q11: 3 + 3 * cospi : c1/temp2
|
||||
vqadd.s16 q10, q5, q10
|
||||
vqadd.s16 q11, q7, q11
|
||||
|
||||
; q8: c1 = temp1 - temp2
|
||||
; q9: d1 = temp1 + temp2
|
||||
vqsub.s16 q8, q8, q11
|
||||
vqadd.s16 q9, q10, q9
|
||||
|
||||
; a1+d1
|
||||
; b1+c1
|
||||
; b1-c1
|
||||
; a1-d1
|
||||
vqadd.s16 q4, q2, q9
|
||||
vqadd.s16 q5, q3, q8
|
||||
vqsub.s16 q6, q3, q8
|
||||
vqsub.s16 q7, q2, q9
|
||||
|
||||
; +4 >> 3 (rounding)
|
||||
vrshr.s16 q4, q4, #3 ; lo
|
||||
vrshr.s16 q5, q5, #3
|
||||
vrshr.s16 q6, q6, #3 ; hi
|
||||
vrshr.s16 q7, q7, #3
|
||||
|
||||
vtrn.32 q4, q6
|
||||
vtrn.32 q5, q7
|
||||
vtrn.16 q4, q5
|
||||
vtrn.16 q6, q7
|
||||
|
||||
; adding pre
|
||||
; input is still packed. pre was read interleaved
|
||||
vaddw.u8 q4, q4, d28
|
||||
vaddw.u8 q5, q5, d29
|
||||
vaddw.u8 q6, q6, d30
|
||||
vaddw.u8 q7, q7, d31
|
||||
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q14
|
||||
vst1.16 {q14, q15}, [r0] ; write over high input
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r1, r2, #4 ; hi
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
vst1.32 {d0[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d0[1]}, [r1], r3 ; hi
|
||||
vst1.32 {d1[0]}, [r2], r3
|
||||
vst1.32 {d1[1]}, [r1], r3
|
||||
vst1.32 {d2[0]}, [r2], r3
|
||||
vst1.32 {d2[1]}, [r1], r3
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
; because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
sinpi8sqrt2 DCD 0x4546
|
||||
|
||||
END
|
|
@ -1,185 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 17734;
|
||||
// because the lowest bit in 0x8a8c is 0, we can pre-shift this
|
||||
|
||||
void idct_dequant_full_2x_neon(
|
||||
int16_t *q,
|
||||
int16_t *dq,
|
||||
unsigned char *dst,
|
||||
int stride) {
|
||||
unsigned char *dst0, *dst1;
|
||||
int32x2_t d28, d29, d30, d31;
|
||||
int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
|
||||
int16x8_t qEmpty = vdupq_n_s16(0);
|
||||
int32x4x2_t q2tmp0, q2tmp1;
|
||||
int16x8x2_t q2tmp2, q2tmp3;
|
||||
int16x4_t dLow0, dLow1, dHigh0, dHigh1;
|
||||
|
||||
d28 = d29 = d30 = d31 = vdup_n_s32(0);
|
||||
|
||||
// load dq
|
||||
q0 = vld1q_s16(dq);
|
||||
dq += 8;
|
||||
q1 = vld1q_s16(dq);
|
||||
|
||||
// load q
|
||||
q2 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q3 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q4 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
q += 8;
|
||||
q5 = vld1q_s16(q);
|
||||
vst1q_s16(q, qEmpty);
|
||||
|
||||
// load src from dst
|
||||
dst0 = dst;
|
||||
dst1 = dst + 4;
|
||||
d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
|
||||
dst0 += stride;
|
||||
d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
|
||||
dst1 += stride;
|
||||
d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
|
||||
dst0 += stride;
|
||||
d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
|
||||
dst1 += stride;
|
||||
|
||||
d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
|
||||
dst0 += stride;
|
||||
d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
|
||||
dst1 += stride;
|
||||
d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
|
||||
d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
|
||||
|
||||
q2 = vmulq_s16(q2, q0);
|
||||
q3 = vmulq_s16(q3, q1);
|
||||
q4 = vmulq_s16(q4, q0);
|
||||
q5 = vmulq_s16(q5, q1);
|
||||
|
||||
// vswp
|
||||
dLow0 = vget_low_s16(q2);
|
||||
dHigh0 = vget_high_s16(q2);
|
||||
dLow1 = vget_low_s16(q4);
|
||||
dHigh1 = vget_high_s16(q4);
|
||||
q2 = vcombine_s16(dLow0, dLow1);
|
||||
q4 = vcombine_s16(dHigh0, dHigh1);
|
||||
|
||||
dLow0 = vget_low_s16(q3);
|
||||
dHigh0 = vget_high_s16(q3);
|
||||
dLow1 = vget_low_s16(q5);
|
||||
dHigh1 = vget_high_s16(q5);
|
||||
q3 = vcombine_s16(dLow0, dLow1);
|
||||
q5 = vcombine_s16(dHigh0, dHigh1);
|
||||
|
||||
q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
|
||||
q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
|
||||
q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
|
||||
q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
|
||||
|
||||
q10 = vqaddq_s16(q2, q3);
|
||||
q11 = vqsubq_s16(q2, q3);
|
||||
|
||||
q8 = vshrq_n_s16(q8, 1);
|
||||
q9 = vshrq_n_s16(q9, 1);
|
||||
|
||||
q4 = vqaddq_s16(q4, q8);
|
||||
q5 = vqaddq_s16(q5, q9);
|
||||
|
||||
q2 = vqsubq_s16(q6, q5);
|
||||
q3 = vqaddq_s16(q7, q4);
|
||||
|
||||
q4 = vqaddq_s16(q10, q3);
|
||||
q5 = vqaddq_s16(q11, q2);
|
||||
q6 = vqsubq_s16(q11, q2);
|
||||
q7 = vqsubq_s16(q10, q3);
|
||||
|
||||
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
|
||||
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
|
||||
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[0]));
|
||||
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[1]));
|
||||
|
||||
// loop 2
|
||||
q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
|
||||
q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
|
||||
q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
|
||||
q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
|
||||
|
||||
q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
|
||||
q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
|
||||
|
||||
q10 = vshrq_n_s16(q10, 1);
|
||||
q11 = vshrq_n_s16(q11, 1);
|
||||
|
||||
q10 = vqaddq_s16(q2tmp2.val[1], q10);
|
||||
q11 = vqaddq_s16(q2tmp3.val[1], q11);
|
||||
|
||||
q8 = vqsubq_s16(q8, q11);
|
||||
q9 = vqaddq_s16(q9, q10);
|
||||
|
||||
q4 = vqaddq_s16(q2, q9);
|
||||
q5 = vqaddq_s16(q3, q8);
|
||||
q6 = vqsubq_s16(q3, q8);
|
||||
q7 = vqsubq_s16(q2, q9);
|
||||
|
||||
q4 = vrshrq_n_s16(q4, 3);
|
||||
q5 = vrshrq_n_s16(q5, 3);
|
||||
q6 = vrshrq_n_s16(q6, 3);
|
||||
q7 = vrshrq_n_s16(q7, 3);
|
||||
|
||||
q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
|
||||
q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
|
||||
q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[0]));
|
||||
q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
|
||||
vreinterpretq_s16_s32(q2tmp1.val[1]));
|
||||
|
||||
q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]),
|
||||
vreinterpret_u8_s32(d28)));
|
||||
q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]),
|
||||
vreinterpret_u8_s32(d29)));
|
||||
q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]),
|
||||
vreinterpret_u8_s32(d30)));
|
||||
q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]),
|
||||
vreinterpret_u8_s32(d31)));
|
||||
|
||||
d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
|
||||
d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
|
||||
d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
|
||||
d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
|
||||
|
||||
dst0 = dst;
|
||||
dst1 = dst + 4;
|
||||
vst1_lane_s32((int32_t *)dst0, d28, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d28, 1);
|
||||
dst1 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d29, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d29, 1);
|
||||
dst1 += stride;
|
||||
|
||||
vst1_lane_s32((int32_t *)dst0, d30, 0);
|
||||
dst0 += stride;
|
||||
vst1_lane_s32((int32_t *)dst1, d30, 1);
|
||||
dst1 += stride;
|
||||
vst1_lane_s32((int32_t *)dst0, d31, 0);
|
||||
vst1_lane_s32((int32_t *)dst1, d31, 1);
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
EXPORT |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff)
|
||||
|vp8_short_inv_walsh4x4_neon| PROC
|
||||
|
||||
; read in all four lines of values: d0->d3
|
||||
vld1.i16 {q0-q1}, [r0@128]
|
||||
|
||||
; first for loop
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [12]
|
||||
vadd.s16 d6, d1, d2 ;b = [4] + [8]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [12]
|
||||
vsub.s16 d7, d1, d2 ;c = [4] - [8]
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vtrn.32 d0, d2 ;d0: 0 1 8 9
|
||||
;d2: 2 3 10 11
|
||||
vtrn.32 d1, d3 ;d1: 4 5 12 13
|
||||
;d3: 6 7 14 15
|
||||
|
||||
vtrn.16 d0, d1 ;d0: 0 4 8 12
|
||||
;d1: 1 5 9 13
|
||||
vtrn.16 d2, d3 ;d2: 2 6 10 14
|
||||
;d3: 3 7 11 15
|
||||
|
||||
; second for loop
|
||||
|
||||
vadd.s16 d4, d0, d3 ;a = [0] + [3]
|
||||
vadd.s16 d6, d1, d2 ;b = [1] + [2]
|
||||
vsub.s16 d5, d0, d3 ;d = [0] - [3]
|
||||
vsub.s16 d7, d1, d2 ;c = [1] - [2]
|
||||
|
||||
vmov.i16 q8, #3
|
||||
|
||||
vadd.s16 q0, q2, q3 ; a+b d+c
|
||||
vsub.s16 q1, q2, q3 ; a-b d-c
|
||||
|
||||
vadd.i16 q0, q0, q8 ;e/f += 3
|
||||
vadd.i16 q1, q1, q8 ;g/h += 3
|
||||
|
||||
vshr.s16 q0, q0, #3 ;e/f >> 3
|
||||
vshr.s16 q1, q1, #3 ;g/h >> 3
|
||||
|
||||
mov r2, #64
|
||||
add r3, r1, #32
|
||||
|
||||
vst1.i16 d0[0], [r1],r2
|
||||
vst1.i16 d1[0], [r3],r2
|
||||
vst1.i16 d2[0], [r1],r2
|
||||
vst1.i16 d3[0], [r3],r2
|
||||
|
||||
vst1.i16 d0[1], [r1],r2
|
||||
vst1.i16 d1[1], [r3],r2
|
||||
vst1.i16 d2[1], [r1],r2
|
||||
vst1.i16 d3[1], [r3],r2
|
||||
|
||||
vst1.i16 d0[2], [r1],r2
|
||||
vst1.i16 d1[2], [r3],r2
|
||||
vst1.i16 d2[2], [r1],r2
|
||||
vst1.i16 d3[2], [r3],r2
|
||||
|
||||
vst1.i16 d0[3], [r1],r2
|
||||
vst1.i16 d1[3], [r3],r2
|
||||
vst1.i16 d2[3], [r1]
|
||||
vst1.i16 d3[3], [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_short_inv_walsh4x4_neon|
|
||||
|
||||
END
|
|
@ -1,102 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_short_inv_walsh4x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *mb_dqcoeff) {
|
||||
int16x8_t q0s16, q1s16, q2s16, q3s16;
|
||||
int16x4_t d4s16, d5s16, d6s16, d7s16;
|
||||
int16x4x2_t v2tmp0, v2tmp1;
|
||||
int32x2x2_t v2tmp2, v2tmp3;
|
||||
int16x8_t qAdd3;
|
||||
|
||||
q0s16 = vld1q_s16(input);
|
||||
q1s16 = vld1q_s16(input + 8);
|
||||
|
||||
// 1st for loop
|
||||
d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
|
||||
d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
|
||||
d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16));
|
||||
d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16));
|
||||
|
||||
q2s16 = vcombine_s16(d4s16, d5s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
q0s16 = vaddq_s16(q2s16, q3s16);
|
||||
q1s16 = vsubq_s16(q2s16, q3s16);
|
||||
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)),
|
||||
vreinterpret_s32_s16(vget_low_s16(q1s16)));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)),
|
||||
vreinterpret_s32_s16(vget_high_s16(q1s16)));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp3.val[0]));
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp3.val[1]));
|
||||
|
||||
// 2nd for loop
|
||||
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q2s16 = vcombine_s16(d4s16, d5s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
qAdd3 = vdupq_n_s16(3);
|
||||
|
||||
q0s16 = vaddq_s16(q2s16, q3s16);
|
||||
q1s16 = vsubq_s16(q2s16, q3s16);
|
||||
|
||||
q0s16 = vaddq_s16(q0s16, qAdd3);
|
||||
q1s16 = vaddq_s16(q1s16, qAdd3);
|
||||
|
||||
q0s16 = vshrq_n_s16(q0s16, 3);
|
||||
q1s16 = vshrq_n_s16(q1s16, 3);
|
||||
|
||||
// store
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2);
|
||||
mb_dqcoeff += 16;
|
||||
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3);
|
||||
mb_dqcoeff += 16;
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,397 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp8_loop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_loop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1
|
||||
add r1, r1, r1
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vld1.u8 {q3}, [r2@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r2@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r2@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r2@128] ; q2
|
||||
vld1.u8 {q10}, [r12@128] ; q3
|
||||
|
||||
sub r2, r2, r1, lsl #1
|
||||
sub r12, r12, r1, lsl #1
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vst1.u8 {q5}, [r2@128], r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r2@128], r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128], r1 ; store oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r3@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r3@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r3@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r3@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r3@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r3@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r3@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r3@64] ; q3
|
||||
vld1.u8 {d21}, [r12@64] ; q3
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r2, r2, r1, lsl #1
|
||||
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r2@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r2@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64] ; store u oq1
|
||||
vst1.u8 {d17}, [r2@64] ; store v oq1
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; int count)
|
||||
; r0 unsigned char *src
|
||||
; r1 int pitch
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|
||||
|vp8_loop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r2, r0, #4 ; src ptr down by 4 columns
|
||||
add r1, r1, r1
|
||||
ldr r3, [sp, #4] ; load thresh
|
||||
add r12, r2, r1, asr #1
|
||||
|
||||
vld1.u8 {d6}, [r2], r1
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d10}, [r2], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d14}, [r2], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d18}, [r2], r1
|
||||
vld1.u8 {d20}, [r12], r1
|
||||
|
||||
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d11}, [r2], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d15}, [r2], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d19}, [r2]
|
||||
vld1.u8 {d21}, [r12]
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r3 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
|
||||
sub r0, r0, #2 ; dst ptr
|
||||
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
add r12, r0, r1, asr #1
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
|
||||
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
|
||||
; const signed char *flimit,
|
||||
; const signed char *limit,
|
||||
; const signed char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_loop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
vdup.u8 q0, r2 ; duplicate blimit
|
||||
sub r12, r0, #4 ; move u pointer down by 4 columns
|
||||
ldr r2, [sp, #8] ; load v ptr
|
||||
vdup.u8 q1, r3 ; duplicate limit
|
||||
sub r3, r2, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r12], r1 ;load u data
|
||||
vld1.u8 {d7}, [r3], r1 ;load v data
|
||||
vld1.u8 {d8}, [r12], r1
|
||||
vld1.u8 {d9}, [r3], r1
|
||||
vld1.u8 {d10}, [r12], r1
|
||||
vld1.u8 {d11}, [r3], r1
|
||||
vld1.u8 {d12}, [r12], r1
|
||||
vld1.u8 {d13}, [r3], r1
|
||||
vld1.u8 {d14}, [r12], r1
|
||||
vld1.u8 {d15}, [r3], r1
|
||||
vld1.u8 {d16}, [r12], r1
|
||||
vld1.u8 {d17}, [r3], r1
|
||||
vld1.u8 {d18}, [r12], r1
|
||||
vld1.u8 {d19}, [r3], r1
|
||||
vld1.u8 {d20}, [r12]
|
||||
vld1.u8 {d21}, [r3]
|
||||
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vdup.u8 q2, r12 ; duplicate thresh
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
bl vp8_loop_filter_neon
|
||||
|
||||
vswp d12, d11
|
||||
vswp d16, d13
|
||||
vswp d14, d12
|
||||
vswp d16, d15
|
||||
|
||||
sub r0, r0, #2
|
||||
sub r2, r2, #2
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
|
||||
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
|
||||
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
|
||||
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1
|
||||
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
|
||||
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1
|
||||
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
|
||||
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1
|
||||
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
|
||||
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1
|
||||
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
|
||||
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1
|
||||
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
|
||||
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1
|
||||
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
|
||||
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp8_loop_filter_neon();
|
||||
; This is a helper function for the loopfilters. The invidual functions do the
|
||||
; necessary load, transpose (if necessary) and store.
|
||||
|
||||
; r0-r3 PRESERVE
|
||||
; q0 flimit
|
||||
; q1 limit
|
||||
; q2 thresh
|
||||
; q3 p3
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3
|
||||
|vp8_loop_filter_neon| PROC
|
||||
|
||||
; vp8_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q3, q3, q4
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
|
||||
vmax.u8 q15, q15, q3
|
||||
|
||||
vmov.u8 q10, #0x80 ; 0x80
|
||||
|
||||
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
; vp8_filter() function
|
||||
; convert to signed
|
||||
veor q7, q7, q10 ; qs0
|
||||
vshr.u8 q2, q2, #1 ; a = a / 2
|
||||
veor q6, q6, q10 ; ps0
|
||||
|
||||
veor q5, q5, q10 ; ps1
|
||||
vqadd.u8 q9, q9, q2 ; a = b + a
|
||||
|
||||
veor q8, q8, q10 ; qs1
|
||||
|
||||
vmov.u8 q10, #3 ; #3
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q11, d15, d13
|
||||
|
||||
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vmovl.u8 q4, d20
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0)
|
||||
vmul.i16 q11, q11, q4
|
||||
|
||||
vand q1, q1, q14 ; vp8_filter &= hev
|
||||
vand q15, q15, q9 ; vp8_filter_mask
|
||||
|
||||
vaddw.s8 q2, q2, d2
|
||||
vaddw.s8 q11, q11, d3
|
||||
|
||||
vmov.u8 q9, #4 ; #4
|
||||
|
||||
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q11
|
||||
vand q1, q1, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3)
|
||||
vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
|
||||
|
||||
|
||||
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
|
||||
|
||||
; outer tap adjustments: ++vp8_filter >> 1
|
||||
vrshr.s8 q1, q1, #1
|
||||
vbic q1, q1, q14 ; vp8_filter &= ~hev
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
|
||||
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
veor q5, q13, q0 ; *op1 = u^0x80
|
||||
veor q8, q12, q0 ; *oq1 = u^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,550 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx_ports/arm.h"
|
||||
|
||||
static INLINE void vp8_loop_filter_neon(
|
||||
uint8x16_t qblimit, // flimit
|
||||
uint8x16_t qlimit, // limit
|
||||
uint8x16_t qthresh, // thresh
|
||||
uint8x16_t q3, // p3
|
||||
uint8x16_t q4, // p2
|
||||
uint8x16_t q5, // p1
|
||||
uint8x16_t q6, // p0
|
||||
uint8x16_t q7, // q0
|
||||
uint8x16_t q8, // q1
|
||||
uint8x16_t q9, // q2
|
||||
uint8x16_t q10, // q3
|
||||
uint8x16_t *q5r, // p1
|
||||
uint8x16_t *q6r, // p0
|
||||
uint8x16_t *q7r, // q0
|
||||
uint8x16_t *q8r) { // q1
|
||||
uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q11s16;
|
||||
uint16x8_t q4u16;
|
||||
int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
|
||||
int8x8_t d2s8, d3s8;
|
||||
|
||||
q11u8 = vabdq_u8(q3, q4);
|
||||
q12u8 = vabdq_u8(q4, q5);
|
||||
q13u8 = vabdq_u8(q5, q6);
|
||||
q14u8 = vabdq_u8(q8, q7);
|
||||
q3 = vabdq_u8(q9, q8);
|
||||
q4 = vabdq_u8(q10, q9);
|
||||
|
||||
q11u8 = vmaxq_u8(q11u8, q12u8);
|
||||
q12u8 = vmaxq_u8(q13u8, q14u8);
|
||||
q3 = vmaxq_u8(q3, q4);
|
||||
q15u8 = vmaxq_u8(q11u8, q12u8);
|
||||
|
||||
q9 = vabdq_u8(q6, q7);
|
||||
|
||||
// vp8_hevmask
|
||||
q13u8 = vcgtq_u8(q13u8, qthresh);
|
||||
q14u8 = vcgtq_u8(q14u8, qthresh);
|
||||
q15u8 = vmaxq_u8(q15u8, q3);
|
||||
|
||||
q2u8 = vabdq_u8(q5, q8);
|
||||
q9 = vqaddq_u8(q9, q9);
|
||||
|
||||
q15u8 = vcgeq_u8(qlimit, q15u8);
|
||||
|
||||
// vp8_filter() function
|
||||
// convert to signed
|
||||
q10 = vdupq_n_u8(0x80);
|
||||
q8 = veorq_u8(q8, q10);
|
||||
q7 = veorq_u8(q7, q10);
|
||||
q6 = veorq_u8(q6, q10);
|
||||
q5 = veorq_u8(q5, q10);
|
||||
|
||||
q2u8 = vshrq_n_u8(q2u8, 1);
|
||||
q9 = vqaddq_u8(q9, q2u8);
|
||||
|
||||
q10 = vdupq_n_u8(3);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6)));
|
||||
q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6)));
|
||||
|
||||
q9 = vcgeq_u8(qblimit, q9);
|
||||
|
||||
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
|
||||
vreinterpretq_s8_u8(q8));
|
||||
|
||||
q14u8 = vorrq_u8(q13u8, q14u8);
|
||||
|
||||
q4u16 = vmovl_u8(vget_low_u8(q10));
|
||||
q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
|
||||
q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
|
||||
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
|
||||
q15u8 = vandq_u8(q15u8, q9);
|
||||
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
|
||||
q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
|
||||
|
||||
q9 = vdupq_n_u8(4);
|
||||
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
d2s8 = vqmovn_s16(q2s16);
|
||||
d3s8 = vqmovn_s16(q11s16);
|
||||
q1s8 = vcombine_s8(d2s8, d3s8);
|
||||
q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
|
||||
q1s8 = vreinterpretq_s8_u8(q1u8);
|
||||
|
||||
q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
|
||||
q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q1s8 = vshrq_n_s8(q1s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
|
||||
|
||||
q1s8 = vrshrq_n_s8(q1s8, 1);
|
||||
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
|
||||
q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
|
||||
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
*q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
|
||||
*q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
*q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
*q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_horizontal_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
src -= (pitch << 2);
|
||||
|
||||
q3 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q4 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q5 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q6 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q7 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q8 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q9 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q10 = vld1q_u8(src);
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
src -= (pitch * 5);
|
||||
vst1q_u8(src, q5);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q6);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q7);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_horizontal_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
u -= (pitch << 2);
|
||||
v -= (pitch << 2);
|
||||
|
||||
d6 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d7 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d8 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d9 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d10 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d11 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d12 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d13 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d14 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d15 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d16 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d17 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d18 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d19 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d20 = vld1_u8(u);
|
||||
d21 = vld1_u8(v);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
u -= (pitch * 5);
|
||||
vst1_u8(u, vget_low_u8(q5));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q6));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q7));
|
||||
u += pitch;
|
||||
vst1_u8(u, vget_low_u8(q8));
|
||||
|
||||
v -= (pitch * 5);
|
||||
vst1_u8(v, vget_high_u8(q5));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q6));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q7));
|
||||
v += pitch;
|
||||
vst1_u8(v, vget_high_u8(q8));
|
||||
return;
|
||||
}
|
||||
|
||||
static INLINE void write_4x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x4_t result) {
|
||||
#ifdef VPX_INCOMPATIBLE_GCC
|
||||
/*
|
||||
* uint8x8x4_t result
|
||||
00 01 02 03 | 04 05 06 07
|
||||
10 11 12 13 | 14 15 16 17
|
||||
20 21 22 23 | 24 25 26 27
|
||||
30 31 32 33 | 34 35 36 37
|
||||
---
|
||||
* after vtrn_u16
|
||||
00 01 20 21 | 04 05 24 25
|
||||
02 03 22 23 | 06 07 26 27
|
||||
10 11 30 31 | 14 15 34 35
|
||||
12 13 32 33 | 16 17 36 37
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 20 30 | 04 14 24 34
|
||||
01 11 21 31 | 05 15 25 35
|
||||
02 12 22 32 | 06 16 26 36
|
||||
03 13 23 33 | 07 17 27 37
|
||||
*/
|
||||
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
|
||||
vreinterpret_u16_u8(result.val[2]));
|
||||
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
|
||||
vreinterpret_u16_u8(result.val[3]));
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
|
||||
vreinterpret_u8_u16(r13_u16.val[0]));
|
||||
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
|
||||
vreinterpret_u8_u16(r13_u16.val[1]));
|
||||
const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
|
||||
const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
|
||||
const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
|
||||
const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
|
||||
vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
|
||||
#else
|
||||
vst4_lane_u8(dst, result, 0);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 1);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 2);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 3);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 4);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 5);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 6);
|
||||
dst += pitch;
|
||||
vst4_lane_u8(dst, result, 7);
|
||||
#endif // VPX_INCOMPATIBLE_GCC
|
||||
}
|
||||
|
||||
void vp8_loop_filter_vertical_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
unsigned char *s, *d;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
uint8x8x4_t q4ResultH, q4ResultL;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
s = src - 4;
|
||||
d6 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d8 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d10 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d12 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d14 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d16 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d18 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d20 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d7 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d9 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d11 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d13 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d15 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d17 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d19 = vld1_u8(s);
|
||||
s += pitch;
|
||||
d21 = vld1_u8(s);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
q4ResultL.val[0] = vget_low_u8(q5); // d10
|
||||
q4ResultL.val[1] = vget_low_u8(q6); // d12
|
||||
q4ResultL.val[2] = vget_low_u8(q7); // d14
|
||||
q4ResultL.val[3] = vget_low_u8(q8); // d16
|
||||
q4ResultH.val[0] = vget_high_u8(q5); // d11
|
||||
q4ResultH.val[1] = vget_high_u8(q6); // d13
|
||||
q4ResultH.val[2] = vget_high_u8(q7); // d15
|
||||
q4ResultH.val[3] = vget_high_u8(q8); // d17
|
||||
|
||||
d = src - 2;
|
||||
write_4x8(d, pitch, q4ResultL);
|
||||
d += pitch * 8;
|
||||
write_4x8(d, pitch, q4ResultH);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_vertical_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
unsigned char *us, *ud;
|
||||
unsigned char *vs, *vd;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
uint8x8x4_t q4ResultH, q4ResultL;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
us = u - 4;
|
||||
d6 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d8 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d10 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d12 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d14 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d16 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d18 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d20 = vld1_u8(us);
|
||||
|
||||
vs = v - 4;
|
||||
d7 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d9 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d11 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d13 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d15 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d17 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d19 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d21 = vld1_u8(vs);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q5, &q6, &q7, &q8);
|
||||
|
||||
q4ResultL.val[0] = vget_low_u8(q5); // d10
|
||||
q4ResultL.val[1] = vget_low_u8(q6); // d12
|
||||
q4ResultL.val[2] = vget_low_u8(q7); // d14
|
||||
q4ResultL.val[3] = vget_low_u8(q8); // d16
|
||||
ud = u - 2;
|
||||
write_4x8(ud, pitch, q4ResultL);
|
||||
|
||||
q4ResultH.val[0] = vget_high_u8(q5); // d11
|
||||
q4ResultH.val[1] = vget_high_u8(q6); // d13
|
||||
q4ResultH.val[2] = vget_high_u8(q7); // d15
|
||||
q4ResultH.val[3] = vget_high_u8(q8); // d17
|
||||
vd = v - 2;
|
||||
write_4x8(vd, pitch, q4ResultH);
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bhs_neon|
|
||||
EXPORT |vp8_loop_filter_mbhs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
|
||||
|
||||
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
|
||||
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q5}, [r3@128], r1 ; p0
|
||||
vld1.u8 {q8}, [r0@128] ; q1
|
||||
vld1.u8 {q6}, [r3@128] ; p1
|
||||
|
||||
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q13, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
|
||||
vsubl.s8 q3, d15, d13
|
||||
|
||||
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q3, q3, q13
|
||||
|
||||
vmov.u8 q10, #0x03 ; 0x03
|
||||
vmov.u8 q9, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q3, q3, d9
|
||||
|
||||
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d9, q3
|
||||
|
||||
vand q14, q4, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
|
||||
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
sub r0, r0, r1
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
|
||||
vst1.u8 {q6}, [r3@128] ; store op0
|
||||
vst1.u8 {q7}, [r0@128] ; store oq0
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_bhs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
|
||||
bl vp8_loop_filter_simple_horizontal_edge_neon
|
||||
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
|
||||
bl vp8_loop_filter_simple_horizontal_edge_neon
|
||||
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
|
||||
pop {r4, lr}
|
||||
b vp8_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bhs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_mbhs_neon| PROC
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp8_loop_filter_simple_horizontal_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bhs_neon|
|
||||
|
||||
END
|
|
@ -1,111 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
|
||||
unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *blimit) {
|
||||
uint8_t *sp;
|
||||
uint8x16_t qblimit, q0u8;
|
||||
uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q3s16, q13s16;
|
||||
int8x8_t d8s8, d9s8;
|
||||
int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
|
||||
|
||||
qblimit = vdupq_n_u8(*blimit);
|
||||
|
||||
sp = s - (p << 1);
|
||||
q5u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q6u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q7u8 = vld1q_u8(sp);
|
||||
sp += p;
|
||||
q8u8 = vld1q_u8(sp);
|
||||
|
||||
q15u8 = vabdq_u8(q6u8, q7u8);
|
||||
q14u8 = vabdq_u8(q5u8, q8u8);
|
||||
|
||||
q15u8 = vqaddq_u8(q15u8, q15u8);
|
||||
q14u8 = vshrq_n_u8(q14u8, 1);
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q13s16 = vdupq_n_s16(3);
|
||||
q15u8 = vqaddq_u8(q15u8, q14u8);
|
||||
|
||||
q5u8 = veorq_u8(q5u8, q0u8);
|
||||
q6u8 = veorq_u8(q6u8, q0u8);
|
||||
q7u8 = veorq_u8(q7u8, q0u8);
|
||||
q8u8 = veorq_u8(q8u8, q0u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qblimit, q15u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6u8)));
|
||||
q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6u8)));
|
||||
|
||||
q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8),
|
||||
vreinterpretq_s8_u8(q8u8));
|
||||
|
||||
q2s16 = vmulq_s16(q2s16, q13s16);
|
||||
q3s16 = vmulq_s16(q3s16, q13s16);
|
||||
|
||||
q10u8 = vdupq_n_u8(3);
|
||||
q9u8 = vdupq_n_u8(4);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
|
||||
q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
|
||||
|
||||
d8s8 = vqmovn_s16(q2s16);
|
||||
d9s8 = vqmovn_s16(q3s16);
|
||||
q4s8 = vcombine_s8(d8s8, d9s8);
|
||||
|
||||
q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
|
||||
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q3s8 = vshrq_n_s8(q3s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
|
||||
|
||||
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
|
||||
vst1q_u8(s, q7u8);
|
||||
s -= p;
|
||||
vst1q_u8(s, q6u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bhs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += y_stride * 4;
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_mbhs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
EXPORT |vp8_loop_filter_bvs_neon|
|
||||
EXPORT |vp8_loop_filter_mbvs_neon|
|
||||
ARM
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *s, PRESERVE
|
||||
; r1 int p, PRESERVE
|
||||
; q1 limit, PRESERVE
|
||||
|
||||
|vp8_loop_filter_simple_vertical_edge_neon| PROC
|
||||
sub r0, r0, #2 ; move src pointer down by 2 columns
|
||||
add r12, r1, r1
|
||||
add r3, r0, r1
|
||||
|
||||
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
|
||||
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
|
||||
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
|
||||
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
|
||||
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
|
||||
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
|
||||
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
|
||||
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
|
||||
|
||||
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
|
||||
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
|
||||
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
|
||||
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
|
||||
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
|
||||
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
|
||||
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
|
||||
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
|
||||
|
||||
vswp d7, d10
|
||||
vswp d12, d9
|
||||
|
||||
;vp8_filter_mask() function
|
||||
;vp8_hevmask() function
|
||||
sub r0, r0, r1, lsl #4
|
||||
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
|
||||
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
|
||||
|
||||
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
|
||||
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
vmov.s16 q11, #3
|
||||
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
||||
|
||||
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
|
||||
veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value
|
||||
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
|
||||
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
|
||||
|
||||
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
|
||||
|
||||
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
|
||||
vsubl.s8 q13, d9, d11
|
||||
|
||||
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
|
||||
|
||||
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
vmul.s16 q13, q13, q11
|
||||
|
||||
vmov.u8 q11, #0x03 ; 0x03
|
||||
vmov.u8 q12, #0x04 ; 0x04
|
||||
|
||||
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d29
|
||||
|
||||
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d29, q13
|
||||
|
||||
add r0, r0, #1
|
||||
add r3, r0, r1
|
||||
|
||||
vand q14, q14, q15 ; vp8_filter &= mask
|
||||
|
||||
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
|
||||
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
|
||||
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
|
||||
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
|
||||
|
||||
;calculate output
|
||||
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
|
||||
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
|
||||
|
||||
veor q6, q11, q0 ; *op0 = u^0x80
|
||||
veor q7, q10, q0 ; *oq0 = u^0x80
|
||||
add r12, r1, r1
|
||||
vswp d13, d14
|
||||
|
||||
;store op1, op0, oq0, oq1
|
||||
vst2.8 {d12[0], d13[0]}, [r0], r12
|
||||
vst2.8 {d12[1], d13[1]}, [r3], r12
|
||||
vst2.8 {d12[2], d13[2]}, [r0], r12
|
||||
vst2.8 {d12[3], d13[3]}, [r3], r12
|
||||
vst2.8 {d12[4], d13[4]}, [r0], r12
|
||||
vst2.8 {d12[5], d13[5]}, [r3], r12
|
||||
vst2.8 {d12[6], d13[6]}, [r0], r12
|
||||
vst2.8 {d12[7], d13[7]}, [r3], r12
|
||||
vst2.8 {d14[0], d15[0]}, [r0], r12
|
||||
vst2.8 {d14[1], d15[1]}, [r3], r12
|
||||
vst2.8 {d14[2], d15[2]}, [r0], r12
|
||||
vst2.8 {d14[3], d15[3]}, [r3], r12
|
||||
vst2.8 {d14[4], d15[4]}, [r0], r12
|
||||
vst2.8 {d14[5], d15[5]}, [r3], r12
|
||||
vst2.8 {d14[6], d15[6]}, [r0], r12
|
||||
vst2.8 {d14[7], d15[7]}, [r3]
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_bvs_neon| PROC
|
||||
push {r4, lr}
|
||||
ldrb r3, [r2] ; load blim from mem
|
||||
mov r4, r0
|
||||
add r0, r0, #4
|
||||
vdup.s8 q1, r3 ; duplicate blim
|
||||
bl vp8_loop_filter_simple_vertical_edge_neon
|
||||
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
|
||||
add r0, r4, #8
|
||||
bl vp8_loop_filter_simple_vertical_edge_neon
|
||||
add r0, r4, #12
|
||||
pop {r4, lr}
|
||||
b vp8_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bvs_neon|
|
||||
|
||||
; r0 unsigned char *y
|
||||
; r1 int ystride
|
||||
; r2 const unsigned char *blimit
|
||||
|
||||
|vp8_loop_filter_mbvs_neon| PROC
|
||||
ldrb r3, [r2] ; load mblim from mem
|
||||
vdup.s8 q1, r3 ; duplicate mblim
|
||||
b vp8_loop_filter_simple_vertical_edge_neon
|
||||
ENDP ;|vp8_loop_filter_bvs_neon|
|
||||
END
|
|
@ -1,280 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx_ports/arm.h"
|
||||
|
||||
#ifdef VPX_INCOMPATIBLE_GCC
|
||||
static INLINE void write_2x4(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result) {
|
||||
/*
|
||||
* uint8x8x2_t result
|
||||
00 01 02 03 | 04 05 06 07
|
||||
10 11 12 13 | 14 15 16 17
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 02 12 | 04 14 06 16
|
||||
01 11 03 13 | 05 15 07 17
|
||||
*/
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
|
||||
result.val[1]);
|
||||
const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
|
||||
const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
|
||||
dst += pitch;
|
||||
vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
|
||||
}
|
||||
|
||||
static INLINE void write_2x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result,
|
||||
const uint8x8x2_t result2) {
|
||||
write_2x4(dst, pitch, result);
|
||||
dst += pitch * 8;
|
||||
write_2x4(dst, pitch, result2);
|
||||
}
|
||||
#else
|
||||
static INLINE void write_2x8(unsigned char *dst, int pitch,
|
||||
const uint8x8x2_t result,
|
||||
const uint8x8x2_t result2) {
|
||||
vst2_lane_u8(dst, result, 0);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 1);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 2);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 3);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 4);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 5);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 6);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result, 7);
|
||||
dst += pitch;
|
||||
|
||||
vst2_lane_u8(dst, result2, 0);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 1);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 2);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 3);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 4);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 5);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 6);
|
||||
dst += pitch;
|
||||
vst2_lane_u8(dst, result2, 7);
|
||||
}
|
||||
#endif // VPX_INCOMPATIBLE_GCC
|
||||
|
||||
|
||||
#ifdef VPX_INCOMPATIBLE_GCC
|
||||
static INLINE
|
||||
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
|
||||
const uint8x8_t a = vld1_u8(src);
|
||||
const uint8x8_t b = vld1_u8(src + pitch * 1);
|
||||
const uint8x8_t c = vld1_u8(src + pitch * 2);
|
||||
const uint8x8_t d = vld1_u8(src + pitch * 3);
|
||||
const uint8x8_t e = vld1_u8(src + pitch * 4);
|
||||
const uint8x8_t f = vld1_u8(src + pitch * 5);
|
||||
const uint8x8_t g = vld1_u8(src + pitch * 6);
|
||||
const uint8x8_t h = vld1_u8(src + pitch * 7);
|
||||
const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
|
||||
vreinterpret_u32_u8(e));
|
||||
const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
|
||||
vreinterpret_u32_u8(f));
|
||||
const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
|
||||
vreinterpret_u32_u8(g));
|
||||
const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
|
||||
vreinterpret_u32_u8(h));
|
||||
const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
|
||||
vreinterpret_u16_u32(r26_u32.val[0]));
|
||||
const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
|
||||
vreinterpret_u16_u32(r37_u32.val[0]));
|
||||
const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
|
||||
vreinterpret_u8_u16(r13_u16.val[0]));
|
||||
const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
|
||||
vreinterpret_u8_u16(r13_u16.val[1]));
|
||||
/*
|
||||
* after vtrn_u32
|
||||
00 01 02 03 | 40 41 42 43
|
||||
10 11 12 13 | 50 51 52 53
|
||||
20 21 22 23 | 60 61 62 63
|
||||
30 31 32 33 | 70 71 72 73
|
||||
---
|
||||
* after vtrn_u16
|
||||
00 01 20 21 | 40 41 60 61
|
||||
02 03 22 23 | 42 43 62 63
|
||||
10 11 30 31 | 50 51 70 71
|
||||
12 13 32 33 | 52 52 72 73
|
||||
|
||||
00 01 20 21 | 40 41 60 61
|
||||
10 11 30 31 | 50 51 70 71
|
||||
02 03 22 23 | 42 43 62 63
|
||||
12 13 32 33 | 52 52 72 73
|
||||
---
|
||||
* after vtrn_u8
|
||||
00 10 20 30 | 40 50 60 70
|
||||
01 11 21 31 | 41 51 61 71
|
||||
02 12 22 32 | 42 52 62 72
|
||||
03 13 23 33 | 43 53 63 73
|
||||
*/
|
||||
x.val[0] = r01_u8.val[0];
|
||||
x.val[1] = r01_u8.val[1];
|
||||
x.val[2] = r23_u8.val[0];
|
||||
x.val[3] = r23_u8.val[1];
|
||||
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
static INLINE
|
||||
uint8x8x4_t read_4x8(unsigned char *src, int pitch, uint8x8x4_t x) {
|
||||
x = vld4_lane_u8(src, x, 0);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 1);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 2);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 3);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 4);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 5);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 6);
|
||||
src += pitch;
|
||||
x = vld4_lane_u8(src, x, 7);
|
||||
return x;
|
||||
}
|
||||
#endif // VPX_INCOMPATIBLE_GCC
|
||||
|
||||
static INLINE void vp8_loop_filter_simple_vertical_edge_neon(
|
||||
unsigned char *s,
|
||||
int p,
|
||||
const unsigned char *blimit) {
|
||||
unsigned char *src1;
|
||||
uint8x16_t qblimit, q0u8;
|
||||
uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8;
|
||||
int16x8_t q2s16, q13s16, q11s16;
|
||||
int8x8_t d28s8, d29s8;
|
||||
int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8;
|
||||
uint8x8x4_t d0u8x4; // d6, d7, d8, d9
|
||||
uint8x8x4_t d1u8x4; // d10, d11, d12, d13
|
||||
uint8x8x2_t d2u8x2; // d12, d13
|
||||
uint8x8x2_t d3u8x2; // d14, d15
|
||||
|
||||
qblimit = vdupq_n_u8(*blimit);
|
||||
|
||||
src1 = s - 2;
|
||||
d0u8x4 = read_4x8(src1, p, d0u8x4);
|
||||
src1 += p * 8;
|
||||
d1u8x4 = read_4x8(src1, p, d1u8x4);
|
||||
|
||||
q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10
|
||||
q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12
|
||||
q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11
|
||||
q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13
|
||||
|
||||
q15u8 = vabdq_u8(q5u8, q4u8);
|
||||
q14u8 = vabdq_u8(q3u8, q6u8);
|
||||
|
||||
q15u8 = vqaddq_u8(q15u8, q15u8);
|
||||
q14u8 = vshrq_n_u8(q14u8, 1);
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q11s16 = vdupq_n_s16(3);
|
||||
q15u8 = vqaddq_u8(q15u8, q14u8);
|
||||
|
||||
q3u8 = veorq_u8(q3u8, q0u8);
|
||||
q4u8 = veorq_u8(q4u8, q0u8);
|
||||
q5u8 = veorq_u8(q5u8, q0u8);
|
||||
q6u8 = veorq_u8(q6u8, q0u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qblimit, q15u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q5u8)));
|
||||
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q5u8)));
|
||||
|
||||
q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8),
|
||||
vreinterpretq_s8_u8(q6u8));
|
||||
|
||||
q2s16 = vmulq_s16(q2s16, q11s16);
|
||||
q13s16 = vmulq_s16(q13s16, q11s16);
|
||||
|
||||
q11u8 = vdupq_n_u8(3);
|
||||
q12u8 = vdupq_n_u8(4);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8));
|
||||
q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8));
|
||||
|
||||
d28s8 = vqmovn_s16(q2s16);
|
||||
d29s8 = vqmovn_s16(q13s16);
|
||||
q14s8 = vcombine_s8(d28s8, d29s8);
|
||||
|
||||
q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8));
|
||||
q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q14s8 = vshrq_n_s8(q3s8, 3);
|
||||
|
||||
q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8);
|
||||
q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8);
|
||||
|
||||
q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
|
||||
q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
|
||||
|
||||
d2u8x2.val[0] = vget_low_u8(q6u8); // d12
|
||||
d2u8x2.val[1] = vget_low_u8(q7u8); // d14
|
||||
d3u8x2.val[0] = vget_high_u8(q6u8); // d13
|
||||
d3u8x2.val[1] = vget_high_u8(q7u8); // d15
|
||||
|
||||
src1 = s - 1;
|
||||
write_2x8(src1, p, d2u8x2, d3u8x2);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bvs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
y_ptr += 4;
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_loop_filter_mbvs_neon(
|
||||
unsigned char *y_ptr,
|
||||
int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit);
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,469 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
ARM
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
|
||||
push {lr}
|
||||
add r1, r1, r1 ; double stride
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
|
||||
|
||||
vld1.u8 {q3}, [r0@128], r1 ; p3
|
||||
vld1.u8 {q4}, [r12@128], r1 ; p2
|
||||
vld1.u8 {q5}, [r0@128], r1 ; p1
|
||||
vld1.u8 {q6}, [r12@128], r1 ; p0
|
||||
vld1.u8 {q7}, [r0@128], r1 ; q0
|
||||
vld1.u8 {q8}, [r12@128], r1 ; q1
|
||||
vld1.u8 {q9}, [r0@128], r1 ; q2
|
||||
vld1.u8 {q10}, [r12@128], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #2
|
||||
add r0, r12, r1, lsr #1
|
||||
|
||||
vst1.u8 {q4}, [r12@128],r1 ; store op2
|
||||
vst1.u8 {q5}, [r0@128],r1 ; store op1
|
||||
vst1.u8 {q6}, [r12@128], r1 ; store op0
|
||||
vst1.u8 {q7}, [r0@128],r1 ; store oq0
|
||||
vst1.u8 {q8}, [r12@128] ; store oq1
|
||||
vst1.u8 {q9}, [r0@128] ; store oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|
||||
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
|
||||
|
||||
vld1.u8 {d6}, [r0@64], r1 ; p3
|
||||
vld1.u8 {d7}, [r12@64], r1 ; p3
|
||||
vld1.u8 {d8}, [r0@64], r1 ; p2
|
||||
vld1.u8 {d9}, [r12@64], r1 ; p2
|
||||
vld1.u8 {d10}, [r0@64], r1 ; p1
|
||||
vld1.u8 {d11}, [r12@64], r1 ; p1
|
||||
vld1.u8 {d12}, [r0@64], r1 ; p0
|
||||
vld1.u8 {d13}, [r12@64], r1 ; p0
|
||||
vld1.u8 {d14}, [r0@64], r1 ; q0
|
||||
vld1.u8 {d15}, [r12@64], r1 ; q0
|
||||
vld1.u8 {d16}, [r0@64], r1 ; q1
|
||||
vld1.u8 {d17}, [r12@64], r1 ; q1
|
||||
vld1.u8 {d18}, [r0@64], r1 ; q2
|
||||
vld1.u8 {d19}, [r12@64], r1 ; q2
|
||||
vld1.u8 {d20}, [r0@64], r1 ; q3
|
||||
vld1.u8 {d21}, [r12@64], r1 ; q3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
|
||||
vst1.u8 {d8}, [r0@64], r1 ; store u op2
|
||||
vst1.u8 {d9}, [r12@64], r1 ; store v op2
|
||||
vst1.u8 {d10}, [r0@64], r1 ; store u op1
|
||||
vst1.u8 {d11}, [r12@64], r1 ; store v op1
|
||||
vst1.u8 {d12}, [r0@64], r1 ; store u op0
|
||||
vst1.u8 {d13}, [r12@64], r1 ; store v op0
|
||||
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
|
||||
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
|
||||
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
|
||||
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
|
||||
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
|
||||
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh)
|
||||
; r0 unsigned char *src,
|
||||
; r1 int pitch,
|
||||
; r2 unsigned char blimit
|
||||
; r3 unsigned char limit
|
||||
; sp unsigned char thresh,
|
||||
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move src pointer down by 4 columns
|
||||
vdup.s8 q2, r12 ; thresh
|
||||
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
|
||||
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
|
||||
|
||||
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
|
||||
; const unsigned char *blimit,
|
||||
; const unsigned char *limit,
|
||||
; const unsigned char *thresh,
|
||||
; unsigned char *v)
|
||||
; r0 unsigned char *u,
|
||||
; r1 int pitch,
|
||||
; r2 const signed char *flimit,
|
||||
; r3 const signed char *limit,
|
||||
; sp const signed char *thresh,
|
||||
; sp+4 unsigned char *v
|
||||
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
|
||||
push {lr}
|
||||
ldr r12, [sp, #4] ; load thresh
|
||||
sub r0, r0, #4 ; move u pointer down by 4 columns
|
||||
vdup.u8 q2, r12 ; thresh
|
||||
ldr r12, [sp, #8] ; load v ptr
|
||||
sub r12, r12, #4 ; move v pointer down by 4 columns
|
||||
|
||||
vld1.u8 {d6}, [r0], r1 ;load u data
|
||||
vld1.u8 {d7}, [r12], r1 ;load v data
|
||||
vld1.u8 {d8}, [r0], r1
|
||||
vld1.u8 {d9}, [r12], r1
|
||||
vld1.u8 {d10}, [r0], r1
|
||||
vld1.u8 {d11}, [r12], r1
|
||||
vld1.u8 {d12}, [r0], r1
|
||||
vld1.u8 {d13}, [r12], r1
|
||||
vld1.u8 {d14}, [r0], r1
|
||||
vld1.u8 {d15}, [r12], r1
|
||||
vld1.u8 {d16}, [r0], r1
|
||||
vld1.u8 {d17}, [r12], r1
|
||||
vld1.u8 {d18}, [r0], r1
|
||||
vld1.u8 {d19}, [r12], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r12], r1
|
||||
|
||||
;transpose to 8x16 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
|
||||
bl vp8_mbloop_filter_neon
|
||||
|
||||
sub r12, r12, r1, lsl #3
|
||||
|
||||
;transpose to 16x8 matrix
|
||||
vtrn.32 q3, q7
|
||||
vtrn.32 q4, q8
|
||||
vtrn.32 q5, q9
|
||||
vtrn.32 q6, q10
|
||||
|
||||
vtrn.16 q3, q5
|
||||
vtrn.16 q4, q6
|
||||
vtrn.16 q7, q9
|
||||
vtrn.16 q8, q10
|
||||
|
||||
vtrn.8 q3, q4
|
||||
vtrn.8 q5, q6
|
||||
vtrn.8 q7, q8
|
||||
vtrn.8 q9, q10
|
||||
|
||||
;store op2, op1, op0, oq0, oq1, oq2
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r12], r1
|
||||
vst1.8 {d8}, [r0], r1
|
||||
vst1.8 {d9}, [r12], r1
|
||||
vst1.8 {d10}, [r0], r1
|
||||
vst1.8 {d11}, [r12], r1
|
||||
vst1.8 {d12}, [r0], r1
|
||||
vst1.8 {d13}, [r12], r1
|
||||
vst1.8 {d14}, [r0], r1
|
||||
vst1.8 {d15}, [r12], r1
|
||||
vst1.8 {d16}, [r0], r1
|
||||
vst1.8 {d17}, [r12], r1
|
||||
vst1.8 {d18}, [r0], r1
|
||||
vst1.8 {d19}, [r12], r1
|
||||
vst1.8 {d20}, [r0]
|
||||
vst1.8 {d21}, [r12]
|
||||
|
||||
pop {pc}
|
||||
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
|
||||
|
||||
; void vp8_mbloop_filter_neon()
|
||||
; This is a helper function for the macroblock loopfilters. The individual
|
||||
; functions do the necessary load, transpose (if necessary), preserve (if
|
||||
; necessary) and store.
|
||||
|
||||
; r0,r1 PRESERVE
|
||||
; r2 mblimit
|
||||
; r3 limit
|
||||
|
||||
; q2 thresh
|
||||
; q3 p3 PRESERVE
|
||||
; q4 p2
|
||||
; q5 p1
|
||||
; q6 p0
|
||||
; q7 q0
|
||||
; q8 q1
|
||||
; q9 q2
|
||||
; q10 q3 PRESERVE
|
||||
|
||||
|vp8_mbloop_filter_neon| PROC
|
||||
|
||||
; vp8_filter_mask
|
||||
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
|
||||
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
|
||||
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
|
||||
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
|
||||
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
|
||||
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
|
||||
|
||||
vmax.u8 q11, q11, q12
|
||||
vmax.u8 q12, q13, q14
|
||||
vmax.u8 q1, q1, q0
|
||||
vmax.u8 q15, q11, q12
|
||||
|
||||
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
|
||||
|
||||
; vp8_hevmask
|
||||
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
|
||||
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
|
||||
vmax.u8 q15, q15, q1
|
||||
|
||||
vdup.u8 q1, r3 ; limit
|
||||
vdup.u8 q2, r2 ; mblimit
|
||||
|
||||
vmov.u8 q0, #0x80 ; 0x80
|
||||
|
||||
vcge.u8 q15, q1, q15
|
||||
|
||||
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
|
||||
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
|
||||
vmov.u16 q11, #3 ; #3
|
||||
|
||||
; vp8_filter
|
||||
; convert to signed
|
||||
veor q7, q7, q0 ; qs0
|
||||
vshr.u8 q1, q1, #1 ; a = a / 2
|
||||
veor q6, q6, q0 ; ps0
|
||||
veor q5, q5, q0 ; ps1
|
||||
|
||||
vqadd.u8 q12, q12, q1 ; a = b + a
|
||||
|
||||
veor q8, q8, q0 ; qs1
|
||||
veor q4, q4, q0 ; ps2
|
||||
veor q9, q9, q0 ; qs2
|
||||
|
||||
vorr q14, q13, q14 ; vp8_hevmask
|
||||
|
||||
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
|
||||
|
||||
vsubl.s8 q2, d14, d12 ; qs0 - ps0
|
||||
vsubl.s8 q13, d15, d13
|
||||
|
||||
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
|
||||
|
||||
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
|
||||
|
||||
vand q15, q15, q12 ; vp8_filter_mask
|
||||
|
||||
vmul.i16 q13, q13, q11
|
||||
|
||||
vmov.u8 q12, #3 ; #3
|
||||
|
||||
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
|
||||
vaddw.s8 q13, q13, d3
|
||||
|
||||
vmov.u8 q11, #4 ; #4
|
||||
|
||||
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
vqmovn.s16 d2, q2
|
||||
vqmovn.s16 d3, q13
|
||||
|
||||
vand q1, q1, q15 ; vp8_filter &= mask
|
||||
|
||||
vmov.u16 q15, #63 ; #63
|
||||
|
||||
vand q13, q1, q14 ; Filter2 &= hev
|
||||
|
||||
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
|
||||
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
|
||||
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
|
||||
|
||||
vmov q11, q15
|
||||
vmov q12, q15
|
||||
|
||||
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
|
||||
|
||||
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
|
||||
|
||||
vbic q1, q1, q14 ; vp8_filter &= ~hev
|
||||
|
||||
; roughly 1/7th difference across boundary
|
||||
; roughly 2/7th difference across boundary
|
||||
; roughly 3/7th difference across boundary
|
||||
|
||||
vmov.u8 d5, #9 ; #9
|
||||
vmov.u8 d4, #18 ; #18
|
||||
|
||||
vmov q13, q15
|
||||
vmov q14, q15
|
||||
|
||||
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
|
||||
vmlal.s8 q11, d3, d5
|
||||
vmov.u8 d5, #27 ; #27
|
||||
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
|
||||
vmlal.s8 q13, d3, d4
|
||||
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
|
||||
vmlal.s8 q15, d3, d5
|
||||
|
||||
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
|
||||
vqshrn.s16 d1, q11, #7
|
||||
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
|
||||
vqshrn.s16 d25, q13, #7
|
||||
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
|
||||
vqshrn.s16 d29, q15, #7
|
||||
|
||||
vmov.u8 q1, #0x80 ; 0x80
|
||||
|
||||
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
|
||||
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
|
||||
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
|
||||
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
|
||||
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
|
||||
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
|
||||
|
||||
veor q9, q11, q1 ; *oq2 = s^0x80
|
||||
veor q4, q0, q1 ; *op2 = s^0x80
|
||||
veor q8, q13, q1 ; *oq1 = s^0x80
|
||||
veor q5, q12, q1 ; *op2 = s^0x80
|
||||
veor q7, q15, q1 ; *oq0 = s^0x80
|
||||
veor q6, q14, q1 ; *op0 = s^0x80
|
||||
|
||||
bx lr
|
||||
ENDP ; |vp8_mbloop_filter_neon|
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -1,625 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "./vpx_config.h"
|
||||
|
||||
static INLINE void vp8_mbloop_filter_neon(
|
||||
uint8x16_t qblimit, // mblimit
|
||||
uint8x16_t qlimit, // limit
|
||||
uint8x16_t qthresh, // thresh
|
||||
uint8x16_t q3, // p2
|
||||
uint8x16_t q4, // p2
|
||||
uint8x16_t q5, // p1
|
||||
uint8x16_t q6, // p0
|
||||
uint8x16_t q7, // q0
|
||||
uint8x16_t q8, // q1
|
||||
uint8x16_t q9, // q2
|
||||
uint8x16_t q10, // q3
|
||||
uint8x16_t *q4r, // p1
|
||||
uint8x16_t *q5r, // p1
|
||||
uint8x16_t *q6r, // p0
|
||||
uint8x16_t *q7r, // q0
|
||||
uint8x16_t *q8r, // q1
|
||||
uint8x16_t *q9r) { // q1
|
||||
uint8x16_t q0u8, q1u8, q11u8, q12u8, q13u8, q14u8, q15u8;
|
||||
int16x8_t q0s16, q2s16, q11s16, q12s16, q13s16, q14s16, q15s16;
|
||||
int8x16_t q1s8, q6s8, q7s8, q2s8, q11s8, q13s8;
|
||||
uint16x8_t q0u16, q11u16, q12u16, q13u16, q14u16, q15u16;
|
||||
int8x16_t q0s8, q12s8, q14s8, q15s8;
|
||||
int8x8_t d0, d1, d2, d3, d4, d5, d24, d25, d28, d29;
|
||||
|
||||
q11u8 = vabdq_u8(q3, q4);
|
||||
q12u8 = vabdq_u8(q4, q5);
|
||||
q13u8 = vabdq_u8(q5, q6);
|
||||
q14u8 = vabdq_u8(q8, q7);
|
||||
q1u8 = vabdq_u8(q9, q8);
|
||||
q0u8 = vabdq_u8(q10, q9);
|
||||
|
||||
q11u8 = vmaxq_u8(q11u8, q12u8);
|
||||
q12u8 = vmaxq_u8(q13u8, q14u8);
|
||||
q1u8 = vmaxq_u8(q1u8, q0u8);
|
||||
q15u8 = vmaxq_u8(q11u8, q12u8);
|
||||
|
||||
q12u8 = vabdq_u8(q6, q7);
|
||||
|
||||
// vp8_hevmask
|
||||
q13u8 = vcgtq_u8(q13u8, qthresh);
|
||||
q14u8 = vcgtq_u8(q14u8, qthresh);
|
||||
q15u8 = vmaxq_u8(q15u8, q1u8);
|
||||
|
||||
q15u8 = vcgeq_u8(qlimit, q15u8);
|
||||
|
||||
q1u8 = vabdq_u8(q5, q8);
|
||||
q12u8 = vqaddq_u8(q12u8, q12u8);
|
||||
|
||||
// vp8_filter() function
|
||||
// convert to signed
|
||||
q0u8 = vdupq_n_u8(0x80);
|
||||
q9 = veorq_u8(q9, q0u8);
|
||||
q8 = veorq_u8(q8, q0u8);
|
||||
q7 = veorq_u8(q7, q0u8);
|
||||
q6 = veorq_u8(q6, q0u8);
|
||||
q5 = veorq_u8(q5, q0u8);
|
||||
q4 = veorq_u8(q4, q0u8);
|
||||
|
||||
q1u8 = vshrq_n_u8(q1u8, 1);
|
||||
q12u8 = vqaddq_u8(q12u8, q1u8);
|
||||
|
||||
q14u8 = vorrq_u8(q13u8, q14u8);
|
||||
q12u8 = vcgeq_u8(qblimit, q12u8);
|
||||
|
||||
q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_low_s8(vreinterpretq_s8_u8(q6)));
|
||||
q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
|
||||
vget_high_s8(vreinterpretq_s8_u8(q6)));
|
||||
|
||||
q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
|
||||
vreinterpretq_s8_u8(q8));
|
||||
|
||||
q11s16 = vdupq_n_s16(3);
|
||||
q2s16 = vmulq_s16(q2s16, q11s16);
|
||||
q13s16 = vmulq_s16(q13s16, q11s16);
|
||||
|
||||
q15u8 = vandq_u8(q15u8, q12u8);
|
||||
|
||||
q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
|
||||
q13s16 = vaddw_s8(q13s16, vget_high_s8(q1s8));
|
||||
|
||||
q12u8 = vdupq_n_u8(3);
|
||||
q11u8 = vdupq_n_u8(4);
|
||||
// vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
|
||||
d2 = vqmovn_s16(q2s16);
|
||||
d3 = vqmovn_s16(q13s16);
|
||||
q1s8 = vcombine_s8(d2, d3);
|
||||
q1s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q15u8));
|
||||
q13s8 = vandq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q2s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q11u8));
|
||||
q13s8 = vqaddq_s8(q13s8, vreinterpretq_s8_u8(q12u8));
|
||||
q2s8 = vshrq_n_s8(q2s8, 3);
|
||||
q13s8 = vshrq_n_s8(q13s8, 3);
|
||||
|
||||
q7s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q2s8);
|
||||
q6s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q13s8);
|
||||
|
||||
q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
|
||||
|
||||
q0u16 = q11u16 = q12u16 = q13u16 = q14u16 = q15u16 = vdupq_n_u16(63);
|
||||
d5 = vdup_n_s8(9);
|
||||
d4 = vdup_n_s8(18);
|
||||
|
||||
q0s16 = vmlal_s8(vreinterpretq_s16_u16(q0u16), vget_low_s8(q1s8), d5);
|
||||
q11s16 = vmlal_s8(vreinterpretq_s16_u16(q11u16), vget_high_s8(q1s8), d5);
|
||||
d5 = vdup_n_s8(27);
|
||||
q12s16 = vmlal_s8(vreinterpretq_s16_u16(q12u16), vget_low_s8(q1s8), d4);
|
||||
q13s16 = vmlal_s8(vreinterpretq_s16_u16(q13u16), vget_high_s8(q1s8), d4);
|
||||
q14s16 = vmlal_s8(vreinterpretq_s16_u16(q14u16), vget_low_s8(q1s8), d5);
|
||||
q15s16 = vmlal_s8(vreinterpretq_s16_u16(q15u16), vget_high_s8(q1s8), d5);
|
||||
|
||||
d0 = vqshrn_n_s16(q0s16 , 7);
|
||||
d1 = vqshrn_n_s16(q11s16, 7);
|
||||
d24 = vqshrn_n_s16(q12s16, 7);
|
||||
d25 = vqshrn_n_s16(q13s16, 7);
|
||||
d28 = vqshrn_n_s16(q14s16, 7);
|
||||
d29 = vqshrn_n_s16(q15s16, 7);
|
||||
|
||||
q0s8 = vcombine_s8(d0, d1);
|
||||
q12s8 = vcombine_s8(d24, d25);
|
||||
q14s8 = vcombine_s8(d28, d29);
|
||||
|
||||
q11s8 = vqsubq_s8(vreinterpretq_s8_u8(q9), q0s8);
|
||||
q0s8 = vqaddq_s8(vreinterpretq_s8_u8(q4), q0s8);
|
||||
q13s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q12s8);
|
||||
q12s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q12s8);
|
||||
q15s8 = vqsubq_s8((q7s8), q14s8);
|
||||
q14s8 = vqaddq_s8((q6s8), q14s8);
|
||||
|
||||
q1u8 = vdupq_n_u8(0x80);
|
||||
*q9r = veorq_u8(vreinterpretq_u8_s8(q11s8), q1u8);
|
||||
*q8r = veorq_u8(vreinterpretq_u8_s8(q13s8), q1u8);
|
||||
*q7r = veorq_u8(vreinterpretq_u8_s8(q15s8), q1u8);
|
||||
*q6r = veorq_u8(vreinterpretq_u8_s8(q14s8), q1u8);
|
||||
*q5r = veorq_u8(vreinterpretq_u8_s8(q12s8), q1u8);
|
||||
*q4r = veorq_u8(vreinterpretq_u8_s8(q0s8), q1u8);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_horizontal_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
src -= (pitch << 2);
|
||||
|
||||
q3 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q4 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q5 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q6 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q7 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q8 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q9 = vld1q_u8(src);
|
||||
src += pitch;
|
||||
q10 = vld1q_u8(src);
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
src -= (pitch * 6);
|
||||
vst1q_u8(src, q4);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q5);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q6);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q7);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q8);
|
||||
src += pitch;
|
||||
vst1q_u8(src, q9);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_horizontal_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
u -= (pitch << 2);
|
||||
v -= (pitch << 2);
|
||||
|
||||
d6 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d7 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d8 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d9 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d10 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d11 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d12 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d13 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d14 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d15 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d16 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d17 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d18 = vld1_u8(u);
|
||||
u += pitch;
|
||||
d19 = vld1_u8(v);
|
||||
v += pitch;
|
||||
d20 = vld1_u8(u);
|
||||
d21 = vld1_u8(v);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
u -= (pitch * 6);
|
||||
v -= (pitch * 6);
|
||||
vst1_u8(u, vget_low_u8(q4));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q4));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q5));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q5));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q6));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q6));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q7));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q7));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q8));
|
||||
u += pitch;
|
||||
vst1_u8(v, vget_high_u8(q8));
|
||||
v += pitch;
|
||||
vst1_u8(u, vget_low_u8(q9));
|
||||
vst1_u8(v, vget_high_u8(q9));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_vertical_edge_y_neon(
|
||||
unsigned char *src,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh) {
|
||||
unsigned char *s1, *s2;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
s1 = src - 4;
|
||||
s2 = s1 + 8 * pitch;
|
||||
d6 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d7 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d8 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d9 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d10 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d11 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d12 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d13 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d14 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d15 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d16 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d17 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d18 = vld1_u8(s1);
|
||||
s1 += pitch;
|
||||
d19 = vld1_u8(s2);
|
||||
s2 += pitch;
|
||||
d20 = vld1_u8(s1);
|
||||
d21 = vld1_u8(s2);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
s1 -= 7 * pitch;
|
||||
s2 -= 7 * pitch;
|
||||
|
||||
vst1_u8(s1, vget_low_u8(q3));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q3));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q4));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q4));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q5));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q5));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q6));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q6));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q7));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q7));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q8));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q8));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q9));
|
||||
s1 += pitch;
|
||||
vst1_u8(s2, vget_high_u8(q9));
|
||||
s2 += pitch;
|
||||
vst1_u8(s1, vget_low_u8(q10));
|
||||
vst1_u8(s2, vget_high_u8(q10));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_mbloop_filter_vertical_edge_uv_neon(
|
||||
unsigned char *u,
|
||||
int pitch,
|
||||
unsigned char blimit,
|
||||
unsigned char limit,
|
||||
unsigned char thresh,
|
||||
unsigned char *v) {
|
||||
unsigned char *us, *ud;
|
||||
unsigned char *vs, *vd;
|
||||
uint8x16_t qblimit, qlimit, qthresh, q3, q4;
|
||||
uint8x16_t q5, q6, q7, q8, q9, q10;
|
||||
uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
|
||||
uint8x8_t d15, d16, d17, d18, d19, d20, d21;
|
||||
uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
|
||||
uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
|
||||
uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
|
||||
|
||||
qblimit = vdupq_n_u8(blimit);
|
||||
qlimit = vdupq_n_u8(limit);
|
||||
qthresh = vdupq_n_u8(thresh);
|
||||
|
||||
us = u - 4;
|
||||
vs = v - 4;
|
||||
d6 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d7 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d8 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d9 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d10 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d11 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d12 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d13 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d14 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d15 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d16 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d17 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d18 = vld1_u8(us);
|
||||
us += pitch;
|
||||
d19 = vld1_u8(vs);
|
||||
vs += pitch;
|
||||
d20 = vld1_u8(us);
|
||||
d21 = vld1_u8(vs);
|
||||
|
||||
q3 = vcombine_u8(d6, d7);
|
||||
q4 = vcombine_u8(d8, d9);
|
||||
q5 = vcombine_u8(d10, d11);
|
||||
q6 = vcombine_u8(d12, d13);
|
||||
q7 = vcombine_u8(d14, d15);
|
||||
q8 = vcombine_u8(d16, d17);
|
||||
q9 = vcombine_u8(d18, d19);
|
||||
q10 = vcombine_u8(d20, d21);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
|
||||
q5, q6, q7, q8, q9, q10,
|
||||
&q4, &q5, &q6, &q7, &q8, &q9);
|
||||
|
||||
q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
|
||||
q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
|
||||
q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
|
||||
q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
|
||||
|
||||
q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[0]));
|
||||
q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[0]));
|
||||
q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp2.val[1]));
|
||||
q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
|
||||
vreinterpretq_u16_u32(q2tmp3.val[1]));
|
||||
|
||||
q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[0]));
|
||||
q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp5.val[1]));
|
||||
q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[0]));
|
||||
q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
|
||||
vreinterpretq_u8_u16(q2tmp7.val[1]));
|
||||
|
||||
q3 = q2tmp8.val[0];
|
||||
q4 = q2tmp8.val[1];
|
||||
q5 = q2tmp9.val[0];
|
||||
q6 = q2tmp9.val[1];
|
||||
q7 = q2tmp10.val[0];
|
||||
q8 = q2tmp10.val[1];
|
||||
q9 = q2tmp11.val[0];
|
||||
q10 = q2tmp11.val[1];
|
||||
|
||||
ud = u - 4;
|
||||
vst1_u8(ud, vget_low_u8(q3));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q4));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q5));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q6));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q7));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q8));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q9));
|
||||
ud += pitch;
|
||||
vst1_u8(ud, vget_low_u8(q10));
|
||||
|
||||
vd = v - 4;
|
||||
vst1_u8(vd, vget_high_u8(q3));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q4));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q5));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q6));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q7));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q8));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q9));
|
||||
vd += pitch;
|
||||
vst1_u8(vd, vget_high_u8(q10));
|
||||
return;
|
||||
}
|
|
@ -1,210 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * yabove_row,
|
||||
unsigned char * yleft,
|
||||
int left_stride,
|
||||
unsigned char * ypred_ptr,
|
||||
int y_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x16_t v_expected_dc = vdupq_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average = 0;
|
||||
int expected_dc;
|
||||
if (x->up_available) {
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(v_above);
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
|
||||
vreinterpret_u32_u64(vget_high_u64(c)));
|
||||
average = vget_lane_u32(d, 0);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 16; ++i) {
|
||||
average += yleft[0];
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 3;
|
||||
expected_dc = (average + (1 << (shift - 1))) >> shift;
|
||||
v_expected_dc = vmovq_n_u8((uint8_t)expected_dc);
|
||||
}
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_expected_dc);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
vst1q_u8(ypred_ptr, v_above);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x16_t v_yleft = vmovq_n_u8((uint8_t)yleft[0]);
|
||||
yleft += left_stride;
|
||||
vst1q_u8(ypred_ptr, v_yleft);
|
||||
ypred_ptr += y_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_ytop_left = vmovq_n_u16((int16_t)yabove_row[-1]);
|
||||
const uint8x16_t v_above = vld1q_u8(yabove_row);
|
||||
for (i = 0; i < 16; ++i) {
|
||||
const uint8x8_t v_yleft = vmov_n_u8((int8_t)yleft[0]);
|
||||
const uint16x8_t a_lo = vaddl_u8(vget_low_u8(v_above), v_yleft);
|
||||
const uint16x8_t a_hi = vaddl_u8(vget_high_u8(v_above), v_yleft);
|
||||
const int16x8_t b_lo = vsubq_s16(vreinterpretq_s16_u16(a_lo),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const int16x8_t b_hi = vsubq_s16(vreinterpretq_s16_u16(a_hi),
|
||||
vreinterpretq_s16_u16(v_ytop_left));
|
||||
const uint8x8_t pred_lo = vqmovun_s16(b_lo);
|
||||
const uint8x8_t pred_hi = vqmovun_s16(b_hi);
|
||||
|
||||
vst1q_u8(ypred_ptr, vcombine_u8(pred_lo, pred_hi));
|
||||
ypred_ptr += y_stride;
|
||||
yleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mbuv_s_neon(MACROBLOCKD *x,
|
||||
unsigned char * uabove_row,
|
||||
unsigned char * vabove_row,
|
||||
unsigned char * uleft,
|
||||
unsigned char * vleft,
|
||||
int left_stride,
|
||||
unsigned char * upred_ptr,
|
||||
unsigned char * vpred_ptr,
|
||||
int pred_stride) {
|
||||
const int mode = x->mode_info_context->mbmi.uv_mode;
|
||||
int i;
|
||||
|
||||
switch (mode) {
|
||||
case DC_PRED:
|
||||
{
|
||||
int shift = x->up_available + x->left_available;
|
||||
uint8x8_t v_expected_udc = vdup_n_u8(128);
|
||||
uint8x8_t v_expected_vdc = vdup_n_u8(128);
|
||||
|
||||
if (shift) {
|
||||
unsigned int average_u = 0;
|
||||
unsigned int average_v = 0;
|
||||
int expected_udc;
|
||||
int expected_vdc;
|
||||
if (x->up_available) {
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
const uint16x8_t a = vpaddlq_u8(vcombine_u8(v_uabove, v_vabove));
|
||||
const uint32x4_t b = vpaddlq_u16(a);
|
||||
const uint64x2_t c = vpaddlq_u32(b);
|
||||
average_u = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 0);
|
||||
average_v = vgetq_lane_u32(vreinterpretq_u32_u64((c)), 2);
|
||||
}
|
||||
if (x->left_available) {
|
||||
for (i = 0; i < 8; ++i) {
|
||||
average_u += uleft[0];
|
||||
uleft += left_stride;
|
||||
average_v += vleft[0];
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
shift += 2;
|
||||
expected_udc = (average_u + (1 << (shift - 1))) >> shift;
|
||||
expected_vdc = (average_v + (1 << (shift - 1))) >> shift;
|
||||
v_expected_udc = vmov_n_u8((uint8_t)expected_udc);
|
||||
v_expected_vdc = vmov_n_u8((uint8_t)expected_vdc);
|
||||
}
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_expected_udc);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_expected_vdc);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case V_PRED:
|
||||
{
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
vst1_u8(upred_ptr, v_uabove);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vabove);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case H_PRED:
|
||||
{
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((uint8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((uint8_t)vleft[0]);
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
vst1_u8(upred_ptr, v_uleft);
|
||||
upred_ptr += pred_stride;
|
||||
vst1_u8(vpred_ptr, v_vleft);
|
||||
vpred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case TM_PRED:
|
||||
{
|
||||
const uint16x8_t v_utop_left = vmovq_n_u16((int16_t)uabove_row[-1]);
|
||||
const uint16x8_t v_vtop_left = vmovq_n_u16((int16_t)vabove_row[-1]);
|
||||
const uint8x8_t v_uabove = vld1_u8(uabove_row);
|
||||
const uint8x8_t v_vabove = vld1_u8(vabove_row);
|
||||
for (i = 0; i < 8; ++i) {
|
||||
const uint8x8_t v_uleft = vmov_n_u8((int8_t)uleft[0]);
|
||||
const uint8x8_t v_vleft = vmov_n_u8((int8_t)vleft[0]);
|
||||
const uint16x8_t a_u = vaddl_u8(v_uabove, v_uleft);
|
||||
const uint16x8_t a_v = vaddl_u8(v_vabove, v_vleft);
|
||||
const int16x8_t b_u = vsubq_s16(vreinterpretq_s16_u16(a_u),
|
||||
vreinterpretq_s16_u16(v_utop_left));
|
||||
const int16x8_t b_v = vsubq_s16(vreinterpretq_s16_u16(a_v),
|
||||
vreinterpretq_s16_u16(v_vtop_left));
|
||||
const uint8x8_t pred_u = vqmovun_s16(b_u);
|
||||
const uint8x8_t pred_v = vqmovun_s16(b_v);
|
||||
|
||||
vst1_u8(upred_ptr, pred_u);
|
||||
vst1_u8(vpred_ptr, pred_v);
|
||||
upred_ptr += pred_stride;
|
||||
vpred_ptr += pred_stride;
|
||||
uleft += left_stride;
|
||||
vleft += left_stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,207 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad16x16_neon|
|
||||
EXPORT |vp8_sad16x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
|vp8_sad16x16_neon| PROC
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
;;
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0]
|
||||
vld1.8 {q7}, [r2]
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
;unsigned int vp8_sad16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|vp8_sad16x8_neon| PROC
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
vabdl.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vld1.8 {q0}, [r0], r1
|
||||
vld1.8 {q4}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
vabal.u8 q13, d1, d9
|
||||
|
||||
vld1.8 {q2}, [r0], r1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
vabal.u8 q13, d3, d11
|
||||
|
||||
vld1.8 {q3}, [r0], r1
|
||||
vld1.8 {q7}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q13, d5, d13
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
vabal.u8 q13, d7, d15
|
||||
|
||||
vadd.u16 q0, q12, q13
|
||||
|
||||
vpaddl.u16 q1, q0
|
||||
vpaddl.u32 q0, q1
|
||||
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -0,0 +1,209 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sad8x8_neon|
|
||||
EXPORT |vp8_sad8x16_neon|
|
||||
EXPORT |vp8_sad4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; unsigned int vp8_sad8x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x8_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;============================
|
||||
;unsigned int vp8_sad8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad8x16_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 q1, q12
|
||||
vpaddl.u32 q0, q1
|
||||
vadd.u32 d0, d0, d1
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;===========================
|
||||
;unsigned int vp8_sad4x4_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int src_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int ref_stride)
|
||||
|
||||
|vp8_sad4x4_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d8}, [r2], r3
|
||||
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d10}, [r2], r3
|
||||
|
||||
vabdl.u8 q12, d0, d8
|
||||
|
||||
vld1.8 {d4}, [r0], r1
|
||||
vld1.8 {d12}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d2, d10
|
||||
|
||||
vld1.8 {d6}, [r0], r1
|
||||
vld1.8 {d14}, [r2], r3
|
||||
|
||||
vabal.u8 q12, d4, d12
|
||||
vabal.u8 q12, d6, d14
|
||||
|
||||
vpaddl.u16 d1, d24
|
||||
vpaddl.u32 d0, d1
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,184 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
unsigned int vp8_sad8x8_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad8x16_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 15; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad4x4_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x8_t d0, d8;
|
||||
uint16x8_t q12;
|
||||
uint32x2_t d1;
|
||||
uint64x1_t d3;
|
||||
int i;
|
||||
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(d0, d8);
|
||||
|
||||
for (i = 0; i < 3; i++) {
|
||||
d0 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, d0, d8);
|
||||
}
|
||||
|
||||
d1 = vpaddl_u16(vget_low_u16(q12));
|
||||
d3 = vpaddl_u32(d1);
|
||||
|
||||
return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad16x16_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x16_t q0, q4;
|
||||
uint16x8_t q12, q13;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
|
||||
|
||||
for (i = 0; i < 15; i++) {
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
|
||||
}
|
||||
|
||||
q12 = vaddq_u16(q12, q13);
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_sad16x8_neon(
|
||||
unsigned char *src_ptr,
|
||||
int src_stride,
|
||||
unsigned char *ref_ptr,
|
||||
int ref_stride) {
|
||||
uint8x16_t q0, q4;
|
||||
uint16x8_t q12, q13;
|
||||
uint32x4_t q1;
|
||||
uint64x2_t q3;
|
||||
uint32x2_t d5;
|
||||
int i;
|
||||
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
|
||||
|
||||
for (i = 0; i < 7; i++) {
|
||||
q0 = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
q4 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
|
||||
q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
|
||||
}
|
||||
|
||||
q12 = vaddq_u16(q12, q13);
|
||||
q1 = vpaddlq_u16(q12);
|
||||
q3 = vpaddlq_u32(q1);
|
||||
d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
|
||||
vreinterpret_u32_u64(vget_high_u64(q3)));
|
||||
|
||||
return vget_lane_u32(d5, 0);
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_push_neon|
|
||||
EXPORT |vp8_pop_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
|vp8_push_neon| PROC
|
||||
vst1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vst1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|vp8_pop_neon| PROC
|
||||
vld1.i64 {d8, d9, d10, d11}, [r0]!
|
||||
vld1.i64 {d12, d13, d14, d15}, [r0]!
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;*************************************************************
|
||||
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
|
||||
; unsigned char *dst, int stride)
|
||||
;r0 short * input
|
||||
;r1 short * pred
|
||||
;r2 int pitch
|
||||
;r3 unsigned char dst
|
||||
;sp int stride
|
||||
;*************************************************************
|
||||
|
||||
; static const int cospi8sqrt2minus1=20091;
|
||||
; static const int sinpi8sqrt2 =35468;
|
||||
; static const int rounding = 0;
|
||||
|
||||
; Optimization note: The resulted data from dequantization are signed
|
||||
; 13-bit data that is in the range of [-4096, 4095]. This allows to
|
||||
; use "vqdmulh"(neon) instruction since it won't go out of range
|
||||
; (13+16+1=30bits<32bits). This instruction gives the high half
|
||||
; result of the multiplication that is needed in IDCT.
|
||||
|
||||
|vp8_short_idct4x4llm_neon| PROC
|
||||
adr r12, idct_coeff
|
||||
vld1.16 {q1, q2}, [r0]
|
||||
vld1.16 {d0}, [r12]
|
||||
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
ldr r0, [sp] ; stride
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
;d6 - c1:temp1
|
||||
;d7 - d1:temp2
|
||||
;d8 - d1:temp1
|
||||
;d9 - c1:temp2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vswp d3, d4
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
|
||||
vqadd.s16 d12, d2, d3 ;a1
|
||||
vqsub.s16 d13, d2, d3 ;b1
|
||||
|
||||
vshr.s16 q3, q3, #1
|
||||
vshr.s16 q4, q4, #1
|
||||
|
||||
vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
|
||||
vqadd.s16 q4, q4, q2
|
||||
|
||||
vqsub.s16 d10, d6, d9 ;c1
|
||||
vqadd.s16 d11, d7, d8 ;d1
|
||||
|
||||
vqadd.s16 d2, d12, d11
|
||||
vqadd.s16 d3, d13, d10
|
||||
vqsub.s16 d4, d13, d10
|
||||
vqsub.s16 d5, d12, d11
|
||||
|
||||
vrshr.s16 d2, d2, #3
|
||||
vrshr.s16 d3, d3, #3
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
; load prediction data
|
||||
vld1.32 d6[0], [r1], r2
|
||||
vld1.32 d6[1], [r1], r2
|
||||
vld1.32 d7[0], [r1], r2
|
||||
vld1.32 d7[1], [r1], r2
|
||||
|
||||
; add prediction and residual
|
||||
vaddw.u8 q1, q1, d6
|
||||
vaddw.u8 q2, q2, d7
|
||||
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
|
||||
; store to destination
|
||||
vst1.32 d1[0], [r3], r0
|
||||
vst1.32 d1[1], [r3], r0
|
||||
vst1.32 d2[0], [r3], r0
|
||||
vst1.32 d2[1], [r3], r0
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
idct_coeff
|
||||
DCD 0x4e7b4e7b, 0x8a8c8a8c
|
||||
|
||||
;20091, 20091, 35468, 35468
|
||||
|
||||
END
|
|
@ -1,123 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static const int16_t cospi8sqrt2minus1 = 20091;
|
||||
static const int16_t sinpi8sqrt2 = 35468;
|
||||
|
||||
void vp8_short_idct4x4llm_neon(
|
||||
int16_t *input,
|
||||
unsigned char *pred_ptr,
|
||||
int pred_stride,
|
||||
unsigned char *dst_ptr,
|
||||
int dst_stride) {
|
||||
int i;
|
||||
uint32x2_t d6u32 = vdup_n_u32(0);
|
||||
uint8x8_t d1u8;
|
||||
int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
|
||||
uint16x8_t q1u16;
|
||||
int16x8_t q1s16, q2s16, q3s16, q4s16;
|
||||
int32x2x2_t v2tmp0, v2tmp1;
|
||||
int16x4x2_t v2tmp2, v2tmp3;
|
||||
|
||||
d2 = vld1_s16(input);
|
||||
d3 = vld1_s16(input + 4);
|
||||
d4 = vld1_s16(input + 8);
|
||||
d5 = vld1_s16(input + 12);
|
||||
|
||||
// 1st for loop
|
||||
q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here
|
||||
q2s16 = vcombine_s16(d3, d5);
|
||||
|
||||
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
|
||||
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
|
||||
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
|
||||
|
||||
q3s16 = vshrq_n_s16(q3s16, 1);
|
||||
q4s16 = vshrq_n_s16(q4s16, 1);
|
||||
|
||||
q3s16 = vqaddq_s16(q3s16, q2s16);
|
||||
q4s16 = vqaddq_s16(q4s16, q2s16);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
|
||||
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[0]));
|
||||
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[1]));
|
||||
|
||||
// 2nd for loop
|
||||
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
|
||||
q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
|
||||
|
||||
q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
|
||||
q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
|
||||
|
||||
d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1
|
||||
d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1
|
||||
|
||||
q3s16 = vshrq_n_s16(q3s16, 1);
|
||||
q4s16 = vshrq_n_s16(q4s16, 1);
|
||||
|
||||
q3s16 = vqaddq_s16(q3s16, q2s16);
|
||||
q4s16 = vqaddq_s16(q4s16, q2s16);
|
||||
|
||||
d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1
|
||||
d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1
|
||||
|
||||
d2 = vqadd_s16(d12, d11);
|
||||
d3 = vqadd_s16(d13, d10);
|
||||
d4 = vqsub_s16(d13, d10);
|
||||
d5 = vqsub_s16(d12, d11);
|
||||
|
||||
d2 = vrshr_n_s16(d2, 3);
|
||||
d3 = vrshr_n_s16(d3, 3);
|
||||
d4 = vrshr_n_s16(d4, 3);
|
||||
d5 = vrshr_n_s16(d5, 3);
|
||||
|
||||
v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
|
||||
v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
|
||||
v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[0]));
|
||||
v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
|
||||
vreinterpret_s16_s32(v2tmp1.val[1]));
|
||||
|
||||
q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
|
||||
q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
|
||||
|
||||
// dc_only_idct_add
|
||||
for (i = 0; i < 2; i++, q1s16 = q2s16) {
|
||||
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
|
||||
pred_ptr += pred_stride;
|
||||
d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
|
||||
vreinterpret_u8_u32(d6u32));
|
||||
d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
|
||||
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
|
||||
dst_ptr += dst_stride;
|
||||
vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,490 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict16x16_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter16_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to
|
||||
; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication,
|
||||
; the result can be negtive. So, I treat the result as s16. But, since it is also possible
|
||||
; that the result can be a large positive number (> 2^15-1), which could be confused as a
|
||||
; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2,
|
||||
; which ensures that the result stays in s16 range. Finally, saturated add the result by
|
||||
; applying 3rd filter coeff. Same applys to other filter functions.
|
||||
|
||||
|vp8_sixtap_predict16x16_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter16_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter16x16_only
|
||||
|
||||
sub sp, sp, #336 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #7 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (21x16)
|
||||
filt_blk2d_fp16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d7, d0
|
||||
vmull.u8 q10, d9, d0
|
||||
vmull.u8 q11, d10, d0
|
||||
vmull.u8 q12, d12, d0
|
||||
vmull.u8 q13, d13, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d9, d10, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q12, d30, d1
|
||||
|
||||
vext.8 d28, d7, d8, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d13, d14, #1
|
||||
|
||||
vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q11, d29, d1
|
||||
vmlsl.u8 q13, d30, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d9, d10, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q12, d30, d4
|
||||
|
||||
vext.8 d28, d7, d8, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d13, d14, #4
|
||||
|
||||
vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q11, d29, d4
|
||||
vmlsl.u8 q13, d30, d4
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
|
||||
vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q12, d30, d5
|
||||
|
||||
vext.8 d28, d7, d8, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d13, d14, #5
|
||||
|
||||
vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q11, d29, d5
|
||||
vmlal.u8 q13, d30, d5
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d9, d10, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
|
||||
vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q12, d30, d2
|
||||
|
||||
vext.8 d28, d7, d8, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d13, d14, #2
|
||||
|
||||
vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q11, d29, d2
|
||||
vmlal.u8 q13, d30, d2
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d9, d10, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
|
||||
vext.8 d15, d7, d8, #3
|
||||
vext.8 d31, d10, d11, #3
|
||||
vext.8 d6, d13, d14, #3
|
||||
|
||||
vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
|
||||
vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q12, q6
|
||||
|
||||
vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q7, d31, d3
|
||||
vmull.u8 q3, d6, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q9, q6
|
||||
vqadd.s16 q11, q7
|
||||
vqadd.s16 q13, q3
|
||||
|
||||
vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q9, #7
|
||||
vqrshrun.s16 d8, q10, #7
|
||||
vqrshrun.s16 d9, q11, #7
|
||||
vqrshrun.s16 d10, q12, #7
|
||||
vqrshrun.s16 d11, q13, #7
|
||||
|
||||
vst1.u8 {d6, d7, d8}, [lr]! ;store result
|
||||
vst1.u8 {d9, d10, d11}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter - do first 8-columns and then second 8-columns
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #336
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
mov r2, #16
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp16x16_outloop_neon
|
||||
vld1.u8 {d18}, [lr], r2 ;load src data
|
||||
vld1.u8 {d19}, [lr], r2
|
||||
vld1.u8 {d20}, [lr], r2
|
||||
vld1.u8 {d21}, [lr], r2
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [lr], r2
|
||||
|
||||
secondpass_inner_loop_neon
|
||||
vld1.u8 {d23}, [lr], r2 ;load src data
|
||||
vld1.u8 {d24}, [lr], r2
|
||||
vld1.u8 {d25}, [lr], r2
|
||||
vld1.u8 {d26}, [lr], r2
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub lr, lr, #336
|
||||
add lr, lr, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_sp16x16_outloop_neon
|
||||
|
||||
add sp, sp, #336
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter16x16_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #8 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data
|
||||
vld1.u8 {d9, d10, d11}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
|
||||
vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q7, d7, d0
|
||||
vmull.u8 q8, d9, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
|
||||
vext.8 d20, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d21, d9, d10, #1
|
||||
vext.8 d22, d7, d8, #1
|
||||
vext.8 d23, d10, d11, #1
|
||||
vext.8 d24, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d25, d9, d10, #4
|
||||
vext.8 d26, d7, d8, #4
|
||||
vext.8 d27, d10, d11, #4
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d9, d10, #5
|
||||
|
||||
vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d21, d1
|
||||
vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d23, d1
|
||||
vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d25, d4
|
||||
vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d27, d4
|
||||
vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
|
||||
vext.8 d20, d7, d8, #5
|
||||
vext.8 d21, d10, d11, #5
|
||||
vext.8 d22, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d23, d9, d10, #2
|
||||
vext.8 d24, d7, d8, #2
|
||||
vext.8 d25, d10, d11, #2
|
||||
|
||||
vext.8 d26, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d27, d9, d10, #3
|
||||
vext.8 d28, d7, d8, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
|
||||
vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d21, d5
|
||||
vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d23, d2
|
||||
vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d25, d2
|
||||
|
||||
vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q11, d27, d3
|
||||
vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q15, d29, d3
|
||||
|
||||
vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q11
|
||||
vqadd.s16 q7, q12
|
||||
vqadd.s16 q9, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q7, #7
|
||||
vqrshrun.s16 d8, q8, #7
|
||||
vqrshrun.s16 d9, q9, #7
|
||||
|
||||
vst1.u8 {q3}, [r4], r5 ;store result
|
||||
vst1.u8 {q4}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
secondpass_filter16x16_only
|
||||
;Second pass: 16x16
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_spo16x16_outloop_neon
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
|
||||
secondpass_only_inner_loop_neon
|
||||
vld1.u8 {d23}, [r0], r1 ;load src data
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q9, q11
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q10, q12
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov d22, d26
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
bne secondpass_only_inner_loop_neon
|
||||
|
||||
subs r3, r3, #1
|
||||
sub r0, r0, r1, lsl #4
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1
|
||||
add r0, r0, #8
|
||||
|
||||
sub r4, r4, r5, lsl #4
|
||||
add r4, r4, #8
|
||||
|
||||
bne filt_blk2d_spo16x16_outloop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
END
|
|
@ -0,0 +1,422 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict4x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter4_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(lr) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict4x4_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
adr r12, filter4_coeff
|
||||
ldr r4, [sp, #8] ;load parameters from stack
|
||||
ldr lr, [sp, #12] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter4x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter4x4_only
|
||||
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
sub r0, r0, r1, lsl #1 ;go back 2 lines of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
;First Pass on rest 5-line data
|
||||
vld1.u8 {q11}, [r0], r1
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vext.8 d31, d22, d23, #5 ;construct src_ptr[3]
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vext.8 d31, d22, d23, #1 ;construct src_ptr[-1]
|
||||
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vext.8 d31, d22, d23, #4 ;construct src_ptr[2]
|
||||
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vext.8 d31, d22, d23, #2 ;construct src_ptr[0]
|
||||
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vext.8 d31, d22, d23, #3 ;construct src_ptr[1]
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
vqadd.s16 q12, q11
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d30, q8, #7
|
||||
vqrshrun.s16 d31, q12, #7
|
||||
|
||||
;Second pass: 4x4
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
firstpass_filter4x4_only
|
||||
vabs.s32 q12, q14 ;get abs(filer_parameters)
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;go back 2 columns of src data
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x4)
|
||||
vld1.u8 {q3}, [r0], r1 ;load first 4-line src data
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
vext.8 d18, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d19, d8, d9, #5
|
||||
vext.8 d20, d10, d11, #5
|
||||
vext.8 d21, d12, d13, #5
|
||||
|
||||
vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done
|
||||
vswp d11, d12
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3])
|
||||
vzip.32 d20, d21
|
||||
vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q8, d20, d5
|
||||
|
||||
vmov q4, q3 ;keep original src data in q4 q6
|
||||
vmov q6, q5
|
||||
|
||||
vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #8 ;construct src_ptr[-1]
|
||||
vshr.u64 q10, q6, #8
|
||||
vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp8_filter[0])
|
||||
vmlal.u8 q8, d10, d0
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #32 ;construct src_ptr[2]
|
||||
vshr.u64 q5, q6, #32
|
||||
vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d20, d1
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2])
|
||||
vzip.32 d10, d11
|
||||
vshr.u64 q9, q4, #16 ;construct src_ptr[0]
|
||||
vshr.u64 q10, q6, #16
|
||||
vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d10, d4
|
||||
|
||||
vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0])
|
||||
vzip.32 d20, d21
|
||||
vshr.u64 q3, q4, #24 ;construct src_ptr[1]
|
||||
vshr.u64 q5, q6, #24
|
||||
vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d20, d2
|
||||
|
||||
vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1])
|
||||
vzip.32 d10, d11
|
||||
vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q10, d10, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q10
|
||||
|
||||
vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d28, q8, #7
|
||||
|
||||
vst1.32 {d27[0]}, [r4] ;store result
|
||||
vst1.32 {d27[1]}, [r0]
|
||||
vst1.32 {d28[0]}, [r1]
|
||||
vst1.32 {d28[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
|
||||
;---------------------
|
||||
secondpass_filter4x4_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.32 {d27[0]}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.32 {d27[1]}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.32 {d28[0]}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.32 {d28[1]}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.32 {d29[0]}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.32 {d29[1]}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.32 {d30[0]}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.32 {d30[1]}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.32 {d31[0]}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vext.8 d23, d27, d28, #4
|
||||
vext.8 d24, d28, d29, #4
|
||||
vext.8 d25, d29, d30, #4
|
||||
vext.8 d26, d30, d31, #4
|
||||
|
||||
vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d28, d0
|
||||
|
||||
vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmull.u8 q6, d26, d5
|
||||
|
||||
vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d30, d4
|
||||
|
||||
vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q6, d24, d1
|
||||
|
||||
vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d29, d2
|
||||
|
||||
vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmlal.u8 q6, d25, d3
|
||||
|
||||
add r0, r4, lr
|
||||
add r1, r0, lr
|
||||
add r2, r1, lr
|
||||
|
||||
vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q6, q4
|
||||
|
||||
vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d4, q6, #7
|
||||
|
||||
vst1.32 {d3[0]}, [r4] ;store result
|
||||
vst1.32 {d3[1]}, [r0]
|
||||
vst1.32 {d4[0]}, [r1]
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
pop {r4, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -0,0 +1,473 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x4_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; r4 unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x4_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x4_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x4_only
|
||||
|
||||
sub sp, sp, #32 ;reserve space on stack for temporary storage
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
mov lr, sp
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
|
||||
;Second pass: 8x4
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #5
|
||||
sub lr, lr, #32
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
add sp, sp, #32
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;--------------------
|
||||
firstpass_filter8x4_only
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d1, d24[4]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d2, d25[0]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (4x8)
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x4_only
|
||||
;Second pass: 8x4
|
||||
add r3, r12, r3, lsl #5
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d23, d0
|
||||
vmull.u8 q5, d24, d0
|
||||
vmull.u8 q6, d25, d0
|
||||
|
||||
vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d24, d1
|
||||
vmlsl.u8 q5, d25, d1
|
||||
vmlsl.u8 q6, d26, d1
|
||||
|
||||
vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d27, d4
|
||||
vmlsl.u8 q5, d28, d4
|
||||
vmlsl.u8 q6, d29, d4
|
||||
|
||||
vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d25, d2
|
||||
vmlal.u8 q5, d26, d2
|
||||
vmlal.u8 q6, d27, d2
|
||||
|
||||
vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d28, d5
|
||||
vmlal.u8 q5, d29, d5
|
||||
vmlal.u8 q6, d30, d5
|
||||
|
||||
vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d26, d3
|
||||
vmull.u8 q9, d27, d3
|
||||
vmull.u8 q10, d28, d3
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
|
@ -0,0 +1,524 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
filter8_coeff
|
||||
DCD 0, 0, 128, 0, 0, 0, 0, 0
|
||||
DCD 0, -6, 123, 12, -1, 0, 0, 0
|
||||
DCD 2, -11, 108, 36, -8, 1, 0, 0
|
||||
DCD 0, -9, 93, 50, -6, 0, 0, 0
|
||||
DCD 3, -16, 77, 77, -16, 3, 0, 0
|
||||
DCD 0, -6, 50, 93, -9, 0, 0, 0
|
||||
DCD 1, -8, 36, 108, -11, 2, 0, 0
|
||||
DCD 0, -1, 12, 123, -6, 0, 0, 0
|
||||
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pitch
|
||||
|
||||
|vp8_sixtap_predict8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, filter8_coeff
|
||||
|
||||
ldr r4, [sp, #12] ;load parameters from stack
|
||||
ldr r5, [sp, #16] ;load parameters from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_filter8x8_only
|
||||
|
||||
add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_filter8x8_only
|
||||
|
||||
sub sp, sp, #64 ;reserve space on stack for temporary storage
|
||||
mov lr, sp
|
||||
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
|
||||
;First pass: output_height lines x output_width columns (13x8)
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vdup.8 d3, d25[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vdup.8 d4, d26[0]
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vdup.8 d5, d26[4]
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
filt_blk2d_fp8x8_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [lr]! ;store result
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vst1.u8 {d23}, [lr]!
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vst1.u8 {d24}, [lr]!
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vst1.u8 {d25}, [lr]!
|
||||
|
||||
bne filt_blk2d_fp8x8_loop_neon
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
;vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
;vld1.u8 {q4}, [r0], r1
|
||||
;vld1.u8 {q5}, [r0], r1
|
||||
;vld1.u8 {q6}, [r0], r1
|
||||
vld1.u8 {q7}, [r0], r1
|
||||
|
||||
vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
vmull.u8 q11, d12, d0
|
||||
vmull.u8 q12, d14, d0
|
||||
|
||||
vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d28, d8, d9, #1
|
||||
vext.8 d29, d10, d11, #1
|
||||
vext.8 d30, d12, d13, #1
|
||||
vext.8 d31, d14, d15, #1
|
||||
|
||||
vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q9, d28, d1
|
||||
vmlsl.u8 q10, d29, d1
|
||||
vmlsl.u8 q11, d30, d1
|
||||
vmlsl.u8 q12, d31, d1
|
||||
|
||||
vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d28, d8, d9, #4
|
||||
vext.8 d29, d10, d11, #4
|
||||
vext.8 d30, d12, d13, #4
|
||||
vext.8 d31, d14, d15, #4
|
||||
|
||||
vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q9, d28, d4
|
||||
vmlsl.u8 q10, d29, d4
|
||||
vmlsl.u8 q11, d30, d4
|
||||
vmlsl.u8 q12, d31, d4
|
||||
|
||||
vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d28, d8, d9, #2
|
||||
vext.8 d29, d10, d11, #2
|
||||
vext.8 d30, d12, d13, #2
|
||||
vext.8 d31, d14, d15, #2
|
||||
|
||||
vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q9, d28, d2
|
||||
vmlal.u8 q10, d29, d2
|
||||
vmlal.u8 q11, d30, d2
|
||||
vmlal.u8 q12, d31, d2
|
||||
|
||||
vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d28, d8, d9, #5
|
||||
vext.8 d29, d10, d11, #5
|
||||
vext.8 d30, d12, d13, #5
|
||||
vext.8 d31, d14, d15, #5
|
||||
|
||||
vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q9, d28, d5
|
||||
vmlal.u8 q10, d29, d5
|
||||
vmlal.u8 q11, d30, d5
|
||||
vmlal.u8 q12, d31, d5
|
||||
|
||||
vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d28, d8, d9, #3
|
||||
vext.8 d29, d10, d11, #3
|
||||
vext.8 d30, d12, d13, #3
|
||||
vext.8 d31, d14, d15, #3
|
||||
|
||||
vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d28, d3
|
||||
vmull.u8 q5, d29, d3
|
||||
vmull.u8 q6, d30, d3
|
||||
vmull.u8 q7, d31, d3
|
||||
|
||||
vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q9, q4
|
||||
vqadd.s16 q10, q5
|
||||
vqadd.s16 q11, q6
|
||||
vqadd.s16 q12, q7
|
||||
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
|
||||
sub lr, lr, #64
|
||||
vqrshrun.s16 d27, q9, #7
|
||||
vld1.u8 {q9}, [lr]! ;load intermediate data from stack
|
||||
vqrshrun.s16 d28, q10, #7
|
||||
vld1.u8 {q10}, [lr]!
|
||||
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
|
||||
vqrshrun.s16 d29, q11, #7
|
||||
vld1.u8 {q11}, [lr]!
|
||||
|
||||
vabs.s32 q7, q5
|
||||
vabs.s32 q8, q6
|
||||
|
||||
vqrshrun.s16 d30, q12, #7
|
||||
vld1.u8 {q12}, [lr]!
|
||||
|
||||
;Second pass: 8x8
|
||||
mov r3, #2 ;loop counter
|
||||
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vdup.8 d1, d14[4]
|
||||
vdup.8 d2, d15[0]
|
||||
vdup.8 d3, d15[4]
|
||||
vdup.8 d4, d16[0]
|
||||
vdup.8 d5, d16[4]
|
||||
|
||||
filt_blk2d_sp8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_sp8x8_loop_neon
|
||||
|
||||
add sp, sp, #64
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
firstpass_filter8x8_only
|
||||
;add r2, r12, r2, lsl #5 ;calculate filter location
|
||||
;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
|
||||
vabs.s32 q12, q14
|
||||
vabs.s32 q13, q15
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
|
||||
|
||||
vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
|
||||
vdup.8 d1, d24[4]
|
||||
vdup.8 d2, d25[0]
|
||||
vdup.8 d3, d25[4]
|
||||
vdup.8 d4, d26[0]
|
||||
vdup.8 d5, d26[4]
|
||||
|
||||
;First pass: output_height lines x output_width columns (8x8)
|
||||
filt_blk2d_fpo8x8_loop_neon
|
||||
vld1.u8 {q3}, [r0], r1 ;load src data
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q8, d8, d0
|
||||
vmull.u8 q9, d10, d0
|
||||
vmull.u8 q10, d12, d0
|
||||
|
||||
vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
|
||||
vext.8 d29, d8, d9, #1
|
||||
vext.8 d30, d10, d11, #1
|
||||
vext.8 d31, d12, d13, #1
|
||||
|
||||
vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q8, d29, d1
|
||||
vmlsl.u8 q9, d30, d1
|
||||
vmlsl.u8 q10, d31, d1
|
||||
|
||||
vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
|
||||
vext.8 d29, d8, d9, #4
|
||||
vext.8 d30, d10, d11, #4
|
||||
vext.8 d31, d12, d13, #4
|
||||
|
||||
vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q8, d29, d4
|
||||
vmlsl.u8 q9, d30, d4
|
||||
vmlsl.u8 q10, d31, d4
|
||||
|
||||
vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
|
||||
vext.8 d29, d8, d9, #2
|
||||
vext.8 d30, d10, d11, #2
|
||||
vext.8 d31, d12, d13, #2
|
||||
|
||||
vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q8, d29, d2
|
||||
vmlal.u8 q9, d30, d2
|
||||
vmlal.u8 q10, d31, d2
|
||||
|
||||
vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
|
||||
vext.8 d29, d8, d9, #5
|
||||
vext.8 d30, d10, d11, #5
|
||||
vext.8 d31, d12, d13, #5
|
||||
|
||||
vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q8, d29, d5
|
||||
vmlal.u8 q9, d30, d5
|
||||
vmlal.u8 q10, d31, d5
|
||||
|
||||
vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
|
||||
vext.8 d29, d8, d9, #3
|
||||
vext.8 d30, d10, d11, #3
|
||||
vext.8 d31, d12, d13, #3
|
||||
|
||||
vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q4, d29, d3
|
||||
vmull.u8 q5, d30, d3
|
||||
vmull.u8 q6, d31, d3
|
||||
;
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d23, q8, #7
|
||||
vqrshrun.s16 d24, q9, #7
|
||||
vqrshrun.s16 d25, q10, #7
|
||||
|
||||
vst1.u8 {d22}, [r4], r5 ;store result
|
||||
vst1.u8 {d23}, [r4], r5
|
||||
vst1.u8 {d24}, [r4], r5
|
||||
vst1.u8 {d25}, [r4], r5
|
||||
|
||||
bne filt_blk2d_fpo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
;---------------------
|
||||
secondpass_filter8x8_only
|
||||
sub r0, r0, r1, lsl #1
|
||||
add r3, r12, r3, lsl #5
|
||||
|
||||
vld1.u8 {d18}, [r0], r1 ;load src data
|
||||
vld1.s32 {q5, q6}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d19}, [r0], r1
|
||||
vabs.s32 q7, q5
|
||||
vld1.u8 {d20}, [r0], r1
|
||||
vabs.s32 q8, q6
|
||||
vld1.u8 {d21}, [r0], r1
|
||||
mov r3, #2 ;loop counter
|
||||
vld1.u8 {d22}, [r0], r1
|
||||
vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vdup.8 d1, d14[4]
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vdup.8 d2, d15[0]
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vdup.8 d3, d15[4]
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vdup.8 d4, d16[0]
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vdup.8 d5, d16[4]
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
;Second pass: 8x8
|
||||
filt_blk2d_spo8x8_loop_neon
|
||||
vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
|
||||
vmull.u8 q4, d19, d0
|
||||
vmull.u8 q5, d20, d0
|
||||
vmull.u8 q6, d21, d0
|
||||
|
||||
vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
|
||||
vmlsl.u8 q4, d20, d1
|
||||
vmlsl.u8 q5, d21, d1
|
||||
vmlsl.u8 q6, d22, d1
|
||||
|
||||
vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
|
||||
vmlsl.u8 q4, d23, d4
|
||||
vmlsl.u8 q5, d24, d4
|
||||
vmlsl.u8 q6, d25, d4
|
||||
|
||||
vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
|
||||
vmlal.u8 q4, d21, d2
|
||||
vmlal.u8 q5, d22, d2
|
||||
vmlal.u8 q6, d23, d2
|
||||
|
||||
vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
|
||||
vmlal.u8 q4, d24, d5
|
||||
vmlal.u8 q5, d25, d5
|
||||
vmlal.u8 q6, d26, d5
|
||||
|
||||
vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
|
||||
vmull.u8 q8, d22, d3
|
||||
vmull.u8 q9, d23, d3
|
||||
vmull.u8 q10, d24, d3
|
||||
|
||||
subs r3, r3, #1
|
||||
|
||||
vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
|
||||
vqadd.s16 q8, q4
|
||||
vqadd.s16 q9, q5
|
||||
vqadd.s16 q10, q6
|
||||
|
||||
vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrun.s16 d7, q8, #7
|
||||
vqrshrun.s16 d8, q9, #7
|
||||
vqrshrun.s16 d9, q10, #7
|
||||
|
||||
vmov q9, q11
|
||||
vst1.u8 {d6}, [r4], r5 ;store result
|
||||
vmov q10, q12
|
||||
vst1.u8 {d7}, [r4], r5
|
||||
vmov q11, q13
|
||||
vst1.u8 {d8}, [r4], r5
|
||||
vmov q12, q14
|
||||
vst1.u8 {d9}, [r4], r5
|
||||
vmov d26, d30
|
||||
|
||||
bne filt_blk2d_spo8x8_loop_neon
|
||||
|
||||
pop {r4-r5,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
END
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,276 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance16x16_neon|
|
||||
EXPORT |vp8_variance16x8_neon|
|
||||
EXPORT |vp8_variance8x16_neon|
|
||||
EXPORT |vp8_variance8x8_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance16x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
;VPADAL adds adjacent pairs of elements of a vector, and accumulates
|
||||
;the results into the elements of the destination vector. The explanation
|
||||
;in ARM guide is wrong.
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
;vmov.32 r0, d0[0] ;this instruction costs a lot
|
||||
;vmov.32 r1, d1[0]
|
||||
;mul r0, r0, r0
|
||||
;str r1, [r12]
|
||||
;sub r0, r1, r0, lsr #8
|
||||
|
||||
; while sum is signed, sum * sum is always positive and must be treated as
|
||||
; unsigned to avoid propagating the sign bit.
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;================================
|
||||
;unsigned int vp8_variance16x8_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|vp8_variance16x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #4
|
||||
|
||||
variance16x8_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance16x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #7
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;unsigned int vp8_variance8x16_c(
|
||||
; unsigned char *src_ptr,
|
||||
; int source_stride,
|
||||
; unsigned char *ref_ptr,
|
||||
; int recon_stride,
|
||||
; unsigned int *sse)
|
||||
|
||||
|vp8_variance8x16_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
variance8x16_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d2, d6
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
|
||||
bne variance8x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #7
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;==================================
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vp8_variance8x8_neon| PROC
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d5}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;calculate diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;calculate sum
|
||||
vmlal.s16 q9, d22, d22 ;calculate sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r12] ;store sse
|
||||
vshr.u32 d10, d10, #6
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
unsigned int vp8_variance16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) {
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance16x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
__builtin_prefetch(src_ptr);
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
__builtin_prefetch(ref_ptr);
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d2u8, d6u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_variance8x8_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint32x2_t d0u32, d10u32;
|
||||
int64x1_t d0s64, d1s64;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int32x4_t q8s32, q9s32, q10s32;
|
||||
int64x2_t q0s64, q1s64, q5s64;
|
||||
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||
q0s64 = vpaddlq_s32(q8s32);
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
|
||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||
vreinterpret_s32_s64(d0s64));
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||
|
||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
|
||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||
|
||||
return vget_lane_u32(d0u32, 0);
|
||||
}
|
|
@ -0,0 +1,423 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
;-----------------
|
||||
|
||||
EXPORT |vp8_sub_pixel_variance16x16_neon_func|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
|
||||
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
|vp8_sub_pixel_variance16x16_neon_func| PROC
|
||||
push {r4-r6, lr}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #16] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
|
||||
ldr r6, [sp, #24] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16_only
|
||||
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
|
||||
vld1.s32 {d31}, [r2] ;load first_pass filter
|
||||
|
||||
beq firstpass_bfilter16x16_only
|
||||
|
||||
sub sp, sp, #272 ;reserve space on stack for temporary storage
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
mov lr, sp
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
mov r2, #3 ;loop counter
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vst1.u8 {d18, d19, d20, d21}, [lr]!
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16_loop_neon
|
||||
|
||||
;First-pass filtering for rest 5 lines
|
||||
vld1.u8 {d14, d15, d16}, [r0], r1
|
||||
|
||||
vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q10, d3, d0
|
||||
vmull.u8 q11, d5, d0
|
||||
vmull.u8 q12, d6, d0
|
||||
vmull.u8 q13, d8, d0
|
||||
vmull.u8 q14, d9, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
|
||||
vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q11, d5, d1
|
||||
vmlal.u8 q13, d8, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
|
||||
vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q12, d6, d1
|
||||
vmlal.u8 q14, d9, d1
|
||||
|
||||
vmull.u8 q1, d11, d0
|
||||
vmull.u8 q2, d12, d0
|
||||
vmull.u8 q3, d14, d0
|
||||
vmull.u8 q4, d15, d0
|
||||
|
||||
vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
|
||||
vext.8 d14, d14, d15, #1
|
||||
|
||||
vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q3, d14, d1
|
||||
|
||||
vext.8 d12, d12, d13, #1
|
||||
vext.8 d15, d15, d16, #1
|
||||
|
||||
vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q4, d15, d1
|
||||
|
||||
vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d11, q10, #7
|
||||
vqrshrn.u16 d12, q11, #7
|
||||
vqrshrn.u16 d13, q12, #7
|
||||
vqrshrn.u16 d14, q13, #7
|
||||
vqrshrn.u16 d15, q14, #7
|
||||
vqrshrn.u16 d16, q1, #7
|
||||
vqrshrn.u16 d17, q2, #7
|
||||
vqrshrn.u16 d18, q3, #7
|
||||
vqrshrn.u16 d19, q4, #7
|
||||
|
||||
vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
|
||||
vst1.u8 {d14, d15, d16, d17}, [lr]!
|
||||
vst1.u8 {d18, d19}, [lr]!
|
||||
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
add r3, r12, r3, lsl #3
|
||||
sub lr, lr, #272
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
sub sp, sp, #256
|
||||
mov r3, sp
|
||||
|
||||
vld1.u8 {d22, d23}, [lr]! ;load src data
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [lr]!
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [lr]!
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [lr]!
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [lr]!
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_sp16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16_only
|
||||
mov r2, #4 ;loop counter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
|
||||
vld1.u8 {d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10}, [r0], r1
|
||||
vld1.u8 {d11, d12, d13}, [r0], r1
|
||||
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q8, d3, d0
|
||||
vmull.u8 q9, d5, d0
|
||||
vmull.u8 q10, d6, d0
|
||||
vmull.u8 q11, d8, d0
|
||||
vmull.u8 q12, d9, d0
|
||||
vmull.u8 q13, d11, d0
|
||||
vmull.u8 q14, d12, d0
|
||||
|
||||
vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
|
||||
vext.8 d5, d5, d6, #1
|
||||
vext.8 d8, d8, d9, #1
|
||||
vext.8 d11, d11, d12, #1
|
||||
|
||||
vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q9, d5, d1
|
||||
vmlal.u8 q11, d8, d1
|
||||
vmlal.u8 q13, d11, d1
|
||||
|
||||
vext.8 d3, d3, d4, #1
|
||||
vext.8 d6, d6, d7, #1
|
||||
vext.8 d9, d9, d10, #1
|
||||
vext.8 d12, d12, d13, #1
|
||||
|
||||
vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
|
||||
vmlal.u8 q10, d6, d1
|
||||
vmlal.u8 q12, d9, d1
|
||||
vmlal.u8 q14, d12, d1
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d15, q8, #7
|
||||
vqrshrn.u16 d16, q9, #7
|
||||
vqrshrn.u16 d17, q10, #7
|
||||
vqrshrn.u16 d18, q11, #7
|
||||
vqrshrn.u16 d19, q12, #7
|
||||
vqrshrn.u16 d20, q13, #7
|
||||
vst1.u8 {d14, d15}, [r3]! ;store result
|
||||
vqrshrn.u16 d21, q14, #7
|
||||
|
||||
vst1.u8 {d16, d17}, [r3]!
|
||||
vst1.u8 {d18, d19}, [r3]!
|
||||
vst1.u8 {d20, d21}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16_only
|
||||
;Second pass: 16x16
|
||||
;secondpass_filter
|
||||
sub sp, sp, #528 ;reserve space on stack for temporary storage
|
||||
add r3, r12, r3, lsl #3
|
||||
mov r12, #4 ;loop counter
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
vld1.u8 {d22, d23}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
vld1.u8 {d24, d25}, [r0], r1
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vld1.u8 {d26, d27}, [r0], r1
|
||||
vmull.u8 q2, d23, d0
|
||||
vld1.u8 {d28, d29}, [r0], r1
|
||||
vmull.u8 q3, d24, d0
|
||||
vld1.u8 {d30, d31}, [r0], r1
|
||||
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d25, d1
|
||||
vmlal.u8 q3, d26, d1
|
||||
vmlal.u8 q4, d27, d1
|
||||
vmlal.u8 q5, d28, d1
|
||||
vmlal.u8 q6, d29, d1
|
||||
vmlal.u8 q7, d30, d1
|
||||
vmlal.u8 q8, d31, d1
|
||||
|
||||
vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d3, q2, #7
|
||||
vqrshrn.u16 d4, q3, #7
|
||||
vqrshrn.u16 d5, q4, #7
|
||||
vqrshrn.u16 d6, q5, #7
|
||||
vqrshrn.u16 d7, q6, #7
|
||||
vqrshrn.u16 d8, q7, #7
|
||||
vqrshrn.u16 d9, q8, #7
|
||||
|
||||
vst1.u8 {d2, d3}, [r3]! ;store result
|
||||
subs r12, r12, #1
|
||||
vst1.u8 {d4, d5}, [r3]!
|
||||
vmov q11, q15
|
||||
vst1.u8 {d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r12, #8
|
||||
|
||||
sub_pixel_variance16x16_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q2}, [r4], r5
|
||||
vld1.8 {q1}, [r3]!
|
||||
vld1.8 {q3}, [r4], r5
|
||||
|
||||
vsubl.u8 q11, d0, d4 ;diff
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne sub_pixel_variance16x16_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [r6] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
add sp, sp, #528
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4-r6,pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -0,0 +1,572 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_variance_halfpixvar16x16_h_neon|
|
||||
EXPORT |vp8_variance_halfpixvar16x16_v_neon|
|
||||
EXPORT |vp8_variance_halfpixvar16x16_hv_neon|
|
||||
EXPORT |vp8_sub_pixel_variance16x16s_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_h_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_h_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.8 {q11}, [r2], r3
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.8 {q12}, [r2], r3
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.8 {q13}, [r2], r3
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vext.8 q3, q2, q3, #1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vld1.8 {q14}, [r2], r3
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
|
||||
vsubl.u8 q4, d0, d22 ;diff
|
||||
vsubl.u8 q5, d1, d23
|
||||
vsubl.u8 q6, d2, d24
|
||||
vsubl.u8 q7, d3, d25
|
||||
vsubl.u8 q0, d4, d26
|
||||
vsubl.u8 q1, d5, d27
|
||||
vsubl.u8 q2, d6, d28
|
||||
vsubl.u8 q3, d7, d29
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_fpo16x16s_4_0_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_v_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_v_neon| PROC
|
||||
push {lr}
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
|
||||
vld1.u8 {q0}, [r0], r1 ;load src data
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
vp8_filt_spo16x16s_0_4_loop_neon
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vld1.8 {q1}, [r2], r3
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
vld1.u8 {q6}, [r0], r1
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vld1.u8 {q15}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q4
|
||||
vrhadd.u8 q4, q4, q6
|
||||
vrhadd.u8 q6, q6, q15
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
|
||||
vmov q0, q15
|
||||
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne vp8_filt_spo16x16s_0_4_loop_neon
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;================================================
|
||||
;unsigned int vp8_variance_halfpixvar16x16_hv_neon
|
||||
;(
|
||||
; unsigned char *src_ptr, r0
|
||||
; int src_pixels_per_line, r1
|
||||
; unsigned char *dst_ptr, r2
|
||||
; int dst_pixels_per_line, r3
|
||||
; unsigned int *sse
|
||||
;);
|
||||
;================================================
|
||||
|vp8_variance_halfpixvar16x16_hv_neon| PROC
|
||||
push {lr}
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
|
||||
ldr lr, [sp, #4] ;load *sse from stack
|
||||
vmov.i8 q13, #0 ;q8 - sum
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
|
||||
vmov.i8 q14, #0 ;q9, q10 - sse
|
||||
vmov.i8 q15, #0
|
||||
|
||||
mov r12, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8_filt16x16s_4_4_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vld1.8 {q5}, [r2], r3
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.8 {q6}, [r2], r3
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.8 {q7}, [r2], r3
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.8 {q8}, [r2], r3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
vsubl.u8 q9, d0, d10 ;diff
|
||||
vsubl.u8 q10, d1, d11
|
||||
vsubl.u8 q11, d2, d12
|
||||
vsubl.u8 q12, d3, d13
|
||||
|
||||
vsubl.u8 q0, d4, d14 ;diff
|
||||
vsubl.u8 q1, d5, d15
|
||||
vsubl.u8 q5, d6, d16
|
||||
vsubl.u8 q6, d7, d17
|
||||
|
||||
vpadal.s16 q13, q9 ;sum
|
||||
vmlal.s16 q14, d18, d18 ;sse
|
||||
vmlal.s16 q15, d19, d19
|
||||
|
||||
vpadal.s16 q13, q10 ;sum
|
||||
vmlal.s16 q14, d20, d20 ;sse
|
||||
vmlal.s16 q15, d21, d21
|
||||
|
||||
vpadal.s16 q13, q11 ;sum
|
||||
vmlal.s16 q14, d22, d22 ;sse
|
||||
vmlal.s16 q15, d23, d23
|
||||
|
||||
vpadal.s16 q13, q12 ;sum
|
||||
vmlal.s16 q14, d24, d24 ;sse
|
||||
vmlal.s16 q15, d25, d25
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vpadal.s16 q13, q0 ;sum
|
||||
vmlal.s16 q14, d0, d0 ;sse
|
||||
vmlal.s16 q15, d1, d1
|
||||
|
||||
vpadal.s16 q13, q1 ;sum
|
||||
vmlal.s16 q14, d2, d2 ;sse
|
||||
vmlal.s16 q15, d3, d3
|
||||
|
||||
vpadal.s16 q13, q5 ;sum
|
||||
vmlal.s16 q14, d10, d10 ;sse
|
||||
vmlal.s16 q15, d11, d11
|
||||
|
||||
vmov q0, q4
|
||||
|
||||
vpadal.s16 q13, q6 ;sum
|
||||
vmlal.s16 q14, d12, d12 ;sse
|
||||
vmlal.s16 q15, d13, d13
|
||||
|
||||
bne vp8_filt16x16s_4_4_loop_neon
|
||||
|
||||
vadd.u32 q15, q14, q15 ;accumulate sse
|
||||
vpaddl.s32 q0, q13 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q15
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {pc}
|
||||
ENDP
|
||||
|
||||
;==============================
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack unsigned char *dst_ptr,
|
||||
; stack int dst_pixels_per_line,
|
||||
; stack unsigned int *sse
|
||||
;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
|
||||
;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
|
||||
;or filter coeff is {64, 64}. This simplified program only works in this situation.
|
||||
;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
|
||||
|
||||
|vp8_sub_pixel_variance16x16s_neon| PROC
|
||||
push {r4, lr}
|
||||
|
||||
ldr r4, [sp, #8] ;load *dst_ptr from stack
|
||||
ldr r12, [sp, #12] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #16] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq secondpass_bfilter16x16s_only
|
||||
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
beq firstpass_bfilter16x16s_only
|
||||
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
mov r3, sp
|
||||
mov r2, #4 ;loop counter
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
|
||||
;First Pass: output_height lines x output_width columns (17x16)
|
||||
vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q3, q2, q3, #1 ;construct src_ptr[1]
|
||||
vext.8 q5, q4, q5, #1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vext.8 q9, q8, q9, #1
|
||||
|
||||
vrhadd.u8 q1, q2, q3 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vrhadd.u8 q3, q3, q4
|
||||
|
||||
subs r2, r2, #1
|
||||
vst1.u8 {d0, d1 ,d2, d3}, [r3]! ;store result
|
||||
vmov q0, q4
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fp16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;--------------------
|
||||
firstpass_bfilter16x16s_only
|
||||
mov r2, #2 ;loop counter
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
mov r3, sp
|
||||
|
||||
;First Pass: output_height lines x output_width columns (16x16)
|
||||
vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
vld1.u8 {d0, d1, d2, d3}, [r0], r1 ;load src data
|
||||
vld1.u8 {d4, d5, d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9, d10, d11}, [r0], r1
|
||||
vld1.u8 {d12, d13, d14, d15}, [r0], r1
|
||||
|
||||
;pld [r0]
|
||||
;pld [r0, r1]
|
||||
;pld [r0, r1, lsl #1]
|
||||
|
||||
vext.8 q1, q0, q1, #1 ;construct src_ptr[1]
|
||||
vld1.u8 {d16, d17, d18, d19}, [r0], r1
|
||||
vext.8 q3, q2, q3, #1
|
||||
vld1.u8 {d20, d21, d22, d23}, [r0], r1
|
||||
vext.8 q5, q4, q5, #1
|
||||
vld1.u8 {d24, d25, d26, d27}, [r0], r1
|
||||
vext.8 q7, q6, q7, #1
|
||||
vld1.u8 {d28, d29, d30, d31}, [r0], r1
|
||||
vext.8 q9, q8, q9, #1
|
||||
vext.8 q11, q10, q11, #1
|
||||
vext.8 q13, q12, q13, #1
|
||||
vext.8 q15, q14, q15, #1
|
||||
|
||||
vrhadd.u8 q0, q0, q1 ;(src_ptr[0]+src_ptr[1])/round/shift right 1
|
||||
vrhadd.u8 q1, q2, q3
|
||||
vrhadd.u8 q2, q4, q5
|
||||
vrhadd.u8 q3, q6, q7
|
||||
vrhadd.u8 q4, q8, q9
|
||||
vrhadd.u8 q5, q10, q11
|
||||
vrhadd.u8 q6, q12, q13
|
||||
vrhadd.u8 q7, q14, q15
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]!
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_fpo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;---------------------
|
||||
secondpass_bfilter16x16s_only
|
||||
sub sp, sp, #256 ;reserve space on stack for temporary storage
|
||||
|
||||
mov r2, #2 ;loop counter
|
||||
vld1.u8 {d0, d1}, [r0], r1 ;load src data
|
||||
mov r3, sp
|
||||
|
||||
vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
vld1.u8 {d2, d3}, [r0], r1
|
||||
vld1.u8 {d4, d5}, [r0], r1
|
||||
vld1.u8 {d6, d7}, [r0], r1
|
||||
vld1.u8 {d8, d9}, [r0], r1
|
||||
|
||||
vrhadd.u8 q0, q0, q1
|
||||
vld1.u8 {d10, d11}, [r0], r1
|
||||
vrhadd.u8 q1, q1, q2
|
||||
vld1.u8 {d12, d13}, [r0], r1
|
||||
vrhadd.u8 q2, q2, q3
|
||||
vld1.u8 {d14, d15}, [r0], r1
|
||||
vrhadd.u8 q3, q3, q4
|
||||
vld1.u8 {d16, d17}, [r0], r1
|
||||
vrhadd.u8 q4, q4, q5
|
||||
vrhadd.u8 q5, q5, q6
|
||||
vrhadd.u8 q6, q6, q7
|
||||
vrhadd.u8 q7, q7, q8
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vst1.u8 {d0, d1, d2, d3}, [r3]! ;store result
|
||||
vmov q0, q8
|
||||
vst1.u8 {d4, d5, d6, d7}, [r3]!
|
||||
vst1.u8 {d8, d9, d10, d11}, [r3]! ;store result
|
||||
vst1.u8 {d12, d13, d14, d15}, [r3]!
|
||||
|
||||
bne vp8e_filt_blk2d_spo16x16s_loop_neon
|
||||
|
||||
b sub_pixel_variance16x16s_neon
|
||||
|
||||
;----------------------------
|
||||
;variance16x16
|
||||
sub_pixel_variance16x16s_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
sub r3, r3, #256
|
||||
mov r2, #4
|
||||
|
||||
sub_pixel_variance16x16s_neon_loop
|
||||
vld1.8 {q0}, [r3]! ;Load up source and reference
|
||||
vld1.8 {q1}, [r4], r12
|
||||
vld1.8 {q2}, [r3]!
|
||||
vld1.8 {q3}, [r4], r12
|
||||
vld1.8 {q4}, [r3]!
|
||||
vld1.8 {q5}, [r4], r12
|
||||
vld1.8 {q6}, [r3]!
|
||||
vld1.8 {q7}, [r4], r12
|
||||
|
||||
vsubl.u8 q11, d0, d2 ;diff
|
||||
vsubl.u8 q12, d1, d3
|
||||
vsubl.u8 q13, d4, d6
|
||||
vsubl.u8 q14, d5, d7
|
||||
vsubl.u8 q0, d8, d10
|
||||
vsubl.u8 q1, d9, d11
|
||||
vsubl.u8 q2, d12, d14
|
||||
vsubl.u8 q3, d13, d15
|
||||
|
||||
vpadal.s16 q8, q11 ;sum
|
||||
vmlal.s16 q9, d22, d22 ;sse
|
||||
vmlal.s16 q10, d23, d23
|
||||
|
||||
subs r2, r2, #1
|
||||
|
||||
vpadal.s16 q8, q12
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vpadal.s16 q8, q13
|
||||
vmlal.s16 q9, d26, d26
|
||||
vmlal.s16 q10, d27, d27
|
||||
vpadal.s16 q8, q14
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
vpadal.s16 q8, q0 ;sum
|
||||
vmlal.s16 q9, d0, d0 ;sse
|
||||
vmlal.s16 q10, d1, d1
|
||||
vpadal.s16 q8, q1
|
||||
vmlal.s16 q9, d2, d2
|
||||
vmlal.s16 q10, d3, d3
|
||||
vpadal.s16 q8, q2
|
||||
vmlal.s16 q9, d4, d4
|
||||
vmlal.s16 q10, d5, d5
|
||||
vpadal.s16 q8, q3
|
||||
vmlal.s16 q9, d6, d6
|
||||
vmlal.s16 q10, d7, d7
|
||||
|
||||
bne sub_pixel_variance16x16s_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #8
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
add sp, sp, #256
|
||||
vmov.32 r0, d0[0] ;return
|
||||
|
||||
pop {r4, pc}
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -0,0 +1,222 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sub_pixel_variance8x8_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack(r4) unsigned char *dst_ptr,
|
||||
; stack(r5) int dst_pixels_per_line,
|
||||
; stack(r6) unsigned int *sse
|
||||
;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
|
||||
|
||||
|vp8_sub_pixel_variance8x8_neon| PROC
|
||||
push {r4-r5, lr}
|
||||
|
||||
adr r12, bilinear_taps_coeff
|
||||
ldr r4, [sp, #12] ;load *dst_ptr from stack
|
||||
ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
|
||||
ldr lr, [sp, #20] ;load *sse from stack
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;First pass: output_height lines x output_width columns (9x8)
|
||||
add r2, r12, r2, lsl #3 ;calculate filter location
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vld1.u32 {d31}, [r2] ;load first_pass filter
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vdup.8 d1, d31[4]
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
|
||||
vld1.u8 {q1}, [r0], r1 ;load src data
|
||||
vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
|
||||
vld1.u8 {q2}, [r0], r1
|
||||
vqrshrn.u16 d23, q7, #7
|
||||
vld1.u8 {q3}, [r0], r1
|
||||
vqrshrn.u16 d24, q8, #7
|
||||
vld1.u8 {q4}, [r0], r1
|
||||
vqrshrn.u16 d25, q9, #7
|
||||
|
||||
;first_pass filtering on the rest 5-line data
|
||||
vld1.u8 {q5}, [r0], r1
|
||||
|
||||
vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q7, d4, d0
|
||||
vmull.u8 q8, d6, d0
|
||||
vmull.u8 q9, d8, d0
|
||||
vmull.u8 q10, d10, d0
|
||||
|
||||
vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
|
||||
vext.8 d5, d4, d5, #1
|
||||
vext.8 d7, d6, d7, #1
|
||||
vext.8 d9, d8, d9, #1
|
||||
vext.8 d11, d10, d11, #1
|
||||
|
||||
vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
|
||||
vmlal.u8 q7, d5, d1
|
||||
vmlal.u8 q8, d7, d1
|
||||
vmlal.u8 q9, d9, d1
|
||||
vmlal.u8 q10, d11, d1
|
||||
|
||||
vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d27, q7, #7
|
||||
vqrshrn.u16 d28, q8, #7
|
||||
vqrshrn.u16 d29, q9, #7
|
||||
vqrshrn.u16 d30, q10, #7
|
||||
|
||||
;Second pass: 8x8
|
||||
secondpass_filter
|
||||
cmp r3, #0 ;skip second_pass filter if yoffset=0
|
||||
;skip_secondpass_filter
|
||||
beq sub_pixel_variance8x8_neon
|
||||
|
||||
add r3, r12, r3, lsl #3
|
||||
|
||||
vld1.u32 {d31}, [r3] ;load second_pass filter
|
||||
|
||||
vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
|
||||
vdup.8 d1, d31[4]
|
||||
|
||||
vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
|
||||
vmull.u8 q2, d23, d0
|
||||
vmull.u8 q3, d24, d0
|
||||
vmull.u8 q4, d25, d0
|
||||
vmull.u8 q5, d26, d0
|
||||
vmull.u8 q6, d27, d0
|
||||
vmull.u8 q7, d28, d0
|
||||
vmull.u8 q8, d29, d0
|
||||
|
||||
vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
|
||||
vmlal.u8 q2, d24, d1
|
||||
vmlal.u8 q3, d25, d1
|
||||
vmlal.u8 q4, d26, d1
|
||||
vmlal.u8 q5, d27, d1
|
||||
vmlal.u8 q6, d28, d1
|
||||
vmlal.u8 q7, d29, d1
|
||||
vmlal.u8 q8, d30, d1
|
||||
|
||||
vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
|
||||
vqrshrn.u16 d23, q2, #7
|
||||
vqrshrn.u16 d24, q3, #7
|
||||
vqrshrn.u16 d25, q4, #7
|
||||
vqrshrn.u16 d26, q5, #7
|
||||
vqrshrn.u16 d27, q6, #7
|
||||
vqrshrn.u16 d28, q7, #7
|
||||
vqrshrn.u16 d29, q8, #7
|
||||
|
||||
b sub_pixel_variance8x8_neon
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
vld1.u8 {d22}, [r0], r1 ;load src data
|
||||
vld1.u8 {d23}, [r0], r1
|
||||
vld1.u8 {d24}, [r0], r1
|
||||
vld1.u8 {d25}, [r0], r1
|
||||
vld1.u8 {d26}, [r0], r1
|
||||
vld1.u8 {d27}, [r0], r1
|
||||
vld1.u8 {d28}, [r0], r1
|
||||
vld1.u8 {d29}, [r0], r1
|
||||
vld1.u8 {d30}, [r0], r1
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;----------------------
|
||||
;vp8_variance8x8_neon
|
||||
sub_pixel_variance8x8_neon
|
||||
vmov.i8 q8, #0 ;q8 - sum
|
||||
vmov.i8 q9, #0 ;q9, q10 - sse
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #2
|
||||
|
||||
sub_pixel_variance8x8_neon_loop
|
||||
vld1.8 {d0}, [r4], r5 ;load dst data
|
||||
subs r12, r12, #1
|
||||
vld1.8 {d1}, [r4], r5
|
||||
vld1.8 {d2}, [r4], r5
|
||||
vsubl.u8 q4, d22, d0 ;calculate diff
|
||||
vld1.8 {d3}, [r4], r5
|
||||
|
||||
vsubl.u8 q5, d23, d1
|
||||
vsubl.u8 q6, d24, d2
|
||||
|
||||
vpadal.s16 q8, q4 ;sum
|
||||
vmlal.s16 q9, d8, d8 ;sse
|
||||
vmlal.s16 q10, d9, d9
|
||||
|
||||
vsubl.u8 q7, d25, d3
|
||||
|
||||
vpadal.s16 q8, q5
|
||||
vmlal.s16 q9, d10, d10
|
||||
vmlal.s16 q10, d11, d11
|
||||
|
||||
vmov q11, q13
|
||||
|
||||
vpadal.s16 q8, q6
|
||||
vmlal.s16 q9, d12, d12
|
||||
vmlal.s16 q10, d13, d13
|
||||
|
||||
vmov q12, q14
|
||||
|
||||
vpadal.s16 q8, q7
|
||||
vmlal.s16 q9, d14, d14
|
||||
vmlal.s16 q10, d15, d15
|
||||
|
||||
bne sub_pixel_variance8x8_neon_loop
|
||||
|
||||
vadd.u32 q10, q9, q10 ;accumulate sse
|
||||
vpaddl.s32 q0, q8 ;accumulate sum
|
||||
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.s64 d0, d0, d1
|
||||
vadd.u64 d1, d2, d3
|
||||
|
||||
vmull.s32 q5, d0, d0
|
||||
vst1.32 {d1[0]}, [lr] ;store sse
|
||||
vshr.u32 d10, d10, #6
|
||||
vsub.u32 d0, d1, d10
|
||||
|
||||
vmov.32 r0, d0[0] ;return
|
||||
pop {r4-r5, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
|
||||
bilinear_taps_coeff
|
||||
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
|
||||
|
||||
END
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
#if HAVE_NEON
|
||||
extern void vp8_build_intra_predictors_mby_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
extern void vp8_build_intra_predictors_mby_s_neon_func(
|
||||
unsigned char *y_buffer,
|
||||
unsigned char *ypred_ptr,
|
||||
int y_stride,
|
||||
int mode,
|
||||
int Up,
|
||||
int Left);
|
||||
|
||||
void vp8_build_intra_predictors_mby_s_neon(MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *y_buffer = x->dst.y_buffer;
|
||||
unsigned char *ypred_ptr = x->predictor;
|
||||
int y_stride = x->dst.y_stride;
|
||||
int mode = x->mode_info_context->mbmi.mode;
|
||||
int Up = x->up_available;
|
||||
int Left = x->left_available;
|
||||
|
||||
vp8_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, y_stride, mode, Up, Left);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -22,9 +22,6 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
|
||||
/* Only need this for fixed-size arrays, for structs just assign. */
|
||||
|
||||
#define vp8_copy( Dest, Src) { \
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "onyxc_int.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
typedef unsigned char uc;
|
||||
|
||||
static void lf_init_lut(loop_filter_info_n *lfi)
|
||||
{
|
||||
|
|
|
@ -104,18 +104,7 @@ extern "C"
|
|||
struct vpx_rational timebase;
|
||||
unsigned int target_bandwidth; /* kilobits per second */
|
||||
|
||||
/* Parameter used for applying denoiser.
|
||||
* For temporal denoiser: noise_sensitivity = 0 means off,
|
||||
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
|
||||
* noise_sensitivity = 2 means temporal denoiser on for all channels.
|
||||
* noise_sensitivity = 3 means aggressive denoising mode.
|
||||
* noise_sensitivity >= 4 means adaptive denoising mode.
|
||||
* Temporal denoiser is enabled via the configuration option:
|
||||
* CONFIG_TEMPORAL_DENOISING.
|
||||
* For spatial denoiser: noise_sensitivity controls the amount of
|
||||
* pre-processing blur: noise_sensitivity = 0 means off.
|
||||
* Spatial denoiser invoked under !CONFIG_TEMPORAL_DENOISING.
|
||||
*/
|
||||
/* parameter used for applying pre processing blur: recommendation 0 */
|
||||
int noise_sensitivity;
|
||||
|
||||
/* parameter used for sharpening output: recommendation 0: */
|
||||
|
@ -224,7 +213,7 @@ extern "C"
|
|||
int arnr_strength;
|
||||
int arnr_type;
|
||||
|
||||
vpx_fixed_buf_t two_pass_stats_in;
|
||||
struct vpx_fixed_buf two_pass_stats_in;
|
||||
struct vpx_codec_pkt_list *output_pkt_list;
|
||||
|
||||
vp8e_tuning tuning;
|
||||
|
|
|
@ -214,7 +214,6 @@ static int q2mbl(int x)
|
|||
x = 50 + (x - 50) * 10 / 8;
|
||||
return x * x / 3;
|
||||
}
|
||||
|
||||
void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit)
|
||||
{
|
||||
int r, c, i;
|
||||
|
@ -227,14 +226,14 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
|
|||
int sumsq = 0;
|
||||
int sum = 0;
|
||||
|
||||
for (i = -8; i < 0; i++)
|
||||
for (i = -8; i<0; i++)
|
||||
s[i]=s[0];
|
||||
|
||||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[i+cols]=s[cols-1];
|
||||
for (i = cols; i<cols+17; i++)
|
||||
s[i]=s[cols-1];
|
||||
|
||||
for (i = -8; i <= 6; i++)
|
||||
{
|
||||
|
@ -265,6 +264,7 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit)
|
||||
{
|
||||
int r, c, i;
|
||||
|
@ -284,8 +284,8 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
|
|||
/* 17 avoids valgrind warning - we buffer values in c in d
|
||||
* and only write them when we've read 8 ahead...
|
||||
*/
|
||||
for (i = 0; i < 17; i++)
|
||||
s[(i+rows)*pitch]=s[(rows-1)*pitch];
|
||||
for (i = rows; i < rows+17; i++)
|
||||
s[i*pitch]=s[(rows-1)*pitch];
|
||||
|
||||
for (i = -8; i <= 6; i++)
|
||||
{
|
||||
|
@ -303,14 +303,13 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i
|
|||
{
|
||||
d[r&15] = (rv2[r&127] + sum + s[0]) >> 4;
|
||||
}
|
||||
if (r >= 8)
|
||||
s[-8*pitch] = d[(r-8)&15];
|
||||
|
||||
s[-8*pitch] = d[(r-8)&15];
|
||||
s += pitch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_POSTPROC
|
||||
static void vp8_de_mblock(YV12_BUFFER_CONFIG *post,
|
||||
int q)
|
||||
{
|
||||
|
@ -383,22 +382,21 @@ void vp8_deblock(VP8_COMMON *cm,
|
|||
vp8_yv12_copy_frame(source, post);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !(CONFIG_TEMPORAL_DENOISING)
|
||||
void vp8_de_noise(VP8_COMMON *cm,
|
||||
YV12_BUFFER_CONFIG *source,
|
||||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
int flag,
|
||||
int uvfilter)
|
||||
int flag)
|
||||
{
|
||||
int mbr;
|
||||
double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
|
||||
int ppl = (int)(level + .5);
|
||||
int mb_rows = cm->mb_rows;
|
||||
int mb_cols = cm->mb_cols;
|
||||
int mb_rows = source->y_width >> 4;
|
||||
int mb_cols = source->y_height >> 4;
|
||||
unsigned char *limits = cm->pp_limits_buffer;;
|
||||
int mbr, mbc;
|
||||
(void) post;
|
||||
(void) low_var_thresh;
|
||||
(void) flag;
|
||||
|
@ -412,20 +410,18 @@ void vp8_de_noise(VP8_COMMON *cm,
|
|||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_buffer + 16 * mbr * source->y_stride,
|
||||
source->y_stride, source->y_stride, source->y_width, limits, 16);
|
||||
if (uvfilter == 1) {
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits,
|
||||
8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits,
|
||||
8);
|
||||
}
|
||||
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->u_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
|
||||
vp8_post_proc_down_and_across_mb_row(
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->v_buffer + 8 * mbr * source->uv_stride,
|
||||
source->uv_stride, source->uv_stride, source->uv_width, limits, 8);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
double vp8_gaussian(double sigma, double mu, double x)
|
||||
{
|
||||
|
|
|
@ -39,8 +39,7 @@ void vp8_de_noise(struct VP8Common *oci,
|
|||
YV12_BUFFER_CONFIG *post,
|
||||
int q,
|
||||
int low_var_thresh,
|
||||
int flag,
|
||||
int uvfilter);
|
||||
int flag);
|
||||
|
||||
void vp8_deblock(struct VP8Common *oci,
|
||||
YV12_BUFFER_CONFIG *source,
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_PRAGMAS_H_
|
||||
#define VP8_COMMON_PRAGMAS_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#pragma warning(disable:997 1011 170)
|
||||
#endif
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable:4799)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_PRAGMAS_H_
|
|
@ -527,7 +527,7 @@ sym(vp8_loop_filter_vertical_edge_mmx):
|
|||
pxor mm7, [GLOBAL(t80)] ; unoffset
|
||||
; mm7 = q1
|
||||
|
||||
; transpose and write back
|
||||
; tranpose and write back
|
||||
; mm1 = 72 62 52 42 32 22 12 02
|
||||
; mm6 = 73 63 53 43 33 23 13 03
|
||||
; mm3 = 74 64 54 44 34 24 14 04
|
||||
|
@ -1289,7 +1289,7 @@ sym(vp8_mbloop_filter_vertical_edge_mmx):
|
|||
pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01
|
||||
pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06
|
||||
|
||||
; transpose and write back
|
||||
; tranpose and write back
|
||||
movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00
|
||||
movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00
|
||||
|
||||
|
|
|
@ -958,7 +958,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
|
|||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; transpose and write back - only work on q1, q0, p0, p1
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
; store 16-line result
|
||||
|
||||
|
@ -1023,7 +1023,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
|
|||
; start work on filters
|
||||
B_FILTER 2
|
||||
|
||||
; transpose and write back - only work on q1, q0, p0, p1
|
||||
; tranpose and write back - only work on q1, q0, p0, p1
|
||||
BV_TRANSPOSE
|
||||
|
||||
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
|
||||
|
|
|
@ -204,16 +204,13 @@ sym(vp8_mbpost_proc_down_mmx):
|
|||
and rcx, 15
|
||||
movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4]
|
||||
|
||||
cmp edx, 8
|
||||
jl .skip_assignment
|
||||
|
||||
mov rcx, rdx
|
||||
sub rcx, 8
|
||||
|
||||
and rcx, 15
|
||||
movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4]
|
||||
movd [rsi], mm1
|
||||
|
||||
.skip_assignment
|
||||
movd [rsi], mm1
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
|
@ -246,6 +243,7 @@ sym(vp8_mbpost_proc_down_mmx):
|
|||
; unsigned char whiteclamp[16],
|
||||
; unsigned char bothclamp[16],
|
||||
; unsigned int Width, unsigned int Height, int Pitch)
|
||||
extern sym(rand)
|
||||
global sym(vp8_plane_add_noise_mmx) PRIVATE
|
||||
sym(vp8_plane_add_noise_mmx):
|
||||
push rbp
|
||||
|
@ -257,7 +255,7 @@ sym(vp8_plane_add_noise_mmx):
|
|||
; end prolog
|
||||
|
||||
.addnoise_loop:
|
||||
call sym(LIBVPX_RAND) WRT_PLT
|
||||
call sym(rand) WRT_PLT
|
||||
mov rcx, arg(1) ;noise
|
||||
and rax, 0xff
|
||||
add rcx, rax
|
||||
|
|
|
@ -425,16 +425,13 @@ sym(vp8_mbpost_proc_down_xmm):
|
|||
and rcx, 15
|
||||
movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8]
|
||||
|
||||
cmp edx, 8
|
||||
jl .skip_assignment
|
||||
|
||||
mov rcx, rdx
|
||||
sub rcx, 8
|
||||
|
||||
and rcx, 15
|
||||
movq mm0, [rsp + rcx*8] ;d[rcx*8]
|
||||
movq [rsi], mm0
|
||||
|
||||
.skip_assignment
|
||||
movq [rsi], mm0
|
||||
lea rsi, [rsi+rax]
|
||||
|
||||
lea rdi, [rdi+rax]
|
||||
|
@ -660,6 +657,7 @@ sym(vp8_mbpost_proc_across_ip_xmm):
|
|||
; unsigned char whiteclamp[16],
|
||||
; unsigned char bothclamp[16],
|
||||
; unsigned int Width, unsigned int Height, int Pitch)
|
||||
extern sym(rand)
|
||||
global sym(vp8_plane_add_noise_wmt) PRIVATE
|
||||
sym(vp8_plane_add_noise_wmt):
|
||||
push rbp
|
||||
|
@ -671,7 +669,7 @@ sym(vp8_plane_add_noise_wmt):
|
|||
; end prolog
|
||||
|
||||
.addnoise_loop:
|
||||
call sym(LIBVPX_RAND) WRT_PLT
|
||||
call sym(rand) WRT_PLT
|
||||
mov rcx, arg(1) ;noise
|
||||
and rax, 0xff
|
||||
add rcx, rax
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
/* On Android NDK, rand is inlined function, but postproc needs rand symbol */
|
||||
#if defined(__ANDROID__)
|
||||
#define rand __rand
|
||||
#include <stdlib.h>
|
||||
#undef rand
|
||||
|
||||
extern int rand(void)
|
||||
{
|
||||
return __rand();
|
||||
}
|
||||
#else
|
||||
/* ISO C forbids an empty translation unit. */
|
||||
int vp8_unused;
|
||||
#endif
|
|
@ -365,7 +365,6 @@ sym(vp8_intra_pred_uv_tm_%1):
|
|||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
; read top row
|
||||
|
@ -396,11 +395,8 @@ sym(vp8_intra_pred_uv_tm_%1):
|
|||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
|
||||
.vp8_intra_pred_uv_tm_%1_loop:
|
||||
mov bl, [rsi]
|
||||
movd xmm3, ebx
|
||||
|
||||
mov bl, [rsi+rax]
|
||||
movd xmm5, ebx
|
||||
movd xmm3, [rsi]
|
||||
movd xmm5, [rsi+rax]
|
||||
%ifidn %1, sse2
|
||||
punpcklbw xmm3, xmm0
|
||||
punpcklbw xmm5, xmm0
|
||||
|
@ -423,7 +419,6 @@ sym(vp8_intra_pred_uv_tm_%1):
|
|||
jnz .vp8_intra_pred_uv_tm_%1_loop
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
|
@ -491,8 +486,10 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
%ifidn %1, ssse3
|
||||
%ifndef GET_GOT_SAVE_ARG
|
||||
push rbx
|
||||
%endif
|
||||
GET_GOT rbx
|
||||
%endif
|
||||
; end prolog
|
||||
|
@ -510,16 +507,13 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
%ifidn %1, ssse3
|
||||
lea rdx, [rcx*3]
|
||||
movdqa xmm2, [GLOBAL(dc_00001111)]
|
||||
lea rbx, [rax*3]
|
||||
%endif
|
||||
|
||||
%ifidn %1, mmx2
|
||||
.vp8_intra_pred_uv_ho_%1_loop:
|
||||
mov bl, [rsi]
|
||||
movd mm0, ebx
|
||||
|
||||
mov bl, [rsi+rax]
|
||||
movd mm1, ebx
|
||||
|
||||
movd mm0, [rsi]
|
||||
movd mm1, [rsi+rax]
|
||||
punpcklbw mm0, mm0
|
||||
punpcklbw mm1, mm1
|
||||
pshufw mm0, mm0, 0x0
|
||||
|
@ -531,19 +525,10 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
dec edx
|
||||
jnz .vp8_intra_pred_uv_ho_%1_loop
|
||||
%else
|
||||
mov bl, [rsi]
|
||||
movd xmm0, ebx
|
||||
|
||||
mov bl, [rsi+rax]
|
||||
movd xmm3, ebx
|
||||
|
||||
mov bl, [rsi+rax*2]
|
||||
movd xmm1, ebx
|
||||
|
||||
lea rbx, [rax*3]
|
||||
mov bl, [rsi+rbx]
|
||||
movd xmm4, ebx
|
||||
|
||||
movd xmm0, [rsi]
|
||||
movd xmm3, [rsi+rax]
|
||||
movd xmm1, [rsi+rax*2]
|
||||
movd xmm4, [rsi+rbx]
|
||||
punpcklbw xmm0, xmm3
|
||||
punpcklbw xmm1, xmm4
|
||||
pshufb xmm0, xmm2
|
||||
|
@ -554,20 +539,10 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
movhps [rdi+rdx], xmm1
|
||||
lea rsi, [rsi+rax*4]
|
||||
lea rdi, [rdi+rcx*4]
|
||||
|
||||
mov bl, [rsi]
|
||||
movd xmm0, ebx
|
||||
|
||||
mov bl, [rsi+rax]
|
||||
movd xmm3, ebx
|
||||
|
||||
mov bl, [rsi+rax*2]
|
||||
movd xmm1, ebx
|
||||
|
||||
lea rbx, [rax*3]
|
||||
mov bl, [rsi+rbx]
|
||||
movd xmm4, ebx
|
||||
|
||||
movd xmm0, [rsi]
|
||||
movd xmm3, [rsi+rax]
|
||||
movd xmm1, [rsi+rax*2]
|
||||
movd xmm4, [rsi+rbx]
|
||||
punpcklbw xmm0, xmm3
|
||||
punpcklbw xmm1, xmm4
|
||||
pshufb xmm0, xmm2
|
||||
|
@ -581,8 +556,10 @@ sym(vp8_intra_pred_uv_ho_%1):
|
|||
; begin epilog
|
||||
%ifidn %1, ssse3
|
||||
RESTORE_GOT
|
||||
%endif
|
||||
%ifndef GET_GOT_SAVE_ARG
|
||||
pop rbx
|
||||
%endif
|
||||
%endif
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
|
@ -916,7 +893,6 @@ sym(vp8_intra_pred_y_tm_%1):
|
|||
SAVE_XMM 7
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
|
@ -950,11 +926,8 @@ sym(vp8_intra_pred_y_tm_%1):
|
|||
mov rdi, arg(0) ;dst;
|
||||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
vp8_intra_pred_y_tm_%1_loop:
|
||||
mov bl, [rsi]
|
||||
movd xmm4, ebx
|
||||
|
||||
mov bl, [rsi+rax]
|
||||
movd xmm5, ebx
|
||||
movd xmm4, [rsi]
|
||||
movd xmm5, [rsi+rax]
|
||||
%ifidn %1, sse2
|
||||
punpcklbw xmm4, xmm0
|
||||
punpcklbw xmm5, xmm0
|
||||
|
@ -983,7 +956,6 @@ vp8_intra_pred_y_tm_%1_loop:
|
|||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_XMM
|
||||
|
@ -1057,7 +1029,6 @@ sym(vp8_intra_pred_y_ho_sse2):
|
|||
SHADOW_ARGS_TO_STACK 5
|
||||
push rsi
|
||||
push rdi
|
||||
push rbx
|
||||
; end prolog
|
||||
|
||||
;arg(2) not used
|
||||
|
@ -1070,11 +1041,8 @@ sym(vp8_intra_pred_y_ho_sse2):
|
|||
movsxd rcx, dword ptr arg(1) ;dst_stride
|
||||
|
||||
vp8_intra_pred_y_ho_sse2_loop:
|
||||
mov bl, [rsi]
|
||||
movd xmm0, ebx
|
||||
mov bl, [rsi+rax]
|
||||
movd xmm1, ebx
|
||||
|
||||
movd xmm0, [rsi]
|
||||
movd xmm1, [rsi+rax]
|
||||
; FIXME use pshufb for ssse3 version
|
||||
punpcklbw xmm0, xmm0
|
||||
punpcklbw xmm1, xmm1
|
||||
|
@ -1090,7 +1058,6 @@ vp8_intra_pred_y_ho_sse2_loop:
|
|||
jnz vp8_intra_pred_y_ho_sse2_loop
|
||||
|
||||
; begin epilog
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
|
|
|
@ -342,8 +342,8 @@ sym(vp8_get4x4var_mmx):
|
|||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||
|
||||
; Row 1
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
@ -351,12 +351,12 @@ sym(vp8_get4x4var_mmx):
|
|||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
|
||||
; Row 2
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
|
@ -365,12 +365,12 @@ sym(vp8_get4x4var_mmx):
|
|||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 3
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher precision
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||
paddw mm5, mm0 ; accumulate differences in mm5
|
||||
|
@ -378,11 +378,11 @@ sym(vp8_get4x4var_mmx):
|
|||
pmaddwd mm0, mm0 ; square and accumulate
|
||||
add rbx,rdx ; Inc pointer into ref data
|
||||
add rax,rcx ; Inc pointer into the new data
|
||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||
paddd mm7, mm0 ; accumulate in mm7
|
||||
|
||||
; Row 4
|
||||
movd mm0, [rax] ; Copy four bytes to mm0
|
||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||
|
||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||
punpcklbw mm1, mm6
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vp8/common/pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp8/common/x86/filter_x86.h"
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vp8/common/pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vp8/common/x86/filter_x86.h"
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/variance.h"
|
||||
#include "vp8/common/pragmas.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern unsigned int vp8_get16x16var_sse2
|
||||
|
|
|
@ -10,12 +10,11 @@
|
|||
|
||||
|
||||
#include "dboolhuff.h"
|
||||
#include "vp8/common/common.h"
|
||||
|
||||
int vp8dx_start_decode(BOOL_DECODER *br,
|
||||
const unsigned char *source,
|
||||
unsigned int source_sz,
|
||||
vpx_decrypt_cb decrypt_cb,
|
||||
vp8_decrypt_cb *decrypt_cb,
|
||||
void *decrypt_state)
|
||||
{
|
||||
br->user_buffer_end = source+source_sz;
|
||||
|
@ -40,7 +39,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
|
|||
const unsigned char *bufptr = br->user_buffer;
|
||||
VP8_BD_VALUE value = br->value;
|
||||
int count = br->count;
|
||||
int shift = VP8_BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
|
||||
int shift = VP8_BD_VALUE_SIZE - 8 - (count + 8);
|
||||
size_t bytes_left = br->user_buffer_end - bufptr;
|
||||
size_t bits_left = bytes_left * CHAR_BIT;
|
||||
int x = (int)(shift + CHAR_BIT - bits_left);
|
||||
|
@ -48,7 +47,7 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br)
|
|||
unsigned char decrypted[sizeof(VP8_BD_VALUE) + 1];
|
||||
|
||||
if (br->decrypt_cb) {
|
||||
size_t n = MIN(sizeof(decrypted), bytes_left);
|
||||
size_t n = bytes_left > sizeof(decrypted) ? sizeof(decrypted) : bytes_left;
|
||||
br->decrypt_cb(br->decrypt_state, bufptr, decrypted, (int)n);
|
||||
bufptr = decrypted;
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
#include "vpx_config.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
#include "vpx/vp8dx.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -33,6 +32,12 @@ typedef size_t VP8_BD_VALUE;
|
|||
Even relatively modest values like 100 would work fine.*/
|
||||
#define VP8_LOTS_OF_BITS (0x40000000)
|
||||
|
||||
/*Decrypt n bytes of data from input -> output, using the decrypt_state
|
||||
passed in VP8D_SET_DECRYPTOR.
|
||||
*/
|
||||
typedef void (vp8_decrypt_cb)(void *decrypt_state, const unsigned char *input,
|
||||
unsigned char *output, int count);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const unsigned char *user_buffer_end;
|
||||
|
@ -40,7 +45,7 @@ typedef struct
|
|||
VP8_BD_VALUE value;
|
||||
int count;
|
||||
unsigned int range;
|
||||
vpx_decrypt_cb decrypt_cb;
|
||||
vp8_decrypt_cb *decrypt_cb;
|
||||
void *decrypt_state;
|
||||
} BOOL_DECODER;
|
||||
|
||||
|
@ -49,7 +54,7 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
|
|||
int vp8dx_start_decode(BOOL_DECODER *br,
|
||||
const unsigned char *source,
|
||||
unsigned int source_sz,
|
||||
vpx_decrypt_cb decrypt_cb,
|
||||
vp8_decrypt_cb *decrypt_cb,
|
||||
void *decrypt_state);
|
||||
|
||||
void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
#include "vp8/common/reconintra4x4.h"
|
||||
#include "vp8/common/reconinter.h"
|
||||
#include "detokenize.h"
|
||||
#include "vp8/common/common.h"
|
||||
#include "vp8/common/invtrans.h"
|
||||
#include "vp8/common/alloccommon.h"
|
||||
#include "vp8/common/entropymode.h"
|
||||
|
@ -632,17 +631,9 @@ static void decode_mb_rows(VP8D_COMP *pbi)
|
|||
xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
|
||||
xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
|
||||
|
||||
if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
|
||||
MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
|
||||
xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
|
||||
xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
|
||||
xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
|
||||
} else {
|
||||
// ref_frame is INTRA_FRAME, pre buffer should not be used.
|
||||
xd->pre.y_buffer = 0;
|
||||
xd->pre.u_buffer = 0;
|
||||
xd->pre.v_buffer = 0;
|
||||
}
|
||||
xd->pre.y_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
|
||||
xd->pre.u_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
|
||||
xd->pre.v_buffer = ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
|
||||
|
||||
/* propagate errors from reference frames */
|
||||
xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
|
||||
|
@ -1019,7 +1010,8 @@ int vp8_decode_frame(VP8D_COMP *pbi)
|
|||
const unsigned char *clear = data;
|
||||
if (pbi->decrypt_cb)
|
||||
{
|
||||
int n = (int)MIN(sizeof(clear_buffer), data_end - data);
|
||||
int n = (int)(data_end - data);
|
||||
if (n > 10) n = 10;
|
||||
pbi->decrypt_cb(pbi->decrypt_state, data, clear_buffer, n);
|
||||
clear = clear_buffer;
|
||||
}
|
||||
|
|
|
@ -15,7 +15,9 @@
|
|||
#include "decodemv.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "vp8/common/findnearmv.h"
|
||||
#include "vp8/common/common.h"
|
||||
|
||||
#define MIN(x,y) (((x)<(y))?(x):(y))
|
||||
#define MAX(x,y) (((x)>(y))?(x):(y))
|
||||
|
||||
#define FLOOR(x,q) ((x) & -(1 << (q)))
|
||||
|
||||
|
|
|
@ -178,6 +178,12 @@ vpx_codec_err_t vp8dx_set_reference(VP8D_COMP *pbi, enum vpx_ref_frame_type ref_
|
|||
return pbi->common.error.error_code;
|
||||
}
|
||||
|
||||
/*For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.*/
|
||||
#if HAVE_NEON
|
||||
extern void vp8_push_neon(int64_t *store);
|
||||
extern void vp8_pop_neon(int64_t *store);
|
||||
#endif
|
||||
|
||||
static int get_free_fb (VP8_COMMON *cm)
|
||||
{
|
||||
int i;
|
||||
|
@ -301,6 +307,9 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
|
|||
const uint8_t *source,
|
||||
int64_t time_stamp)
|
||||
{
|
||||
#if HAVE_NEON
|
||||
int64_t dx_store_reg[8];
|
||||
#endif
|
||||
VP8_COMMON *cm = &pbi->common;
|
||||
int retcode = -1;
|
||||
|
||||
|
@ -310,6 +319,15 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
|
|||
if(retcode <= 0)
|
||||
return retcode;
|
||||
|
||||
#if HAVE_NEON
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->cpu_caps & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp8_push_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
|
||||
cm->new_fb_idx = get_free_fb (cm);
|
||||
|
||||
/* setup reference frames for vp8_decode_frame */
|
||||
|
@ -385,8 +403,16 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
|
|||
pbi->last_time_stamp = time_stamp;
|
||||
|
||||
decode_exit:
|
||||
#if HAVE_NEON
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
if (cm->cpu_caps & HAS_NEON)
|
||||
#endif
|
||||
{
|
||||
vp8_pop_neon(dx_store_reg);
|
||||
}
|
||||
#endif
|
||||
|
||||
pbi->common.error.setjmp = 0;
|
||||
vp8_clear_system_state();
|
||||
return retcode;
|
||||
}
|
||||
int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags)
|
||||
|
|
|
@ -126,7 +126,7 @@ typedef struct VP8D_COMP
|
|||
int independent_partitions;
|
||||
int frame_corrupt_residual;
|
||||
|
||||
vpx_decrypt_cb decrypt_cb;
|
||||
vp8_decrypt_cb *decrypt_cb;
|
||||
void *decrypt_state;
|
||||
} VP8D_COMP;
|
||||
|
||||
|
|
|
@ -45,31 +45,37 @@
|
|||
* [16, 255] 3 6 7
|
||||
*/
|
||||
|
||||
int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
|
||||
int mc_running_avg_y_stride,
|
||||
unsigned char *running_avg_y,
|
||||
int running_avg_y_stride,
|
||||
unsigned char *sig, int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising) {
|
||||
int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg,
|
||||
YV12_BUFFER_CONFIG *running_avg,
|
||||
MACROBLOCK *signal, unsigned int motion_magnitude,
|
||||
int y_offset, int uv_offset) {
|
||||
/* If motion_magnitude is small, making the denoiser more aggressive by
|
||||
* increasing the adjustment for each level, level1 adjustment is
|
||||
* increased, the deltas stay the same.
|
||||
*/
|
||||
int shift_inc = (increase_denoising &&
|
||||
motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
|
||||
const uint8x16_t v_level1_adjustment = vmovq_n_u8(
|
||||
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
|
||||
const uint8x16_t v_level1_adjustment = vdupq_n_u8(
|
||||
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 : 3);
|
||||
const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
|
||||
const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
|
||||
const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
|
||||
const uint8x16_t v_level1_threshold = vdupq_n_u8(4);
|
||||
const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
|
||||
const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
|
||||
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
|
||||
|
||||
/* Local variables for array pointers and strides. */
|
||||
unsigned char *sig = signal->thismb;
|
||||
int sig_stride = 16;
|
||||
unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
|
||||
int mc_running_avg_y_stride = mc_running_avg->y_stride;
|
||||
unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
|
||||
int running_avg_y_stride = running_avg->y_stride;
|
||||
|
||||
/* Go over lines. */
|
||||
int r;
|
||||
for (r = 0; r < 16; ++r) {
|
||||
int i;
|
||||
int sum_diff = 0;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
int8x16_t v_sum_diff = vdupq_n_s8(0);
|
||||
uint8x16_t v_running_avg_y;
|
||||
|
||||
/* Load inputs. */
|
||||
const uint8x16_t v_sig = vld1q_u8(sig);
|
||||
const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
|
||||
|
@ -111,9 +117,12 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
|
|||
v_abs_adjustment);
|
||||
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
|
||||
v_abs_adjustment);
|
||||
|
||||
uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
|
||||
v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
|
||||
v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
|
||||
v_sum_diff = vqaddq_s8(v_sum_diff,
|
||||
vreinterpretq_s8_u8(v_pos_adjustment));
|
||||
v_sum_diff = vqsubq_s8(v_sum_diff,
|
||||
vreinterpretq_s8_u8(v_neg_adjustment));
|
||||
|
||||
/* Store results. */
|
||||
vst1q_u8(running_avg_y, v_running_avg_y);
|
||||
|
@ -122,19 +131,23 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
|
|||
* for this macroblock.
|
||||
*/
|
||||
{
|
||||
const int8x16_t v_sum_diff =
|
||||
vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
|
||||
vreinterpretq_s8_u8(v_neg_adjustment));
|
||||
|
||||
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
|
||||
|
||||
const int32x4_t fedc_ba98_7654_3210 =
|
||||
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
|
||||
|
||||
const int64x2_t fedcba98_76543210 =
|
||||
vpaddlq_s32(fedc_ba98_7654_3210);
|
||||
|
||||
v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
|
||||
int s0 = vgetq_lane_s8(v_sum_diff, 0) +
|
||||
vgetq_lane_s8(v_sum_diff, 1) +
|
||||
vgetq_lane_s8(v_sum_diff, 2) +
|
||||
vgetq_lane_s8(v_sum_diff, 3);
|
||||
int s1 = vgetq_lane_s8(v_sum_diff, 4) +
|
||||
vgetq_lane_s8(v_sum_diff, 5) +
|
||||
vgetq_lane_s8(v_sum_diff, 6) +
|
||||
vgetq_lane_s8(v_sum_diff, 7);
|
||||
int s2 = vgetq_lane_s8(v_sum_diff, 8) +
|
||||
vgetq_lane_s8(v_sum_diff, 9) +
|
||||
vgetq_lane_s8(v_sum_diff, 10) +
|
||||
vgetq_lane_s8(v_sum_diff, 11);
|
||||
int s3 = vgetq_lane_s8(v_sum_diff, 12) +
|
||||
vgetq_lane_s8(v_sum_diff, 13) +
|
||||
vgetq_lane_s8(v_sum_diff, 14) +
|
||||
vgetq_lane_s8(v_sum_diff, 15);
|
||||
sum_diff += s0 + s1+ s2 + s3;
|
||||
}
|
||||
|
||||
/* Update pointers for next iteration. */
|
||||
|
@ -144,335 +157,11 @@ int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y,
|
|||
}
|
||||
|
||||
/* Too much adjustments => copy block. */
|
||||
{
|
||||
int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
|
||||
vget_low_s64(v_sum_diff_total));
|
||||
int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
|
||||
int sum_diff_thresh = SUM_DIFF_THRESHOLD;
|
||||
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
|
||||
if (sum_diff > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising),
|
||||
// checK if we can still apply some (weaker) temporal filtering to
|
||||
// this block, that would otherwise not be denoised at all. Simplest
|
||||
// is to apply an additional adjustment to running_avg_y to bring it
|
||||
// closer to sig. The adjustment is capped by a maximum delta, and
|
||||
// chosen such that in most cases the resulting sum_diff will be
|
||||
// within the accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over the
|
||||
// threshold.
|
||||
int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
const uint8x16_t k_delta = vmovq_n_u8(delta);
|
||||
sig -= sig_stride * 16;
|
||||
mc_running_avg_y -= mc_running_avg_y_stride * 16;
|
||||
running_avg_y -= running_avg_y_stride * 16;
|
||||
for (r = 0; r < 16; ++r) {
|
||||
uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
|
||||
const uint8x16_t v_sig = vld1q_u8(sig);
|
||||
const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
|
||||
|
||||
/* Calculate absolute difference and sign masks. */
|
||||
const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
|
||||
v_mc_running_avg_y);
|
||||
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
|
||||
v_mc_running_avg_y);
|
||||
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
|
||||
v_mc_running_avg_y);
|
||||
// Clamp absolute difference to delta to get the adjustment.
|
||||
const uint8x16_t v_abs_adjustment =
|
||||
vminq_u8(v_abs_diff, (k_delta));
|
||||
|
||||
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
|
||||
v_abs_adjustment);
|
||||
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
|
||||
v_abs_adjustment);
|
||||
|
||||
v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
|
||||
v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
|
||||
|
||||
/* Store results. */
|
||||
vst1q_u8(running_avg_y, v_running_avg_y);
|
||||
|
||||
{
|
||||
const int8x16_t v_sum_diff =
|
||||
vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
|
||||
vreinterpretq_s8_u8(v_pos_adjustment));
|
||||
|
||||
const int16x8_t fe_dc_ba_98_76_54_32_10 =
|
||||
vpaddlq_s8(v_sum_diff);
|
||||
const int32x4_t fedc_ba98_7654_3210 =
|
||||
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
|
||||
const int64x2_t fedcba98_76543210 =
|
||||
vpaddlq_s32(fedc_ba98_7654_3210);
|
||||
|
||||
v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
|
||||
fedcba98_76543210);
|
||||
}
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride;
|
||||
mc_running_avg_y += mc_running_avg_y_stride;
|
||||
running_avg_y += running_avg_y_stride;
|
||||
}
|
||||
{
|
||||
// Update the sum of all pixel differences of this MB.
|
||||
x = vqadd_s64(vget_high_s64(v_sum_diff_total),
|
||||
vget_low_s64(v_sum_diff_total));
|
||||
sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
|
||||
|
||||
if (sum_diff > sum_diff_thresh) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
|
||||
return COPY_BLOCK;
|
||||
|
||||
/* Tell above level that block was filtered. */
|
||||
running_avg_y -= running_avg_y_stride * 16;
|
||||
sig -= sig_stride * 16;
|
||||
|
||||
vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride);
|
||||
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg,
|
||||
int mc_running_avg_stride,
|
||||
unsigned char *running_avg,
|
||||
int running_avg_stride,
|
||||
unsigned char *sig, int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising) {
|
||||
/* If motion_magnitude is small, making the denoiser more aggressive by
|
||||
* increasing the adjustment for each level, level1 adjustment is
|
||||
* increased, the deltas stay the same.
|
||||
*/
|
||||
int shift_inc = (increase_denoising &&
|
||||
motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
|
||||
const uint8x16_t v_level1_adjustment = vmovq_n_u8(
|
||||
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 4 + shift_inc : 3);
|
||||
|
||||
const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
|
||||
const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
|
||||
const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
|
||||
const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
|
||||
const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
|
||||
int64x2_t v_sum_diff_total = vdupq_n_s64(0);
|
||||
int r;
|
||||
|
||||
{
|
||||
uint16x4_t v_sum_block = vdup_n_u16(0);
|
||||
|
||||
// Avoid denoising color signal if its close to average level.
|
||||
for (r = 0; r < 8; ++r) {
|
||||
const uint8x8_t v_sig = vld1_u8(sig);
|
||||
const uint16x4_t _76_54_32_10 = vpaddl_u8(v_sig);
|
||||
v_sum_block = vqadd_u16(v_sum_block, _76_54_32_10);
|
||||
sig += sig_stride;
|
||||
}
|
||||
sig -= sig_stride * 8;
|
||||
{
|
||||
const uint32x2_t _7654_3210 = vpaddl_u16(v_sum_block);
|
||||
const uint64x1_t _76543210 = vpaddl_u32(_7654_3210);
|
||||
const int sum_block =
|
||||
vget_lane_s32(vreinterpret_s32_u64(_76543210), 0);
|
||||
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Go over lines. */
|
||||
for (r = 0; r < 4; ++r) {
|
||||
/* Load inputs. */
|
||||
const uint8x8_t v_sig_lo = vld1_u8(sig);
|
||||
const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
|
||||
const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
|
||||
const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
|
||||
const uint8x8_t v_mc_running_avg_hi =
|
||||
vld1_u8(&mc_running_avg[mc_running_avg_stride]);
|
||||
const uint8x16_t v_mc_running_avg =
|
||||
vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
|
||||
/* Calculate absolute difference and sign masks. */
|
||||
const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg);
|
||||
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg);
|
||||
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg);
|
||||
|
||||
/* Figure out which level that put us in. */
|
||||
const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold,
|
||||
v_abs_diff);
|
||||
const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold,
|
||||
v_abs_diff);
|
||||
const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold,
|
||||
v_abs_diff);
|
||||
|
||||
/* Calculate absolute adjustments for level 1, 2 and 3. */
|
||||
const uint8x16_t v_level2_adjustment = vandq_u8(v_level2_mask,
|
||||
v_delta_level_1_and_2);
|
||||
const uint8x16_t v_level3_adjustment = vandq_u8(v_level3_mask,
|
||||
v_delta_level_2_and_3);
|
||||
const uint8x16_t v_level1and2_adjustment = vaddq_u8(v_level1_adjustment,
|
||||
v_level2_adjustment);
|
||||
const uint8x16_t v_level1and2and3_adjustment = vaddq_u8(
|
||||
v_level1and2_adjustment, v_level3_adjustment);
|
||||
|
||||
/* Figure adjustment absolute value by selecting between the absolute
|
||||
* difference if in level0 or the value for level 1, 2 and 3.
|
||||
*/
|
||||
const uint8x16_t v_abs_adjustment = vbslq_u8(v_level1_mask,
|
||||
v_level1and2and3_adjustment, v_abs_diff);
|
||||
|
||||
/* Calculate positive and negative adjustments. Apply them to the signal
|
||||
* and accumulate them. Adjustments are less than eight and the maximum
|
||||
* sum of them (7 * 16) can fit in a signed char.
|
||||
*/
|
||||
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
|
||||
v_abs_adjustment);
|
||||
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
|
||||
v_abs_adjustment);
|
||||
|
||||
uint8x16_t v_running_avg = vqaddq_u8(v_sig, v_pos_adjustment);
|
||||
v_running_avg = vqsubq_u8(v_running_avg, v_neg_adjustment);
|
||||
|
||||
/* Store results. */
|
||||
vst1_u8(running_avg, vget_low_u8(v_running_avg));
|
||||
vst1_u8(&running_avg[running_avg_stride], vget_high_u8(v_running_avg));
|
||||
|
||||
/* Sum all the accumulators to have the sum of all pixel differences
|
||||
* for this macroblock.
|
||||
*/
|
||||
{
|
||||
const int8x16_t v_sum_diff =
|
||||
vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
|
||||
vreinterpretq_s8_u8(v_neg_adjustment));
|
||||
|
||||
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff);
|
||||
|
||||
const int32x4_t fedc_ba98_7654_3210 =
|
||||
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
|
||||
|
||||
const int64x2_t fedcba98_76543210 =
|
||||
vpaddlq_s32(fedc_ba98_7654_3210);
|
||||
|
||||
v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210);
|
||||
}
|
||||
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride * 2;
|
||||
mc_running_avg += mc_running_avg_stride * 2;
|
||||
running_avg += running_avg_stride * 2;
|
||||
}
|
||||
|
||||
|
||||
/* Too much adjustments => copy block. */
|
||||
{
|
||||
int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total),
|
||||
vget_low_s64(v_sum_diff_total));
|
||||
int sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
|
||||
int sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
|
||||
if (sum_diff > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising),
|
||||
// checK if we can still apply some (weaker) temporal filtering to
|
||||
// this block, that would otherwise not be denoised at all. Simplest
|
||||
// is to apply an additional adjustment to running_avg_y to bring it
|
||||
// closer to sig. The adjustment is capped by a maximum delta, and
|
||||
// chosen such that in most cases the resulting sum_diff will be
|
||||
// within the accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over the
|
||||
// threshold.
|
||||
int delta = ((sum_diff - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
const uint8x16_t k_delta = vmovq_n_u8(delta);
|
||||
sig -= sig_stride * 8;
|
||||
mc_running_avg -= mc_running_avg_stride * 8;
|
||||
running_avg -= running_avg_stride * 8;
|
||||
for (r = 0; r < 4; ++r) {
|
||||
const uint8x8_t v_sig_lo = vld1_u8(sig);
|
||||
const uint8x8_t v_sig_hi = vld1_u8(&sig[sig_stride]);
|
||||
const uint8x16_t v_sig = vcombine_u8(v_sig_lo, v_sig_hi);
|
||||
const uint8x8_t v_mc_running_avg_lo = vld1_u8(mc_running_avg);
|
||||
const uint8x8_t v_mc_running_avg_hi =
|
||||
vld1_u8(&mc_running_avg[mc_running_avg_stride]);
|
||||
const uint8x16_t v_mc_running_avg =
|
||||
vcombine_u8(v_mc_running_avg_lo, v_mc_running_avg_hi);
|
||||
/* Calculate absolute difference and sign masks. */
|
||||
const uint8x16_t v_abs_diff = vabdq_u8(v_sig,
|
||||
v_mc_running_avg);
|
||||
const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig,
|
||||
v_mc_running_avg);
|
||||
const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig,
|
||||
v_mc_running_avg);
|
||||
// Clamp absolute difference to delta to get the adjustment.
|
||||
const uint8x16_t v_abs_adjustment =
|
||||
vminq_u8(v_abs_diff, (k_delta));
|
||||
|
||||
const uint8x16_t v_pos_adjustment = vandq_u8(v_diff_pos_mask,
|
||||
v_abs_adjustment);
|
||||
const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask,
|
||||
v_abs_adjustment);
|
||||
const uint8x8_t v_running_avg_lo = vld1_u8(running_avg);
|
||||
const uint8x8_t v_running_avg_hi =
|
||||
vld1_u8(&running_avg[running_avg_stride]);
|
||||
uint8x16_t v_running_avg =
|
||||
vcombine_u8(v_running_avg_lo, v_running_avg_hi);
|
||||
|
||||
v_running_avg = vqsubq_u8(v_running_avg, v_pos_adjustment);
|
||||
v_running_avg = vqaddq_u8(v_running_avg, v_neg_adjustment);
|
||||
|
||||
/* Store results. */
|
||||
vst1_u8(running_avg, vget_low_u8(v_running_avg));
|
||||
vst1_u8(&running_avg[running_avg_stride],
|
||||
vget_high_u8(v_running_avg));
|
||||
|
||||
{
|
||||
const int8x16_t v_sum_diff =
|
||||
vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
|
||||
vreinterpretq_s8_u8(v_pos_adjustment));
|
||||
|
||||
const int16x8_t fe_dc_ba_98_76_54_32_10 =
|
||||
vpaddlq_s8(v_sum_diff);
|
||||
const int32x4_t fedc_ba98_7654_3210 =
|
||||
vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
|
||||
const int64x2_t fedcba98_76543210 =
|
||||
vpaddlq_s32(fedc_ba98_7654_3210);
|
||||
|
||||
v_sum_diff_total = vqaddq_s64(v_sum_diff_total,
|
||||
fedcba98_76543210);
|
||||
}
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride * 2;
|
||||
mc_running_avg += mc_running_avg_stride * 2;
|
||||
running_avg += running_avg_stride * 2;
|
||||
}
|
||||
{
|
||||
// Update the sum of all pixel differences of this MB.
|
||||
x = vqadd_s64(vget_high_s64(v_sum_diff_total),
|
||||
vget_low_s64(v_sum_diff_total));
|
||||
sum_diff = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0);
|
||||
|
||||
if (sum_diff > sum_diff_thresh) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Tell above level that block was filtered. */
|
||||
running_avg -= running_avg_stride * 8;
|
||||
sig -= sig_stride * 8;
|
||||
|
||||
vp8_copy_mem8x8(running_avg, running_avg_stride, sig, sig_stride);
|
||||
|
||||
vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride,
|
||||
signal->thismb, sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vp8/common/loopfilter.h"
|
||||
#include "vpx_scale/yv12config.h"
|
||||
|
||||
extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr,
|
||||
unsigned char *src_ptr,
|
||||
int sz);
|
||||
|
||||
|
||||
void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc,
|
||||
YV12_BUFFER_CONFIG *dst_ybc)
|
||||
{
|
||||
unsigned char *src_y, *dst_y;
|
||||
int yheight;
|
||||
int ystride;
|
||||
int yoffset;
|
||||
int linestocopy;
|
||||
|
||||
yheight = src_ybc->y_height;
|
||||
ystride = src_ybc->y_stride;
|
||||
|
||||
/* number of MB rows to use in partial filtering */
|
||||
linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION;
|
||||
linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */
|
||||
|
||||
/* Copy extra 4 so that full filter context is available if filtering done
|
||||
* on the copied partial frame and not original. Partial filter does mb
|
||||
* filtering for top row also, which can modify3 pixels above.
|
||||
*/
|
||||
linestocopy += 4;
|
||||
/* partial image starts at ~middle of frame (macroblock border) */
|
||||
yoffset = ystride * (((yheight >> 5) * 16) - 4);
|
||||
src_y = src_ybc->y_buffer + yoffset;
|
||||
dst_y = dst_ybc->y_buffer + yoffset;
|
||||
|
||||
vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy);
|
||||
}
|
|
@ -0,0 +1,221 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_fdct4x4_neon|
|
||||
EXPORT |vp8_short_fdct8x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=4
|
||||
|
||||
|
||||
ALIGN 16 ; enable use of @128 bit aligned loads
|
||||
coeff
|
||||
DCW 5352, 5352, 5352, 5352
|
||||
DCW 2217, 2217, 2217, 2217
|
||||
DCD 14500, 14500, 14500, 14500
|
||||
DCD 7500, 7500, 7500, 7500
|
||||
DCD 12000, 12000, 12000, 12000
|
||||
DCD 51000, 51000, 51000, 51000
|
||||
|
||||
;void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct4x4_neon| PROC
|
||||
|
||||
; Part one
|
||||
vld1.16 {d0}, [r0@64], r2
|
||||
adr r12, coeff
|
||||
vld1.16 {d1}, [r0@64], r2
|
||||
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
|
||||
vld1.16 {d2}, [r0@64], r2
|
||||
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
|
||||
vld1.16 {d3}, [r0@64], r2
|
||||
|
||||
; transpose d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
vld1.32 {q11,q12}, [r12@128] ; q11=12000, q12=51000
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[3]
|
||||
vadd.s16 d5, d1, d2 ; b1 = ip[1] + ip[2]
|
||||
vsub.s16 d6, d1, d2 ; c1 = ip[1] - ip[2]
|
||||
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[3]
|
||||
|
||||
vshl.s16 q2, q2, #3 ; (a1, b1) << 3
|
||||
vshl.s16 q3, q3, #3 ; (c1, d1) << 3
|
||||
|
||||
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1
|
||||
vsub.s16 d2, d4, d5 ; op[2] = a1 - b1
|
||||
|
||||
vmlal.s16 q9, d7, d16 ; d1*5352 + 14500
|
||||
vmlal.s16 q10, d7, d17 ; d1*2217 + 7500
|
||||
vmlal.s16 q9, d6, d17 ; c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q10, d6, d16 ; d1*2217 - c1*5352 + 7500
|
||||
|
||||
vshrn.s32 d1, q9, #12 ; op[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d3, q10, #12 ; op[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
|
||||
|
||||
; Part two
|
||||
|
||||
; transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vmov.s16 d26, #7
|
||||
|
||||
vadd.s16 d4, d0, d3 ; a1 = ip[0] + ip[12]
|
||||
vadd.s16 d5, d1, d2 ; b1 = ip[4] + ip[8]
|
||||
vsub.s16 d6, d1, d2 ; c1 = ip[4] - ip[8]
|
||||
vadd.s16 d4, d4, d26 ; a1 + 7
|
||||
vsub.s16 d7, d0, d3 ; d1 = ip[0] - ip[12]
|
||||
|
||||
vadd.s16 d0, d4, d5 ; op[0] = a1 + b1 + 7
|
||||
vsub.s16 d2, d4, d5 ; op[8] = a1 - b1 + 7
|
||||
|
||||
vmlal.s16 q11, d7, d16 ; d1*5352 + 12000
|
||||
vmlal.s16 q12, d7, d17 ; d1*2217 + 51000
|
||||
|
||||
vceq.s16 d4, d7, #0
|
||||
|
||||
vshr.s16 d0, d0, #4
|
||||
vshr.s16 d2, d2, #4
|
||||
|
||||
vmlal.s16 q11, d6, d17 ; c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q12, d6, d16 ; d1*2217 - c1*5352 + 51000
|
||||
|
||||
vmvn d4, d4
|
||||
vshrn.s32 d1, q11, #16 ; op[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vsub.s16 d1, d1, d4 ; op[4] += (d1!=0)
|
||||
vshrn.s32 d3, q12, #16 ; op[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
;void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct8x4_neon| PROC
|
||||
|
||||
; Part one
|
||||
|
||||
vld1.16 {q0}, [r0@128], r2
|
||||
adr r12, coeff
|
||||
vld1.16 {q1}, [r0@128], r2
|
||||
vld1.16 {q8}, [r12@128]! ; d16=5352, d17=2217
|
||||
vld1.16 {q2}, [r0@128], r2
|
||||
vld1.32 {q9, q10}, [r12@128]! ; q9=14500, q10=7500
|
||||
vld1.16 {q3}, [r0@128], r2
|
||||
|
||||
; transpose q0=ip[0], q1=ip[1], q2=ip[2], q3=ip[3]
|
||||
vtrn.32 q0, q2 ; [A0|B0]
|
||||
vtrn.32 q1, q3 ; [A1|B1]
|
||||
vtrn.16 q0, q1 ; [A2|B2]
|
||||
vtrn.16 q2, q3 ; [A3|B3]
|
||||
|
||||
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[3]
|
||||
vadd.s16 q12, q1, q2 ; b1 = ip[1] + ip[2]
|
||||
vsub.s16 q13, q1, q2 ; c1 = ip[1] - ip[2]
|
||||
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[3]
|
||||
|
||||
vshl.s16 q11, q11, #3 ; a1 << 3
|
||||
vshl.s16 q12, q12, #3 ; b1 << 3
|
||||
vshl.s16 q13, q13, #3 ; c1 << 3
|
||||
vshl.s16 q14, q14, #3 ; d1 << 3
|
||||
|
||||
vadd.s16 q0, q11, q12 ; [A0 | B0] = a1 + b1
|
||||
vsub.s16 q2, q11, q12 ; [A2 | B2] = a1 - b1
|
||||
|
||||
vmov.s16 q11, q9 ; 14500
|
||||
vmov.s16 q12, q10 ; 7500
|
||||
|
||||
vmlal.s16 q9, d28, d16 ; A[1] = d1*5352 + 14500
|
||||
vmlal.s16 q10, d28, d17 ; A[3] = d1*2217 + 7500
|
||||
vmlal.s16 q11, d29, d16 ; B[1] = d1*5352 + 14500
|
||||
vmlal.s16 q12, d29, d17 ; B[3] = d1*2217 + 7500
|
||||
|
||||
vmlal.s16 q9, d26, d17 ; A[1] = c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q10, d26, d16 ; A[3] = d1*2217 - c1*5352 + 7500
|
||||
vmlal.s16 q11, d27, d17 ; B[1] = c1*2217 + d1*5352 + 14500
|
||||
vmlsl.s16 q12, d27, d16 ; B[3] = d1*2217 - c1*5352 + 7500
|
||||
|
||||
vshrn.s32 d2, q9, #12 ; A[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d6, q10, #12 ; A[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
vshrn.s32 d3, q11, #12 ; B[1] = (c1*2217 + d1*5352 + 14500)>>12
|
||||
vshrn.s32 d7, q12, #12 ; B[3] = (d1*2217 - c1*5352 + 7500)>>12
|
||||
|
||||
|
||||
; Part two
|
||||
vld1.32 {q9,q10}, [r12@128] ; q9=12000, q10=51000
|
||||
|
||||
; transpose q0=ip[0], q1=ip[4], q2=ip[8], q3=ip[12]
|
||||
vtrn.32 q0, q2 ; q0=[A0 | B0]
|
||||
vtrn.32 q1, q3 ; q1=[A4 | B4]
|
||||
vtrn.16 q0, q1 ; q2=[A8 | B8]
|
||||
vtrn.16 q2, q3 ; q3=[A12|B12]
|
||||
|
||||
vmov.s16 q15, #7
|
||||
|
||||
vadd.s16 q11, q0, q3 ; a1 = ip[0] + ip[12]
|
||||
vadd.s16 q12, q1, q2 ; b1 = ip[4] + ip[8]
|
||||
vadd.s16 q11, q11, q15 ; a1 + 7
|
||||
vsub.s16 q13, q1, q2 ; c1 = ip[4] - ip[8]
|
||||
vsub.s16 q14, q0, q3 ; d1 = ip[0] - ip[12]
|
||||
|
||||
vadd.s16 q0, q11, q12 ; a1 + b1 + 7
|
||||
vsub.s16 q1, q11, q12 ; a1 - b1 + 7
|
||||
|
||||
vmov.s16 q11, q9 ; 12000
|
||||
vmov.s16 q12, q10 ; 51000
|
||||
|
||||
vshr.s16 d0, d0, #4 ; A[0] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d4, d1, #4 ; B[0] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d2, d2, #4 ; A[8] = (a1 + b1 + 7)>>4
|
||||
vshr.s16 d6, d3, #4 ; B[8] = (a1 + b1 + 7)>>4
|
||||
|
||||
|
||||
vmlal.s16 q9, d28, d16 ; A[4] = d1*5352 + 12000
|
||||
vmlal.s16 q10, d28, d17 ; A[12] = d1*2217 + 51000
|
||||
vmlal.s16 q11, d29, d16 ; B[4] = d1*5352 + 12000
|
||||
vmlal.s16 q12, d29, d17 ; B[12] = d1*2217 + 51000
|
||||
|
||||
vceq.s16 q14, q14, #0
|
||||
|
||||
vmlal.s16 q9, d26, d17 ; A[4] = c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q10, d26, d16 ; A[12] = d1*2217 - c1*5352 + 51000
|
||||
vmlal.s16 q11, d27, d17 ; B[4] = c1*2217 + d1*5352 + 12000
|
||||
vmlsl.s16 q12, d27, d16 ; B[12] = d1*2217 - c1*5352 + 51000
|
||||
|
||||
vmvn q14, q14
|
||||
|
||||
vshrn.s32 d1, q9, #16 ; A[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vshrn.s32 d3, q10, #16 ; A[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
vsub.s16 d1, d1, d28 ; A[4] += (d1!=0)
|
||||
|
||||
vshrn.s32 d5, q11, #16 ; B[4] = (c1*2217 + d1*5352 + 12000)>>16
|
||||
vshrn.s32 d7, q12, #16 ; B[12]= (d1*2217 - c1*5352 + 51000)>>16
|
||||
vsub.s16 d5, d5, d29 ; B[4] += (d1!=0)
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]! ; block A
|
||||
vst1.16 {q2, q3}, [r1@128]! ; block B
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
|
@ -1,269 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
void vp8_short_fdct4x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *output,
|
||||
int pitch) {
|
||||
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
|
||||
int16x4_t d16s16, d17s16, d26s16, dEmptys16;
|
||||
uint16x4_t d4u16;
|
||||
int16x8_t q0s16, q1s16;
|
||||
int32x4_t q9s32, q10s32, q11s32, q12s32;
|
||||
int16x4x2_t v2tmp0, v2tmp1;
|
||||
int32x2x2_t v2tmp2, v2tmp3;
|
||||
|
||||
d16s16 = vdup_n_s16(5352);
|
||||
d17s16 = vdup_n_s16(2217);
|
||||
q9s32 = vdupq_n_s32(14500);
|
||||
q10s32 = vdupq_n_s32(7500);
|
||||
q11s32 = vdupq_n_s32(12000);
|
||||
q12s32 = vdupq_n_s32(51000);
|
||||
|
||||
// Part one
|
||||
pitch >>= 1;
|
||||
d0s16 = vld1_s16(input);
|
||||
input += pitch;
|
||||
d1s16 = vld1_s16(input);
|
||||
input += pitch;
|
||||
d2s16 = vld1_s16(input);
|
||||
input += pitch;
|
||||
d3s16 = vld1_s16(input);
|
||||
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
|
||||
vreinterpret_s32_s16(d2s16));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
|
||||
vreinterpret_s32_s16(d3s16));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
|
||||
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
|
||||
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
|
||||
|
||||
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
|
||||
d4s16 = vshl_n_s16(d4s16, 3);
|
||||
d5s16 = vshl_n_s16(d5s16, 3);
|
||||
d6s16 = vshl_n_s16(d6s16, 3);
|
||||
d7s16 = vshl_n_s16(d7s16, 3);
|
||||
|
||||
d0s16 = vadd_s16(d4s16, d5s16);
|
||||
d2s16 = vsub_s16(d4s16, d5s16);
|
||||
|
||||
q9s32 = vmlal_s16(q9s32, d7s16, d16s16);
|
||||
q10s32 = vmlal_s16(q10s32, d7s16, d17s16);
|
||||
q9s32 = vmlal_s16(q9s32, d6s16, d17s16);
|
||||
q10s32 = vmlsl_s16(q10s32, d6s16, d16s16);
|
||||
|
||||
d1s16 = vshrn_n_s32(q9s32, 12);
|
||||
d3s16 = vshrn_n_s32(q10s32, 12);
|
||||
|
||||
// Part two
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
|
||||
vreinterpret_s32_s16(d2s16));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
|
||||
vreinterpret_s32_s16(d3s16));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
|
||||
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
|
||||
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
|
||||
|
||||
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
|
||||
d26s16 = vdup_n_s16(7);
|
||||
d4s16 = vadd_s16(d4s16, d26s16);
|
||||
|
||||
d0s16 = vadd_s16(d4s16, d5s16);
|
||||
d2s16 = vsub_s16(d4s16, d5s16);
|
||||
|
||||
q11s32 = vmlal_s16(q11s32, d7s16, d16s16);
|
||||
q12s32 = vmlal_s16(q12s32, d7s16, d17s16);
|
||||
|
||||
dEmptys16 = vdup_n_s16(0);
|
||||
d4u16 = vceq_s16(d7s16, dEmptys16);
|
||||
|
||||
d0s16 = vshr_n_s16(d0s16, 4);
|
||||
d2s16 = vshr_n_s16(d2s16, 4);
|
||||
|
||||
q11s32 = vmlal_s16(q11s32, d6s16, d17s16);
|
||||
q12s32 = vmlsl_s16(q12s32, d6s16, d16s16);
|
||||
|
||||
d4u16 = vmvn_u16(d4u16);
|
||||
d1s16 = vshrn_n_s32(q11s32, 16);
|
||||
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16));
|
||||
d3s16 = vshrn_n_s32(q12s32, 16);
|
||||
|
||||
q0s16 = vcombine_s16(d0s16, d1s16);
|
||||
q1s16 = vcombine_s16(d2s16, d3s16);
|
||||
|
||||
vst1q_s16(output, q0s16);
|
||||
vst1q_s16(output + 8, q1s16);
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_short_fdct8x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *output,
|
||||
int pitch) {
|
||||
int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
|
||||
int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16;
|
||||
uint16x4_t d28u16, d29u16;
|
||||
uint16x8_t q14u16;
|
||||
int16x8_t q0s16, q1s16, q2s16, q3s16;
|
||||
int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16;
|
||||
int32x4_t q9s32, q10s32, q11s32, q12s32;
|
||||
int16x8x2_t v2tmp0, v2tmp1;
|
||||
int32x4x2_t v2tmp2, v2tmp3;
|
||||
|
||||
d16s16 = vdup_n_s16(5352);
|
||||
d17s16 = vdup_n_s16(2217);
|
||||
q9s32 = vdupq_n_s32(14500);
|
||||
q10s32 = vdupq_n_s32(7500);
|
||||
|
||||
// Part one
|
||||
pitch >>= 1;
|
||||
q0s16 = vld1q_s16(input);
|
||||
input += pitch;
|
||||
q1s16 = vld1q_s16(input);
|
||||
input += pitch;
|
||||
q2s16 = vld1q_s16(input);
|
||||
input += pitch;
|
||||
q3s16 = vld1q_s16(input);
|
||||
|
||||
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
|
||||
vreinterpretq_s32_s16(q2s16));
|
||||
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
|
||||
vreinterpretq_s32_s16(q3s16));
|
||||
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
|
||||
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
|
||||
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
|
||||
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
|
||||
|
||||
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
|
||||
q11s16 = vshlq_n_s16(q11s16, 3);
|
||||
q12s16 = vshlq_n_s16(q12s16, 3);
|
||||
q13s16 = vshlq_n_s16(q13s16, 3);
|
||||
q14s16 = vshlq_n_s16(q14s16, 3);
|
||||
|
||||
q0s16 = vaddq_s16(q11s16, q12s16);
|
||||
q2s16 = vsubq_s16(q11s16, q12s16);
|
||||
|
||||
q11s32 = q9s32;
|
||||
q12s32 = q10s32;
|
||||
|
||||
d26s16 = vget_low_s16(q13s16);
|
||||
d27s16 = vget_high_s16(q13s16);
|
||||
d28s16 = vget_low_s16(q14s16);
|
||||
d29s16 = vget_high_s16(q14s16);
|
||||
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
|
||||
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
|
||||
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
|
||||
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
|
||||
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
|
||||
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
|
||||
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
|
||||
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
|
||||
|
||||
d2s16 = vshrn_n_s32(q9s32, 12);
|
||||
d6s16 = vshrn_n_s32(q10s32, 12);
|
||||
d3s16 = vshrn_n_s32(q11s32, 12);
|
||||
d7s16 = vshrn_n_s32(q12s32, 12);
|
||||
q1s16 = vcombine_s16(d2s16, d3s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
// Part two
|
||||
q9s32 = vdupq_n_s32(12000);
|
||||
q10s32 = vdupq_n_s32(51000);
|
||||
|
||||
v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16),
|
||||
vreinterpretq_s32_s16(q2s16));
|
||||
v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16),
|
||||
vreinterpretq_s32_s16(q3s16));
|
||||
v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0
|
||||
vreinterpretq_s16_s32(v2tmp3.val[0])); // q1
|
||||
v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2
|
||||
vreinterpretq_s16_s32(v2tmp3.val[1])); // q3
|
||||
|
||||
q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]);
|
||||
q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]);
|
||||
|
||||
q15s16 = vdupq_n_s16(7);
|
||||
q11s16 = vaddq_s16(q11s16, q15s16);
|
||||
q0s16 = vaddq_s16(q11s16, q12s16);
|
||||
q1s16 = vsubq_s16(q11s16, q12s16);
|
||||
|
||||
q11s32 = q9s32;
|
||||
q12s32 = q10s32;
|
||||
|
||||
d0s16 = vget_low_s16(q0s16);
|
||||
d1s16 = vget_high_s16(q0s16);
|
||||
d2s16 = vget_low_s16(q1s16);
|
||||
d3s16 = vget_high_s16(q1s16);
|
||||
|
||||
d0s16 = vshr_n_s16(d0s16, 4);
|
||||
d4s16 = vshr_n_s16(d1s16, 4);
|
||||
d2s16 = vshr_n_s16(d2s16, 4);
|
||||
d6s16 = vshr_n_s16(d3s16, 4);
|
||||
|
||||
d26s16 = vget_low_s16(q13s16);
|
||||
d27s16 = vget_high_s16(q13s16);
|
||||
d28s16 = vget_low_s16(q14s16);
|
||||
d29s16 = vget_high_s16(q14s16);
|
||||
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d16s16);
|
||||
q10s32 = vmlal_s16(q10s32, d28s16, d17s16);
|
||||
q11s32 = vmlal_s16(q11s32, d29s16, d16s16);
|
||||
q12s32 = vmlal_s16(q12s32, d29s16, d17s16);
|
||||
|
||||
q9s32 = vmlal_s16(q9s32, d26s16, d17s16);
|
||||
q10s32 = vmlsl_s16(q10s32, d26s16, d16s16);
|
||||
q11s32 = vmlal_s16(q11s32, d27s16, d17s16);
|
||||
q12s32 = vmlsl_s16(q12s32, d27s16, d16s16);
|
||||
|
||||
d1s16 = vshrn_n_s32(q9s32, 16);
|
||||
d3s16 = vshrn_n_s32(q10s32, 16);
|
||||
d5s16 = vshrn_n_s32(q11s32, 16);
|
||||
d7s16 = vshrn_n_s32(q12s32, 16);
|
||||
|
||||
qEmptys16 = vdupq_n_s16(0);
|
||||
q14u16 = vceqq_s16(q14s16, qEmptys16);
|
||||
q14u16 = vmvnq_u16(q14u16);
|
||||
|
||||
d28u16 = vget_low_u16(q14u16);
|
||||
d29u16 = vget_high_u16(q14u16);
|
||||
d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16));
|
||||
d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16));
|
||||
|
||||
q0s16 = vcombine_s16(d0s16, d1s16);
|
||||
q1s16 = vcombine_s16(d2s16, d3s16);
|
||||
q2s16 = vcombine_s16(d4s16, d5s16);
|
||||
q3s16 = vcombine_s16(d6s16, d7s16);
|
||||
|
||||
vst1q_s16(output, q0s16);
|
||||
vst1q_s16(output + 8, q1s16);
|
||||
vst1q_s16(output + 16, q2s16);
|
||||
vst1q_s16(output + 24, q3s16);
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,199 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_subtract_b_neon|
|
||||
EXPORT |vp8_subtract_mby_neon|
|
||||
EXPORT |vp8_subtract_mbuv_neon|
|
||||
|
||||
INCLUDE vp8_asm_enc_offsets.asm
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
|vp8_subtract_b_neon| PROC
|
||||
|
||||
stmfd sp!, {r4-r7}
|
||||
|
||||
ldr r3, [r0, #vp8_block_base_src]
|
||||
ldr r4, [r0, #vp8_block_src]
|
||||
ldr r5, [r0, #vp8_block_src_diff]
|
||||
ldr r3, [r3]
|
||||
ldr r6, [r0, #vp8_block_src_stride]
|
||||
add r3, r3, r4 ; src = *base_src + src
|
||||
ldr r7, [r1, #vp8_blockd_predictor]
|
||||
|
||||
vld1.8 {d0}, [r3], r6 ;load src
|
||||
vld1.8 {d1}, [r7], r2 ;load pred
|
||||
vld1.8 {d2}, [r3], r6
|
||||
vld1.8 {d3}, [r7], r2
|
||||
vld1.8 {d4}, [r3], r6
|
||||
vld1.8 {d5}, [r7], r2
|
||||
vld1.8 {d6}, [r3], r6
|
||||
vld1.8 {d7}, [r7], r2
|
||||
|
||||
vsubl.u8 q10, d0, d1
|
||||
vsubl.u8 q11, d2, d3
|
||||
vsubl.u8 q12, d4, d5
|
||||
vsubl.u8 q13, d6, d7
|
||||
|
||||
mov r2, r2, lsl #1
|
||||
|
||||
vst1.16 {d20}, [r5], r2 ;store diff
|
||||
vst1.16 {d22}, [r5], r2
|
||||
vst1.16 {d24}, [r5], r2
|
||||
vst1.16 {d26}, [r5], r2
|
||||
|
||||
ldmfd sp!, {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
;==========================================
|
||||
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
|
||||
; unsigned char *pred, int pred_stride)
|
||||
|vp8_subtract_mby_neon| PROC
|
||||
push {r4-r7}
|
||||
mov r12, #4
|
||||
ldr r4, [sp, #16] ; pred_stride
|
||||
mov r6, #32 ; "diff" stride x2
|
||||
add r5, r0, #16 ; second diff pointer
|
||||
|
||||
subtract_mby_loop
|
||||
vld1.8 {q0}, [r1], r2 ;load src
|
||||
vld1.8 {q1}, [r3], r4 ;load pred
|
||||
vld1.8 {q2}, [r1], r2
|
||||
vld1.8 {q3}, [r3], r4
|
||||
vld1.8 {q4}, [r1], r2
|
||||
vld1.8 {q5}, [r3], r4
|
||||
vld1.8 {q6}, [r1], r2
|
||||
vld1.8 {q7}, [r3], r4
|
||||
|
||||
vsubl.u8 q8, d0, d2
|
||||
vsubl.u8 q9, d1, d3
|
||||
vsubl.u8 q10, d4, d6
|
||||
vsubl.u8 q11, d5, d7
|
||||
vsubl.u8 q12, d8, d10
|
||||
vsubl.u8 q13, d9, d11
|
||||
vsubl.u8 q14, d12, d14
|
||||
vsubl.u8 q15, d13, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r6 ;store diff
|
||||
vst1.16 {q9}, [r5], r6
|
||||
vst1.16 {q10}, [r0], r6
|
||||
vst1.16 {q11}, [r5], r6
|
||||
vst1.16 {q12}, [r0], r6
|
||||
vst1.16 {q13}, [r5], r6
|
||||
vst1.16 {q14}, [r0], r6
|
||||
vst1.16 {q15}, [r5], r6
|
||||
|
||||
subs r12, r12, #1
|
||||
bne subtract_mby_loop
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
; int src_stride, unsigned char *upred,
|
||||
; unsigned char *vpred, int pred_stride)
|
||||
|
||||
|vp8_subtract_mbuv_neon| PROC
|
||||
push {r4-r7}
|
||||
ldr r4, [sp, #16] ; upred
|
||||
ldr r5, [sp, #20] ; vpred
|
||||
ldr r6, [sp, #24] ; pred_stride
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
mov r12, #32 ; "diff" stride x2
|
||||
add r7, r0, #16 ; second diff pointer
|
||||
|
||||
;u
|
||||
vld1.8 {d0}, [r1], r3 ;load usrc
|
||||
vld1.8 {d1}, [r4], r6 ;load upred
|
||||
vld1.8 {d2}, [r1], r3
|
||||
vld1.8 {d3}, [r4], r6
|
||||
vld1.8 {d4}, [r1], r3
|
||||
vld1.8 {d5}, [r4], r6
|
||||
vld1.8 {d6}, [r1], r3
|
||||
vld1.8 {d7}, [r4], r6
|
||||
vld1.8 {d8}, [r1], r3
|
||||
vld1.8 {d9}, [r4], r6
|
||||
vld1.8 {d10}, [r1], r3
|
||||
vld1.8 {d11}, [r4], r6
|
||||
vld1.8 {d12}, [r1], r3
|
||||
vld1.8 {d13}, [r4], r6
|
||||
vld1.8 {d14}, [r1], r3
|
||||
vld1.8 {d15}, [r4], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
;v
|
||||
vld1.8 {d0}, [r2], r3 ;load vsrc
|
||||
vld1.8 {d1}, [r5], r6 ;load vpred
|
||||
vld1.8 {d2}, [r2], r3
|
||||
vld1.8 {d3}, [r5], r6
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d5}, [r5], r6
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d7}, [r5], r6
|
||||
vld1.8 {d8}, [r2], r3
|
||||
vld1.8 {d9}, [r5], r6
|
||||
vld1.8 {d10}, [r2], r3
|
||||
vld1.8 {d11}, [r5], r6
|
||||
vld1.8 {d12}, [r2], r3
|
||||
vld1.8 {d13}, [r5], r6
|
||||
vld1.8 {d14}, [r2], r3
|
||||
vld1.8 {d15}, [r5], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vsubl.u8 q11, d6, d7
|
||||
vsubl.u8 q12, d8, d9
|
||||
vsubl.u8 q13, d10, d11
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,154 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "vp8/encoder/block.h"
|
||||
|
||||
void vp8_subtract_b_neon(
|
||||
BLOCK *be,
|
||||
BLOCKD *bd,
|
||||
int pitch) {
|
||||
unsigned char *src_ptr, *predictor;
|
||||
int src_stride;
|
||||
int16_t *src_diff;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint16x8_t q10u16, q11u16, q12u16, q13u16;
|
||||
|
||||
src_ptr = *be->base_src + be->src;
|
||||
src_stride = be->src_stride;
|
||||
predictor = bd->predictor;
|
||||
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d4u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d6u8 = vld1_u8(src_ptr);
|
||||
|
||||
d1u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d3u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d5u8 = vld1_u8(predictor);
|
||||
predictor += pitch;
|
||||
d7u8 = vld1_u8(predictor);
|
||||
|
||||
q10u16 = vsubl_u8(d0u8, d1u8);
|
||||
q11u16 = vsubl_u8(d2u8, d3u8);
|
||||
q12u16 = vsubl_u8(d4u8, d5u8);
|
||||
q13u16 = vsubl_u8(d6u8, d7u8);
|
||||
|
||||
src_diff = be->src_diff;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
|
||||
src_diff += pitch;
|
||||
vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_subtract_mby_neon(
|
||||
int16_t *diff,
|
||||
unsigned char *src,
|
||||
int src_stride,
|
||||
unsigned char *pred,
|
||||
int pred_stride) {
|
||||
int i;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
uint16x8_t q8u16, q9u16, q10u16, q11u16;
|
||||
|
||||
for (i = 0; i < 8; i++) { // subtract_mby_loop
|
||||
q0u8 = vld1q_u8(src);
|
||||
src += src_stride;
|
||||
q2u8 = vld1q_u8(src);
|
||||
src += src_stride;
|
||||
q1u8 = vld1q_u8(pred);
|
||||
pred += pred_stride;
|
||||
q3u8 = vld1q_u8(pred);
|
||||
pred += pred_stride;
|
||||
|
||||
q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
|
||||
q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
|
||||
q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
|
||||
q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
|
||||
|
||||
vst1q_u16((uint16_t *)diff, q8u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q9u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q10u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q11u16);
|
||||
diff += 8;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void vp8_subtract_mbuv_neon(
|
||||
int16_t *diff,
|
||||
unsigned char *usrc,
|
||||
unsigned char *vsrc,
|
||||
int src_stride,
|
||||
unsigned char *upred,
|
||||
unsigned char *vpred,
|
||||
int pred_stride) {
|
||||
int i, j;
|
||||
unsigned char *src_ptr, *pred_ptr;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
uint16x8_t q8u16, q9u16, q10u16, q11u16;
|
||||
|
||||
diff += 256;
|
||||
for (i = 0; i < 2; i++) {
|
||||
if (i == 0) {
|
||||
src_ptr = usrc;
|
||||
pred_ptr = upred;
|
||||
} else if (i == 1) {
|
||||
src_ptr = vsrc;
|
||||
pred_ptr = vpred;
|
||||
}
|
||||
|
||||
for (j = 0; j < 2; j++) {
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d1u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d3u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d4u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d5u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
d6u8 = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
d7u8 = vld1_u8(pred_ptr);
|
||||
pred_ptr += pred_stride;
|
||||
|
||||
q8u16 = vsubl_u8(d0u8, d1u8);
|
||||
q9u16 = vsubl_u8(d2u8, d3u8);
|
||||
q10u16 = vsubl_u8(d4u8, d5u8);
|
||||
q11u16 = vsubl_u8(d6u8, d7u8);
|
||||
|
||||
vst1q_u16((uint16_t *)diff, q8u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q9u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q10u16);
|
||||
diff += 8;
|
||||
vst1q_u16((uint16_t *)diff, q11u16);
|
||||
diff += 8;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_memcpy_partial_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;=========================================
|
||||
;this is not a full memcpy function!!!
|
||||
;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
|
||||
; int sz);
|
||||
|vp8_memcpy_partial_neon| PROC
|
||||
;pld [r1] ;preload pred data
|
||||
;pld [r1, #128]
|
||||
;pld [r1, #256]
|
||||
;pld [r1, #384]
|
||||
|
||||
mov r12, r2, lsr #8 ;copy 256 bytes data at one time
|
||||
|
||||
memcpy_neon_loop
|
||||
vld1.8 {q0, q1}, [r1]! ;load src data
|
||||
subs r12, r12, #1
|
||||
vld1.8 {q2, q3}, [r1]!
|
||||
vst1.8 {q0, q1}, [r0]! ;copy to dst_ptr
|
||||
vld1.8 {q4, q5}, [r1]!
|
||||
vst1.8 {q2, q3}, [r0]!
|
||||
vld1.8 {q6, q7}, [r1]!
|
||||
vst1.8 {q4, q5}, [r0]!
|
||||
vld1.8 {q8, q9}, [r1]!
|
||||
vst1.8 {q6, q7}, [r0]!
|
||||
vld1.8 {q10, q11}, [r1]!
|
||||
vst1.8 {q8, q9}, [r0]!
|
||||
vld1.8 {q12, q13}, [r1]!
|
||||
vst1.8 {q10, q11}, [r0]!
|
||||
vld1.8 {q14, q15}, [r1]!
|
||||
vst1.8 {q12, q13}, [r0]!
|
||||
vst1.8 {q14, q15}, [r0]!
|
||||
|
||||
;pld [r1] ;preload pred data -- need to adjust for real device
|
||||
;pld [r1, #128]
|
||||
;pld [r1, #256]
|
||||
;pld [r1, #384]
|
||||
|
||||
bne memcpy_neon_loop
|
||||
|
||||
ands r3, r2, #0xff ;extra copy
|
||||
beq done_copy_neon_loop
|
||||
|
||||
extra_copy_neon_loop
|
||||
vld1.8 {q0}, [r1]! ;load src data
|
||||
subs r3, r3, #16
|
||||
vst1.8 {q0}, [r0]!
|
||||
bne extra_copy_neon_loop
|
||||
|
||||
done_copy_neon_loop
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -0,0 +1,116 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_mse16x16_neon|
|
||||
EXPORT |vp8_get4x4sse_cs_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;============================
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;note: in this function, sum is never used. So, we can remove this part of calculation
|
||||
;from vp8_variance().
|
||||
|
||||
|vp8_mse16x16_neon| PROC
|
||||
vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
|
||||
vmov.i8 q8, #0
|
||||
vmov.i8 q9, #0
|
||||
vmov.i8 q10, #0
|
||||
|
||||
mov r12, #8
|
||||
|
||||
mse16x16_neon_loop
|
||||
vld1.8 {q0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {q2}, [r2], r3
|
||||
vld1.8 {q1}, [r0], r1
|
||||
vld1.8 {q3}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vmlal.s16 q7, d22, d22
|
||||
vmlal.s16 q8, d23, d23
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
vmlal.s16 q9, d24, d24
|
||||
vmlal.s16 q10, d25, d25
|
||||
vmlal.s16 q7, d26, d26
|
||||
vmlal.s16 q8, d27, d27
|
||||
vmlal.s16 q9, d28, d28
|
||||
vmlal.s16 q10, d29, d29
|
||||
|
||||
bne mse16x16_neon_loop
|
||||
|
||||
vadd.u32 q7, q7, q8
|
||||
vadd.u32 q9, q9, q10
|
||||
|
||||
ldr r12, [sp] ;load *sse from stack
|
||||
|
||||
vadd.u32 q10, q7, q9
|
||||
vpaddl.u32 q1, q10
|
||||
vadd.u64 d0, d2, d3
|
||||
|
||||
vst1.32 {d0[0]}, [r12]
|
||||
vmov.32 r0, d0[0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
;=============================
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int source_stride,
|
||||
; r2 unsigned char *ref_ptr,
|
||||
; r3 int recon_stride
|
||||
|vp8_get4x4sse_cs_neon| PROC
|
||||
vld1.8 {d0}, [r0], r1 ;Load up source and reference
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d5}, [r2], r3
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d3}, [r0], r1
|
||||
vld1.8 {d7}, [r2], r3
|
||||
|
||||
vsubl.u8 q11, d0, d4
|
||||
vsubl.u8 q12, d1, d5
|
||||
vsubl.u8 q13, d2, d6
|
||||
vsubl.u8 q14, d3, d7
|
||||
|
||||
vmull.s16 q7, d22, d22
|
||||
vmull.s16 q8, d24, d24
|
||||
vmull.s16 q9, d26, d26
|
||||
vmull.s16 q10, d28, d28
|
||||
|
||||
vadd.u32 q7, q7, q8
|
||||
vadd.u32 q9, q9, q10
|
||||
vadd.u32 q9, q7, q9
|
||||
|
||||
vpaddl.u32 q1, q9
|
||||
vadd.u64 d0, d2, d3
|
||||
|
||||
vmov.32 r0, d0[0]
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,131 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
unsigned int vp8_mse16x16_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride,
|
||||
unsigned int *sse) {
|
||||
int i;
|
||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
q7s32 = vdupq_n_s32(0);
|
||||
q8s32 = vdupq_n_s32(0);
|
||||
q9s32 = vdupq_n_s32(0);
|
||||
q10s32 = vdupq_n_s32(0);
|
||||
|
||||
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
|
||||
q0u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q1u8 = vld1q_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
q2u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
q3u8 = vld1q_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||
|
||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
|
||||
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
|
||||
|
||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||
|
||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
|
||||
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
|
||||
|
||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||
}
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q10s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q10s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
||||
|
||||
unsigned int vp8_get4x4sse_cs_neon(
|
||||
const unsigned char *src_ptr,
|
||||
int source_stride,
|
||||
const unsigned char *ref_ptr,
|
||||
int recon_stride) {
|
||||
int16x4_t d22s16, d24s16, d26s16, d28s16;
|
||||
int64x1_t d0s64;
|
||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||
int64x2_t q1s64;
|
||||
|
||||
d0u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d4u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d1u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d5u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d2u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d6u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
d3u8 = vld1_u8(src_ptr);
|
||||
src_ptr += source_stride;
|
||||
d7u8 = vld1_u8(ref_ptr);
|
||||
ref_ptr += recon_stride;
|
||||
|
||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||
|
||||
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
|
||||
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
|
||||
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
|
||||
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
|
||||
|
||||
q7s32 = vmull_s16(d22s16, d22s16);
|
||||
q8s32 = vmull_s16(d24s16, d24s16);
|
||||
q9s32 = vmull_s16(d26s16, d26s16);
|
||||
q10s32 = vmull_s16(d28s16, d28s16);
|
||||
|
||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||
q9s32 = vaddq_s32(q7s32, q9s32);
|
||||
|
||||
q1s64 = vpaddlq_s32(q9s32);
|
||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||
|
||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_walsh4x4_neon|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
|
||||
; r0 short *input,
|
||||
; r1 short *output,
|
||||
; r2 int pitch
|
||||
|vp8_short_walsh4x4_neon| PROC
|
||||
|
||||
vld1.16 {d0}, [r0@64], r2 ; load input
|
||||
vld1.16 {d1}, [r0@64], r2
|
||||
vld1.16 {d2}, [r0@64], r2
|
||||
vld1.16 {d3}, [r0@64]
|
||||
|
||||
;First for-loop
|
||||
;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
|
||||
vtrn.32 d0, d2
|
||||
vtrn.32 d1, d3
|
||||
|
||||
vmov.s32 q15, #3 ; add 3 to all values
|
||||
|
||||
vtrn.16 d0, d1
|
||||
vtrn.16 d2, d3
|
||||
|
||||
vadd.s16 d4, d0, d2 ; ip[0] + ip[2]
|
||||
vadd.s16 d5, d1, d3 ; ip[1] + ip[3]
|
||||
vsub.s16 d6, d1, d3 ; ip[1] - ip[3]
|
||||
vsub.s16 d7, d0, d2 ; ip[0] - ip[2]
|
||||
|
||||
vshl.s16 d4, d4, #2 ; a1 = (ip[0] + ip[2]) << 2
|
||||
vshl.s16 d5, d5, #2 ; d1 = (ip[1] + ip[3]) << 2
|
||||
vshl.s16 d6, d6, #2 ; c1 = (ip[1] - ip[3]) << 2
|
||||
vceq.s16 d16, d4, #0 ; a1 == 0
|
||||
vshl.s16 d7, d7, #2 ; b1 = (ip[0] - ip[2]) << 2
|
||||
|
||||
vadd.s16 d0, d4, d5 ; a1 + d1
|
||||
vmvn d16, d16 ; a1 != 0
|
||||
vsub.s16 d3, d4, d5 ; op[3] = a1 - d1
|
||||
vadd.s16 d1, d7, d6 ; op[1] = b1 + c1
|
||||
vsub.s16 d2, d7, d6 ; op[2] = b1 - c1
|
||||
vsub.s16 d0, d0, d16 ; op[0] = a1 + d1 + (a1 != 0)
|
||||
|
||||
;Second for-loop
|
||||
;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
|
||||
vtrn.32 d1, d3
|
||||
vtrn.32 d0, d2
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d0, d1
|
||||
|
||||
vaddl.s16 q8, d0, d2 ; a1 = ip[0]+ip[8]
|
||||
vaddl.s16 q9, d1, d3 ; d1 = ip[4]+ip[12]
|
||||
vsubl.s16 q10, d1, d3 ; c1 = ip[4]-ip[12]
|
||||
vsubl.s16 q11, d0, d2 ; b1 = ip[0]-ip[8]
|
||||
|
||||
vadd.s32 q0, q8, q9 ; a2 = a1 + d1
|
||||
vadd.s32 q1, q11, q10 ; b2 = b1 + c1
|
||||
vsub.s32 q2, q11, q10 ; c2 = b1 - c1
|
||||
vsub.s32 q3, q8, q9 ; d2 = a1 - d1
|
||||
|
||||
vclt.s32 q8, q0, #0
|
||||
vclt.s32 q9, q1, #0
|
||||
vclt.s32 q10, q2, #0
|
||||
vclt.s32 q11, q3, #0
|
||||
|
||||
; subtract -1 (or 0)
|
||||
vsub.s32 q0, q0, q8 ; a2 += a2 < 0
|
||||
vsub.s32 q1, q1, q9 ; b2 += b2 < 0
|
||||
vsub.s32 q2, q2, q10 ; c2 += c2 < 0
|
||||
vsub.s32 q3, q3, q11 ; d2 += d2 < 0
|
||||
|
||||
vadd.s32 q8, q0, q15 ; a2 + 3
|
||||
vadd.s32 q9, q1, q15 ; b2 + 3
|
||||
vadd.s32 q10, q2, q15 ; c2 + 3
|
||||
vadd.s32 q11, q3, q15 ; d2 + 3
|
||||
|
||||
; vrshrn? would add 1 << 3-1 = 2
|
||||
vshrn.s32 d0, q8, #3
|
||||
vshrn.s32 d1, q9, #3
|
||||
vshrn.s32 d2, q10, #3
|
||||
vshrn.s32 d3, q11, #3
|
||||
|
||||
vst1.16 {q0, q1}, [r1@128]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
|
@ -1,129 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "vpx_ports/arm.h"
|
||||
|
||||
#ifdef VPX_INCOMPATIBLE_GCC
|
||||
#include "./vp8_rtcd.h"
|
||||
void vp8_short_walsh4x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *output,
|
||||
int pitch) {
|
||||
vp8_short_walsh4x4_c(input, output, pitch);
|
||||
}
|
||||
#else
|
||||
void vp8_short_walsh4x4_neon(
|
||||
int16_t *input,
|
||||
int16_t *output,
|
||||
int pitch) {
|
||||
uint16x4_t d16u16;
|
||||
int16x8_t q0s16, q1s16;
|
||||
int16x4_t dEmptys16, d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
|
||||
int32x4_t qEmptys32, q0s32, q1s32, q2s32, q3s32, q8s32;
|
||||
int32x4_t q9s32, q10s32, q11s32, q15s32;
|
||||
uint32x4_t q8u32, q9u32, q10u32, q11u32;
|
||||
int16x4x2_t v2tmp0, v2tmp1;
|
||||
int32x2x2_t v2tmp2, v2tmp3;
|
||||
|
||||
dEmptys16 = vdup_n_s16(0);
|
||||
qEmptys32 = vdupq_n_s32(0);
|
||||
q15s32 = vdupq_n_s32(3);
|
||||
|
||||
d0s16 = vld1_s16(input);
|
||||
input += pitch/2;
|
||||
d1s16 = vld1_s16(input);
|
||||
input += pitch/2;
|
||||
d2s16 = vld1_s16(input);
|
||||
input += pitch/2;
|
||||
d3s16 = vld1_s16(input);
|
||||
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16),
|
||||
vreinterpret_s32_s16(d2s16));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16),
|
||||
vreinterpret_s32_s16(d3s16));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0
|
||||
vreinterpret_s16_s32(v2tmp3.val[0])); // d1
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2
|
||||
vreinterpret_s16_s32(v2tmp3.val[1])); // d3
|
||||
|
||||
d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[0]);
|
||||
d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[1]);
|
||||
d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[1]);
|
||||
d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[0]);
|
||||
|
||||
d4s16 = vshl_n_s16(d4s16, 2);
|
||||
d5s16 = vshl_n_s16(d5s16, 2);
|
||||
d6s16 = vshl_n_s16(d6s16, 2);
|
||||
d7s16 = vshl_n_s16(d7s16, 2);
|
||||
|
||||
d16u16 = vceq_s16(d4s16, dEmptys16);
|
||||
d16u16 = vmvn_u16(d16u16);
|
||||
|
||||
d0s16 = vadd_s16(d4s16, d5s16);
|
||||
d3s16 = vsub_s16(d4s16, d5s16);
|
||||
d1s16 = vadd_s16(d7s16, d6s16);
|
||||
d2s16 = vsub_s16(d7s16, d6s16);
|
||||
|
||||
d0s16 = vsub_s16(d0s16, vreinterpret_s16_u16(d16u16));
|
||||
|
||||
// Second for-loop
|
||||
v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d1s16),
|
||||
vreinterpret_s32_s16(d3s16));
|
||||
v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d0s16),
|
||||
vreinterpret_s32_s16(d2s16));
|
||||
v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[1]), // d2
|
||||
vreinterpret_s16_s32(v2tmp2.val[1])); // d3
|
||||
v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp3.val[0]), // d0
|
||||
vreinterpret_s16_s32(v2tmp2.val[0])); // d1
|
||||
|
||||
q8s32 = vaddl_s16(v2tmp1.val[0], v2tmp0.val[0]);
|
||||
q9s32 = vaddl_s16(v2tmp1.val[1], v2tmp0.val[1]);
|
||||
q10s32 = vsubl_s16(v2tmp1.val[1], v2tmp0.val[1]);
|
||||
q11s32 = vsubl_s16(v2tmp1.val[0], v2tmp0.val[0]);
|
||||
|
||||
q0s32 = vaddq_s32(q8s32, q9s32);
|
||||
q1s32 = vaddq_s32(q11s32, q10s32);
|
||||
q2s32 = vsubq_s32(q11s32, q10s32);
|
||||
q3s32 = vsubq_s32(q8s32, q9s32);
|
||||
|
||||
q8u32 = vcltq_s32(q0s32, qEmptys32);
|
||||
q9u32 = vcltq_s32(q1s32, qEmptys32);
|
||||
q10u32 = vcltq_s32(q2s32, qEmptys32);
|
||||
q11u32 = vcltq_s32(q3s32, qEmptys32);
|
||||
|
||||
q8s32 = vreinterpretq_s32_u32(q8u32);
|
||||
q9s32 = vreinterpretq_s32_u32(q9u32);
|
||||
q10s32 = vreinterpretq_s32_u32(q10u32);
|
||||
q11s32 = vreinterpretq_s32_u32(q11u32);
|
||||
|
||||
q0s32 = vsubq_s32(q0s32, q8s32);
|
||||
q1s32 = vsubq_s32(q1s32, q9s32);
|
||||
q2s32 = vsubq_s32(q2s32, q10s32);
|
||||
q3s32 = vsubq_s32(q3s32, q11s32);
|
||||
|
||||
q8s32 = vaddq_s32(q0s32, q15s32);
|
||||
q9s32 = vaddq_s32(q1s32, q15s32);
|
||||
q10s32 = vaddq_s32(q2s32, q15s32);
|
||||
q11s32 = vaddq_s32(q3s32, q15s32);
|
||||
|
||||
d0s16 = vshrn_n_s32(q8s32, 3);
|
||||
d1s16 = vshrn_n_s32(q9s32, 3);
|
||||
d2s16 = vshrn_n_s32(q10s32, 3);
|
||||
d3s16 = vshrn_n_s32(q11s32, 3);
|
||||
|
||||
q0s16 = vcombine_s16(d0s16, d1s16);
|
||||
q1s16 = vcombine_s16(d2s16, d3s16);
|
||||
|
||||
vst1q_s16(output, q0s16);
|
||||
vst1q_s16(output + 8, q1s16);
|
||||
return;
|
||||
}
|
||||
#endif // VPX_INCOMPATIBLE_GCC
|
|
@ -18,6 +18,7 @@
|
|||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
#include "vp8/common/pragmas.h"
|
||||
#include "vpx/vpx_encoder.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
#include "bitstream.h"
|
||||
|
|
|
@ -18,18 +18,18 @@ extern "C" {
|
|||
|
||||
#if HAVE_EDSP
|
||||
void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
|
||||
vp8_token *,
|
||||
const vp8_token *,
|
||||
const vp8_extra_bit_struct *,
|
||||
const vp8_tree_index *);
|
||||
void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
|
||||
unsigned char * cx_data,
|
||||
const unsigned char *cx_data_end,
|
||||
int num_parts,
|
||||
vp8_token *,
|
||||
const vp8_token *,
|
||||
const vp8_extra_bit_struct *,
|
||||
const vp8_tree_index *);
|
||||
void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
|
||||
vp8_token *,
|
||||
const vp8_token *,
|
||||
const vp8_extra_bit_struct *,
|
||||
const vp8_tree_index *);
|
||||
# define pack_tokens(a,b,c) \
|
||||
|
|
|
@ -128,7 +128,6 @@ typedef struct macroblock
|
|||
int q_index;
|
||||
|
||||
#if CONFIG_TEMPORAL_DENOISING
|
||||
int increase_denoising;
|
||||
MB_PREDICTION_MODE best_sse_inter_mode;
|
||||
int_mv best_sse_mv;
|
||||
MV_REFERENCE_FRAME best_reference_frame;
|
||||
|
|
|
@ -35,6 +35,10 @@ typedef struct
|
|||
unsigned char *buffer;
|
||||
unsigned char *buffer_end;
|
||||
struct vpx_internal_error_info *error;
|
||||
|
||||
/* Variables used to track bit costs without outputing to the bitstream */
|
||||
unsigned int measure_cost;
|
||||
unsigned long bit_counter;
|
||||
} BOOL_CODER;
|
||||
|
||||
extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, unsigned char *buffer_end);
|
||||
|
|
|
@ -8,8 +8,6 @@
|
|||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include "denoising.h"
|
||||
|
||||
#include "vp8/common/reconinter.h"
|
||||
|
@ -23,7 +21,6 @@ static const unsigned int NOISE_MOTION_THRESHOLD = 25 * 25;
|
|||
*/
|
||||
static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
|
||||
static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
|
||||
static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
|
||||
|
||||
/*
|
||||
* The filter function was modified to reduce the computational complexity.
|
||||
|
@ -54,36 +51,27 @@ static const unsigned int SSE_THRESHOLD_HIGH = 16 * 16 * 60;
|
|||
* [16, 255] 6 7
|
||||
*/
|
||||
|
||||
int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
||||
unsigned char *running_avg_y, int avg_y_stride,
|
||||
unsigned char *sig, int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising)
|
||||
int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
|
||||
YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
|
||||
unsigned int motion_magnitude, int y_offset,
|
||||
int uv_offset)
|
||||
{
|
||||
unsigned char *running_avg_y_start = running_avg_y;
|
||||
unsigned char *sig_start = sig;
|
||||
int sum_diff_thresh;
|
||||
int r, c;
|
||||
unsigned char *sig = signal->thismb;
|
||||
int sig_stride = 16;
|
||||
unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
|
||||
int mc_avg_y_stride = mc_running_avg->y_stride;
|
||||
unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
|
||||
int avg_y_stride = running_avg->y_stride;
|
||||
int r, c, i;
|
||||
int sum_diff = 0;
|
||||
int adj_val[3] = {3, 4, 6};
|
||||
int shift_inc1 = 0;
|
||||
int shift_inc2 = 1;
|
||||
int col_sum[16] = {0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0};
|
||||
|
||||
/* If motion_magnitude is small, making the denoiser more aggressive by
|
||||
* increasing the adjustment for each level. Add another increment for
|
||||
* blocks that are labeled for increase denoising. */
|
||||
* increasing the adjustment for each level. */
|
||||
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
|
||||
{
|
||||
if (increase_denoising) {
|
||||
shift_inc1 = 1;
|
||||
shift_inc2 = 2;
|
||||
}
|
||||
adj_val[0] += shift_inc2;
|
||||
adj_val[1] += shift_inc2;
|
||||
adj_val[2] += shift_inc2;
|
||||
for (i = 0; i < 3; i++)
|
||||
adj_val[i] += 1;
|
||||
}
|
||||
|
||||
for (r = 0; r < 16; ++r)
|
||||
|
@ -97,16 +85,15 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
|||
diff = mc_running_avg_y[c] - sig[c];
|
||||
absdiff = abs(diff);
|
||||
|
||||
// When |diff| <= |3 + shift_inc1|, use pixel value from
|
||||
// last denoised raw.
|
||||
if (absdiff <= 3 + shift_inc1)
|
||||
/* When |diff| < 4, use pixel value from last denoised raw. */
|
||||
if (absdiff <= 3)
|
||||
{
|
||||
running_avg_y[c] = mc_running_avg_y[c];
|
||||
col_sum[c] += diff;
|
||||
sum_diff += diff;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
|
||||
if (absdiff >= 4 && absdiff <= 7)
|
||||
adjustment = adj_val[0];
|
||||
else if (absdiff >= 8 && absdiff <= 15)
|
||||
adjustment = adj_val[1];
|
||||
|
@ -120,7 +107,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
|||
else
|
||||
running_avg_y[c] = sig[c] + adjustment;
|
||||
|
||||
col_sum[c] += adjustment;
|
||||
sum_diff += adjustment;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -129,7 +116,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
|||
else
|
||||
running_avg_y[c] = sig[c] - adjustment;
|
||||
|
||||
col_sum[c] -= adjustment;
|
||||
sum_diff -= adjustment;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -140,269 +127,18 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
|||
running_avg_y += avg_y_stride;
|
||||
}
|
||||
|
||||
for (c = 0; c < 16; ++c) {
|
||||
// Below we clip the value in the same way which SSE code use.
|
||||
// When adopting aggressive denoiser, the adj_val for each pixel
|
||||
// could be at most 8 (this is current max adjustment of the map).
|
||||
// In SSE code, we calculate the sum of adj_val for
|
||||
// the columns, so the sum could be upto 128(16 rows). However,
|
||||
// the range of the value is -128 ~ 127 in SSE code, that's why
|
||||
// we do this change in C code.
|
||||
// We don't do this for UV denoiser, since there are only 8 rows,
|
||||
// and max adjustments <= 8, so the sum of the columns will not
|
||||
// exceed 64.
|
||||
if (col_sum[c] >= 128) {
|
||||
col_sum[c] = 127;
|
||||
}
|
||||
sum_diff += col_sum[c];
|
||||
}
|
||||
|
||||
sum_diff_thresh= SUM_DIFF_THRESHOLD;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
|
||||
if (abs(sum_diff) > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising), check
|
||||
// if we can still apply some (weaker) temporal filtering to this block,
|
||||
// that would otherwise not be denoised at all. Simplest is to apply
|
||||
// an additional adjustment to running_avg_y to bring it closer to sig.
|
||||
// The adjustment is capped by a maximum delta, and chosen such that
|
||||
// in most cases the resulting sum_diff will be within the
|
||||
// accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over threshold.
|
||||
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
sig -= sig_stride * 16;
|
||||
mc_running_avg_y -= mc_avg_y_stride * 16;
|
||||
running_avg_y -= avg_y_stride * 16;
|
||||
for (r = 0; r < 16; ++r) {
|
||||
for (c = 0; c < 16; ++c) {
|
||||
int diff = mc_running_avg_y[c] - sig[c];
|
||||
int adjustment = abs(diff);
|
||||
if (adjustment > delta)
|
||||
adjustment = delta;
|
||||
if (diff > 0) {
|
||||
// Bring denoised signal down.
|
||||
if (running_avg_y[c] - adjustment < 0)
|
||||
running_avg_y[c] = 0;
|
||||
else
|
||||
running_avg_y[c] = running_avg_y[c] - adjustment;
|
||||
col_sum[c] -= adjustment;
|
||||
} else if (diff < 0) {
|
||||
// Bring denoised signal up.
|
||||
if (running_avg_y[c] + adjustment > 255)
|
||||
running_avg_y[c] = 255;
|
||||
else
|
||||
running_avg_y[c] = running_avg_y[c] + adjustment;
|
||||
col_sum[c] += adjustment;
|
||||
}
|
||||
}
|
||||
// TODO(marpan): Check here if abs(sum_diff) has gone below the
|
||||
// threshold sum_diff_thresh, and if so, we can exit the row loop.
|
||||
sig += sig_stride;
|
||||
mc_running_avg_y += mc_avg_y_stride;
|
||||
running_avg_y += avg_y_stride;
|
||||
}
|
||||
|
||||
sum_diff = 0;
|
||||
for (c = 0; c < 16; ++c) {
|
||||
if (col_sum[c] >= 128) {
|
||||
col_sum[c] = 127;
|
||||
}
|
||||
sum_diff += col_sum[c];
|
||||
}
|
||||
|
||||
if (abs(sum_diff) > sum_diff_thresh)
|
||||
return COPY_BLOCK;
|
||||
} else {
|
||||
if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
|
||||
vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
|
||||
vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
|
||||
signal->thismb, sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
|
||||
int mc_avg_uv_stride,
|
||||
unsigned char *running_avg_uv,
|
||||
int avg_uv_stride,
|
||||
unsigned char *sig,
|
||||
int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising) {
|
||||
unsigned char *running_avg_uv_start = running_avg_uv;
|
||||
unsigned char *sig_start = sig;
|
||||
int sum_diff_thresh;
|
||||
int r, c;
|
||||
int sum_diff = 0;
|
||||
int sum_block = 0;
|
||||
int adj_val[3] = {3, 4, 6};
|
||||
int shift_inc1 = 0;
|
||||
int shift_inc2 = 1;
|
||||
/* If motion_magnitude is small, making the denoiser more aggressive by
|
||||
* increasing the adjustment for each level. Add another increment for
|
||||
* blocks that are labeled for increase denoising. */
|
||||
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
|
||||
if (increase_denoising) {
|
||||
shift_inc1 = 1;
|
||||
shift_inc2 = 2;
|
||||
}
|
||||
adj_val[0] += shift_inc2;
|
||||
adj_val[1] += shift_inc2;
|
||||
adj_val[2] += shift_inc2;
|
||||
}
|
||||
|
||||
// Avoid denoising color signal if its close to average level.
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
sum_block += sig[c];
|
||||
}
|
||||
sig += sig_stride;
|
||||
}
|
||||
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
|
||||
sig -= sig_stride * 8;
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
int diff = 0;
|
||||
int adjustment = 0;
|
||||
int absdiff = 0;
|
||||
|
||||
diff = mc_running_avg_uv[c] - sig[c];
|
||||
absdiff = abs(diff);
|
||||
|
||||
// When |diff| <= |3 + shift_inc1|, use pixel value from
|
||||
// last denoised raw.
|
||||
if (absdiff <= 3 + shift_inc1) {
|
||||
running_avg_uv[c] = mc_running_avg_uv[c];
|
||||
sum_diff += diff;
|
||||
} else {
|
||||
if (absdiff >= 4 && absdiff <= 7)
|
||||
adjustment = adj_val[0];
|
||||
else if (absdiff >= 8 && absdiff <= 15)
|
||||
adjustment = adj_val[1];
|
||||
else
|
||||
adjustment = adj_val[2];
|
||||
if (diff > 0) {
|
||||
if ((sig[c] + adjustment) > 255)
|
||||
running_avg_uv[c] = 255;
|
||||
else
|
||||
running_avg_uv[c] = sig[c] + adjustment;
|
||||
sum_diff += adjustment;
|
||||
} else {
|
||||
if ((sig[c] - adjustment) < 0)
|
||||
running_avg_uv[c] = 0;
|
||||
else
|
||||
running_avg_uv[c] = sig[c] - adjustment;
|
||||
sum_diff -= adjustment;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride;
|
||||
mc_running_avg_uv += mc_avg_uv_stride;
|
||||
running_avg_uv += avg_uv_stride;
|
||||
}
|
||||
|
||||
sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
|
||||
if (abs(sum_diff) > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising), check
|
||||
// if we can still apply some (weaker) temporal filtering to this block,
|
||||
// that would otherwise not be denoised at all. Simplest is to apply
|
||||
// an additional adjustment to running_avg_y to bring it closer to sig.
|
||||
// The adjustment is capped by a maximum delta, and chosen such that
|
||||
// in most cases the resulting sum_diff will be within the
|
||||
// accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over threshold.
|
||||
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
sig -= sig_stride * 8;
|
||||
mc_running_avg_uv -= mc_avg_uv_stride * 8;
|
||||
running_avg_uv -= avg_uv_stride * 8;
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
int diff = mc_running_avg_uv[c] - sig[c];
|
||||
int adjustment = abs(diff);
|
||||
if (adjustment > delta)
|
||||
adjustment = delta;
|
||||
if (diff > 0) {
|
||||
// Bring denoised signal down.
|
||||
if (running_avg_uv[c] - adjustment < 0)
|
||||
running_avg_uv[c] = 0;
|
||||
else
|
||||
running_avg_uv[c] = running_avg_uv[c] - adjustment;
|
||||
sum_diff -= adjustment;
|
||||
} else if (diff < 0) {
|
||||
// Bring denoised signal up.
|
||||
if (running_avg_uv[c] + adjustment > 255)
|
||||
running_avg_uv[c] = 255;
|
||||
else
|
||||
running_avg_uv[c] = running_avg_uv[c] + adjustment;
|
||||
sum_diff += adjustment;
|
||||
}
|
||||
}
|
||||
// TODO(marpan): Check here if abs(sum_diff) has gone below the
|
||||
// threshold sum_diff_thresh, and if so, we can exit the row loop.
|
||||
sig += sig_stride;
|
||||
mc_running_avg_uv += mc_avg_uv_stride;
|
||||
running_avg_uv += avg_uv_stride;
|
||||
}
|
||||
if (abs(sum_diff) > sum_diff_thresh)
|
||||
return COPY_BLOCK;
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
|
||||
vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
|
||||
sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
|
||||
assert(mode > 0); // Denoiser is allocated only if mode > 0.
|
||||
if (mode == 1) {
|
||||
denoiser->denoiser_mode = kDenoiserOnYOnly;
|
||||
} else if (mode == 2) {
|
||||
denoiser->denoiser_mode = kDenoiserOnYUV;
|
||||
} else if (mode == 3) {
|
||||
denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
|
||||
} else {
|
||||
denoiser->denoiser_mode = kDenoiserOnAdaptive;
|
||||
}
|
||||
if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
|
||||
denoiser->denoise_pars.scale_sse_thresh = 1;
|
||||
denoiser->denoise_pars.scale_motion_thresh = 8;
|
||||
denoiser->denoise_pars.scale_increase_filter = 0;
|
||||
denoiser->denoise_pars.denoise_mv_bias = 95;
|
||||
denoiser->denoise_pars.pickmode_mv_bias = 100;
|
||||
denoiser->denoise_pars.qp_thresh = 0;
|
||||
denoiser->denoise_pars.consec_zerolast = UINT_MAX;
|
||||
denoiser->denoise_pars.spatial_blur = 0;
|
||||
} else {
|
||||
denoiser->denoise_pars.scale_sse_thresh = 2;
|
||||
denoiser->denoise_pars.scale_motion_thresh = 16;
|
||||
denoiser->denoise_pars.scale_increase_filter = 1;
|
||||
denoiser->denoise_pars.denoise_mv_bias = 60;
|
||||
denoiser->denoise_pars.pickmode_mv_bias = 60;
|
||||
denoiser->denoise_pars.qp_thresh = 100;
|
||||
denoiser->denoise_pars.consec_zerolast = 10;
|
||||
denoiser->denoise_pars.spatial_blur = 20;
|
||||
}
|
||||
}
|
||||
|
||||
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
|
||||
int num_mb_rows, int num_mb_cols, int mode)
|
||||
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
|
||||
{
|
||||
int i;
|
||||
assert(denoiser);
|
||||
denoiser->num_mb_cols = num_mb_cols;
|
||||
|
||||
for (i = 0; i < MAX_REF_FRAMES; i++)
|
||||
{
|
||||
|
@ -430,45 +166,9 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
|
|||
|
||||
vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
|
||||
denoiser->yv12_mc_running_avg.frame_size);
|
||||
|
||||
if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
|
||||
height, VP8BORDERINPIXELS) < 0) {
|
||||
vp8_denoiser_free(denoiser);
|
||||
return 1;
|
||||
}
|
||||
vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
|
||||
denoiser->yv12_last_source.frame_size);
|
||||
|
||||
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
|
||||
vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
|
||||
vp8_denoiser_set_parameters(denoiser, mode);
|
||||
denoiser->nmse_source_diff = 0;
|
||||
denoiser->nmse_source_diff_count = 0;
|
||||
denoiser->qp_avg = 0;
|
||||
// QP threshold below which we can go up to aggressive mode.
|
||||
denoiser->qp_threshold_up = 80;
|
||||
// QP threshold above which we can go back down to normal mode.
|
||||
// For now keep this second threshold high, so not used currently.
|
||||
denoiser->qp_threshold_down = 128;
|
||||
// Bitrate thresholds and noise metric (nmse) thresholds for switching to
|
||||
// aggressive mode.
|
||||
// TODO(marpan): Adjust thresholds, including effect on resolution.
|
||||
denoiser->bitrate_threshold = 200000; // (bits/sec).
|
||||
denoiser->threshold_aggressive_mode = 35;
|
||||
if (width * height > 640 * 480) {
|
||||
denoiser->bitrate_threshold = 500000;
|
||||
denoiser->threshold_aggressive_mode = 100;
|
||||
} else if (width * height > 960 * 540) {
|
||||
denoiser->bitrate_threshold = 800000;
|
||||
denoiser->threshold_aggressive_mode = 150;
|
||||
} else if (width * height > 1280 * 720) {
|
||||
denoiser->bitrate_threshold = 2000000;
|
||||
denoiser->threshold_aggressive_mode = 1400;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void vp8_denoiser_free(VP8_DENOISER *denoiser)
|
||||
{
|
||||
int i;
|
||||
|
@ -479,8 +179,6 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
|
|||
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_running_avg[i]);
|
||||
}
|
||||
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_mc_running_avg);
|
||||
vp8_yv12_de_alloc_frame_buffer(&denoiser->yv12_last_source);
|
||||
vpx_free(denoiser->denoise_state);
|
||||
}
|
||||
|
||||
|
||||
|
@ -489,28 +187,16 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
unsigned int best_sse,
|
||||
unsigned int zero_mv_sse,
|
||||
int recon_yoffset,
|
||||
int recon_uvoffset,
|
||||
loop_filter_info_n *lfi_n,
|
||||
int mb_row,
|
||||
int mb_col,
|
||||
int block_index)
|
||||
|
||||
int recon_uvoffset)
|
||||
{
|
||||
int mv_row;
|
||||
int mv_col;
|
||||
unsigned int motion_threshold;
|
||||
unsigned int motion_magnitude2;
|
||||
unsigned int sse_thresh;
|
||||
int sse_diff_thresh = 0;
|
||||
// Spatial loop filter: only applied selectively based on
|
||||
// temporal filter state of block relative to top/left neighbors.
|
||||
int apply_spatial_loop_filter = 1;
|
||||
|
||||
MV_REFERENCE_FRAME frame = x->best_reference_frame;
|
||||
MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
|
||||
|
||||
enum vp8_denoiser_decision decision = FILTER_BLOCK;
|
||||
enum vp8_denoiser_decision decision_u = COPY_BLOCK;
|
||||
enum vp8_denoiser_decision decision_v = COPY_BLOCK;
|
||||
|
||||
if (zero_frame)
|
||||
{
|
||||
|
@ -520,11 +206,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
MB_MODE_INFO saved_mbmi;
|
||||
MACROBLOCKD *filter_xd = &x->e_mbd;
|
||||
MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
|
||||
int sse_diff = 0;
|
||||
// Bias on zero motion vector sse.
|
||||
const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
|
||||
zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
|
||||
sse_diff = zero_mv_sse - best_sse;
|
||||
int sse_diff = zero_mv_sse - best_sse;
|
||||
|
||||
saved_mbmi = *mbmi;
|
||||
|
||||
|
@ -535,16 +217,11 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
mbmi->need_to_clamp_mvs = x->need_to_clamp_best_mvs;
|
||||
mv_col = x->best_sse_mv.as_mv.col;
|
||||
mv_row = x->best_sse_mv.as_mv.row;
|
||||
// Bias to zero_mv if small amount of motion.
|
||||
// Note sse_diff_thresh is intialized to zero, so this ensures
|
||||
// we will always choose zero_mv for denoising if
|
||||
// zero_mv_see <= best_sse (i.e., sse_diff <= 0).
|
||||
if ((unsigned int)(mv_row * mv_row + mv_col * mv_col)
|
||||
<= NOISE_MOTION_THRESHOLD)
|
||||
sse_diff_thresh = (int)SSE_DIFF_THRESHOLD;
|
||||
|
||||
if (frame == INTRA_FRAME ||
|
||||
sse_diff <= sse_diff_thresh)
|
||||
((unsigned int)(mv_row *mv_row + mv_col *mv_col)
|
||||
<= NOISE_MOTION_THRESHOLD &&
|
||||
sse_diff < (int)SSE_DIFF_THRESHOLD))
|
||||
{
|
||||
/*
|
||||
* Handle intra blocks as referring to last frame with zero motion
|
||||
|
@ -600,62 +277,20 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
mv_row = x->best_sse_mv.as_mv.row;
|
||||
mv_col = x->best_sse_mv.as_mv.col;
|
||||
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
|
||||
motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
|
||||
NOISE_MOTION_THRESHOLD;
|
||||
|
||||
if (motion_magnitude2 <
|
||||
denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
|
||||
x->increase_denoising = 1;
|
||||
|
||||
sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
|
||||
if (x->increase_denoising)
|
||||
sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
|
||||
|
||||
if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
|
||||
decision = COPY_BLOCK;
|
||||
if (best_sse > SSE_THRESHOLD || motion_magnitude2
|
||||
> 8 * NOISE_MOTION_THRESHOLD)
|
||||
{
|
||||
decision = COPY_BLOCK;
|
||||
}
|
||||
|
||||
if (decision == FILTER_BLOCK)
|
||||
{
|
||||
unsigned char *mc_running_avg_y =
|
||||
denoiser->yv12_mc_running_avg.y_buffer + recon_yoffset;
|
||||
int mc_avg_y_stride = denoiser->yv12_mc_running_avg.y_stride;
|
||||
unsigned char *running_avg_y =
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset;
|
||||
int avg_y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
|
||||
|
||||
/* Filter. */
|
||||
decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
|
||||
running_avg_y, avg_y_stride,
|
||||
x->thismb, 16, motion_magnitude2,
|
||||
x->increase_denoising);
|
||||
denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
|
||||
kFilterNonZeroMV : kFilterZeroMV;
|
||||
// Only denoise UV for zero motion, and if y channel was denoised.
|
||||
if (denoiser->denoiser_mode != kDenoiserOnYOnly &&
|
||||
motion_magnitude2 == 0 &&
|
||||
decision == FILTER_BLOCK) {
|
||||
unsigned char *mc_running_avg_u =
|
||||
denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
|
||||
unsigned char *running_avg_u =
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
|
||||
unsigned char *mc_running_avg_v =
|
||||
denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
|
||||
unsigned char *running_avg_v =
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
|
||||
int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
|
||||
int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
|
||||
int signal_stride = x->block[16].src_stride;
|
||||
decision_u =
|
||||
vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
|
||||
running_avg_u, avg_uv_stride,
|
||||
x->block[16].src + *x->block[16].base_src,
|
||||
signal_stride, motion_magnitude2, 0);
|
||||
decision_v =
|
||||
vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
|
||||
running_avg_v, avg_uv_stride,
|
||||
x->block[20].src + *x->block[20].base_src,
|
||||
signal_stride, motion_magnitude2, 0);
|
||||
}
|
||||
decision = vp8_denoiser_filter(&denoiser->yv12_mc_running_avg,
|
||||
&denoiser->yv12_running_avg[INTRA_FRAME],
|
||||
x,
|
||||
motion_magnitude2,
|
||||
recon_yoffset, recon_uvoffset);
|
||||
}
|
||||
if (decision == COPY_BLOCK)
|
||||
{
|
||||
|
@ -666,73 +301,5 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
x->thismb, 16,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
|
||||
denoiser->denoise_state[block_index] = kNoFilter;
|
||||
}
|
||||
if (denoiser->denoiser_mode != kDenoiserOnYOnly) {
|
||||
if (decision_u == COPY_BLOCK) {
|
||||
vp8_copy_mem8x8(
|
||||
x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
|
||||
}
|
||||
if (decision_v == COPY_BLOCK) {
|
||||
vp8_copy_mem8x8(
|
||||
x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
|
||||
}
|
||||
}
|
||||
// Option to selectively deblock the denoised signal, for y channel only.
|
||||
if (apply_spatial_loop_filter) {
|
||||
loop_filter_info lfi;
|
||||
int apply_filter_col = 0;
|
||||
int apply_filter_row = 0;
|
||||
int apply_filter = 0;
|
||||
int y_stride = denoiser->yv12_running_avg[INTRA_FRAME].y_stride;
|
||||
int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
|
||||
|
||||
// Fix filter level to some nominal value for now.
|
||||
int filter_level = 32;
|
||||
|
||||
int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
|
||||
lfi.mblim = lfi_n->mblim[filter_level];
|
||||
lfi.blim = lfi_n->blim[filter_level];
|
||||
lfi.lim = lfi_n->lim[filter_level];
|
||||
lfi.hev_thr = lfi_n->hev_thr[hev_index];
|
||||
|
||||
// Apply filter if there is a difference in the denoiser filter state
|
||||
// between the current and left/top block, or if non-zero motion vector
|
||||
// is used for the motion-compensated filtering.
|
||||
if (mb_col > 0) {
|
||||
apply_filter_col = !((denoiser->denoise_state[block_index] ==
|
||||
denoiser->denoise_state[block_index - 1]) &&
|
||||
denoiser->denoise_state[block_index] != kFilterNonZeroMV);
|
||||
if (apply_filter_col) {
|
||||
// Filter left vertical edge.
|
||||
apply_filter = 1;
|
||||
vp8_loop_filter_mbv(
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
|
||||
NULL, NULL, y_stride, uv_stride, &lfi);
|
||||
}
|
||||
}
|
||||
if (mb_row > 0) {
|
||||
apply_filter_row = !((denoiser->denoise_state[block_index] ==
|
||||
denoiser->denoise_state[block_index - denoiser->num_mb_cols]) &&
|
||||
denoiser->denoise_state[block_index] != kFilterNonZeroMV);
|
||||
if (apply_filter_row) {
|
||||
// Filter top horizontal edge.
|
||||
apply_filter = 1;
|
||||
vp8_loop_filter_mbh(
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
|
||||
NULL, NULL, y_stride, uv_stride, &lfi);
|
||||
}
|
||||
}
|
||||
if (apply_filter) {
|
||||
// Update the signal block |x|. Pixel changes are only to top and/or
|
||||
// left boundary pixels: can we avoid full block copy here.
|
||||
vp8_copy_mem16x16(
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
|
||||
y_stride, x->thismb, 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,101 +12,36 @@
|
|||
#define VP8_ENCODER_DENOISING_H_
|
||||
|
||||
#include "block.h"
|
||||
#include "vp8/common/loopfilter.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SUM_DIFF_THRESHOLD (16 * 16 * 2)
|
||||
#define SUM_DIFF_THRESHOLD_HIGH (600)
|
||||
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
|
||||
|
||||
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
|
||||
#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
|
||||
#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
|
||||
#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
|
||||
|
||||
enum vp8_denoiser_decision
|
||||
{
|
||||
COPY_BLOCK,
|
||||
FILTER_BLOCK
|
||||
};
|
||||
|
||||
enum vp8_denoiser_filter_state {
|
||||
kNoFilter,
|
||||
kFilterZeroMV,
|
||||
kFilterNonZeroMV
|
||||
};
|
||||
|
||||
enum vp8_denoiser_mode {
|
||||
kDenoiserOff,
|
||||
kDenoiserOnYOnly,
|
||||
kDenoiserOnYUV,
|
||||
kDenoiserOnYUVAggressive,
|
||||
kDenoiserOnAdaptive
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
// Scale factor on sse threshold above which no denoising is done.
|
||||
unsigned int scale_sse_thresh;
|
||||
// Scale factor on motion magnitude threshold above which no
|
||||
// denoising is done.
|
||||
unsigned int scale_motion_thresh;
|
||||
// Scale factor on motion magnitude below which we increase the strength of
|
||||
// the temporal filter (in function vp8_denoiser_filter).
|
||||
unsigned int scale_increase_filter;
|
||||
// Scale factor to bias to ZEROMV for denoising.
|
||||
unsigned int denoise_mv_bias;
|
||||
// Scale factor to bias to ZEROMV for coding mode selection.
|
||||
unsigned int pickmode_mv_bias;
|
||||
// Quantizer threshold below which we use the segmentation map to switch off
|
||||
// loop filter for blocks that have been coded as ZEROMV-LAST a certain number
|
||||
// (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
|
||||
// 0 when segmentation map is used for shutting off loop filter.
|
||||
unsigned int qp_thresh;
|
||||
// Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
|
||||
unsigned int consec_zerolast;
|
||||
// Threshold for amount of spatial blur on Y channel. 0 means no spatial blur.
|
||||
unsigned int spatial_blur;
|
||||
} denoise_params;
|
||||
|
||||
typedef struct vp8_denoiser
|
||||
{
|
||||
YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
|
||||
YV12_BUFFER_CONFIG yv12_mc_running_avg;
|
||||
// TODO(marpan): Should remove yv12_last_source and use vp8_lookahead_peak.
|
||||
YV12_BUFFER_CONFIG yv12_last_source;
|
||||
unsigned char* denoise_state;
|
||||
int num_mb_cols;
|
||||
int denoiser_mode;
|
||||
int threshold_aggressive_mode;
|
||||
int nmse_source_diff;
|
||||
int nmse_source_diff_count;
|
||||
int qp_avg;
|
||||
int qp_threshold_up;
|
||||
int qp_threshold_down;
|
||||
int bitrate_threshold;
|
||||
denoise_params denoise_pars;
|
||||
} VP8_DENOISER;
|
||||
|
||||
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
|
||||
int num_mb_rows, int num_mb_cols, int mode);
|
||||
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height);
|
||||
|
||||
void vp8_denoiser_free(VP8_DENOISER *denoiser);
|
||||
|
||||
void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode);
|
||||
|
||||
void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
||||
MACROBLOCK *x,
|
||||
unsigned int best_sse,
|
||||
unsigned int zero_mv_sse,
|
||||
int recon_yoffset,
|
||||
int recon_uvoffset,
|
||||
loop_filter_info_n *lfi_n,
|
||||
int mb_row,
|
||||
int mb_col,
|
||||
int block_index);
|
||||
int recon_uvoffset);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
|
|
@ -522,19 +522,6 @@ void encode_mb_row(VP8_COMP *cpi,
|
|||
}
|
||||
|
||||
#endif
|
||||
// Keep track of how many (consecutive) times a block is coded
|
||||
// as ZEROMV_LASTREF, for base layer frames.
|
||||
// Reset to 0 if its coded as anything else.
|
||||
if (cpi->current_layer == 0) {
|
||||
if (xd->mode_info_context->mbmi.mode == ZEROMV &&
|
||||
xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
|
||||
// Increment, check for wrap-around.
|
||||
if (cpi->consec_zero_last[map_index+mb_col] < 255)
|
||||
cpi->consec_zero_last[map_index+mb_col] += 1;
|
||||
} else {
|
||||
cpi->consec_zero_last[map_index+mb_col] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Special case code for cyclic refresh
|
||||
* If cyclic update enabled then copy xd->mbmi.segment_id; (which
|
||||
|
@ -1259,7 +1246,7 @@ int vp8cx_encode_inter_macroblock
|
|||
x->zbin_mode_boost_enabled = 0;
|
||||
}
|
||||
vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
|
||||
&distortion, &intra_error, mb_row, mb_col);
|
||||
&distortion, &intra_error);
|
||||
|
||||
/* switch back to the regular quantizer for the encode */
|
||||
if (cpi->sf.improved_quant)
|
||||
|
|
|
@ -258,6 +258,13 @@ static void optimize_b(MACROBLOCK *mb, int ib, int type,
|
|||
b = &mb->block[ib];
|
||||
d = &mb->e_mbd.block[ib];
|
||||
|
||||
/* Enable this to test the effect of RDO as a replacement for the dynamic
|
||||
* zero bin instead of an augmentation of it.
|
||||
*/
|
||||
#if 0
|
||||
vp8_strict_quantize_b(b, d);
|
||||
#endif
|
||||
|
||||
dequant_ptr = d->dequant;
|
||||
coeff_ptr = b->coeff;
|
||||
qcoeff_ptr = d->qcoeff;
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче