aom/aom_dsp/arm/aom_convolve_neon.c

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>

#include "./aom_dsp_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"

void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                        int w, int h) {
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);

  // Account for the vertical phase needing 3 lines prior and 4 lines post
  int intermediate_height = h + 7;

  assert(y_step_q4 == 16);
  assert(x_step_q4 == 16);

  /* Filter starting 3 lines back. The neon implementation will ignore the
   * given height and filter a multiple of 4 lines. Since this goes in to
   * the temp buffer which has lots of extra room and is subsequently discarded
   * this is safe if somewhat less than ideal.
   */
  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
                           x_step_q4, filter_y, y_step_q4, w,
                           intermediate_height);

  /* Step into the temp buffer 3 lines to get the actual frame data */
  aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
                          x_step_q4, filter_y, y_step_q4, w, h);
}

void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4, int w,
                            int h) {
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
  int intermediate_height = h + 7;

  assert(y_step_q4 == 16);
  assert(x_step_q4 == 16);

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
   */
  aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,
                           x_step_q4, filter_y, y_step_q4, w,
                           intermediate_height);
  aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,
                              x_step_q4, filter_y, y_step_q4, w, h);
}
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`/*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* Copyright (c) 2016, Alliance for Open Media. All rights reserved`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* This source code is subject to the terms of the BSD 2 Clause License and`
			`* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License`
			`* was not distributed with this source code in the LICENSE file, you can`
			`* obtain it at www.aomedia.org/license/software. If the Alliance for Open`
			`* Media Patent License 1.0 was not distributed with this source code in the`
			`* PATENTS file, you can obtain it at www.aomedia.org/license/patent.`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`*/`

VPX: removed step checks from neon convolve code The check is handled by the predictor table. Change-Id: I42479f843e77a2d40cdcdfc9e2e6c48a05a36561 2015-08-13 02:14:53 +03:00			`#include <assert.h>`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`#include "./aom_dsp_rtcd.h"`
			`#include "aom_dsp/aom_dsp_common.h"`
Port folder renaming changes from AOM Manually cherry-picked commits: ceef058 libvpx->libaom part2 3d26d91 libvpx -> libaom cfea7dd vp10/ -> av1/ 3a8eff7 Fix a build issue for a test bf4202e Rename vpx to aom Change-Id: I1b0eb5a40796e3aaf41c58984b4229a439a597dc 2016-08-23 02:08:15 +03:00			`#include "aom_ports/mem.h"`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`void aom_convolve8_neon(const uint8_t src, ptrdiff_t src_stride, uint8_t dst,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`ptrdiff_t dst_stride, const int16_t *filter_x,`
			`int x_step_q4, const int16_t *filter_y, int y_step_q4,`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`int w, int h) {`
			`/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the`
			`* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).`
			`*/`
replace DECLARE_ALIGNED_ARRAY w/DECLARE_ALIGNED this macro was used inconsistently and only differs in behavior from DECLARE_ALIGNED when an alignment attribute is unavailable. this macro is used with calls to assembly, while generic c-code doesn't rely on it, so in a c-only build without an alignment attribute the code will function as expected. Change-Id: Ie9d06d4028c0de17c63b3a27e6c1b0491cc4ea79 2015-05-02 23:24:16 +03:00			`DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00
			`// Account for the vertical phase needing 3 lines prior and 4 lines post`
			`int intermediate_height = h + 7;`

VPX: removed step checks from neon convolve code The check is handled by the predictor table. Change-Id: I42479f843e77a2d40cdcdfc9e2e6c48a05a36561 2015-08-13 02:14:53 +03:00			`assert(y_step_q4 == 16);`
			`assert(x_step_q4 == 16);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00
			`/* Filter starting 3 lines back. The neon implementation will ignore the`
			`* given height and filter a multiple of 4 lines. Since this goes in to`
			`* the temp buffer which has lots of extra room and is subsequently discarded`
			`* this is safe if somewhat less than ideal.`
			`*/`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`x_step_q4, filter_y, y_step_q4, w,`
			`intermediate_height);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00
			`/* Step into the temp buffer 3 lines to get the actual frame data */`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`x_step_q4, filter_y, y_step_q4, w, h);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`}`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`uint8_t *dst, ptrdiff_t dst_stride,`
			`const int16_t *filter_x, int x_step_q4,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`const int16_t *filter_y, int y_step_q4, int w,`
			`int h) {`
replace DECLARE_ALIGNED_ARRAY w/DECLARE_ALIGNED this macro was used inconsistently and only differs in behavior from DECLARE_ALIGNED when an alignment attribute is unavailable. this macro is used with calls to assembly, while generic c-code doesn't rely on it, so in a c-only build without an alignment attribute the code will function as expected. Change-Id: Ie9d06d4028c0de17c63b3a27e6c1b0491cc4ea79 2015-05-02 23:24:16 +03:00			`DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`int intermediate_height = h + 7;`

VPX: removed step checks from neon convolve code The check is handled by the predictor table. Change-Id: I42479f843e77a2d40cdcdfc9e2e6c48a05a36561 2015-08-13 02:14:53 +03:00			`assert(y_step_q4 == 16);`
			`assert(x_step_q4 == 16);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00
			`/* This implementation has the same issues as above. In addition, we only want`
			`* to average the values after both passes.`
			`*/`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`x_step_q4, filter_y, y_step_q4, w,`
			`intermediate_height);`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`x_step_q4, filter_y, y_step_q4, w, h);`
vp9_convolve8_neon placeholder Call the individually optimized horizontal and vertical functions. This implementation abuses the temp buffer. This will be replaced with a custom optimized function. Over 2x speedup. Change-Id: I5b908d2a73d264e9810d6022bbff73207a3055dd 2013-07-16 21:13:06 +04:00			`}`