aom/av1/common/arm/neon/iht4x4_add_neon.c

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <arm_neon.h>
#include <assert.h>

#include "./aom_config.h"
#include "./av1_rtcd.h"
#include "aom_dsp/txfm_common.h"
#include "av1/common/common.h"

static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
  int32x4_t q8s32, q9s32;
  int16x4x2_t d0x2s16, d1x2s16;
  int32x4x2_t q0x2s32;

  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));

  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
  q0x2s32 = vtrnq_s32(q8s32, q9s32);

  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
  return;
}

static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
                                             int16x4_t *d2s16) {
  *d0s16 = vdup_n_s16((int16_t)cospi_8_64);
  *d1s16 = vdup_n_s16((int16_t)cospi_16_64);
  *d2s16 = vdup_n_s16((int16_t)cospi_24_64);
  return;
}

static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
                                           int16x4_t *d5s16, int16x8_t *q3s16) {
  *d3s16 = vdup_n_s16((int16_t)sinpi_1_9);
  *d4s16 = vdup_n_s16((int16_t)sinpi_2_9);
  *q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);
  *d5s16 = vdup_n_s16((int16_t)sinpi_4_9);
  return;
}

static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
                              int16x4_t *d2s16, int16x8_t *q8s16,
                              int16x8_t *q9s16) {
  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
  int16x4_t d26s16, d27s16, d28s16, d29s16;
  int32x4_t q10s32, q13s32, q14s32, q15s32;
  int16x8_t q13s16, q14s16;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  d23s16 = vadd_s16(d16s16, d18s16);
  d24s16 = vsub_s16(d16s16, d18s16);

  q15s32 = vmull_s16(d17s16, *d2s16);
  q10s32 = vmull_s16(d17s16, *d0s16);
  q13s32 = vmull_s16(d23s16, *d1s16);
  q14s32 = vmull_s16(d24s16, *d1s16);
  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);

  d26s16 = vqrshrn_n_s32(q13s32, 14);
  d27s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d28s16 = vqrshrn_n_s32(q10s32, 14);

  q13s16 = vcombine_s16(d26s16, d27s16);
  q14s16 = vcombine_s16(d28s16, d29s16);
  *q8s16 = vaddq_s16(q13s16, q14s16);
  *q9s16 = vsubq_s16(q13s16, q14s16);
  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
  return;
}

static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
}

void av1_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                            int dest_stride, int tx_type) {
  uint8x8_t d26u8, d27u8;
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
  uint32x2_t d26u32, d27u32;
  int16x8_t q3s16, q8s16, q9s16;
  uint16x8_t q8u16, q9u16;

  d26u32 = d27u32 = vdup_n_u32(0);

  q8s16 = vld1q_s16(input);
  q9s16 = vld1q_s16(input + 8);

  TRANSPOSE4X4(&q8s16, &q9s16);

  switch (tx_type) {
    case 0:  // idct_idct is not supported. Fall back to C
      av1_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
      return;
      break;
    case 1:  // iadst_idct
      // generate constants
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    case 2:  // idct_iadst
      // generate constantsyy
      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
      break;
    case 3:  // iadst_iadst
      // generate constants
      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);

      // first transform rows
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);

      // transpose the matrix
      TRANSPOSE4X4(&q8s16, &q9s16);

      // then transform columns
      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
      break;
    default:  // iadst_idct
      assert(0);
      break;
  }

  q8s16 = vrshrq_n_s16(q8s16, 4);
  q9s16 = vrshrq_n_s16(q9s16, 4);

  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
  dest += dest_stride;
  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
  dest += dest_stride;
  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);

  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));

  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));

  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
  dest -= dest_stride;
  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
  return;
}
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`/*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* Copyright (c) 2016, Alliance for Open Media. All rights reserved`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* This source code is subject to the terms of the BSD 2 Clause License and`
			`* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License`
			`* was not distributed with this source code in the LICENSE file, you can`
			`* obtain it at www.aomedia.org/license/software. If the Alliance for Open`
			`* Media Patent License 1.0 was not distributed with this source code in the`
			`* PATENTS file, you can obtain it at www.aomedia.org/license/patent.`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`*/`

			`#include <arm_neon.h>`
			`#include <assert.h>`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`#include "./aom_config.h"`
Fix warnings reported by -Wshadow: Part2: av1 directory While we are at it: - Rename some variables to more meaningful names - Reuse some common consts from a header instead of redefining them. Cherry-picked from aomedia/master: 863b0499 Change-Id: Ida5de713156dc0126a27f90fdd36d29a398a3c88 2016-10-15 02:51:44 +03:00			`#include "./av1_rtcd.h"`
			`#include "aom_dsp/txfm_common.h"`
Port folder renaming changes from AOM Manually cherry-picked commits: ceef058 libvpx->libaom part2 3d26d91 libvpx -> libaom cfea7dd vp10/ -> av1/ 3a8eff7 Fix a build issue for a test bf4202e Rename vpx to aom Change-Id: I1b0eb5a40796e3aaf41c58984b4229a439a597dc 2016-08-23 02:08:15 +03:00			`#include "av1/common/common.h"`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`static INLINE void TRANSPOSE4X4(int16x8_t q8s16, int16x8_t q9s16) {`
			`int32x4_t q8s32, q9s32;`
			`int16x4x2_t d0x2s16, d1x2s16;`
			`int32x4x2_t q0x2s32;`

			`d0x2s16 = vtrn_s16(vget_low_s16(q8s16), vget_high_s16(q8s16));`
			`d1x2s16 = vtrn_s16(vget_low_s16(q9s16), vget_high_s16(q9s16));`

			`q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));`
			`q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));`
			`q0x2s32 = vtrnq_s32(q8s32, q9s32);`

			`*q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);`
			`*q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);`
			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`

vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t d0s16, int16x4_t d1s16,`
			`int16x4_t *d2s16) {`
Fix warnings reported by -Wshadow: Part2: av1 directory While we are at it: - Rename some variables to more meaningful names - Reuse some common consts from a header instead of redefining them. Cherry-picked from aomedia/master: 863b0499 Change-Id: Ida5de713156dc0126a27f90fdd36d29a398a3c88 2016-10-15 02:51:44 +03:00			`*d0s16 = vdup_n_s16((int16_t)cospi_8_64);`
			`*d1s16 = vdup_n_s16((int16_t)cospi_16_64);`
			`*d2s16 = vdup_n_s16((int16_t)cospi_24_64);`
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`

vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t d3s16, int16x4_t d4s16,`
			`int16x4_t d5s16, int16x8_t q3s16) {`
Fix warnings reported by -Wshadow: Part2: av1 directory While we are at it: - Rename some variables to more meaningful names - Reuse some common consts from a header instead of redefining them. Cherry-picked from aomedia/master: 863b0499 Change-Id: Ida5de713156dc0126a27f90fdd36d29a398a3c88 2016-10-15 02:51:44 +03:00			`*d3s16 = vdup_n_s16((int16_t)sinpi_1_9);`
			`*d4s16 = vdup_n_s16((int16_t)sinpi_2_9);`
			`*q3s16 = vdupq_n_s16((int16_t)sinpi_3_9);`
			`*d5s16 = vdup_n_s16((int16_t)sinpi_4_9);`
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`

vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`static INLINE void IDCT4x4_1D(int16x4_t d0s16, int16x4_t d1s16,`
			`int16x4_t d2s16, int16x8_t q8s16,`
			`int16x8_t *q9s16) {`
			`int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;`
			`int16x4_t d26s16, d27s16, d28s16, d29s16;`
			`int32x4_t q10s32, q13s32, q14s32, q15s32;`
			`int16x8_t q13s16, q14s16;`

			`d16s16 = vget_low_s16(*q8s16);`
			`d17s16 = vget_high_s16(*q8s16);`
			`d18s16 = vget_low_s16(*q9s16);`
			`d19s16 = vget_high_s16(*q9s16);`

			`d23s16 = vadd_s16(d16s16, d18s16);`
			`d24s16 = vsub_s16(d16s16, d18s16);`

			`q15s32 = vmull_s16(d17s16, *d2s16);`
			`q10s32 = vmull_s16(d17s16, *d0s16);`
			`q13s32 = vmull_s16(d23s16, *d1s16);`
			`q14s32 = vmull_s16(d24s16, *d1s16);`
			`q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);`
			`q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);`

			`d26s16 = vqrshrn_n_s32(q13s32, 14);`
			`d27s16 = vqrshrn_n_s32(q14s32, 14);`
			`d29s16 = vqrshrn_n_s32(q15s32, 14);`
			`d28s16 = vqrshrn_n_s32(q10s32, 14);`

			`q13s16 = vcombine_s16(d26s16, d27s16);`
			`q14s16 = vcombine_s16(d28s16, d29s16);`
			`*q8s16 = vaddq_s16(q13s16, q14s16);`
			`*q9s16 = vsubq_s16(q13s16, q14s16);`
			`q9s16 = vcombine_s16(vget_high_s16(q9s16), vget_low_s16(*q9s16)); // vswp`
			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`

vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`static INLINE void IADST4x4_1D(int16x4_t d3s16, int16x4_t d4s16,`
			`int16x4_t d5s16, int16x8_t q3s16,`
			`int16x8_t q8s16, int16x8_t q9s16) {`
			`int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;`
			`int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;`

			`d6s16 = vget_low_s16(*q3s16);`

			`d16s16 = vget_low_s16(*q8s16);`
			`d17s16 = vget_high_s16(*q8s16);`
			`d18s16 = vget_low_s16(*q9s16);`
			`d19s16 = vget_high_s16(*q9s16);`

			`q10s32 = vmull_s16(*d3s16, d16s16);`
			`q11s32 = vmull_s16(*d4s16, d16s16);`
			`q12s32 = vmull_s16(d6s16, d17s16);`
			`q13s32 = vmull_s16(*d5s16, d18s16);`
			`q14s32 = vmull_s16(*d3s16, d18s16);`
			`q15s32 = vmovl_s16(d16s16);`
			`q15s32 = vaddw_s16(q15s32, d19s16);`
			`q8s32 = vmull_s16(*d4s16, d19s16);`
			`q15s32 = vsubw_s16(q15s32, d18s16);`
			`q9s32 = vmull_s16(*d5s16, d19s16);`

			`q10s32 = vaddq_s32(q10s32, q13s32);`
			`q10s32 = vaddq_s32(q10s32, q8s32);`
			`q11s32 = vsubq_s32(q11s32, q14s32);`
Fix warnings reported by -Wshadow: Part2: av1 directory While we are at it: - Rename some variables to more meaningful names - Reuse some common consts from a header instead of redefining them. Cherry-picked from aomedia/master: 863b0499 Change-Id: Ida5de713156dc0126a27f90fdd36d29a398a3c88 2016-10-15 02:51:44 +03:00			`q8s32 = vdupq_n_s32((int32_t)sinpi_3_9);`
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`q11s32 = vsubq_s32(q11s32, q9s32);`
			`q15s32 = vmulq_s32(q15s32, q8s32);`

			`q13s32 = vaddq_s32(q10s32, q12s32);`
			`q10s32 = vaddq_s32(q10s32, q11s32);`
			`q14s32 = vaddq_s32(q11s32, q12s32);`
			`q10s32 = vsubq_s32(q10s32, q12s32);`

			`d16s16 = vqrshrn_n_s32(q13s32, 14);`
			`d17s16 = vqrshrn_n_s32(q14s32, 14);`
			`d18s16 = vqrshrn_n_s32(q15s32, 14);`
			`d19s16 = vqrshrn_n_s32(q10s32, 14);`

			`*q8s16 = vcombine_s16(d16s16, d17s16);`
			`*q9s16 = vcombine_s16(d18s16, d19s16);`
			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`void av1_iht4x4_16_add_neon(const tran_low_t input, uint8_t dest,`
			`int dest_stride, int tx_type) {`
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`uint8x8_t d26u8, d27u8;`
			`int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;`
			`uint32x2_t d26u32, d27u32;`
			`int16x8_t q3s16, q8s16, q9s16;`
			`uint16x8_t q8u16, q9u16;`

			`d26u32 = d27u32 = vdup_n_u32(0);`

			`q8s16 = vld1q_s16(input);`
			`q9s16 = vld1q_s16(input + 8);`

			`TRANSPOSE4X4(&q8s16, &q9s16);`

			`switch (tx_type) {`
			`case 0: // idct_idct is not supported. Fall back to C`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`av1_iht4x4_16_add_c(input, dest, dest_stride, tx_type);`
vp10/common: apply clang-format Change-Id: I01d8241eba3ccaf4d06c00a51df2d17c126f6f9d 2016-08-12 04:55:00 +03:00			`return;`
			`break;`
			`case 1: // iadst_idct`
			`// generate constants`
			`GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);`
			`GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);`

			`// first transform rows`
			`IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);`

			`// transpose the matrix`
			`TRANSPOSE4X4(&q8s16, &q9s16);`

			`// then transform columns`
			`IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);`
			`break;`
			`case 2: // idct_iadst`
			`// generate constantsyy`
			`GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);`
			`GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);`

			`// first transform rows`
			`IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);`

			`// transpose the matrix`
			`TRANSPOSE4X4(&q8s16, &q9s16);`

			`// then transform columns`
			`IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);`
			`break;`
			`case 3: // iadst_iadst`
			`// generate constants`
			`GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);`

			`// first transform rows`
			`IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);`

			`// transpose the matrix`
			`TRANSPOSE4X4(&q8s16, &q9s16);`

			`// then transform columns`
			`IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);`
			`break;`
			`default: // iadst_idct`
			`assert(0);`
			`break;`
			`}`

			`q8s16 = vrshrq_n_s16(q8s16, 4);`
			`q9s16 = vrshrq_n_s16(q9s16, 4);`

			`d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);`
			`dest += dest_stride;`
			`d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);`
			`dest += dest_stride;`
			`d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);`
			`dest += dest_stride;`
			`d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);`

			`q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));`
			`q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));`

			`d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));`
			`d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));`

			`vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);`
			`dest -= dest_stride;`
			`vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);`
			`dest -= dest_stride;`
			`vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);`
			`dest -= dest_stride;`
			`vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);`
			`return;`
Fork VP9 and VP10 codebase This commit folks the VP9 and VP10 codebase and makes libvpx support VP8, VP9, and VP10. Change-Id: I81782e0b809acb3c9844bee8c8ec8f4d5e8fa356 2015-08-06 05:00:31 +03:00			`}`