aom/aom_dsp/mips/convolve2_avg_dspr2.c

/*
 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
 *
 * This source code is subject to the terms of the BSD 2 Clause License and
 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
 * was not distributed with this source code in the LICENSE file, you can
 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

#include <assert.h>
#include <stdio.h>

#include "./aom_dsp_rtcd.h"
#include "aom_dsp/mips/convolve_common_dspr2.h"
#include "aom_dsp/aom_convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_ports/mem.h"

#if HAVE_DSPR2
static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
                                         uint8_t *dst, int32_t dst_stride,
                                         const int16_t *filter_y, int32_t w,
                                         int32_t h) {
  int32_t x, y;
  const uint8_t *src_ptr;
  uint8_t *dst_ptr;
  uint8_t *cm = aom_ff_cropTbl;
  uint32_t vector4a = 64;
  uint32_t load1, load2;
  uint32_t p1, p2;
  uint32_t scratch1, scratch2;
  uint32_t store1, store2;
  int32_t Temp1, Temp2;
  const int16_t *filter = &filter_y[3];
  uint32_t filter45;

  filter45 = ((const int32_t *)filter)[0];

  for (y = h; y--;) {
    /* prefetch data to cache memory */
    prefetch_store(dst + dst_stride);

    for (x = 0; x < w; x += 4) {
      src_ptr = src + x;
      dst_ptr = dst + x;

      __asm__ __volatile__(
          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
          "ulw              %[load2],     0(%[src_ptr])                   \n\t"

          "mtlo             %[vector4a],  $ac0                            \n\t"
          "mtlo             %[vector4a],  $ac1                            \n\t"
          "mtlo             %[vector4a],  $ac2                            \n\t"
          "mtlo             %[vector4a],  $ac3                            \n\t"
          "mthi             $zero,        $ac0                            \n\t"
          "mthi             $zero,        $ac1                            \n\t"
          "mthi             $zero,        $ac2                            \n\t"
          "mthi             $zero,        $ac3                            \n\t"

          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"

          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"

          "extp             %[Temp1],     $ac0,           31              \n\t"
          "extp             %[Temp2],     $ac1,           31              \n\t"

          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"

          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
          "extp             %[Temp1],     $ac2,           31              \n\t"

          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
          "extp             %[Temp2],     $ac3,           31              \n\t"
          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"

          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"

          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */

          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"

          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
    }

    /* Next row... */
    src += src_stride;
    dst += dst_stride;
  }
}

static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
                                          int32_t src_stride, uint8_t *dst,
                                          int32_t dst_stride,
                                          const int16_t *filter_y, int32_t h) {
  int32_t x, y;
  const uint8_t *src_ptr;
  uint8_t *dst_ptr;
  uint8_t *cm = aom_ff_cropTbl;
  uint32_t vector4a = 64;
  uint32_t load1, load2;
  uint32_t p1, p2;
  uint32_t scratch1, scratch2;
  uint32_t store1, store2;
  int32_t Temp1, Temp2;
  const int16_t *filter = &filter_y[3];
  uint32_t filter45;

  filter45 = ((const int32_t *)filter)[0];

  for (y = h; y--;) {
    /* prefetch data to cache memory */
    prefetch_store(dst + dst_stride);
    prefetch_store(dst + dst_stride + 32);

    for (x = 0; x < 64; x += 4) {
      src_ptr = src + x;
      dst_ptr = dst + x;

      __asm__ __volatile__(
          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
          "ulw              %[load2],     0(%[src_ptr])                   \n\t"

          "mtlo             %[vector4a],  $ac0                            \n\t"
          "mtlo             %[vector4a],  $ac1                            \n\t"
          "mtlo             %[vector4a],  $ac2                            \n\t"
          "mtlo             %[vector4a],  $ac3                            \n\t"
          "mthi             $zero,        $ac0                            \n\t"
          "mthi             $zero,        $ac1                            \n\t"
          "mthi             $zero,        $ac2                            \n\t"
          "mthi             $zero,        $ac3                            \n\t"

          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"

          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */

          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"

          "extp             %[Temp1],     $ac0,           31              \n\t"
          "extp             %[Temp2],     $ac1,           31              \n\t"

          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"

          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
          "extp             %[Temp1],     $ac2,           31              \n\t"

          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
          "extp             %[Temp2],     $ac3,           31              \n\t"
          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"

          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"

          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */

          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
          "sb               %[store2],    3(%[dst_ptr])                   \n\t"

          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1),
            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
    }

    /* Next row... */
    src += src_stride;
    dst += dst_stride;
  }
}

void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
                                  const int16_t *filter_x, int x_step_q4,
                                  const int16_t *filter_y, int y_step_q4, int w,
                                  int h) {
  uint32_t pos = 38;

  assert(y_step_q4 == 16);

  /* bit positon for extract from acc */
  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
                       :
                       : [pos] "r"(pos));

  prefetch_store(dst);

  switch (w) {
    case 4:
    case 8:
    case 16:
    case 32:
      convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,
                                   w, h);
      break;
    case 64:
      prefetch_store(dst + 32);
      convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
                                    h);
      break;
    default:
      aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
                               x_step_q4, filter_y, y_step_q4, w, h);
      break;
  }
}
#endif
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`/*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* Copyright (c) 2016, Alliance for Open Media. All rights reserved`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`*`
Change to use aom copyright notice This minimize code differences between AOM master and nextgenv2 Change-Id: If144865bdf3ef0818e7aac11018b9e786444c550 2016-09-02 00:32:49 +03:00			`* This source code is subject to the terms of the BSD 2 Clause License and`
			`* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License`
			`* was not distributed with this source code in the LICENSE file, you can`
			`* obtain it at www.aomedia.org/license/software. If the Alliance for Open`
			`* Media Patent License 1.0 was not distributed with this source code in the`
			`* PATENTS file, you can obtain it at www.aomedia.org/license/patent.`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`*/`

			`#include <assert.h>`
			`#include <stdio.h>`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`#include "./aom_dsp_rtcd.h"`
Port folder renaming changes from AOM Manually cherry-picked commits: ceef058 libvpx->libaom part2 3d26d91 libvpx -> libaom cfea7dd vp10/ -> av1/ 3a8eff7 Fix a build issue for a test bf4202e Rename vpx to aom Change-Id: I1b0eb5a40796e3aaf41c58984b4229a439a597dc 2016-08-23 02:08:15 +03:00			`#include "aom_dsp/mips/convolve_common_dspr2.h"`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`#include "aom_dsp/aom_convolve.h"`
			`#include "aom_dsp/aom_dsp_common.h"`
Port folder renaming changes from AOM Manually cherry-picked commits: ceef058 libvpx->libaom part2 3d26d91 libvpx -> libaom cfea7dd vp10/ -> av1/ 3a8eff7 Fix a build issue for a test bf4202e Rename vpx to aom Change-Id: I1b0eb5a40796e3aaf41c58984b4229a439a597dc 2016-08-23 02:08:15 +03:00			`#include "aom_ports/mem.h"`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00
			`#if HAVE_DSPR2`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,`
			`uint8_t *dst, int32_t dst_stride,`
			`const int16_t *filter_y, int32_t w,`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`int32_t h) {`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`int32_t x, y;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`const uint8_t *src_ptr;`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint8_t *dst_ptr;`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`uint8_t *cm = aom_ff_cropTbl;`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint32_t vector4a = 64;`
			`uint32_t load1, load2;`
			`uint32_t p1, p2;`
			`uint32_t scratch1, scratch2;`
			`uint32_t store1, store2;`
			`int32_t Temp1, Temp2;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`const int16_t *filter = &filter_y[3];`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint32_t filter45;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00
			`filter45 = ((const int32_t *)filter)[0];`

			`for (y = h; y--;) {`
			`/* prefetch data to cache memory */`
Rename loop filter function from vp9_ to vpx_ Change-Id: I6f424bb8daec26bf8482b5d75dd9b0e45c11a665 2015-07-17 22:31:53 +03:00			`prefetch_store(dst + dst_stride);`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00
			`for (x = 0; x < w; x += 4) {`
			`src_ptr = src + x;`
			`dst_ptr = dst + x;`

vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`__asm__ __volatile__(`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`"ulw %[load1], 0(%[src_ptr]) \n\t"`
			`"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"`
			`"ulw %[load2], 0(%[src_ptr]) \n\t"`

			`"mtlo %[vector4a], $ac0 \n\t"`
			`"mtlo %[vector4a], $ac1 \n\t"`
			`"mtlo %[vector4a], $ac2 \n\t"`
			`"mtlo %[vector4a], $ac3 \n\t"`
			`"mthi $zero, $ac0 \n\t"`
			`"mthi $zero, $ac1 \n\t"`
			`"mthi $zero, $ac2 \n\t"`
			`"mthi $zero, $ac3 \n\t"`

			`"preceu.ph.qbr %[scratch1], %[load1] \n\t"`
			`"preceu.ph.qbr %[p1], %[load2] \n\t"`
			`"precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */`
			`"append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */`

			`"dpa.w.ph $ac0, %[p1], %[filter45] \n\t"`
			`"dpa.w.ph $ac1, %[p2], %[filter45] \n\t"`

			`"preceu.ph.qbl %[scratch1], %[load1] \n\t"`
			`"preceu.ph.qbl %[p1], %[load2] \n\t"`
			`"precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */`
			`"append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */`

			`"dpa.w.ph $ac2, %[p1], %[filter45] \n\t"`
			`"dpa.w.ph $ac3, %[p2], %[filter45] \n\t"`

			`"extp %[Temp1], $ac0, 31 \n\t"`
			`"extp %[Temp2], $ac1, 31 \n\t"`

			`"lbu %[scratch1], 0(%[dst_ptr]) \n\t"`
			`"lbu %[scratch2], 1(%[dst_ptr]) \n\t"`

			`"lbux %[store1], %[Temp1](%[cm]) \n\t"`
			`"addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */`
			`"extp %[Temp1], $ac2, 31 \n\t"`

			`"lbux %[store2], %[Temp2](%[cm]) \n\t"`
			`"addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */`
			`"extp %[Temp2], $ac3, 31 \n\t"`
			`"lbu %[scratch1], 2(%[dst_ptr]) \n\t"`

			`"sb %[store1], 0(%[dst_ptr]) \n\t"`
			`"sb %[store2], 1(%[dst_ptr]) \n\t"`
			`"lbu %[scratch2], 3(%[dst_ptr]) \n\t"`

			`"lbux %[store1], %[Temp1](%[cm]) \n\t"`
			`"lbux %[store2], %[Temp2](%[cm]) \n\t"`
			`"addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */`
			`"addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */`

			`"sb %[store1], 2(%[dst_ptr]) \n\t"`
			`"sb %[store2], 3(%[dst_ptr]) \n\t"`

vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`: [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),`
			`[p2] "=&r"(p2), [scratch1] "=&r"(scratch1),`
			`[scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),`
			`[Temp2] "=&r"(Temp2), [store1] "=&r"(store1),`
			`[store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)`
			`: [filter45] "r"(filter45), [vector4a] "r"(vector4a),`
			`[src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`}`

			`/* Next row... */`
			`src += src_stride;`
			`dst += dst_stride;`
			`}`
			`}`

			`static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`int32_t src_stride, uint8_t *dst,`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`int32_t dst_stride,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`const int16_t *filter_y, int32_t h) {`
			`int32_t x, y;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`const uint8_t *src_ptr;`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint8_t *dst_ptr;`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`uint8_t *cm = aom_ff_cropTbl;`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint32_t vector4a = 64;`
			`uint32_t load1, load2;`
			`uint32_t p1, p2;`
			`uint32_t scratch1, scratch2;`
			`uint32_t store1, store2;`
			`int32_t Temp1, Temp2;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`const int16_t *filter = &filter_y[3];`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`uint32_t filter45;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00
			`filter45 = ((const int32_t *)filter)[0];`

			`for (y = h; y--;) {`
			`/* prefetch data to cache memory */`
Rename loop filter function from vp9_ to vpx_ Change-Id: I6f424bb8daec26bf8482b5d75dd9b0e45c11a665 2015-07-17 22:31:53 +03:00			`prefetch_store(dst + dst_stride);`
			`prefetch_store(dst + dst_stride + 32);`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00
			`for (x = 0; x < 64; x += 4) {`
			`src_ptr = src + x;`
			`dst_ptr = dst + x;`

vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`__asm__ __volatile__(`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`"ulw %[load1], 0(%[src_ptr]) \n\t"`
			`"add %[src_ptr], %[src_ptr], %[src_stride] \n\t"`
			`"ulw %[load2], 0(%[src_ptr]) \n\t"`

			`"mtlo %[vector4a], $ac0 \n\t"`
			`"mtlo %[vector4a], $ac1 \n\t"`
			`"mtlo %[vector4a], $ac2 \n\t"`
			`"mtlo %[vector4a], $ac3 \n\t"`
			`"mthi $zero, $ac0 \n\t"`
			`"mthi $zero, $ac1 \n\t"`
			`"mthi $zero, $ac2 \n\t"`
			`"mthi $zero, $ac3 \n\t"`

			`"preceu.ph.qbr %[scratch1], %[load1] \n\t"`
			`"preceu.ph.qbr %[p1], %[load2] \n\t"`
			`"precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */`
			`"append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */`

			`"dpa.w.ph $ac0, %[p1], %[filter45] \n\t"`
			`"dpa.w.ph $ac1, %[p2], %[filter45] \n\t"`

			`"preceu.ph.qbl %[scratch1], %[load1] \n\t"`
			`"preceu.ph.qbl %[p1], %[load2] \n\t"`
			`"precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */`
			`"append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */`

			`"dpa.w.ph $ac2, %[p1], %[filter45] \n\t"`
			`"dpa.w.ph $ac3, %[p2], %[filter45] \n\t"`

			`"extp %[Temp1], $ac0, 31 \n\t"`
			`"extp %[Temp2], $ac1, 31 \n\t"`

			`"lbu %[scratch1], 0(%[dst_ptr]) \n\t"`
			`"lbu %[scratch2], 1(%[dst_ptr]) \n\t"`

			`"lbux %[store1], %[Temp1](%[cm]) \n\t"`
			`"addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */`
			`"extp %[Temp1], $ac2, 31 \n\t"`

			`"lbux %[store2], %[Temp2](%[cm]) \n\t"`
			`"addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */`
			`"extp %[Temp2], $ac3, 31 \n\t"`
			`"lbu %[scratch1], 2(%[dst_ptr]) \n\t"`

			`"sb %[store1], 0(%[dst_ptr]) \n\t"`
			`"sb %[store2], 1(%[dst_ptr]) \n\t"`
			`"lbu %[scratch2], 3(%[dst_ptr]) \n\t"`

			`"lbux %[store1], %[Temp1](%[cm]) \n\t"`
			`"lbux %[store2], %[Temp2](%[cm]) \n\t"`
			`"addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */`
			`"addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */`

			`"sb %[store1], 2(%[dst_ptr]) \n\t"`
			`"sb %[store2], 3(%[dst_ptr]) \n\t"`

vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`: [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),`
			`[p2] "=&r"(p2), [scratch1] "=&r"(scratch1),`
			`[scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),`
			`[Temp2] "=&r"(Temp2), [store1] "=&r"(store1),`
			`[store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)`
			`: [filter45] "r"(filter45), [vector4a] "r"(vector4a),`
			`[src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`}`

			`/* Next row... */`
			`src += src_stride;`
			`dst += dst_stride;`
			`}`
			`}`

Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`uint8_t *dst, ptrdiff_t dst_stride,`
			`const int16_t *filter_x, int x_step_q4,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`const int16_t *filter_y, int y_step_q4, int w,`
			`int h) {`
VPX: removed step checks from mips convolve code The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b 2015-08-13 21:27:04 +03:00			`uint32_t pos = 38;`

			`assert(y_step_q4 == 16);`

			`/* bit positon for extract from acc */`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`__asm__ __volatile__("wrdsp %[pos], 1 \n\t"`
			`:`
			`: [pos] "r"(pos));`
VPX: removed step checks from mips convolve code The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b 2015-08-13 21:27:04 +03:00
			`prefetch_store(dst);`

			`switch (w) {`
			`case 4:`
			`case 8:`
			`case 16:`
			`case 32:`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y,`
			`w, h);`
VPX: removed step checks from mips convolve code The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b 2015-08-13 21:27:04 +03:00			`break;`
			`case 64:`
			`prefetch_store(dst + 32);`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,`
			`h);`
VPX: removed step checks from mips convolve code The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b 2015-08-13 21:27:04 +03:00			`break;`
			`default:`
Port renaming changes from AOMedia Cherry-Picked the following commits: 0defd8f Changed "WebM" to "AOMedia" & "webm" to "aomedia" 54e6676 Replace "VPx" by "AVx" 5082a36 Change "Vpx" to "Avx" 7df44f1 Replace "Vp9" w/ "Av1" 967f722 Remove kVp9CodecId 828f30c Change "Vp8" to "AOM" 030b5ff AUTHORS regenerated 2524cae Add ref-mv experimental flag 016762b Change copyright notice to AOMedia form 81e5526 Replace vp9 w/ av1 9b94565 Add missing files fa8ca9f Change "vp9" to "av1" ec838b7 Convert "vp8" to "aom" 80edfa0 Change "VP9" to "AV1" d1a11fb Change "vp8" to "aom" 7b58251 Point to WebM test data dd1a5c8 Replace "VP8" with "AOM" ff00fc0 Change "VPX" to "AOM" 01dee0b Change "vp10" to "av1" in source code cebe6f0 Convert "vpx" to "aom" 17b0567 rename vp10.mk to av1_.mk fe5f8a8 rename files vp10_* to av1_* Change-Id: I6fc3d18eb11fc171e46140c836ad5339cf6c9419 2016-08-31 00:01:10 +03:00			`aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,`
vpx_dsp/: apply clang-format Change-Id: Ia3f96910409be4ae8a4907a2f0dee73b1af8f93d 2016-08-09 08:59:08 +03:00			`x_step_q4, filter_y, y_step_q4, w, h);`
VPX: removed step checks from mips convolve code The check is handled by the predictor table. Change-Id: I5e5084ebb46be8087c8c9d80b5f76e919a1cd05b 2015-08-13 21:27:04 +03:00			`break;`
mips dsp-ase r2 vp9 decoder bilinear convolve optimizations Change-Id: Ic31b4ef85e65070b4f8b9f26e068ccfaae00c4f0 2013-10-09 16:35:27 +04:00			`}`
			`}`
			`#endif`