Remove unused assembly sources and associated tests.

Change-Id: Ic8386743b1852ca1074528d04e2adc1d191b091b
2017-01-31 18:39:45 -08:00 · 2017-01-31 18:39:45 -08:00 · 0d3aeda300
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -374,8 +374,6 @@ set(AOM_UNIT_TEST_SOURCES
    #"${AOM_ROOT}/test/accounting_test.cc"
    "${AOM_ROOT}/test/acm_random.h"
    "${AOM_ROOT}/test/active_map_test.cc"
-    # not in test.mk
-    #"${AOM_ROOT}/test/add_noise_test.cc"
    "${AOM_ROOT}/test/altref_test.cc"
    "${AOM_ROOT}/test/android"
    # requires CONFIG_ANS
--- a/aom_dsp/deblock.c
+++ b/aom_dsp/deblock.c
@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- *
- */
-
-#include <stdlib.h>
-#include "aom/aom_integer.h"
-
-const int16_t aom_rv[] = {
-  8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
-  4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
-  3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
-  2,  9,  7,  3,  3,  1,  13, 13, 6,  6,  5,  2,  7,  11, 9,  11, 8,  7,  3,
-  2,  0,  13, 13, 14, 4,  12, 5,  12, 10, 8,  10, 13, 10, 4,  14, 4,  10, 0,
-  8,  11, 1,  13, 7,  7,  14, 6,  14, 13, 2,  13, 5,  4,  4,  0,  10, 0,  5,
-  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,  7,  2,  2,  5,  3,  4,  7,
-  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,  0,  11, 8,  13, 1,  13, 1,
-  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,  1,  13, 14, 7,  6,  7,  9,
-  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,  8,  7,  10, 0,  8,  14, 11,
-  3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12, 12, 8,  0,  11, 13, 1,  2,
-  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,  3,  10, 5,  8,  0,  11, 6,
-  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,  4,  3,  5,  6,  10, 8,  9,
-  4,  11, 14, 0,  10, 0,  5,  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,
-  7,  2,  2,  5,  3,  4,  7,  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,
-  0,  11, 8,  13, 1,  13, 1,  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,
-  1,  13, 14, 7,  6,  7,  9,  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,
-  8,  7,  10, 0,  8,  14, 11, 3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12,
-  12, 8,  0,  11, 13, 1,  2,  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,
-  3,  10, 5,  8,  0,  11, 6,  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,
-  4,  3,  5,  6,  10, 8,  9,  4,  11, 14, 3,  8,  3,  7,  8,  5,  11, 4,  12,
-  3,  11, 9,  14, 8,  14, 13, 4,  3,  1,  2,  14, 6,  5,  4,  4,  11, 4,  6,
-  2,  1,  5,  8,  8,  12, 13, 5,  14, 10, 12, 13, 0,  9,  5,  5,  11, 10, 13,
-  9,  10, 13,
-};
-
-void aom_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line, int cols,
-                                            unsigned char *f, int size) {
-  unsigned char *p_src, *p_dst;
-  int row;
-  int col;
-  unsigned char v;
-  unsigned char d[4];
-
-  for (row = 0; row < size; row++) {
-    /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
-
-    for (col = 0; col < cols; col++) {
-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
-      unsigned char p_above1 = p_src[col - src_pixels_per_line];
-      unsigned char p_below1 = p_src[col + src_pixels_per_line];
-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
-
-      v = p_src[col];
-
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
-        unsigned char k1, k2, k3;
-        k1 = (p_above2 + p_above1 + 1) >> 1;
-        k2 = (p_below2 + p_below1 + 1) >> 1;
-        k3 = (k1 + k2 + 1) >> 1;
-        v = (k3 + v + 1) >> 1;
-      }
-
-      p_dst[col] = v;
-    }
-
-    /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
-
-    p_src[-2] = p_src[-1] = p_src[0];
-    p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
-
-    for (col = 0; col < cols; col++) {
-      v = p_src[col];
-
-      if ((abs(v - p_src[col - 2]) < f[col]) &&
-          (abs(v - p_src[col - 1]) < f[col]) &&
-          (abs(v - p_src[col + 1]) < f[col]) &&
-          (abs(v - p_src[col + 2]) < f[col])) {
-        unsigned char k1, k2, k3;
-        k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
-        k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
-        k3 = (k1 + k2 + 1) >> 1;
-        v = (k3 + v + 1) >> 1;
-      }
-
-      d[col & 3] = v;
-
-      if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
-    }
-
-    /* handle the last two pixels */
-    p_dst[col - 2] = d[(col - 2) & 3];
-    p_dst[col - 1] = d[(col - 1) & 3];
-
-    /* next row */
-    src_ptr += src_pixels_per_line;
-    dst_ptr += dst_pixels_per_line;
-  }
-}
-
-void aom_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
-                                 int cols, int flimit) {
-  int r, c, i;
-
-  unsigned char *s = src;
-  unsigned char d[16];
-
-  for (r = 0; r < rows; r++) {
-    int sumsq = 0;
-    int sum = 0;
-
-    for (i = -8; i < 0; i++) s[i] = s[0];
-
-    /* 17 avoids valgrind warning - we buffer values in c in d
-     * and only write them when we've read 8 ahead...
-     */
-    for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
-
-    for (i = -8; i <= 6; i++) {
-      sumsq += s[i] * s[i];
-      sum += s[i];
-      d[i + 8] = 0;
-    }
-
-    for (c = 0; c < cols + 8; c++) {
-      int x = s[c + 7] - s[c - 8];
-      int y = s[c + 7] + s[c - 8];
-
-      sum += x;
-      sumsq += x * y;
-
-      d[c & 15] = s[c];
-
-      if (sumsq * 15 - sum * sum < flimit) {
-        d[c & 15] = (8 + sum + s[c]) >> 4;
-      }
-
-      s[c - 8] = d[(c - 8) & 15];
-    }
-
-    s += pitch;
-  }
-}
-
-void aom_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
-                            int flimit) {
-  int r, c, i;
-  const int16_t *rv3 = &aom_rv[63 & rand()];
-
-  for (c = 0; c < cols; c++) {
-    unsigned char *s = &dst[c];
-    int sumsq = 0;
-    int sum = 0;
-    unsigned char d[16];
-    const int16_t *rv2 = rv3 + ((c * 17) & 127);
-
-    for (i = -8; i < 0; i++) s[i * pitch] = s[0];
-
-    /* 17 avoids valgrind warning - we buffer values in c in d
-     * and only write them when we've read 8 ahead...
-     */
-    for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
-
-    for (i = -8; i <= 6; i++) {
-      sumsq += s[i * pitch] * s[i * pitch];
-      sum += s[i * pitch];
-    }
-
-    for (r = 0; r < rows + 8; r++) {
-      sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
-      sum += s[7 * pitch] - s[-8 * pitch];
-      d[r & 15] = s[0];
-
-      if (sumsq * 15 - sum * sum < flimit) {
-        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
-      }
-      if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
-      s += pitch;
-    }
-  }
-}
--- a/aom_dsp/mips/deblock_msa.c
+++ b/aom_dsp/mips/deblock_msa.c
@ -1,682 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include "./macros_msa.h"
-
-extern const int16_t aom_rv[];
-
-#define AOM_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
-                                out1, out2, out3, out4, out5, out6, out7,      \
-                                out8, out9, out10, out11, out12, out13, out14, \
-                                out15)                                         \
-  {                                                                            \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
-                                                                               \
-    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
-    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
-    out0 = (v16u8)temp6;                                                       \
-    out2 = (v16u8)temp7;                                                       \
-    out4 = (v16u8)temp8;                                                       \
-    out6 = (v16u8)temp9;                                                       \
-    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
-    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
-    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
-    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
-    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
-    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
-  }
-
-#define AOM_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
-                           ref, out)                                           \
-  {                                                                            \
-    v16u8 temp0, temp1;                                                        \
-                                                                               \
-    temp1 = __msa_aver_u_b(above2_in, above1_in);                              \
-    temp0 = __msa_aver_u_b(below2_in, below1_in);                              \
-    temp1 = __msa_aver_u_b(temp1, temp0);                                      \
-    out = __msa_aver_u_b(src_in, temp1);                                       \
-    temp0 = __msa_asub_u_b(src_in, above2_in);                                 \
-    temp1 = __msa_asub_u_b(src_in, above1_in);                                 \
-    temp0 = (temp0 < ref);                                                     \
-    temp1 = (temp1 < ref);                                                     \
-    temp0 = temp0 & temp1;                                                     \
-    temp1 = __msa_asub_u_b(src_in, below1_in);                                 \
-    temp1 = (temp1 < ref);                                                     \
-    temp0 = temp0 & temp1;                                                     \
-    temp1 = __msa_asub_u_b(src_in, below2_in);                                 \
-    temp1 = (temp1 < ref);                                                     \
-    temp0 = temp0 & temp1;                                                     \
-    out = __msa_bmz_v(out, src_in, temp0);                                     \
-  }
-
-#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,    \
-                         in10, in11, in12, in13, in14, in15)                  \
-  {                                                                           \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                                  \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                                  \
-                                                                              \
-    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                             \
-    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                  \
-    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
-    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                  \
-    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                  \
-    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                  \
-    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
-    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
-    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                                  \
-    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                         \
-    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                                  \
-    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                                  \
-    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                                  \
-    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                             \
-    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                         \
-    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);                    \
-    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);                    \
-    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
-    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                         \
-    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);                    \
-    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);                    \
-    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3,    \
-               temp4, temp5);                                                 \
-    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
-               temp7, temp8, temp9);                                          \
-    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);                     \
-    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);                    \
-    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);                    \
-    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);                     \
-    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);                   \
-    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);                   \
-  }
-
-#define AOM_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
-                                in9, in10, in11)                             \
-  {                                                                          \
-    v8i16 temp0, temp1, temp2, temp3;                                        \
-    v8i16 temp4, temp5, temp6, temp7;                                        \
-                                                                             \
-    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                            \
-    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                 \
-    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                            \
-    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                 \
-    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                 \
-    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                 \
-    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                            \
-    temp4 = __msa_ilvr_h(temp5, temp4);                                      \
-    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                            \
-    temp5 = __msa_ilvr_h(temp7, temp6);                                      \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                 \
-    in0 = (v16u8)temp0;                                                      \
-    in2 = (v16u8)temp1;                                                      \
-    in4 = (v16u8)temp2;                                                      \
-    in6 = (v16u8)temp3;                                                      \
-    in8 = (v16u8)temp6;                                                      \
-    in10 = (v16u8)temp7;                                                     \
-    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);                   \
-    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);                   \
-    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);                   \
-    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);                   \
-    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);                   \
-    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);                  \
-  }
-
-static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
-                                            int32_t src_stride,
-                                            int32_t dst_stride, int32_t cols,
-                                            uint8_t *f) {
-  uint8_t *p_src = src_ptr;
-  uint8_t *p_dst = dst_ptr;
-  uint8_t *f_orig = f;
-  uint8_t *p_dst_st = dst_ptr;
-  uint16_t col;
-  uint64_t out0, out1, out2, out3;
-  v16u8 above2, above1, below2, below1, src, ref, ref_temp;
-  v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
-  v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
-
-  for (col = (cols / 16); col--;) {
-    ref = LD_UB(f);
-    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
-    src = LD_UB(p_src);
-    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
-    above2 = LD_UB(p_src + 3 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
-    above1 = LD_UB(p_src + 4 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
-    src = LD_UB(p_src + 5 * src_stride);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
-    below1 = LD_UB(p_src + 6 * src_stride);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
-    below2 = LD_UB(p_src + 7 * src_stride);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
-    above2 = LD_UB(p_src + 8 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
-    above1 = LD_UB(p_src + 9 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
-    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
-           p_dst, dst_stride);
-
-    p_dst += 16;
-    p_src += 16;
-    f += 16;
-  }
-
-  if (0 != (cols / 16)) {
-    ref = LD_UB(f);
-    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
-    src = LD_UB(p_src);
-    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
-    above2 = LD_UB(p_src + 3 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
-    above1 = LD_UB(p_src + 4 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
-    src = LD_UB(p_src + 5 * src_stride);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
-    below1 = LD_UB(p_src + 6 * src_stride);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
-    below2 = LD_UB(p_src + 7 * src_stride);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
-    above2 = LD_UB(p_src + 8 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
-    above1 = LD_UB(p_src + 9 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
-    out0 = __msa_copy_u_d((v2i64)inter0, 0);
-    out1 = __msa_copy_u_d((v2i64)inter1, 0);
-    out2 = __msa_copy_u_d((v2i64)inter2, 0);
-    out3 = __msa_copy_u_d((v2i64)inter3, 0);
-    SD4(out0, out1, out2, out3, p_dst, dst_stride);
-
-    out0 = __msa_copy_u_d((v2i64)inter4, 0);
-    out1 = __msa_copy_u_d((v2i64)inter5, 0);
-    out2 = __msa_copy_u_d((v2i64)inter6, 0);
-    out3 = __msa_copy_u_d((v2i64)inter7, 0);
-    SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
-  }
-
-  f = f_orig;
-  p_dst = dst_ptr - 2;
-  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
-         inter6, inter7);
-
-  for (col = 0; col < (cols / 8); ++col) {
-    ref = LD_UB(f);
-    f += 8;
-    AOM_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
-                            inter6, inter7, inter8, inter9, inter10, inter11);
-    if (0 == col) {
-      above2 = inter2;
-      above1 = inter2;
-    } else {
-      above2 = inter0;
-      above1 = inter1;
-    }
-    src = inter2;
-    below1 = inter3;
-    below2 = inter4;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
-    above2 = inter5;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
-    above1 = inter6;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
-    src = inter7;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
-    below1 = inter8;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
-    below2 = inter9;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
-    if (col == (cols / 8 - 1)) {
-      above2 = inter9;
-    } else {
-      above2 = inter10;
-    }
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
-    if (col == (cols / 8 - 1)) {
-      above1 = inter9;
-    } else {
-      above1 = inter11;
-    }
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
-    TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
-                       inter9, inter2, inter3, inter4, inter5, inter6, inter7,
-                       inter8, inter9);
-    p_dst += 8;
-    LD_UB2(p_dst, dst_stride, inter0, inter1);
-    ST8x1_UB(inter2, p_dst_st);
-    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
-    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
-    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
-    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
-    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
-    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
-    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
-    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
-    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
-    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
-    p_dst_st += 8;
-  }
-}
-
-static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
-                                          int32_t src_stride,
-                                          int32_t dst_stride, int32_t cols,
-                                          uint8_t *f) {
-  uint8_t *p_src = src_ptr;
-  uint8_t *p_dst = dst_ptr;
-  uint8_t *p_dst_st = dst_ptr;
-  uint8_t *f_orig = f;
-  uint16_t col;
-  v16u8 above2, above1, below2, below1;
-  v16u8 src, ref, ref_temp;
-  v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
-  v16u8 inter7, inter8, inter9, inter10, inter11;
-  v16u8 inter12, inter13, inter14, inter15;
-
-  for (col = (cols / 16); col--;) {
-    ref = LD_UB(f);
-    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
-    src = LD_UB(p_src);
-    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
-    above2 = LD_UB(p_src + 3 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
-    above1 = LD_UB(p_src + 4 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
-    src = LD_UB(p_src + 5 * src_stride);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
-    below1 = LD_UB(p_src + 6 * src_stride);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
-    below2 = LD_UB(p_src + 7 * src_stride);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
-    above2 = LD_UB(p_src + 8 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
-    above1 = LD_UB(p_src + 9 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
-    src = LD_UB(p_src + 10 * src_stride);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
-    below1 = LD_UB(p_src + 11 * src_stride);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
-    below2 = LD_UB(p_src + 12 * src_stride);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
-    above2 = LD_UB(p_src + 13 * src_stride);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
-    above1 = LD_UB(p_src + 14 * src_stride);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
-    src = LD_UB(p_src + 15 * src_stride);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
-    below1 = LD_UB(p_src + 16 * src_stride);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
-    below2 = LD_UB(p_src + 17 * src_stride);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
-    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
-           p_dst, dst_stride);
-    ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
-           p_dst + 8 * dst_stride, dst_stride);
-    p_src += 16;
-    p_dst += 16;
-    f += 16;
-  }
-
-  f = f_orig;
-  p_dst = dst_ptr - 2;
-  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
-         inter6, inter7);
-  LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
-         inter12, inter13, inter14, inter15);
-
-  for (col = 0; col < cols / 8; ++col) {
-    ref = LD_UB(f);
-    f += 8;
-    TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
-                     inter7, inter8, inter9, inter10, inter11, inter12, inter13,
-                     inter14, inter15);
-    if (0 == col) {
-      above2 = inter2;
-      above1 = inter2;
-    } else {
-      above2 = inter0;
-      above1 = inter1;
-    }
-
-    src = inter2;
-    below1 = inter3;
-    below2 = inter4;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
-    above2 = inter5;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
-    above1 = inter6;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
-    src = inter7;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
-    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
-    below1 = inter8;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
-    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
-    below2 = inter9;
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
-    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
-    if (col == (cols / 8 - 1)) {
-      above2 = inter9;
-    } else {
-      above2 = inter10;
-    }
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
-    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
-    if (col == (cols / 8 - 1)) {
-      above1 = inter9;
-    } else {
-      above1 = inter11;
-    }
-    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
-    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
-    AOM_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
-                            inter8, inter9, inter2, inter3, inter4, inter5,
-                            inter6, inter7, inter8, inter9, inter10, inter11,
-                            inter12, inter13, inter14, inter15, above2, above1);
-
-    p_dst += 8;
-    LD_UB2(p_dst, dst_stride, inter0, inter1);
-    ST8x1_UB(inter2, p_dst_st);
-    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
-    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
-    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
-    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
-    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
-    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
-    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
-    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
-    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
-    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
-    LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
-    ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
-    ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
-    LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
-    ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
-    ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
-    LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
-    ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
-    ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
-    LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
-    ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
-    ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
-    p_dst_st += 8;
-  }
-}
-
-void aom_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
-                                              int32_t src_stride,
-                                              int32_t dst_stride, int32_t cols,
-                                              uint8_t *f, int32_t size) {
-  if (8 == size) {
-    postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
-  } else if (16 == size) {
-    postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
-  }
-}
-
-void aom_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
-                                   int32_t rows, int32_t cols, int32_t flimit) {
-  int32_t row, col, cnt;
-  uint8_t *src_dup = src_ptr;
-  v16u8 src0, src, tmp_orig;
-  v16u8 tmp = { 0 };
-  v16i8 zero = { 0 };
-  v8u16 sum_h, src_r_h, src_l_h;
-  v4u32 src_r_w, src_l_w;
-  v4i32 flimit_vec;
-
-  flimit_vec = __msa_fill_w(flimit);
-  for (row = rows; row--;) {
-    int32_t sum_sq = 0;
-    int32_t sum = 0;
-    src0 = (v16u8)__msa_fill_b(src_dup[0]);
-    ST8x1_UB(src0, (src_dup - 8));
-
-    src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
-    ST_UB(src0, src_dup + cols);
-    src_dup[cols + 16] = src_dup[cols - 1];
-    tmp_orig = (v16u8)__msa_ldi_b(0);
-    tmp_orig[15] = tmp[15];
-    src = LD_UB(src_dup - 8);
-    src[15] = 0;
-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
-    src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
-    src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
-    sum_sq = HADD_SW_S32(src_r_w);
-    sum_sq += HADD_SW_S32(src_l_w);
-    sum_h = __msa_hadd_u_h(src, src);
-    sum = HADD_UH_U32(sum_h);
-    {
-      v16u8 src7, src8, src_r, src_l;
-      v16i8 mask;
-      v8u16 add_r, add_l;
-      v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
-      v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
-      v4i32 sub0, sub1, sub2, sub3;
-      v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
-      v4i32 mul0, mul1, mul2, mul3;
-      v4i32 total0, total1, total2, total3;
-      v8i16 const8 = __msa_fill_h(8);
-
-      src7 = LD_UB(src_dup + 7);
-      src8 = LD_UB(src_dup - 8);
-      for (col = 0; col < (cols >> 4); ++col) {
-        ILVRL_B2_UB(src7, src8, src_r, src_l);
-        HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
-
-        sum_r[0] = sum + sub_r[0];
-        for (cnt = 0; cnt < 7; ++cnt) {
-          sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
-        }
-        sum_l[0] = sum_r[7] + sub_l[0];
-        for (cnt = 0; cnt < 7; ++cnt) {
-          sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
-        }
-        sum = sum_l[7];
-        src = LD_UB(src_dup + 16 * col);
-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
-        src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
-        src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
-        tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
-
-        HADD_UB2_UH(src_r, src_l, add_r, add_l);
-        UNPCK_SH_SW(sub_r, sub0, sub1);
-        UNPCK_SH_SW(sub_l, sub2, sub3);
-        ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
-        ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
-        MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
-             mul2, mul3);
-        sum_sq0[0] = sum_sq + mul0[0];
-        for (cnt = 0; cnt < 3; ++cnt) {
-          sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
-        }
-        sum_sq1[0] = sum_sq0[3] + mul1[0];
-        for (cnt = 0; cnt < 3; ++cnt) {
-          sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
-        }
-        sum_sq2[0] = sum_sq1[3] + mul2[0];
-        for (cnt = 0; cnt < 3; ++cnt) {
-          sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
-        }
-        sum_sq3[0] = sum_sq2[3] + mul3[0];
-        for (cnt = 0; cnt < 3; ++cnt) {
-          sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
-        }
-        sum_sq = sum_sq3[3];
-
-        UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
-        UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
-        total0 = sum_sq0 * __msa_ldi_w(15);
-        total0 -= sum0_w * sum0_w;
-        total1 = sum_sq1 * __msa_ldi_w(15);
-        total1 -= sum1_w * sum1_w;
-        total2 = sum_sq2 * __msa_ldi_w(15);
-        total2 -= sum2_w * sum2_w;
-        total3 = sum_sq3 * __msa_ldi_w(15);
-        total3 -= sum3_w * sum3_w;
-        total0 = (total0 < flimit_vec);
-        total1 = (total1 < flimit_vec);
-        total2 = (total2 < flimit_vec);
-        total3 = (total3 < flimit_vec);
-        PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
-        mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
-
-        if (col == 0) {
-          uint64_t src_d;
-
-          src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
-          SD(src_d, (src_dup - 8));
-        }
-
-        src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
-        src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
-        ST_UB(tmp, (src_dup + (16 * col)));
-      }
-
-      src_dup += pitch;
-    }
-  }
-}
-
-void aom_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
-                              int32_t cols, int32_t flimit) {
-  int32_t row, col, cnt, i;
-  const int16_t *rv3 = &aom_rv[63 & rand()];
-  v4i32 flimit_vec;
-  v16u8 dst7, dst8, dst_r_b, dst_l_b;
-  v16i8 mask;
-  v8u16 add_r, add_l;
-  v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
-  v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
-
-  flimit_vec = __msa_fill_w(flimit);
-
-  for (col = 0; col < (cols >> 4); ++col) {
-    uint8_t *dst_tmp = &dst_ptr[col << 4];
-    v16u8 dst;
-    v16i8 zero = { 0 };
-    v16u8 tmp[16];
-    v8i16 mult0, mult1, rv2_0, rv2_1;
-    v8i16 sum0_h = { 0 };
-    v8i16 sum1_h = { 0 };
-    v4i32 mul0 = { 0 };
-    v4i32 mul1 = { 0 };
-    v4i32 mul2 = { 0 };
-    v4i32 mul3 = { 0 };
-    v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
-    v4i32 add0, add1, add2, add3;
-    const int16_t *rv2[16];
-
-    dst = LD_UB(dst_tmp);
-    for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
-      rv2[i] = rv3 + ((cnt * 17) & 127);
-      ++i;
-    }
-    for (cnt = -8; cnt < 0; ++cnt) {
-      ST_UB(dst, dst_tmp + cnt * pitch);
-    }
-
-    dst = LD_UB((dst_tmp + (rows - 1) * pitch));
-    for (cnt = rows; cnt < rows + 17; ++cnt) {
-      ST_UB(dst, dst_tmp + cnt * pitch);
-    }
-    for (cnt = -8; cnt <= 6; ++cnt) {
-      dst = LD_UB(dst_tmp + (cnt * pitch));
-      UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
-      MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
-      mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
-      mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
-      mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
-      mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
-      ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
-    }
-
-    for (row = 0; row < (rows + 8); ++row) {
-      for (i = 0; i < 8; ++i) {
-        rv2_0[i] = *(rv2[i] + (row & 127));
-        rv2_1[i] = *(rv2[i + 8] + (row & 127));
-      }
-      dst7 = LD_UB(dst_tmp + (7 * pitch));
-      dst8 = LD_UB(dst_tmp - (8 * pitch));
-      ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
-
-      HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
-      UNPCK_SH_SW(sub_r, sub0, sub1);
-      UNPCK_SH_SW(sub_l, sub2, sub3);
-      sum0_h += sub_r;
-      sum1_h += sub_l;
-
-      HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
-
-      ILVRL_H2_SW(zero, add_r, add0, add1);
-      ILVRL_H2_SW(zero, add_l, add2, add3);
-      mul0 += add0 * sub0;
-      mul1 += add1 * sub1;
-      mul2 += add2 * sub2;
-      mul3 += add3 * sub3;
-      dst = LD_UB(dst_tmp);
-      ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
-      dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
-      dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
-      tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
-
-      UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
-      UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
-      total0 = mul0 * __msa_ldi_w(15);
-      total0 -= sum0_w * sum0_w;
-      total1 = mul1 * __msa_ldi_w(15);
-      total1 -= sum1_w * sum1_w;
-      total2 = mul2 * __msa_ldi_w(15);
-      total2 -= sum2_w * sum2_w;
-      total3 = mul3 * __msa_ldi_w(15);
-      total3 -= sum3_w * sum3_w;
-      total0 = (total0 < flimit_vec);
-      total1 = (total1 < flimit_vec);
-      total2 = (total2 < flimit_vec);
-      total3 = (total3 < flimit_vec);
-      PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
-      mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-      tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
-
-      if (row >= 8) {
-        ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
-      }
-
-      dst_tmp += pitch;
-    }
-  }
-}
--- a/aom_dsp/x86/add_noise_sse2.asm
+++ b/aom_dsp/x86/add_noise_sse2.asm
@ -1,83 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-;void aom_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
-;                              unsigned char blackclamp[16],
-;                              unsigned char whiteclamp[16],
-;                              unsigned char bothclamp[16],
-;                              unsigned int width, unsigned int height,
-;                              int pitch)
-global sym(aom_plane_add_noise_sse2) PRIVATE
-sym(aom_plane_add_noise_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ; get the clamps in registers
-    mov     rdx, arg(2) ; blackclamp
-    movdqu  xmm3, [rdx]
-    mov     rdx, arg(3) ; whiteclamp
-    movdqu  xmm4, [rdx]
-    mov     rdx, arg(4) ; bothclamp
-    movdqu  xmm5, [rdx]
-
-.addnoise_loop:
-    call sym(LIBAOM_RAND) WRT_PLT
-    mov     rcx, arg(1) ;noise
-    and     rax, 0xff
-    add     rcx, rax
-
-    mov     rdi, rcx
-    movsxd  rcx, dword arg(5) ;[Width]
-    mov     rsi, arg(0) ;Pos
-    xor         rax,rax
-
-.addnoise_nextset:
-      movdqu      xmm1,[rsi+rax]         ; get the source
-
-      psubusb     xmm1, xmm3 ; subtract black clamp
-      paddusb     xmm1, xmm5 ; add both clamp
-      psubusb     xmm1, xmm4 ; subtract whiteclamp
-
-      movdqu      xmm2,[rdi+rax]         ; get the noise for this line
-      paddb       xmm1,xmm2              ; add it in
-      movdqu      [rsi+rax],xmm1         ; store the result
-
-      add         rax,16                 ; move to the next line
-
-      cmp         rax, rcx
-      jl          .addnoise_nextset
-
-    movsxd  rax, dword arg(7) ; Pitch
-    add     arg(0), rax ; Start += Pitch
-    sub     dword arg(6), 1   ; Height -= 1
-    jg      .addnoise_loop
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-rd42:
-    times 8 dw 0x04
-four8s:
-    times 4 dd 8
--- a/aom_dsp/x86/deblock_sse2.asm
+++ b/aom_dsp/x86/deblock_sse2.asm
@ -1,661 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-;macro in deblock functions
-%macro FIRST_2_ROWS 0
-        movdqa      xmm4,       xmm0
-        movdqa      xmm6,       xmm0
-        movdqa      xmm5,       xmm1
-        pavgb       xmm5,       xmm3
-
-        ;calculate absolute value
-        psubusb     xmm4,       xmm1
-        psubusb     xmm1,       xmm0
-        psubusb     xmm6,       xmm3
-        psubusb     xmm3,       xmm0
-        paddusb     xmm4,       xmm1
-        paddusb     xmm6,       xmm3
-
-        ;get threshold
-        movdqa      xmm2,       flimit
-        pxor        xmm1,       xmm1
-        movdqa      xmm7,       xmm2
-
-        ;get mask
-        psubusb     xmm2,       xmm4
-        psubusb     xmm7,       xmm6
-        pcmpeqb     xmm2,       xmm1
-        pcmpeqb     xmm7,       xmm1
-        por         xmm7,       xmm2
-%endmacro
-
-%macro SECOND_2_ROWS 0
-        movdqa      xmm6,       xmm0
-        movdqa      xmm4,       xmm0
-        movdqa      xmm2,       xmm1
-        pavgb       xmm1,       xmm3
-
-        ;calculate absolute value
-        psubusb     xmm6,       xmm2
-        psubusb     xmm2,       xmm0
-        psubusb     xmm4,       xmm3
-        psubusb     xmm3,       xmm0
-        paddusb     xmm6,       xmm2
-        paddusb     xmm4,       xmm3
-
-        pavgb       xmm5,       xmm1
-
-        ;get threshold
-        movdqa      xmm2,       flimit
-        pxor        xmm1,       xmm1
-        movdqa      xmm3,       xmm2
-
-        ;get mask
-        psubusb     xmm2,       xmm6
-        psubusb     xmm3,       xmm4
-        pcmpeqb     xmm2,       xmm1
-        pcmpeqb     xmm3,       xmm1
-
-        por         xmm7,       xmm2
-        por         xmm7,       xmm3
-
-        pavgb       xmm5,       xmm0
-
-        ;decide if or not to use filtered value
-        pand        xmm0,       xmm7
-        pandn       xmm7,       xmm5
-        paddusb     xmm0,       xmm7
-%endmacro
-
-%macro UPDATE_FLIMIT 0
-        movdqa      xmm2,       XMMWORD PTR [rbx]
-        movdqa      [rsp],      xmm2
-        add         rbx,        16
-%endmacro
-
-;void aom_post_proc_down_and_across_mb_row_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned char *dst_ptr,
-;    int src_pixels_per_line,
-;    int dst_pixels_per_line,
-;    int cols,
-;    int *flimits,
-;    int size
-;)
-global sym(aom_post_proc_down_and_across_mb_row_sse2) PRIVATE
-sym(aom_post_proc_down_and_across_mb_row_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-    ALIGN_STACK 16, rax
-    sub         rsp, 16
-
-        ; put flimit on stack
-        mov         rbx,        arg(5)           ;flimits ptr
-        UPDATE_FLIMIT
-
-%define flimit [rsp]
-
-        mov         rsi,        arg(0)           ;src_ptr
-        mov         rdi,        arg(1)           ;dst_ptr
-
-        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
-        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
-.nextrow:
-        xor         rdx,        rdx              ;col
-.nextcol:
-        ;load current and next 2 rows
-        movdqu      xmm0,       XMMWORD PTR [rsi]
-        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
-        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
-
-        FIRST_2_ROWS
-
-        ;load above 2 rows
-        neg         rax
-        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
-        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
-
-        SECOND_2_ROWS
-
-        movdqu      XMMWORD PTR [rdi], xmm0
-
-        neg         rax                          ; positive stride
-        add         rsi,        16
-        add         rdi,        16
-
-        add         rdx,        16
-        cmp         edx,        dword arg(4)     ;cols
-        jge         .downdone
-        UPDATE_FLIMIT
-        jmp         .nextcol
-
-.downdone:
-        ; done with the all cols, start the across filtering in place
-        sub         rsi,        rdx
-        sub         rdi,        rdx
-
-        mov         rbx,        arg(5) ; flimits
-        UPDATE_FLIMIT
-
-        ; dup the first byte into the left border 8 times
-        movq        mm1,   [rdi]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-        mov         rdx,    -8
-        movq        [rdi+rdx], mm1
-
-        ; dup the last byte into the right border
-        movsxd      rdx,    dword arg(4)
-        movq        mm1,   [rdi + rdx + -1]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-        movq        [rdi+rdx], mm1
-
-        xor         rdx,        rdx
-        movq        mm0,        QWORD PTR [rdi-16];
-        movq        mm1,        QWORD PTR [rdi-8];
-
-.acrossnextcol:
-        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
-        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
-        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
-
-        FIRST_2_ROWS
-
-        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
-        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
-
-        SECOND_2_ROWS
-
-        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
-        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
-        movdq2q     mm0,        xmm0
-        psrldq      xmm0,       8
-        movdq2q     mm1,        xmm0
-
-        add         rdx,        16
-        cmp         edx,        dword arg(4)     ;cols
-        jge         .acrossdone
-        UPDATE_FLIMIT
-        jmp         .acrossnextcol
-
-.acrossdone:
-        ; last 16 pixels
-        movq        QWORD PTR [rdi+rdx-16], mm0
-
-        cmp         edx,        dword arg(4)
-        jne         .throw_last_8
-        movq        QWORD PTR [rdi+rdx-8], mm1
-.throw_last_8:
-        ; done with this rwo
-        add         rsi,rax                      ;next src line
-        mov         eax, dword arg(3)            ;dst_pixels_per_line
-        add         rdi,rax                      ;next destination
-        mov         eax, dword arg(2)            ;src_pixels_per_line
-
-        mov         rbx,        arg(5)           ;flimits
-        UPDATE_FLIMIT
-
-        dec         rcx                          ;decrement count
-        jnz         .nextrow                     ;next row
-
-    add rsp, 16
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit
-
-;void aom_mbpost_proc_down_xmm(unsigned char *dst,
-;                            int pitch, int rows, int cols,int flimit)
-extern sym(aom_rv)
-global sym(aom_mbpost_proc_down_xmm) PRIVATE
-sym(aom_mbpost_proc_down_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 128+16
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-    mov         [rsp+128+8], eax
-    mov         [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(aom_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword arg(2), 8
-
-    ;for(c=0; c<cols; c+=8)
-.loop_col:
-            mov         rsi,        arg(0) ; s
-            pxor        xmm0,       xmm0        ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-
-            ; this copies the last row down into the border 8 rows
-            mov         rdi,        rsi
-            mov         rdx,        arg(2)
-            sub         rdx,        9
-            imul        rdx,        rax
-            lea         rdi,        [rdi+rdx]
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_borderd:                                                  ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_borderd
-
-            neg         rax                                     ; rax = -pitch
-
-            ; this copies the first row up into the border 8 rows
-            mov         rdi,        rsi
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_border:                                                   ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_border
-
-
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movq        xmm1,       QWORD PTR [rdi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [rsi+rax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(aom_rv))]
-            movdqu      xmm4,       [rax + rcx*2] ;aom_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movdqu      xmm4,       [r8 + rcx*2] ;aom_rv[rcx*2]
-%else
-            movdqu      xmm4,       [sym(aom_rv) + rcx*2]
-%endif
-
-            paddw       xmm1,       xmm4
-            ;paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         rcx,        15
-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
-
-            cmp         edx,        8
-            jl          .skip_assignment
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-            and         rcx,        15
-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
-            movq        [rsi],      mm0
-
-.skip_assignment:
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-        add         dword arg(0), 8 ; s += 8
-        sub         dword arg(3), 8 ; cols -= 8
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 128+16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
-
-;void aom_mbpost_proc_across_ip_xmm(unsigned char *src,
-;                                int pitch, int rows, int cols,int flimit)
-global sym(aom_mbpost_proc_across_ip_xmm) PRIVATE
-sym(aom_mbpost_proc_across_ip_xmm):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16
-
-    ; create flimit4 at [rsp]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp], eax
-    mov         [rsp+4], eax
-    mov         [rsp+8], eax
-    mov         [rsp+12], eax
-%define flimit4 [rsp]
-
-
-    ;for(r=0;r<rows;r++)
-.ip_row_loop:
-
-        xor         rdx,    rdx ;sumsq=0;
-        xor         rcx,    rcx ;sum=0;
-        mov         rsi,    arg(0); s
-
-
-        ; dup the first byte into the left border 8 times
-        movq        mm1,   [rsi]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-
-        mov         rdi,    -8
-        movq        [rsi+rdi], mm1
-
-        ; dup the last byte into the right border
-        movsxd      rdx,    dword arg(3)
-        movq        mm1,   [rsi + rdx + -1]
-        punpcklbw   mm1,   mm1
-        punpcklwd   mm1,   mm1
-        punpckldq   mm1,   mm1
-        movq        [rsi+rdx], mm1
-
-.ip_var_loop:
-        ;for(i=-8;i<=6;i++)
-        ;{
-        ;    sumsq += s[i]*s[i];
-        ;    sum   += s[i];
-        ;}
-        movzx       eax, byte [rsi+rdi]
-        add         ecx, eax
-        mul         al
-        add         edx, eax
-        add         rdi, 1
-        cmp         rdi, 6
-        jle         .ip_var_loop
-
-
-            ;mov         rax,    sumsq
-            ;movd        xmm7,   rax
-            movd        xmm7,   edx
-
-            ;mov         rax,    sum
-            ;movd        xmm6,   rax
-            movd        xmm6,   ecx
-
-            mov         rsi,    arg(0) ;s
-            xor         rcx,    rcx
-
-            movsxd      rdx,    dword arg(3) ;cols
-            add         rdx,    8
-            pxor        mm0,    mm0
-            pxor        mm1,    mm1
-
-            pxor        xmm0,   xmm0
-.nextcol4:
-
-            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
-            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
-
-            punpcklbw   xmm1,   xmm0                    ; expanding
-            punpcklbw   xmm2,   xmm0                    ; expanding
-
-            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
-            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
-
-            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
-            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
-
-            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
-            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
-
-            paddd       xmm6,   xmm2
-            paddd       xmm7,   xmm1
-
-            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
-            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
-
-            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
-            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
-
-            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
-
-            paddd       xmm6,   xmm4
-            paddd       xmm7,   xmm3
-
-            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
-            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            movdqa      xmm3,   xmm6
-            pmaddwd     xmm3,   xmm3
-
-            movdqa      xmm5,   xmm7
-            pslld       xmm5,   4
-
-            psubd       xmm5,   xmm7
-            psubd       xmm5,   xmm3
-
-            psubd       xmm5,   flimit4
-            psrad       xmm5,   31
-
-            packssdw    xmm5,   xmm0
-            packsswb    xmm5,   xmm0
-
-            movd        xmm1,   DWORD PTR [rsi+rcx]
-            movq        xmm2,   xmm1
-
-            punpcklbw   xmm1,   xmm0
-            punpcklwd   xmm1,   xmm0
-
-            paddd       xmm1,   xmm6
-            paddd       xmm1,   [GLOBAL(four8s)]
-
-            psrad       xmm1,   4
-            packssdw    xmm1,   xmm0
-
-            packuswb    xmm1,   xmm0
-            pand        xmm1,   xmm5
-
-            pandn       xmm5,   xmm2
-            por         xmm5,   xmm1
-
-            movd        [rsi+rcx-8],  mm0
-            movq        mm0,    mm1
-
-            movdq2q     mm1,    xmm5
-            psrldq      xmm7,   12
-
-            psrldq      xmm6,   12
-            add         rcx,    4
-
-            cmp         rcx,    rdx
-            jl          .nextcol4
-
-        ;s+=pitch;
-        movsxd rax, dword arg(1)
-        add    arg(0), rax
-
-        sub dword arg(2), 1 ;rows-=1
-        cmp dword arg(2), 0
-        jg .ip_row_loop
-
-    add         rsp, 16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
-
-SECTION_RODATA
-align 16
-four8s:
-    times 4 dd 8
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./aom_dsp_rtcd.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/postproc.h"
-#include "aom_mem/aom_mem.h"
-
-namespace {
-
-// TODO(jimbankoski): make width and height integers not unsigned.
-typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16], unsigned int width,
-                             unsigned int height, int pitch);
-
-class AddNoiseTest : public ::testing::TestWithParam<AddNoiseFunc> {
- public:
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-  virtual ~AddNoiseTest() {}
-};
-
-double stddev6(char a, char b, char c, char d, char e, char f) {
-  const double n = (a + b + c + d + e + f) / 6.0;
-  const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
-                    (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
-                   6.0;
-  return sqrt(v);
-}
-
-TEST_P(AddNoiseTest, CheckNoiseAdded) {
-  DECLARE_ALIGNED(16, char, blackclamp[16]);
-  DECLARE_ALIGNED(16, char, whiteclamp[16]);
-  DECLARE_ALIGNED(16, char, bothclamp[16]);
-  const int width = 64;
-  const int height = 64;
-  const int image_size = width * height;
-  char noise[3072];
-  const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
-
-  for (int i = 0; i < 16; i++) {
-    blackclamp[i] = clamp;
-    whiteclamp[i] = clamp;
-    bothclamp[i] = 2 * clamp;
-  }
-
-  uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
-  memset(s, 99, image_size);
-
-  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
-                                      bothclamp, width, height, width));
-
-  // Check to make sure we don't end up having either the same or no added
-  // noise either vertically or horizontally.
-  for (int i = 0; i < image_size - 6 * width - 6; ++i) {
-    const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
-                              s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
-    const double vd = stddev6(s[i] - 99, s[i + width] - 99,
-                              s[i + 2 * width] - 99, s[i + 3 * width] - 99,
-                              s[i + 4 * width] - 99, s[i + 5 * width] - 99);
-
-    EXPECT_NE(hd, 0);
-    EXPECT_NE(vd, 0);
-  }
-
-  // Initialize pixels in the image to 255 and check for roll over.
-  memset(s, 255, image_size);
-
-  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
-                                      bothclamp, width, height, width));
-
-  // Check to make sure don't roll over.
-  for (int i = 0; i < image_size; ++i) {
-    EXPECT_GT(static_cast<int>(s[i]), clamp) << "i = " << i;
-  }
-
-  // Initialize pixels in the image to 0 and check for roll under.
-  memset(s, 0, image_size);
-
-  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
-                                      bothclamp, width, height, width));
-
-  // Check to make sure don't roll under.
-  for (int i = 0; i < image_size; ++i) {
-    EXPECT_LT(static_cast<int>(s[i]), 255 - clamp) << "i = " << i;
-  }
-
-  aom_free(s);
-}
-
-TEST_P(AddNoiseTest, CheckCvsAssembly) {
-  DECLARE_ALIGNED(16, char, blackclamp[16]);
-  DECLARE_ALIGNED(16, char, whiteclamp[16]);
-  DECLARE_ALIGNED(16, char, bothclamp[16]);
-  const int width = 64;
-  const int height = 64;
-  const int image_size = width * height;
-  char noise[3072];
-
-  const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
-
-  for (int i = 0; i < 16; i++) {
-    blackclamp[i] = clamp;
-    whiteclamp[i] = clamp;
-    bothclamp[i] = 2 * clamp;
-  }
-
-  uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
-  uint8_t *const d = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
-
-  memset(s, 99, image_size);
-  memset(d, 99, image_size);
-
-  srand(0);
-  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
-                                      bothclamp, width, height, width));
-  srand(0);
-  ASM_REGISTER_STATE_CHECK(aom_plane_add_noise_c(
-      d, noise, blackclamp, whiteclamp, bothclamp, width, height, width));
-
-  for (int i = 0; i < image_size; ++i) {
-    EXPECT_EQ(static_cast<int>(s[i]), static_cast<int>(d[i])) << "i = " << i;
-  }
-
-  aom_free(d);
-  aom_free(s);
-}
-
-INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
-                        ::testing::Values(aom_plane_add_noise_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
-                        ::testing::Values(aom_plane_add_noise_sse2));
-#endif
-
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
-                        ::testing::Values(aom_plane_add_noise_msa));
-#endif
-}  // namespace