Add SIMD support for CDEF dering for sse2/ssse3 and neon

Change-Id: Ibaaed850ddceba9c3db542eaf4a1c623ce6b412b
2017-03-25 18:52:22 +01:00 · 2017-03-25 18:52:22 +01:00 · b8ff6aaf5d
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@ -851,22 +851,6 @@ specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/;
 add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;

-if (aom_config("CONFIG_CDEF") eq "yes") {
-  add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
-  # VS compiling for 32 bit targets does not support vector types in
-  # structs as arguments, which makes the v256 type of the intrinsics
-  # hard to support, so optimizations for this target are disabled.
-  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
-    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
-    specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
-  }
-}
-
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
  add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
  specialize qw/aom_highbd_lpf_vertical_16 sse2/;
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@ -231,47 +231,32 @@ if (CONFIG_CDEF)
      "${AOM_ROOT}/av1/common/cdef.c"
      "${AOM_ROOT}/av1/common/cdef.h"
      "${AOM_ROOT}/av1/common/od_dering.c"
-      "${AOM_ROOT}/av1/common/od_dering.h")
+      "${AOM_ROOT}/av1/common/od_dering.h"
+      "${AOM_ROOT}/av1/common/od_dering_simd.h")

  set(AOM_AV1_ENCODER_SOURCES
      ${AOM_AV1_ENCODER_SOURCES}
-      "${AOM_ROOT}/av1/encoder/clpf_rdo.c"
-      "${AOM_ROOT}/av1/encoder/clpf_rdo.h"
      "${AOM_ROOT}/av1/encoder/pickcdef.c")

  set(AOM_AV1_COMMON_SSE2_INTRIN
      ${AOM_AV1_COMMON_SSE2_INTRIN}
-      "${AOM_ROOT}/av1/common/clpf_sse2.c")
+      "${AOM_ROOT}/av1/common/clpf_sse2.c"
+      "${AOM_ROOT}/av1/common/od_dering_sse2.c")

  set(AOM_AV1_COMMON_SSSE3_INTRIN
      ${AOM_AV1_COMMON_SSSE3_INTRIN}
-      "${AOM_ROOT}/av1/common/clpf_ssse3.c")
+      "${AOM_ROOT}/av1/common/clpf_ssse3.c"
+      "${AOM_ROOT}/av1/common/od_dering_ssse3.c")

  set(AOM_AV1_COMMON_SSE4_1_INTRIN
      ${AOM_AV1_COMMON_SSE4_1_INTRIN}
-      "${AOM_ROOT}/av1/common/clpf_sse4.c")
+      "${AOM_ROOT}/av1/common/clpf_sse4.c"
+      "${AOM_ROOT}/av1/common/od_dering_sse4.c")

  set(AOM_AV1_COMMON_NEON_INTRIN
      ${AOM_AV1_COMMON_NEON_INTRIN}
-      "${AOM_ROOT}/av1/common/clpf_neon.c")
-
-  set(AOM_AV1_ENCODER_SSE2_INTRIN
-      ${AOM_AV1_ENCODER_SSE2_INTRIN}
-      "${AOM_ROOT}/av1/encoder/clpf_rdo_sse2.c")
-
-  set(AOM_AV1_ENCODER_SSSE3_INTRIN
-      ${AOM_AV1_ENCODER_SSSE3_INTRIN}
-      "${AOM_ROOT}/av1/encoder/clpf_rdo_ssse3.c")
-
-  set(AOM_AV1_ENCODER_SSE4_1_INTRIN
-      ${AOM_AV1_ENCODER_SSE4_1_INTRIN}
-      "${AOM_ROOT}/av1/encoder/clpf_rdo_sse4.c"
-      "${AOM_ROOT}/av1/common/x86/od_dering_sse4.c"
-      "${AOM_ROOT}/av1/common/x86/od_dering_sse4.h")
-
-  set(AOM_AV1_ENCODER_NEON_INTRIN
-      ${AOM_AV1_ENCODER_NEON_INTRIN}
-      "${AOM_ROOT}/av1/encoder/clpf_rdo_neon.c")
+      "${AOM_ROOT}/av1/common/clpf_neon.c"
+      "${AOM_ROOT}/av1/common/od_dering_neon.c")
 endif ()

 if (CONFIG_EXT_INTER)
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@ -94,10 +94,13 @@ AV1_COMMON_SRCS-$(HAVE_SSE2) += common/clpf_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/clpf_ssse3.c
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/clpf_sse4.c
 AV1_COMMON_SRCS-$(HAVE_NEON) += common/clpf_neon.c
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/od_dering_sse2.c
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/od_dering_ssse3.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/od_dering_sse4.c
+AV1_COMMON_SRCS-$(HAVE_NEON) += common/od_dering_neon.c
 AV1_COMMON_SRCS-yes += common/od_dering.c
 AV1_COMMON_SRCS-yes += common/od_dering.h
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/od_dering_sse4.h
+AV1_COMMON_SRCS-yes += common/od_dering_simd.h
 AV1_COMMON_SRCS-yes += common/cdef.c
 AV1_COMMON_SRCS-yes += common/cdef.h
 endif
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@ -753,14 +753,25 @@ if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 # Deringing Functions

 if (aom_config("CONFIG_CDEF") eq "yes") {
+  add_proto qw/void aom_clpf_block_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_hblock_hbd/, "uint16_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_block/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
+  add_proto qw/void aom_clpf_hblock/, "uint8_t *dst, const uint16_t *src, int dstride, int sstride, int sizex, int sizey, unsigned int strength, unsigned int bd";
  add_proto qw/int od_dir_find8/, "const od_dering_in *img, int stride, int32_t *var, int coeff_shift";
-  specialize qw/od_dir_find8 sse4_1/;
-
  add_proto qw/int od_filter_dering_direction_4x4/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
-  specialize qw/od_filter_dering_direction_4x4 sse4_1/;
-
  add_proto qw/int od_filter_dering_direction_8x8/, "uint16_t *y, int ystride, const uint16_t *in, int threshold, int dir";
-  specialize qw/od_filter_dering_direction_8x8 sse4_1/;
+  # VS compiling for 32 bit targets does not support vector types in
+  # structs as arguments, which makes the v256 type of the intrinsics
+  # hard to support, so optimizations for this target are disabled.
+  if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+    specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock_hbd sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/;
+    specialize qw/aom_clpf_hblock sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_dir_find8 sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_filter_dering_direction_4x4 sse2 ssse3 sse4_1 neon/;
+    specialize qw/od_filter_dering_direction_8x8 sse2 ssse3 sse4_1 neon/;
+  }
 }

 # PVQ Functions
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@ -10,7 +10,7 @@
 */

 #include "av1/common/clpf.h"
-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
 #include "aom/aom_image.h"
 #include "aom_dsp/aom_dsp_common.h"

--- a/av1/common/clpf_simd.h
+++ b/av1/common/clpf_simd.h
@ -9,7 +9,7 @@
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */

-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/bitops.h"
 #include "av1/common/clpf_simd_kernel.h"
--- a/av1/common/od_dering.h
+++ b/av1/common/od_dering.h
@ -56,10 +56,4 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
               dering_list *dlist, int dering_count, int level,
               int clpf_strength, int clpf_damping, int coeff_shift,
               int skip_dering, int hbd);
-int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
-                                     const uint16_t *in, int threshold,
-                                     int dir);
-int od_filter_dering_direction_8x8_c(uint16_t *y, int ystride,
-                                     const uint16_t *in, int threshold,
-                                     int dir);
 #endif
--- a/av1/common/x86/od_dering_sse4.h
+++ b/av1/common/x86/od_dering_sse4.h
@ -8,7 +8,7 @@
 * Media Patent License 1.0 was not distributed with this source code in the
 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
-#include "av1/common/od_dering.h"
-#ifndef AOM_COMMON_OD_DERING_X86_SSE4_H_
-#define AOM_COMMON_OD_DERING_X86_SSE4_H_
-#endif  // AOM_COMMON_OD_DERING_X86_SSE4_H_
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "./od_dering_simd.h"
--- a/av1/common/od_dering_simd.h
+++ b/av1/common/od_dering_simd.h
@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./od_dering.h"
+
+/* partial A is a 16-bit vector of the form:
+   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+   [0  y1 y2 y3 y4 y5 y6 y7].
+   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+   and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+                                    v128 const2) {
+  v128 tmp;
+  /* Reverse partial B. */
+  partialb = v128_shuffle_8(
+      partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+  /* Interleave the x and y values of identical indices and pair x8 with 0. */
+  tmp = partiala;
+  partiala = v128_ziplo_16(partialb, partiala);
+  partialb = v128_ziphi_16(partialb, tmp);
+  /* Square and add the corresponding x and y values. */
+  partiala = v128_madd_s16(partiala, partiala);
+  partialb = v128_madd_s16(partialb, partialb);
+  /* Multiply by constant. */
+  partiala = v128_mullo_s32(partiala, const1);
+  partialb = v128_mullo_s32(partialb, const2);
+  /* Sum all results. */
+  partiala = v128_add_32(partiala, partialb);
+  return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+  v128 t0, t1, t2, t3;
+  t0 = v128_ziplo_32(x1, x0);
+  t1 = v128_ziplo_32(x3, x2);
+  t2 = v128_ziphi_32(x1, x0);
+  t3 = v128_ziphi_32(x3, x2);
+  x0 = v128_ziplo_64(t1, t0);
+  x1 = v128_ziphi_64(t1, t0);
+  x2 = v128_ziplo_64(t3, t2);
+  x3 = v128_ziphi_64(t3, t2);
+  return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+   to compute the remaining directions. */
+static INLINE void compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+  v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+  v128 partial6;
+  v128 tmp;
+  /* Partial sums for lines 0 and 1. */
+  partial4a = v128_shl_n_byte(lines[0], 14);
+  partial4b = v128_shr_n_byte(lines[0], 2);
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+  tmp = v128_add_16(lines[0], lines[1]);
+  partial5a = v128_shl_n_byte(tmp, 10);
+  partial5b = v128_shr_n_byte(tmp, 6);
+  partial7a = v128_shl_n_byte(tmp, 4);
+  partial7b = v128_shr_n_byte(tmp, 12);
+  partial6 = tmp;
+
+  /* Partial sums for lines 2 and 3. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+  tmp = v128_add_16(lines[2], lines[3]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 4 and 5. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+  tmp = v128_add_16(lines[4], lines[5]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Partial sums for lines 6 and 7. */
+  partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+  partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+  partial4a = v128_add_16(partial4a, lines[7]);
+  tmp = v128_add_16(lines[6], lines[7]);
+  partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+  partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+  partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+  partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+  partial6 = v128_add_16(partial6, tmp);
+
+  /* Compute costs in terms of partial sums. */
+  partial4a =
+      fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+                       v128_from_32(105, 120, 140, 168));
+  partial7a =
+      fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial5a =
+      fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+                       v128_from_32(105, 105, 105, 140));
+  partial6 = v128_madd_s16(partial6, partial6);
+  partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+  v128_store_unaligned(tmp_cost1, partial4a);
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+   counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+  const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+  const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+  const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+  const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+  const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+  const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+  const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+  const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+  const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+  const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+  const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+  const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+  const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+  const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+  const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+  const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+  res[7] = v128_ziplo_64(tr1_1, tr1_0);
+  res[6] = v128_ziphi_64(tr1_1, tr1_0);
+  res[5] = v128_ziplo_64(tr1_3, tr1_2);
+  res[4] = v128_ziphi_64(tr1_3, tr1_2);
+  res[3] = v128_ziplo_64(tr1_5, tr1_4);
+  res[2] = v128_ziphi_64(tr1_5, tr1_4);
+  res[1] = v128_ziplo_64(tr1_7, tr1_6);
+  res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
+                            int coeff_shift) {
+  int i;
+  int32_t cost[8];
+  int32_t best_cost = 0;
+  int best_dir = 0;
+  v128 lines[8];
+  for (i = 0; i < 8; i++) {
+    lines[i] = v128_load_unaligned(&img[i * stride]);
+    lines[i] =
+        v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+  }
+
+  /* Compute "mostly vertical" directions. */
+  compute_directions(lines, cost + 4);
+
+  array_reverse_transpose_8x8(lines, lines);
+
+  /* Compute "mostly horizontal" directions. */
+  compute_directions(lines, cost);
+
+  for (i = 0; i < 8; i++) {
+    if (cost[i] > best_cost) {
+      best_cost = cost[i];
+      best_dir = i;
+    }
+  }
+
+  /* Difference between the optimal variance and the variance along the
+     orthogonal direction. Again, the sum(x^2) terms cancel out. */
+  *var = best_cost - cost[(best_dir + 4) & 7];
+  /* We'd normally divide by 840, but dividing by 1024 is close enough
+     for what we're going to do with this. */
+  *var >>= 10;
+  return best_dir;
+}
+
+static INLINE v128 od_cmplt_abs_epi16(v128 in, v128 threshold) {
+  return v128_cmplt_s16(v128_abs_s16(in), threshold);
+}
+
+int SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
+                                              const uint16_t *in, int threshold,
+                                              int dir) {
+  int i;
+  v128 sum;
+  v128 p;
+  v128 cmp;
+  v128 row;
+  v128 res;
+  v128 tmp;
+  v128 thresh;
+  v128 total_abs;
+  int off1, off2;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  total_abs = v128_zero();
+  thresh = v128_dup_16(threshold);
+  for (i = 0; i < 4; i += 2) {
+    sum = v128_zero();
+    row = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]),
+                        v64_load_aligned(&in[i * OD_FILT_BSTRIDE]));
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off1]),
+                        v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off1]));
+    p = v128_sub_16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_shl_n_16(p, 2);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off1]),
+                        v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off1]));
+    p = v128_sub_16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_shl_n_16(p, 2);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE + off2]),
+                        v64_load_aligned(&in[i * OD_FILT_BSTRIDE + off2]));
+    p = v128_sub_16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    tmp = v128_from_v64(v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE - off2]),
+                        v64_load_aligned(&in[i * OD_FILT_BSTRIDE - off2]));
+    p = v128_sub_16(tmp, row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    total_abs = v128_add_16(total_abs, v128_abs_s16(res));
+    res = v128_add_16(row, res);
+    v64_store_aligned(&y[i * ystride], v128_low_v64(res));
+    v64_store_aligned(&y[(i + 1) * ystride], v128_high_v64(res));
+  }
+  return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 2) >> 2;
+}
+
+int SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
+                                              const uint16_t *in, int threshold,
+                                              int dir) {
+  int i;
+  v128 sum;
+  v128 p;
+  v128 cmp;
+  v128 row;
+  v128 res;
+  v128 thresh;
+  v128 total_abs;
+  int off1, off2, off3;
+  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
+  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
+  off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
+  total_abs = v128_zero();
+  thresh = v128_dup_16(threshold);
+  for (i = 0; i < 8; i++) {
+    sum = v128_zero();
+    row = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE]);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_add_16(p, v128_shl_n_16(p, 1));
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off1]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_add_16(p, v128_shl_n_16(p, 1));
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_shl_n_16(p, 1);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off2]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_shl_n_16(p, 1);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
+    p = v128_sub_16(v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - off3]), row);
+    /*if (abs(p) < thresh) sum += taps[k]*p1*/
+    cmp = od_cmplt_abs_epi16(p, thresh);
+    p = v128_and(p, cmp);
+    sum = v128_add_16(sum, p);
+
+    /*res = row + ((sum + 8) >> 4)*/
+    res = v128_add_16(sum, v128_dup_16(8));
+    res = v128_shr_n_s16(res, 4);
+    total_abs = v128_add_16(total_abs, v128_abs_s16(res));
+    res = v128_add_16(row, res);
+    v128_store_unaligned(&y[i * ystride], res);
+  }
+  return (v128_dotp_s16(total_abs, v128_dup_16(1)) + 8) >> 4;
+}
--- a/av1/common/od_dering_sse2.c
+++ b/av1/common/od_dering_sse2.c
@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "./od_dering_simd.h"
--- a/av1/common/od_dering_sse4.c
+++ b/av1/common/od_dering_sse4.c
@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "./od_dering_simd.h"
--- a/av1/common/od_dering_ssse3.c
+++ b/av1/common/od_dering_ssse3.c
@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "./od_dering_simd.h"
--- a/av1/common/x86/od_dering_sse4.c
+++ b/av1/common/x86/od_dering_sse4.c
@ -1,387 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-#include <emmintrin.h>
-#include <tmmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/od_dering_sse4.h"
-
-/* partial A is a 16-bit vector of the form:
-   [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
-   [0  y1 y2 y3 y4 y5 y6 y7].
-   This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
-   (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
-   and const2. */
-static INLINE __m128i fold_mul_and_sum(__m128i partiala, __m128i partialb,
-                                       __m128i const1, __m128i const2) {
-  __m128i tmp;
-  /* Reverse partial B. */
-  partialb = _mm_shuffle_epi8(
-      partialb,
-      _mm_set_epi8(15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12));
-  /* Interleave the x and y values of identical indices and pair x8 with 0. */
-  tmp = partiala;
-  partiala = _mm_unpacklo_epi16(partiala, partialb);
-  partialb = _mm_unpackhi_epi16(tmp, partialb);
-  /* Square and add the corresponding x and y values. */
-  partiala = _mm_madd_epi16(partiala, partiala);
-  partialb = _mm_madd_epi16(partialb, partialb);
-  /* Multiply by constant. */
-  partiala = _mm_mullo_epi32(partiala, const1);
-  partialb = _mm_mullo_epi32(partialb, const2);
-  /* Sum all results. */
-  partiala = _mm_add_epi32(partiala, partialb);
-  return partiala;
-}
-
-static INLINE __m128i hsum4(__m128i x0, __m128i x1, __m128i x2, __m128i x3) {
-  __m128i t0, t1, t2, t3;
-  t0 = _mm_unpacklo_epi32(x0, x1);
-  t1 = _mm_unpacklo_epi32(x2, x3);
-  t2 = _mm_unpackhi_epi32(x0, x1);
-  t3 = _mm_unpackhi_epi32(x2, x3);
-  x0 = _mm_unpacklo_epi64(t0, t1);
-  x1 = _mm_unpackhi_epi64(t0, t1);
-  x2 = _mm_unpacklo_epi64(t2, t3);
-  x3 = _mm_unpackhi_epi64(t2, t3);
-  return _mm_add_epi32(_mm_add_epi32(x0, x1), _mm_add_epi32(x2, x3));
-}
-
-/* Horizontal sum of 8x16-bit unsigned values. */
-static INLINE int32_t hsum_epi16(__m128i a) {
-  a = _mm_madd_epi16(a, _mm_set1_epi16(1));
-  a = _mm_hadd_epi32(a, a);
-  a = _mm_hadd_epi32(a, a);
-  return _mm_cvtsi128_si32(a);
-}
-
-/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
-   to compute the remaining directions. */
-static INLINE __m128i compute_directions(__m128i lines[8],
-                                         int32_t tmp_cost1[4]) {
-  __m128i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
-  __m128i partial6;
-  __m128i tmp;
-  /* Partial sums for lines 0 and 1. */
-  partial4a = _mm_slli_si128(lines[0], 14);
-  partial4b = _mm_srli_si128(lines[0], 2);
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[1], 12));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[1], 4));
-  tmp = _mm_add_epi16(lines[0], lines[1]);
-  partial5a = _mm_slli_si128(tmp, 10);
-  partial5b = _mm_srli_si128(tmp, 6);
-  partial7a = _mm_slli_si128(tmp, 4);
-  partial7b = _mm_srli_si128(tmp, 12);
-  partial6 = tmp;
-
-  /* Partial sums for lines 2 and 3. */
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[2], 10));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[2], 6));
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[3], 8));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[3], 8));
-  tmp = _mm_add_epi16(lines[2], lines[3]);
-  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 8));
-  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 8));
-  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 6));
-  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 10));
-  partial6 = _mm_add_epi16(partial6, tmp);
-
-  /* Partial sums for lines 4 and 5. */
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[4], 6));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[4], 10));
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[5], 4));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[5], 12));
-  tmp = _mm_add_epi16(lines[4], lines[5]);
-  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 6));
-  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 10));
-  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 8));
-  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 8));
-  partial6 = _mm_add_epi16(partial6, tmp);
-
-  /* Partial sums for lines 6 and 7. */
-  partial4a = _mm_add_epi16(partial4a, _mm_slli_si128(lines[6], 2));
-  partial4b = _mm_add_epi16(partial4b, _mm_srli_si128(lines[6], 14));
-  partial4a = _mm_add_epi16(partial4a, lines[7]);
-  tmp = _mm_add_epi16(lines[6], lines[7]);
-  partial5a = _mm_add_epi16(partial5a, _mm_slli_si128(tmp, 4));
-  partial5b = _mm_add_epi16(partial5b, _mm_srli_si128(tmp, 12));
-  partial7a = _mm_add_epi16(partial7a, _mm_slli_si128(tmp, 10));
-  partial7b = _mm_add_epi16(partial7b, _mm_srli_si128(tmp, 6));
-  partial6 = _mm_add_epi16(partial6, tmp);
-
-  /* Compute costs in terms of partial sums. */
-  partial4a =
-      fold_mul_and_sum(partial4a, partial4b, _mm_set_epi32(210, 280, 420, 840),
-                       _mm_set_epi32(105, 120, 140, 168));
-  partial7a =
-      fold_mul_and_sum(partial7a, partial7b, _mm_set_epi32(210, 420, 0, 0),
-                       _mm_set_epi32(105, 105, 105, 140));
-  partial5a =
-      fold_mul_and_sum(partial5a, partial5b, _mm_set_epi32(210, 420, 0, 0),
-                       _mm_set_epi32(105, 105, 105, 140));
-  partial6 = _mm_madd_epi16(partial6, partial6);
-  partial6 = _mm_mullo_epi32(partial6, _mm_set1_epi32(105));
-
-  partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
-  _mm_storeu_si128((__m128i *)tmp_cost1, partial4a);
-  return partial4a;
-}
-
-/* transpose and reverse the order of the lines -- equivalent to a 90-degree
-   counter-clockwise rotation of the pixels. */
-static INLINE void array_reverse_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[7] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[6] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[5] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[2] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[1] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[0] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-int od_dir_find8_sse4_1(const od_dering_in *img, int stride, int32_t *var,
-                        int coeff_shift) {
-  int i;
-  int32_t cost[8];
-  int32_t best_cost = 0;
-  int best_dir = 0;
-  __m128i lines[8];
-  __m128i dir03, dir47;
-  __m128i max;
-  for (i = 0; i < 8; i++) {
-    lines[i] = _mm_loadu_si128((__m128i *)&img[i * stride]);
-    lines[i] = _mm_sub_epi16(_mm_srai_epi16(lines[i], coeff_shift),
-                             _mm_set1_epi16(128));
-  }
-
-  /* Compute "mostly vertical" directions. */
-  dir47 = compute_directions(lines, cost + 4);
-
-  array_reverse_transpose_8x8(lines, lines);
-
-  /* Compute "mostly horizontal" directions. */
-  dir03 = compute_directions(lines, cost);
-
-#if 1
-  max = _mm_max_epi32(dir03, dir47);
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
-  max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
-  dir03 = _mm_and_si128(_mm_cmpeq_epi32(max, dir03),
-                        _mm_setr_epi32(-1, -2, -3, -4));
-  dir47 = _mm_and_si128(_mm_cmpeq_epi32(max, dir47),
-                        _mm_setr_epi32(-5, -6, -7, -8));
-  dir03 = _mm_max_epu32(dir03, dir47);
-  dir03 = _mm_max_epu32(dir03, _mm_unpackhi_epi64(dir03, dir03));
-  dir03 =
-      _mm_max_epu32(dir03, _mm_shufflelo_epi16(dir03, _MM_SHUFFLE(1, 0, 3, 2)));
-  dir03 = _mm_xor_si128(dir03, _mm_set1_epi32(0xFFFFFFFF));
-
-  best_dir = _mm_cvtsi128_si32(dir03);
-  best_cost = _mm_cvtsi128_si32(max);
-#else
-  for (i = 0; i < 8; i++) {
-    if (cost[i] > best_cost) {
-      best_cost = cost[i];
-      best_dir = i;
-    }
-  }
-#endif
-  /* Difference between the optimal variance and the variance along the
-     orthogonal direction. Again, the sum(x^2) terms cancel out. */
-  *var = best_cost - cost[(best_dir + 4) & 7];
-  /* We'd normally divide by 840, but dividing by 1024 is close enough
-     for what we're going to do with this. */
-  *var >>= 10;
-  return best_dir;
-}
-
-static INLINE __m128i od_cmplt_abs_epi16(__m128i in, __m128i threshold) {
-  return _mm_cmplt_epi16(_mm_abs_epi16(in), threshold);
-}
-
-int od_filter_dering_direction_4x4_sse4_1(uint16_t *y, int ystride,
-                                          const uint16_t *in, int threshold,
-                                          int dir) {
-  int i;
-  __m128i sum;
-  __m128i p;
-  __m128i cmp;
-  __m128i row;
-  __m128i res;
-  __m128i tmp;
-  __m128i thresh;
-  __m128i total_abs;
-  int off1, off2;
-  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
-  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
-  total_abs = _mm_setzero_si128();
-  thresh = _mm_set1_epi16(threshold);
-  for (i = 0; i < 4; i += 2) {
-    sum = _mm_set1_epi16(0);
-    row = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE]),
-        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE]));
-
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    tmp = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]),
-        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off1]));
-    p = _mm_sub_epi16(tmp, row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_slli_epi16(p, 2);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    tmp = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]),
-        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off1]));
-    p = _mm_sub_epi16(tmp, row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_slli_epi16(p, 2);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    tmp = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]),
-        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE + off2]));
-    p = _mm_sub_epi16(tmp, row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    tmp = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]),
-        _mm_loadl_epi64((__m128i *)&in[(i + 1) * OD_FILT_BSTRIDE - off2]));
-    p = _mm_sub_epi16(tmp, row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*res = row + ((sum + 8) >> 4)*/
-    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
-    res = _mm_srai_epi16(res, 4);
-    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
-    res = _mm_add_epi16(row, res);
-    _mm_storel_epi64((__m128i *)&y[i * ystride], res);
-    _mm_storel_epi64((__m128i *)&y[(i + 1) * ystride],
-                     _mm_unpackhi_epi64(res, res));
-  }
-  return (hsum_epi16(total_abs) + 2) >> 2;
-}
-
-int od_filter_dering_direction_8x8_sse4_1(uint16_t *y, int ystride,
-                                          const uint16_t *in, int threshold,
-                                          int dir) {
-  int i;
-  __m128i sum;
-  __m128i p;
-  __m128i cmp;
-  __m128i row;
-  __m128i res;
-  __m128i thresh;
-  __m128i total_abs;
-  int off1, off2, off3;
-  off1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
-  off2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
-  off3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
-  total_abs = _mm_setzero_si128();
-  thresh = _mm_set1_epi16(threshold);
-  for (i = 0; i < 8; i++) {
-    sum = _mm_set1_epi16(0);
-    row = _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE]);
-
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off1]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off1]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_add_epi16(p, _mm_slli_epi16(p, 1));
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off2]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_slli_epi16(p, 1);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off2]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_slli_epi16(p, 1);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE + offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE + off3]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*p = in[i*OD_FILT_BSTRIDE - offset] - row*/
-    p = _mm_sub_epi16(
-        _mm_loadu_si128((__m128i *)&in[i * OD_FILT_BSTRIDE - off3]), row);
-    /*if (abs(p) < thresh) sum += taps[k]*p1*/
-    cmp = od_cmplt_abs_epi16(p, thresh);
-    p = _mm_and_si128(p, cmp);
-    sum = _mm_add_epi16(sum, p);
-
-    /*res = row + ((sum + 8) >> 4)*/
-    res = _mm_add_epi16(sum, _mm_set1_epi16(8));
-    res = _mm_srai_epi16(res, 4);
-    total_abs = _mm_add_epi16(total_abs, _mm_abs_epi16(res));
-    res = _mm_add_epi16(row, res);
-    _mm_storeu_si128((__m128i *)&y[i * ystride], res);
-  }
-  return (hsum_epi16(total_abs) + 8) >> 4;
-}
--- a/av1/encoder/clpf_rdo_simd.h
+++ b/av1/encoder/clpf_rdo_simd.h
@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_simd.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/bitops.h"
-#include "av1/common/clpf_simd_kernel.h"
-
-SIMD_INLINE void clip_sides(v128 *c, v128 *d, v128 *e, v128 *f, int left,
-                            int right) {
-  DECLARE_ALIGNED(16, static const uint64_t,
-                  c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
-  DECLARE_ALIGNED(16, static const uint64_t,
-                  d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
-  DECLARE_ALIGNED(16, static const uint64_t,
-                  e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
-  DECLARE_ALIGNED(16, static const uint64_t,
-                  f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
-
-  if (!left) {  // Left clipping
-    *c = v128_shuffle_8(*c, v128_load_aligned(c_shuff));
-    *d = v128_shuffle_8(*d, v128_load_aligned(d_shuff));
-  }
-  if (!right) {  // Right clipping
-    *e = v128_shuffle_8(*e, v128_load_aligned(e_shuff));
-    *f = v128_shuffle_8(*f, v128_load_aligned(f_shuff));
-  }
-}
-
-SIMD_INLINE void read_two_lines(const uint8_t *rec, const uint8_t *org,
-                                int rstride, int ostride, int x0, int y0,
-                                int bottom, int right, int y, v128 *o, v128 *r,
-                                v128 *a, v128 *b, v128 *c, v128 *d, v128 *e,
-                                v128 *f, v128 *g, v128 *h) {
-  const v64 k1 = v64_load_aligned(org);
-  const v64 k2 = v64_load_aligned(org + ostride);
-  const v64 l1 = v64_load_aligned(rec);
-  const v64 l2 = v64_load_aligned(rec + rstride);
-  const v64 l3 = v64_load_aligned(rec - (y != -y0) * rstride);
-  const v64 l4 = v64_load_aligned(rec + ((y != bottom) + 1) * rstride);
-  *o = v128_from_v64(k1, k2);
-  *r = v128_from_v64(l1, l2);
-  *a = v128_from_v64(v64_load_aligned(rec - 2 * (y != -y0) * rstride), l3);
-  *b = v128_from_v64(l3, l1);
-  *g = v128_from_v64(l2, l4);
-  *h = v128_from_v64(l4,
-                     v64_load_aligned(rec + (2 * (y != bottom) + 1) * rstride));
-  *c = v128_from_v64(v64_load_unaligned(rec - 2 * !!x0),
-                     v64_load_unaligned(rec - 2 * !!x0 + rstride));
-  *d = v128_from_v64(v64_load_unaligned(rec - !!x0),
-                     v64_load_unaligned(rec - !!x0 + rstride));
-  *e = v128_from_v64(v64_load_unaligned(rec + !!right),
-                     v64_load_unaligned(rec + !!right + rstride));
-  *f = v128_from_v64(v64_load_unaligned(rec + 2 * !!right),
-                     v64_load_unaligned(rec + 2 * !!right + rstride));
-  clip_sides(c, d, e, f, x0, right);
-}
-
-void SIMD_FUNC(aom_clpf_detect)(const uint8_t *rec, const uint8_t *org,
-                                int rstride, int ostride, int x0, int y0,
-                                int width, int height, int *sum0, int *sum1,
-                                unsigned int strength, int size,
-                                unsigned int dmp) {
-  const int bottom = height - 2 - y0;
-  const int right = width - 8 - x0;
-  ssd128_internal ssd0 = v128_ssd_u8_init();
-  ssd128_internal ssd1 = v128_ssd_u8_init();
-  int y;
-
-  if (size != 8) {  // Fallback to plain C
-    aom_clpf_detect_c(rec, org, rstride, ostride, x0, y0, width, height, sum0,
-                      sum1, strength, size, dmp);
-    return;
-  }
-
-  rec += x0 + y0 * rstride;
-  org += x0 + y0 * ostride;
-
-  for (y = 0; y < 8; y += 2) {
-    v128 a, b, c, d, e, f, g, h, o, r;
-    read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
-                   &a, &b, &c, &d, &e, &f, &g, &h);
-    ssd0 = v128_ssd_u8(ssd0, o, r);
-    ssd1 = v128_ssd_u8(ssd1, o,
-                       calc_delta(r, a, b, c, d, e, f, g, h, strength, dmp));
-    rec += rstride * 2;
-    org += ostride * 2;
-  }
-  *sum0 += v128_ssd_u8_sum(ssd0);
-  *sum1 += v128_ssd_u8_sum(ssd1);
-}
-
-SIMD_INLINE void calc_delta_multi(v128 r, v128 o, v128 a, v128 b, v128 c,
-                                  v128 d, v128 e, v128 f, v128 g, v128 h,
-                                  ssd128_internal *ssd1, ssd128_internal *ssd2,
-                                  ssd128_internal *ssd3, unsigned int dmp) {
-  *ssd1 = v128_ssd_u8(*ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h, 1, dmp));
-  *ssd2 = v128_ssd_u8(*ssd2, o, calc_delta(r, a, b, c, d, e, f, g, h, 2, dmp));
-  *ssd3 = v128_ssd_u8(*ssd3, o, calc_delta(r, a, b, c, d, e, f, g, h, 4, dmp));
-}
-
-// Test multiple filter strengths at once.
-void SIMD_FUNC(aom_clpf_detect_multi)(const uint8_t *rec, const uint8_t *org,
-                                      int rstride, int ostride, int x0, int y0,
-                                      int width, int height, int *sum, int size,
-                                      unsigned int dmp) {
-  const int bottom = height - 2 - y0;
-  const int right = width - 8 - x0;
-  ssd128_internal ssd0 = v128_ssd_u8_init();
-  ssd128_internal ssd1 = v128_ssd_u8_init();
-  ssd128_internal ssd2 = v128_ssd_u8_init();
-  ssd128_internal ssd3 = v128_ssd_u8_init();
-  int y;
-
-  if (size != 8) {  // Fallback to plain C
-    aom_clpf_detect_multi_c(rec, org, rstride, ostride, x0, y0, width, height,
-                            sum, size, dmp);
-    return;
-  }
-
-  rec += x0 + y0 * rstride;
-  org += x0 + y0 * ostride;
-
-  for (y = 0; y < 8; y += 2) {
-    v128 a, b, c, d, e, f, g, h, o, r;
-    read_two_lines(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o, &r,
-                   &a, &b, &c, &d, &e, &f, &g, &h);
-    ssd0 = v128_ssd_u8(ssd0, o, r);
-    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3, dmp);
-    rec += 2 * rstride;
-    org += 2 * ostride;
-  }
-  sum[0] += v128_ssd_u8_sum(ssd0);
-  sum[1] += v128_ssd_u8_sum(ssd1);
-  sum[2] += v128_ssd_u8_sum(ssd2);
-  sum[3] += v128_ssd_u8_sum(ssd3);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-SIMD_INLINE void read_two_lines_hbd(const uint16_t *rec, const uint16_t *org,
-                                    int rstride, int ostride, int x0, int y0,
-                                    int bottom, int right, int y, v128 *o,
-                                    v128 *r, v128 *a, v128 *b, v128 *c, v128 *d,
-                                    v128 *e, v128 *f, v128 *g, v128 *h,
-                                    int shift) {
-  const v128 k1 = v128_shr_u16(v128_load_aligned(org), shift);
-  const v128 k2 = v128_shr_u16(v128_load_aligned(org + ostride), shift);
-  const v128 l1 = v128_shr_u16(v128_load_aligned(rec), shift);
-  const v128 l2 = v128_shr_u16(v128_load_aligned(rec + rstride), shift);
-  const v128 l3 =
-      v128_shr_u16(v128_load_aligned(rec - (y != -y0) * rstride), shift);
-  const v128 l4 = v128_shr_u16(
-      v128_load_aligned(rec + ((y != bottom) + 1) * rstride), shift);
-  *o = v128_unziplo_8(k1, k2);
-  *r = v128_unziplo_8(l1, l2);
-  *a = v128_unziplo_8(
-      v128_shr_u16(v128_load_aligned(rec - 2 * (y != -y0) * rstride), shift),
-      l3);
-  *b = v128_unziplo_8(l3, l1);
-  *g = v128_unziplo_8(l2, l4);
-  *h = v128_unziplo_8(
-      l4,
-      v128_shr_u16(v128_load_unaligned(rec + (2 * (y != bottom) + 1) * rstride),
-                   shift));
-  *c = v128_unziplo_8(
-      v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0), shift),
-      v128_shr_u16(v128_load_unaligned(rec - 2 * !!x0 + rstride), shift));
-  *d = v128_unziplo_8(
-      v128_shr_u16(v128_load_unaligned(rec - !!x0), shift),
-      v128_shr_u16(v128_load_unaligned(rec - !!x0 + rstride), shift));
-  *e = v128_unziplo_8(
-      v128_shr_u16(v128_load_unaligned(rec + !!right), shift),
-      v128_shr_u16(v128_load_unaligned(rec + !!right + rstride), shift));
-  *f = v128_unziplo_8(
-      v128_shr_u16(v128_load_unaligned(rec + 2 * !!right), shift),
-      v128_shr_u16(v128_load_unaligned(rec + 2 * !!right + rstride), shift));
-  clip_sides(c, d, e, f, x0, right);
-}
-
-void SIMD_FUNC(aom_clpf_detect_hbd)(const uint16_t *rec, const uint16_t *org,
-                                    int rstride, int ostride, int x0, int y0,
-                                    int width, int height, int *sum0, int *sum1,
-                                    unsigned int strength, int size,
-                                    unsigned int bitdepth,
-                                    unsigned int damping) {
-  const int shift = bitdepth - 8;
-  const int bottom = height - 2 - y0;
-  const int right = width - 8 - x0;
-  ssd128_internal ssd0 = v128_ssd_u8_init();
-  ssd128_internal ssd1 = v128_ssd_u8_init();
-  int y;
-
-  if (size != 8) {  // Fallback to plain C
-    aom_clpf_detect_hbd_c(rec, org, rstride, ostride, x0, y0, width, height,
-                          sum0, sum1, strength, size, bitdepth, damping);
-    return;
-  }
-
-  rec += x0 + y0 * rstride;
-  org += x0 + y0 * ostride;
-
-  for (y = 0; y < 8; y += 2) {
-    v128 a, b, c, d, e, f, g, h, o, r;
-    read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
-                       &r, &a, &b, &c, &d, &e, &f, &g, &h, shift);
-    ssd0 = v128_ssd_u8(ssd0, o, r);
-    ssd1 = v128_ssd_u8(ssd1, o, calc_delta(r, a, b, c, d, e, f, g, h,
-                                           strength >> shift, damping));
-    rec += rstride * 2;
-    org += ostride * 2;
-  }
-  *sum0 += v128_ssd_u8_sum(ssd0);
-  *sum1 += v128_ssd_u8_sum(ssd1);
-}
-
-void SIMD_FUNC(aom_clpf_detect_multi_hbd)(const uint16_t *rec,
-                                          const uint16_t *org, int rstride,
-                                          int ostride, int x0, int y0,
-                                          int width, int height, int *sum,
-                                          int size, unsigned int bitdepth,
-                                          unsigned int damping) {
-  const int bottom = height - 2 - y0;
-  const int right = width - 8 - x0;
-  ssd128_internal ssd0 = v128_ssd_u8_init();
-  ssd128_internal ssd1 = v128_ssd_u8_init();
-  ssd128_internal ssd2 = v128_ssd_u8_init();
-  ssd128_internal ssd3 = v128_ssd_u8_init();
-  int y;
-
-  if (size != 8) {  // Fallback to plain C
-    aom_clpf_detect_multi_hbd_c(rec, org, rstride, ostride, x0, y0, width,
-                                height, sum, size, bitdepth, damping);
-    return;
-  }
-
-  rec += x0 + y0 * rstride;
-  org += x0 + y0 * ostride;
-
-  for (y = 0; y < 8; y += 2) {
-    v128 a, b, c, d, e, f, g, h, o, r;
-    read_two_lines_hbd(rec, org, rstride, ostride, x0, y0, bottom, right, y, &o,
-                       &r, &a, &b, &c, &d, &e, &f, &g, &h, bitdepth - 8);
-    ssd0 = v128_ssd_u8(ssd0, o, r);
-    calc_delta_multi(r, o, a, b, c, d, e, f, g, h, &ssd1, &ssd2, &ssd3,
-                     damping);
-    rec += rstride * 2;
-    org += ostride * 2;
-  }
-  sum[0] += v128_ssd_u8_sum(ssd0);
-  sum[1] += v128_ssd_u8_sum(ssd1);
-  sum[2] += v128_ssd_u8_sum(ssd2);
-  sum[3] += v128_ssd_u8_sum(ssd3);
-}
-#endif
--- a/test/clpf_test.cc
+++ b/test/clpf_test.cc
@ -15,7 +15,7 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"

 #include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"