Merge "Do horizontal loopfiltering in parallel"

2013-11-18 10:03:41 -08:00 · 2013-11-18 10:03:41 -08:00 · e3168b0c54
--- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
+                                               const uint8_t *blimit0,
+                                               const uint8_t *limit0,
+                                               const uint8_t *thresh0,
+                                               const uint8_t *blimit1,
+                                               const uint8_t *limit1,
+                                               const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
+}
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -390,19 +390,16 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
          // Next block's thresholds
          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);

-          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, 1);
-          vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim,
-                                            lfin->lim, lfin->hev_thr, 1);
+          vp9_mbloop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);

          if ((mask_4x4_int & 3) == 3) {
-            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
-                                            lfi->lim, lfi->hev_thr, 1);
-            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
-                                            lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
          } else {
            if (mask_4x4_int & 1)
              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
@ -426,19 +423,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
          // Next block's thresholds
          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);

-          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, 1);
-          vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
-
+          vp9_loop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr);
          if ((mask_4x4_int & 3) == 3) {
-            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
-                                            lfi->lim, lfi->hev_thr, 1);
-            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
-                                            lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
          } else {
            if (mask_4x4_int & 1)
              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@ -121,6 +121,34 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
  }
 }

+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+      const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+      ++s;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                     const uint8_t *blimit,
                                     const uint8_t *limit,
@ -185,6 +213,37 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
  }
 }

+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+      const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+      filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                               s,         s + 1 * p, s + 2 * p, s + 3 * p);
+      ++s;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -205,9 +205,15 @@ specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2

+prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon
+
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_horizontal_edge mmx neon dspr2

+prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon
+
 #
 # post proc
 #
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -123,6 +123,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c

 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)