Add multiple of 8 copies

Change-Id: I8fb710b767a986c898fbef9e329f30bfb0a22dad
2017-04-01 03:28:13 -07:00 · 2017-04-01 03:28:13 -07:00 · 131a0d5519
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@ -633,8 +633,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
  add_proto qw/void copy_4x4_16bit_to_8bit/, "uint8_t *dst, int dstride, const uint16_t *src, int sstride";
  add_proto qw/void copy_8x8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
  add_proto qw/void copy_4x4_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride";
-  add_proto qw/void copy_nxm_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int n, int m";
-  add_proto qw/void copy_nxm_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int n, int m";
+  add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+  add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";

 # VS compiling for 32 bit targets does not support vector types in
  # structs as arguments, which makes the v256 type of the intrinsics
@ -652,8 +652,8 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
    specialize qw/copy_4x4_16bit_to_8bit sse2 ssse3 sse4_1 neon/;
    specialize qw/copy_8x8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
    specialize qw/copy_4x4_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_nxm_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
-    specialize qw/copy_nxm_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 neon/;
+    specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 neon/;
  }
 }

--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c
@ -91,21 +91,24 @@ int sb_compute_dering_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
  return count;
 }

-void copy_nxm_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
-                              int sstride, int n, int m) {
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
+                                int sstride, int v, int h) {
  int i, j;
-  for (i = 0; i < m; i++) {
-    for (j = 0; j < n; j++) {
+  OD_ASSERT((h & 0x7) == 0);
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
      dst[i * dstride + j] = src[i * sstride + j];
    }
  }
 }

-void copy_nxm_16bit_to_16bit_c(uint16_t *dst, int dstride, const uint16_t *src,
-                               int sstride, int n, int m) {
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+                                 const uint16_t *src, int sstride, int v,
+                                 int h) {
  int i, j;
-  for (i = 0; i < m; i++) {
-    for (j = 0; j < n; j++) {
+  OD_ASSERT((h & 0x7) == 0);
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j++) {
      dst[i * dstride + j] = src[i * sstride + j];
    }
  }
@ -118,11 +121,11 @@ void copy_sb8_16(UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
  if (cm->use_highbitdepth) {
    const uint16_t *base =
        &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
-    copy_nxm_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+    copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
  } else {
 #endif
    const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
-    copy_nxm_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize);
+    copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
 #if CONFIG_AOM_HIGHBITDEPTH
  }
 #endif
--- a/av1/common/od_dering_simd.h
+++ b/av1/common/od_dering_simd.h
@ -405,32 +405,28 @@ void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
  }
 }

-void SIMD_FUNC(copy_nxm_8bit_to_16bit)(uint16_t *dst, int dstride,
-                                       const uint8_t *src, int sstride, int n,
-                                       int m) {
+void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+                                         const uint8_t *src, int sstride, int v,
+                                         int h) {
  int i, j;
-  for (i = 0; i < m; i++) {
-    for (j = 0; j < (n & ~0x7); j += 8) {
+  OD_ASSERT((h & 0x7) == 0);
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j += 8) {
      v64 row = v64_load_unaligned(&src[i * sstride + j]);
      v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
    }
-    for (; j < n; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
-    }
  }
 }

-void SIMD_FUNC(copy_nxm_16bit_to_16bit)(uint16_t *dst, int dstride,
-                                        const uint16_t *src, int sstride, int n,
-                                        int m) {
+void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+                                          const uint16_t *src, int sstride,
+                                          int v, int h) {
  int i, j;
-  for (i = 0; i < m; i++) {
-    for (j = 0; j < (n & ~0x7); j += 8) {
+  OD_ASSERT((h & 0x7) == 0);
+  for (i = 0; i < v; i++) {
+    for (j = 0; j < h; j += 8) {
      v128 row = v128_load_unaligned(&src[i * sstride + j]);
      v128_store_unaligned(&dst[i * dstride + j], row);
    }
-    for (; j < n; j++) {
-      dst[i * dstride + j] = src[i * sstride + j];
-    }
  }
 }