From 8404253f81a91a3fe077f17a1b6ef9452a6edd48 Mon Sep 17 00:00:00 2001
From: Yi Luo <luoyi@google.com>
Date: Fri, 24 Jun 2016 17:29:21 -0700
Subject: [PATCH] Fix bugs in convolution filter optimization

- Fix the over-writing bug in horizontal filtering as width = 2.
- Fix 10-tap vertical filtering which no longer reads one row of
  pixel above the block.
- Fix 10-tap filter zero padding.
- Encoder speed slow down ~4.0%, compared to,
  81ad953 Convolution vertical filter SSSE3 optimization

Change-Id: I9bb294a4529300081c29bf284e6bc6eb081cc536
---
 test/vp10_convolve_optimz_test.cc             |   7 +-
 vp10/common/vp10_convolve.c                   |   6 +-
 vp10/common/x86/vp10_convolve_filters_ssse3.c |  60 ++---
 vp10/common/x86/vp10_convolve_ssse3.c         | 245 +++++++++---------
 4 files changed, 157 insertions(+), 161 deletions(-)

diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index 04c4321d3..0d4beb042 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc
@@ -39,6 +39,7 @@ const size_t maxHeight = 256;
 const size_t maxBlockSize = maxWidth * maxHeight;
 const int horizOffset = 32;
 const int vertiOffset = 32;
+const size_t testMaxBlk = 128;
 const int stride = 128;
 const int x_step_q4 = 16;
 
@@ -142,7 +143,7 @@ void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
 }
 
 void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
+  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
 
   InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
 
@@ -159,7 +160,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
+  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
 
   vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                         intermediate_height, filter_params, subpel_, x_step_q4,
@@ -174,7 +175,7 @@ void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
 }
 
 void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, width_, height_);
+  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
 
   InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
 
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
index 1abd15925..2026df198 100644
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c
@@ -139,7 +139,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
     // temp's size is set to (maximum possible intermediate_height) *
     // MAX_BLOCK_WIDTH
     uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
-                  MAX_FILTER_TAP + 1) *
+                  MAX_FILTER_TAP) *
                  MAX_BLOCK_WIDTH];
     int temp_stride = MAX_BLOCK_WIDTH;
 #if CONFIG_DUAL_FILTER
@@ -164,7 +164,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
     assert(filter_params.taps <= MAX_FILTER_TAP);
 
     vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
-                        temp + temp_stride, temp_stride, w, intermediate_height,
+                        temp, temp_stride, w, intermediate_height,
                         filter_params, subpel_x_q4, x_step_q4, 0);
 
 #if CONFIG_DUAL_FILTER
@@ -175,7 +175,7 @@ void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
     filter_size = filter_params.taps;
     assert(filter_params.taps <= MAX_FILTER_TAP);
 
-    vp10_convolve_vert(temp + temp_stride * (filter_size / 2), temp_stride,
+    vp10_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
                        dst, dst_stride, w, h, filter_params,
                        subpel_y_q4, y_step_q4, ref_idx);
   }
diff --git a/vp10/common/x86/vp10_convolve_filters_ssse3.c b/vp10/common/x86/vp10_convolve_filters_ssse3.c
index 3be79ff16..410da8914 100644
--- a/vp10/common/x86/vp10_convolve_filters_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_filters_ssse3.c
@@ -159,64 +159,64 @@ DECLARE_ALIGNED(16, const int8_t,
 DECLARE_ALIGNED(16, const int8_t,
                 sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
   {
-    {0, 0,  -1,   3,  -6, 127,   8,  -4,   2,  -1, 0, 0, 0, 0, 0, 0},
-    {0, 0, 0,  -1,   3,  -6, 127,   8,  -4,   2,  -1, 0, 0, 0, 0},
+    {0, 0, -1, 3,  -6, 127,  8,  -4,  2,  -1, 0,   0, 0, 0, 0, 0},
+    {0, 0,  0, 0,  -1,   3, -6, 127,  8,  -4, 2,  -1, 0, 0, 0, 0},
   },
   {
-    {0, 1,  -2,   5, -12, 124,  18,  -7,   3,  -2, 0, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -2,   5, -12, 124,  18,  -7,   3,  -2, 0, 0, 0, 0},
+    {0, 1, -2, 5, -12, 124,  18,  -7,   3, -2, 0,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -2,   5, -12, 124,  18, -7, 3, -2, 0, 0, 0, 0},
   },
   {
-    {0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0},
+    {0, 1, -3, 7, -17, 119,  28, -11,  5,  -2, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -3,   7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
   },
   {
-    {0, 1,  -4,   8, -20, 114,  38, -14,   7,  -3, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -4,   8, -20, 114,  38, -14,   7,  -3, 1, 0, 0, 0},
+    {0, 1, -4, 8, -20, 114,  38, -14,  7,  -3, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -4,   8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
   },
   {
-    {0, 1,  -4,   9, -22, 107,  49, -17,   8,  -4, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -4,   9, -22, 107,  49, -17,   8,  -4, 1, 0, 0, 0},
+    {0, 1, -4, 9, -22, 107,  49, -17,  8,  -4, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -4,   9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
   },
   {
-    {0, 2,  -5,  10, -24,  99,  59, -20,   9,  -4, 2, 0, 0, 0, 0, 0},
-    {0, 0, 2,  -5,  10, -24,  99,  59, -20,   9,  -4, 2, 0, 0, 0},
+    {0, 2, -5, 10, -24, 99,  59, -20,  9,  -4, 2,  0, 0, 0, 0, 0},
+    {0, 0,  0,  2,  -5, 10, -24,  99, 59, -20, 9, -4, 2, 0, 0, 0},
   },
   {
-    {0, 2,  -5,  10, -24,  90,  70, -22,  10,  -5, 2, 0, 0, 0, 0, 0},
-    {0, 0, 2,  -5,  10, -24,  90,  70, -22,  10,  -5, 2, 0, 0, 0},
+    {0, 2, -5, 10, -24, 90,  70, -22, 10,  -5,  2,  0, 0, 0, 0, 0},
+    {0, 0,  0,  2,  -5, 10, -24,  90, 70, -22, 10, -5, 2, 0, 0, 0},
   },
   {
-    {0, 2,  -5,  10, -23,  80,  80, -23,  10,  -5, 2, 0, 0, 0, 0, 0},
-    {0, 0, 2,  -5,  10, -23,  80,  80, -23,  10,  -5, 2, 0, 0, 0},
+    {0, 2, -5, 10, -23, 80,  80, -23, 10,  -5,  2,  0, 0, 0, 0, 0},
+    {0, 0,  0,  2,  -5, 10, -23,  80, 80, -23, 10, -5, 2, 0, 0, 0},
   },
   {
-    {0, 2,  -5,  10, -22,  70,  90, -24,  10,  -5, 2, 0, 0, 0, 0, 0},
-    {0, 0, 2,  -5,  10, -22,  70,  90, -24,  10,  -5, 2, 0, 0, 0},
+    {0, 2, -5, 10, -22, 70,  90, -24, 10,  -5,  2,  0, 0, 0, 0, 0},
+    {0, 0,  0,  2,  -5, 10, -22,  70, 90, -24, 10, -5, 2, 0, 0, 0},
   },
   {
-    {0, 2,  -4,   9, -20,  59,  99, -24,  10,  -5, 2, 0, 0, 0, 0, 0},
-    {0, 0, 2,  -4,   9, -20,  59,  99, -24,  10,  -5, 2, 0, 0, 0},
+    {0, 2, -4, 9, -20, 59,  99, -24,  10,  -5,  2,  0, 0, 0, 0, 0},
+    {0, 0,  0, 2,  -4,  9, -20,  59,  99, -24, 10, -5, 2, 0, 0, 0},
   },
   {
-    {0, 1,  -4,   8, -17,  49, 107, -22,   9,  -4, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -4,   8, -17,  49, 107, -22,   9,  -4, 1, 0, 0, 0},
+    {0, 1, -4, 8, -17,  49, 107, -22,   9,  -4, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -4,   8, -17,  49, 107, -22, 9, -4, 1, 0, 0, 0},
   },
   {
-    {0, 1,  -3,   7, -14,  38, 114, -20,   8,  -4, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -3,   7, -14,  38, 114, -20,   8,  -4, 1, 0, 0, 0},
+    {0, 1, -3, 7, -14, 38, 114, -20,   8,  -4, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -3,  7, -14,  38, 114, -20, 8, -4, 1, 0, 0, 0},
   },
   {
-    {0, 1,  -2,   5, -11,  28, 119, -17,   7,  -3, 1, 0, 0, 0, 0, 0},
-    {0, 0, 1,  -2,   5, -11,  28, 119, -17,   7,  -3, 1, 0, 0, 0},
+    {0, 1, -2, 5, -11, 28, 119, -17,   7,  -3, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 1,  -2,  5, -11,  28, 119, -17, 7, -3, 1, 0, 0, 0},
   },
   {
-    {0, 0,  -2,   3,  -7,  18, 124, -12,   5,  -2, 1, 0, 0, 0, 0, 0},
-    {0, 0, 0,  -2,   3,  -7,  18, 124, -12,   5,  -2, 1, 0, 0, 0},
+    {0, 0, -2, 3,  -7, 18, 124, -12,   5,  -2, 1,  0, 0, 0, 0, 0},
+    {0, 0,  0, 0,  -2,  3,  -7,  18, 124, -12, 5, -2, 1, 0, 0, 0},
   },
   {
-    {0, 0,  -1,   2,  -4,   8, 127,  -6,   3,  -1, 0, 0, 0, 0, 0, 0},
-    {0, 0, 0,  -1,   2,  -4,   8, 127,  -6,   3,  -1, 0, 0, 0, 0},
+    {0, 0, -1, 2,  -4,  8, 127, -6,   3,   -1, 0,  0, 0, 0, 0, 0},
+    {0, 0,  0, 0,  -1,  2,  -4,  8, 127,   -6, 3, -1, 0, 0, 0, 0},
   },
 };
 #endif  // CONFIG_EXT_INTERP
diff --git a/vp10/common/x86/vp10_convolve_ssse3.c b/vp10/common/x86/vp10_convolve_ssse3.c
index 472990e82..75520c9ac 100644
--- a/vp10/common/x86/vp10_convolve_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_ssse3.c
@@ -37,15 +37,7 @@ static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
   // they're zero vectors.
 }
 
-typedef void (*store_pixel_t)(const __m128i *x, uint8_t *src, uint8_t *dst);
-
-static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *src,
-                                      uint8_t *dst) {
-  __m128i u;
-  (void)src;
-  u = _mm_packus_epi16(*x, *x);
-  *(int *)dst = _mm_cvtsi128_si32(u);
-}
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
 
 static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
   const __m128i zero = _mm_setzero_si128();
@@ -59,9 +51,30 @@ static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
   return y;
 }
 
-static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *src,
-                                            uint8_t *dst) {
-  __m128i y = accumulate_store(x, src);
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i u = _mm_packus_epi16(*x, *x);
+  temp = _mm_cvtsi128_si32(u);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i y = accumulate_store(x, dst);
+  temp = _mm_cvtsi128_si32(y);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static store_pixel_t store2pixelTab[2] = {
+  store_2_pixel_only, accumulate_store_2_pixel};
+
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  *(int *)dst = _mm_cvtsi128_si32(u);
+}
+
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
   *(int *)dst = _mm_cvtsi128_si32(y);
 }
 
@@ -69,12 +82,12 @@ static store_pixel_t store4pixelTab[2] = {
   store_4_pixel_only, accumulate_store_4_pixel};
 
 void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
-                    int tapsNum, store_pixel_t store_func, uint8_t *dst,
-                    uint8_t *buf) {
+                    int tapsNum, store_pixel_t store_func, uint8_t *dst) {
   __m128i sumPairRow[4];
   __m128i sumPairCol[8];
   __m128i pixel;
   const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
 
   if (10 == tapsNum) {
     src -= 1;
@@ -103,52 +116,54 @@ void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
   sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
 
   sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
+  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
+  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
 
-  store_func(&sumPairRow[1], dst, buf);
+  store_func(&sumPairRow[1], dst);
 }
 
 void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                    store_pixel_t store, uint8_t *dst, uint8_t *buf) {
-  horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
+                    store_pixel_t store, uint8_t *buf) {
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
   src += 4;
   buf += 4;
-  horiz_w4_ssse3(src, f, tapsNum, store, dst, buf);
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
 }
 
 void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *dst, uint8_t *buf) {
-  horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
+                     store_pixel_t store, uint8_t *buf) {
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
   src += 8;
   buf += 8;
-  horiz_w8_ssse3(src, f, tapsNum, store, dst, buf);
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
 }
 
 void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *dst, uint8_t *buf) {
-  horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
+                     store_pixel_t store, uint8_t *buf) {
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
   src += 16;
   buf += 16;
-  horiz_w16_ssse3(src, f, tapsNum, store, dst, buf);
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
 }
 
 void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *dst, uint8_t *buf) {
-  horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
+                     store_pixel_t store, uint8_t *buf) {
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
   src += 32;
   buf += 32;
-  horiz_w32_ssse3(src, f, tapsNum, store, dst, buf);
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
 }
 
 void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                      store_pixel_t store, uint8_t *dst, uint8_t *buf) {
-  horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
+                      store_pixel_t store, uint8_t *buf) {
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
   src += 64;
   buf += 64;
-  horiz_w64_ssse3(src, f, tapsNum, store, dst, buf);
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
 }
 
 static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
-                           store_pixel_t, uint8_t *, uint8_t *) = {
+                           store_pixel_t, uint8_t *) = {
   horiz_w4_ssse3,
   horiz_w8_ssse3,
   horiz_w16_ssse3,
@@ -158,26 +173,28 @@ static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
 };
 
 void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
-                        store_pixel_t store, uint8_t *dst, uint8_t *buffer) {
+                        store_pixel_t store, uint8_t *dst) {
   switch (width) {
+    // Note:
+    // For width=2 and 4, store function must be different
     case 2:
     case 4:
-      horizTab[0](src, f, tapsNum, store, dst, buffer);
+      horizTab[0](src, f, tapsNum, store, dst);
       break;
     case 8:
-      horizTab[1](src, f, tapsNum, store, dst, buffer);
+      horizTab[1](src, f, tapsNum, store, dst);
       break;
     case 16:
-      horizTab[2](src, f, tapsNum, store, dst, buffer);
+      horizTab[2](src, f, tapsNum, store, dst);
       break;
     case 32:
-      horizTab[3](src, f, tapsNum, store, dst, buffer);
+      horizTab[3](src, f, tapsNum, store, dst);
       break;
     case 64:
-      horizTab[4](src, f, tapsNum, store, dst, buffer);
+      horizTab[4](src, f, tapsNum, store, dst);
       break;
     case 128:
-      horizTab[5](src, f, tapsNum, store, dst, buffer);
+      horizTab[5](src, f, tapsNum, store, dst);
       break;
     default:
       assert(0);
@@ -625,6 +642,7 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
   __m128i horf[2];
   SubpelFilterCoeffs hCoeffs, vCoeffs;
   const uint8_t *src_ptr;
+  store_pixel_t store2p = store2pixelTab[avg];
   store_pixel_t store4p = store4pixelTab[avg];
   transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
   transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
@@ -686,114 +704,97 @@ void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
     } while (count < block_height);
 
     for (i = 0; i < block_residu; ++i) {
-      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
+      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
       src_ptr += src_stride;
       dst += dst_stride;
     }
   } else {
-    // 4-pixels parallel
-    block_height = h >> 2;
-    block_residu = h & 3;
+    if (w > 2) {
+      // 4-pixels parallel
+      block_height = h >> 2;
+      block_residu = h & 3;
 
-    do {
-      for (col = 0; col < w; col += 4) {
-        for (i = 0; i < 4; ++i) {
-          filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
-                                 temp + (i * 4));
-          src_ptr += 1;
+      do {
+        for (col = 0; col < w; col += 4) {
+          for (i = 0; i < 4; ++i) {
+            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                   temp + (i * 4));
+            src_ptr += 1;
+          }
+          transpose_4x4(temp, 4, dst + col, dst_stride);
         }
-        transpose_4x4(temp, 4, dst + col, dst_stride);
-      }
-      count++;
-      src_ptr = src + count * src_stride * 4;
-      dst += dst_stride * 4;
-    } while (count < block_height);
+        count++;
+        src_ptr = src + count * src_stride * 4;
+        dst += dst_stride * 4;
+      } while (count < block_height);
 
-    for (i = 0; i < block_residu; ++i) {
-      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst, dst);
-      src_ptr += src_stride;
-      dst += dst_stride;
+      for (i = 0; i < block_residu; ++i) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    } else {
+      for (i = 0; i < h; i++) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
     }
   }
 }
 
 // Vertical convolution filtering
-static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *src,
-                                      uint8_t *dst) {
-  __m128i u;
-  uint32_t temp;
-  (void)src;
-  u = _mm_packus_epi16(*x, *x);
-  temp = _mm_cvtsi128_si32(u);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *src,
-                                            uint8_t *dst) {
-  uint32_t temp;
-  __m128i y = accumulate_store(x, src);
-  temp = _mm_cvtsi128_si32(y);
-  *(uint16_t *)dst = (uint16_t)temp;
-}
-
-static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *src,
-                                      uint8_t *dst) {
-  __m128i u;
-  (void)src;
-  u = _mm_packus_epi16(*x, *x);
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
   _mm_storel_epi64((__m128i *)dst, u);
 }
 
-static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *src,
-                                            uint8_t *dst) {
-  __m128i y = accumulate_store(x, src);
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
   _mm_storel_epi64((__m128i *)dst, y);
 }
 
 static store_pixel_t store8pixelTab[2] = {
   store_8_pixel_only, accumulate_store_8_pixel};
 
-static store_pixel_t store2pixelTab[2] = {
-  store_2_pixel_only, accumulate_store_2_pixel};
-
 static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
-                                 __m128i *f) {
+                                 int tapsNum, __m128i *f) {
+  __m128i s[12];
   const __m128i k_256 = _mm_set1_epi16(1 << 8);
   const __m128i zero = _mm_setzero_si128();
   __m128i min_x2x3, max_x2x3, sum;
+  int i = 0;
+  int r = 0;
 
-  __m128i s0 = _mm_loadu_si128((__m128i const *)(src));
-  __m128i s1 = _mm_loadu_si128((__m128i const *)(src + src_stride));
-  __m128i s2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  __m128i s3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  __m128i s4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  __m128i s5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  __m128i s6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  __m128i s7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-  __m128i s8 = _mm_loadu_si128((__m128i const *)(src + 8 * src_stride));
-  __m128i s9 = _mm_loadu_si128((__m128i const *)(src + 9 * src_stride));
-  __m128i s10 = _mm_loadu_si128((__m128i const *)(src + 10 * src_stride));
-  __m128i s11 = _mm_loadu_si128((__m128i const *)(src + 11 * src_stride));
+  if (10 == tapsNum) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
 
-  s0 = _mm_unpacklo_epi8(s0, s1);
-  s2 = _mm_unpacklo_epi8(s2, s3);
-  s4 = _mm_unpacklo_epi8(s4, s5);
-  s6 = _mm_unpacklo_epi8(s6, s7);
-  s8 = _mm_unpacklo_epi8(s8, s9);
-  s10 = _mm_unpacklo_epi8(s10, s11);
+  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
 
-  s0 = _mm_maddubs_epi16(s0, f[0]);
-  s2 = _mm_maddubs_epi16(s2, f[1]);
-  s4 = _mm_maddubs_epi16(s4, f[2]);
-  s6 = _mm_maddubs_epi16(s6, f[3]);
-  s8 = _mm_maddubs_epi16(s8, f[4]);
-  s10 = _mm_maddubs_epi16(s10, f[5]);
+  s[0] = _mm_maddubs_epi16(s[0], f[0]);
+  s[2] = _mm_maddubs_epi16(s[2], f[1]);
+  s[4] = _mm_maddubs_epi16(s[4], f[2]);
+  s[6] = _mm_maddubs_epi16(s[6], f[3]);
+  s[8] = _mm_maddubs_epi16(s[8], f[4]);
+  s[10] = _mm_maddubs_epi16(s[10], f[5]);
 
-  min_x2x3 = _mm_min_epi16(s4, s6);
-  max_x2x3 = _mm_max_epi16(s4, s6);
-  sum = _mm_adds_epi16(s0, s2);
-  sum = _mm_adds_epi16(sum, s10);
-  sum = _mm_adds_epi16(sum, s8);
+  min_x2x3 = _mm_min_epi16(s[4], s[6]);
+  max_x2x3 = _mm_max_epi16(s[4], s[6]);
+  sum = _mm_adds_epi16(s[0], s[2]);
+  sum = _mm_adds_epi16(sum, s[10]);
+  sum = _mm_adds_epi16(sum, s[8]);
 
   sum = _mm_adds_epi16(sum, min_x2x3);
   sum = _mm_adds_epi16(sum, max_x2x3);
@@ -808,14 +809,8 @@ static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
                                              __m128i *f, int tapsNum,
                                              store_pixel_t store_func,
                                              uint8_t *dst) {
-  __m128i sum;
-
-  if (10 == tapsNum) {
-    src -= src_stride;
-  }
-
-  sum = filter_vert_ssse3(src, src_stride, f);
-  store_func(&sum, dst, dst);
+  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
+  store_func(&sum, dst);
 }
 
 void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,