Merge changes I40454d26,I892e76d5,I865ab3f9,I4a4bec17,I61c4351e,I37eb3559,I1031c556,I8c8f1f42

* changes: delete vp9_loopfilter_sse2.asm vp9_loopfilter_intrin_sse2: cosmetics: fix indent delete x86/vp9_loopfilter_x86.h vp9_loopfilter_intrin_sse2: make some funcs static vp9_loopfilter_intrin_sse2: remove unused uv funcs vp9_loopfilter: remove uv function typedef filter_block_plane: reuse some constants vp9_loopfilter.c: make some functions static
2013-07-16 14:25:32 -07:00 · 2013-07-16 14:25:32 -07:00 · 98e132bde0
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@ -33,8 +33,7 @@ static void lf_init_lut(loop_filter_info_n *lfi) {
  lfi->mode_lf_lut[NEWMV] = 1;
 }

-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl) {
+static void update_sharpness(loop_filter_info_n *const lfi, int sharpness_lvl) {
  int lvl;

  // For each possible value for the loop filter fill out limits
@ -62,7 +61,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
  int i;

  // init limits for given sharpness
-  vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+  update_sharpness(lfi, cm->sharpness_level);
  cm->last_sharpness_level = cm->sharpness_level;

  // init LUT for lvl  and hev thr picking
@ -73,8 +72,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
    vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
 }

-void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                int default_filt_lvl) {
+static void loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                   int default_filt_lvl) {
  int seg;
  // n_shift is the a multiplier for lf_deltas
  // the multiplier is 1 for when filter_lvl is between 0 and 31;
@ -84,7 +83,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,

  // update limits if sharpness has changed
  if (cm->last_sharpness_level != cm->sharpness_level) {
-    vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);
+    update_sharpness(lfi, cm->sharpness_level);
    cm->last_sharpness_level = cm->sharpness_level;
  }

@ -118,9 +117,9 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, MACROBLOCKD *xd,
  }
 }

-static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi,
-                      struct loop_filter_info *lfi) {
-  const loop_filter_info_n *const lfi_n = &cm->lf_info;
+static int build_lfi(const loop_filter_info_n *const lfi_n,
+                     const MB_MODE_INFO *const mbmi,
+                     struct loop_filter_info *const lfi) {
  const int seg = mbmi->segment_id;
  const int ref = mbmi->ref_frame[0];
  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
@ -231,13 +230,13 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
  }
 }

-static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
+static void filter_block_plane(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                               int plane, int mi_row, int mi_col) {
  const int ss_x = xd->plane[plane].subsampling_x;
  const int ss_y = xd->plane[plane].subsampling_y;
-  const int row_step = 1 << xd->plane[plane].subsampling_y;
-  const int col_step = 1 << xd->plane[plane].subsampling_x;
-  struct buf_2d * const dst = &xd->plane[plane].dst;
+  const int row_step = 1 << ss_x;
+  const int col_step = 1 << ss_y;
+  struct buf_2d *const dst = &xd->plane[plane].dst;
  uint8_t* const dst0 = dst->buf;
  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
@ -245,8 +244,8 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
  struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
  int r, c;
-  MODE_INFO *mi = xd->mode_info_context;
-  int row_step_stride = cm->mode_info_stride * row_step;
+  const MODE_INFO *mi = xd->mode_info_context;
+  const int row_step_stride = cm->mode_info_stride * row_step;

  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
    unsigned int mask_16x16_c = 0;
@ -272,8 +271,7 @@ static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd,
      const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;

      // Filter level can vary per MI
-      if (!build_lfi(cm, &mi[c].mbmi,
-                     lfi[r] + (c >> xd->plane[plane].subsampling_x)))
+      if (!build_lfi(&cm->lf_info, &mi[c].mbmi, lfi[r] + (c >> ss_x)))
        continue;

      // Build masks based on the transform size of each block
@ -355,7 +353,7 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
  int mi_row, mi_col;

  // Initialize the loop filter for this frame.
-  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
+  loop_filter_frame_init(cm, xd, frame_filter_level);

  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
    MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride;
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@ -44,44 +44,14 @@ struct loop_filter_info {
  const uint8_t *hev_thr;
 };

-#define prototype_loopfilter(sym) \
-  void sym(uint8_t *src, int pitch, const uint8_t *blimit, \
-           const uint8_t *limit, const uint8_t *thresh, int count)
-
-#define prototype_loopfilter_block(sym) \
-  void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
-           int ystride, int uv_stride, struct loop_filter_info *lfi)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/vp9_loopfilter_x86.h"
-#endif
-
-typedef void loop_filter_uvfunction(uint8_t *src, int pitch,
-                                    const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh,
-                                    uint8_t *v);
-
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;

 void vp9_loop_filter_init(struct VP9Common *cm);

-void vp9_loop_filter_frame_init(struct VP9Common *cm,
-                                struct macroblockd *mbd,
-                                int default_filt_lvl);
-
 void vp9_loop_filter_frame(struct VP9Common *cm,
                           struct macroblockd *mbd,
                           int filter_level,
                           int y_only);
-
-void vp9_loop_filter_partial_frame(struct VP9Common *cm,
-                                   struct macroblockd *mbd,
-                                   int default_filt_lvl);
-
-void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
-                                      int sharpness_lvl);
-
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@ -12,17 +12,11 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"

-prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2);
-
-extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2;
-extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2;
-
-void vp9_mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
-                                         int p,
-                                         const unsigned char *_blimit,
-                                         const unsigned char *_limit,
-                                         const unsigned char *_thresh) {
+static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit,
+                                            const unsigned char *_limit,
+                                            const unsigned char *_thresh) {
  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);

@ -483,11 +477,11 @@ void vp9_mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
  }
 }

-void vp9_mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
-                                          int p,
-                                          const unsigned char *_blimit,
-                                          const unsigned char *_limit,
-                                          const unsigned char *_thresh) {
+static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
+                                             int p,
+                                             const unsigned char *_blimit,
+                                             const unsigned char *_limit,
+                                             const unsigned char *_thresh) {
  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);

@ -962,9 +956,9 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
                                       const unsigned char *_thresh,
                                       int count) {
  if (count == 1)
-    vp9_mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
+    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
  else
-    vp9_mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
+    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
 }

 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
@ -1206,79 +1200,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
  }
 }

-void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
-                                               int p,
-                                               const unsigned char *_blimit,
-                                               const unsigned char *_limit,
-                                               const unsigned char *_thresh,
-                                               unsigned char *v) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160);
-
-  /* Read source */
-  const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 5 * p)));
-  const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 4 * p)));
-  const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 3 * p)));
-  const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 2 * p)));
-  const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)),
-      _mm_loadl_epi64((__m128i *)(v - 1 * p)));
-  const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)),
-      _mm_loadl_epi64((__m128i *)(v)));
-  const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 1 * p)));
-  const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 2 * p)));
-  const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 3 * p)));
-  const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)),
-      _mm_loadl_epi64((__m128i *)(v + 4 * p)));
-
-  _mm_store_si128((__m128i *)(src), p4);
-  _mm_store_si128((__m128i *)(src + 16), p3);
-  _mm_store_si128((__m128i *)(src + 32), p2);
-  _mm_store_si128((__m128i *)(src + 48), p1);
-  _mm_store_si128((__m128i *)(src + 64), p0);
-  _mm_store_si128((__m128i *)(src + 80), q0);
-  _mm_store_si128((__m128i *)(src + 96), q1);
-  _mm_store_si128((__m128i *)(src + 112), q2);
-  _mm_store_si128((__m128i *)(src + 128), q3);
-  _mm_store_si128((__m128i *)(src + 144), q4);
-
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit,
-                                         _thresh, 1);
-
-  /* Store result */
-  _mm_storel_epi64((__m128i *)(u - 3 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 32)));
-  _mm_storel_epi64((__m128i *)(u - 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 48)));
-  _mm_storel_epi64((__m128i *)(u - p),
-                   _mm_loadl_epi64((__m128i *)(src + 64)));
-  _mm_storel_epi64((__m128i *)u,
-                   _mm_loadl_epi64((__m128i *)(src + 80)));
-  _mm_storel_epi64((__m128i *)(u + p),
-                   _mm_loadl_epi64((__m128i *)(src + 96)));
-  _mm_storel_epi64((__m128i *)(u + 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 112)));
-
-  _mm_storel_epi64((__m128i *)(v - 3 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 40)));
-  _mm_storel_epi64((__m128i *)(v - 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 56)));
-  _mm_storel_epi64((__m128i *)(v - p),
-                   _mm_loadl_epi64((__m128i *)(src + 72)));
-  _mm_storel_epi64((__m128i *)v,
-                   _mm_loadl_epi64((__m128i *)(src + 88)));
-  _mm_storel_epi64((__m128i *)(v + p),
-                   _mm_loadl_epi64((__m128i *)(src + 104)));
-  _mm_storel_epi64((__m128i *)(v + 2 * p),
-                   _mm_loadl_epi64((__m128i *)(src + 120)));
-}
-
 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
                                 int in_p, unsigned char *out, int out_p) {
  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
@ -1425,7 +1346,7 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,

  /* Loop filtering */
  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                           thresh, 1);
+                                         thresh, 1);
  src[0] = t_dst + 3 * 16;
  src[1] = t_dst + 3 * 16 + 8;

@ -1437,10 +1358,10 @@ void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
 }

 void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                          int p,
-                                          const unsigned char *blimit,
-                                          const unsigned char *limit,
-                                          const unsigned char *thresh) {
+                                     int p,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
  unsigned char *src[4];
  unsigned char *dst[4];
@ -1466,32 +1387,3 @@ void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,

  transpose(src, 16, dst, p, 2);
 }
-
-
-void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
-                                             int p,
-                                             const unsigned char *blimit,
-                                             const unsigned char *limit,
-                                             const unsigned char *thresh,
-                                             unsigned char *v) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[2];
-  unsigned char *dst[2];
-
-  /* Transpose 16x16 */
-  transpose8x16(u - 8, v - 8, p, t_dst, 16);
-  transpose8x16(u, v, p, t_dst + 16 * 8, 16);
-
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                           thresh, 1);
-
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
-
-  dst[0] = u - 5;
-  dst[1] = v - 5;
-
-  /* Transpose 16x8 */
-  transpose(src, 16, dst, p, 2);
-}
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@ -1,872 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; Use of pmaxub instead of psubusb to compute filter mask was seen
-; in ffvp8
-
-%macro LFH_FILTER_AND_HEV_MASK 1
-%if %1
-        movdqa      xmm2,                   [rdi+2*rax]       ; q3
-        movdqa      xmm1,                   [rsi+2*rax]       ; q2
-        movdqa      xmm4,                   [rsi+rax]         ; q1
-        movdqa      xmm5,                   [rsi]             ; q0
-        neg         rax                     ; negate pitch to deal with above border
-%else
-        movlps      xmm2,                   [rsi + rcx*2]     ; q3
-        movlps      xmm1,                   [rsi + rcx]       ; q2
-        movlps      xmm4,                   [rsi]             ; q1
-        movlps      xmm5,                   [rsi + rax]       ; q0
-
-        movhps      xmm2,                   [rdi + rcx*2]
-        movhps      xmm1,                   [rdi + rcx]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm5,                   [rdi + rax]
-
-        lea         rsi,                    [rsi + rax*4]
-        lea         rdi,                    [rdi + rax*4]
-
-        movdqa      XMMWORD PTR [rsp],      xmm1              ; store q2
-        movdqa      XMMWORD PTR [rsp + 16], xmm4              ; store q1
-%endif
-
-        movdqa      xmm6,                   xmm1              ; q2
-        movdqa      xmm3,                   xmm4              ; q1
-
-        psubusb     xmm1,                   xmm2              ; q2-=q3
-        psubusb     xmm2,                   xmm6              ; q3-=q2
-
-        psubusb     xmm4,                   xmm6              ; q1-=q2
-        psubusb     xmm6,                   xmm3              ; q2-=q1
-
-        por         xmm4,                   xmm6              ; abs(q2-q1)
-        por         xmm1,                   xmm2              ; abs(q3-q2)
-
-        movdqa      xmm0,                   xmm5              ; q0
-        pmaxub      xmm1,                   xmm4
-
-        psubusb     xmm5,                   xmm3              ; q0-=q1
-        psubusb     xmm3,                   xmm0              ; q1-=q0
-
-        por         xmm5,                   xmm3              ; abs(q0-q1)
-        movdqa      t0,                     xmm5              ; save to t0
-
-        pmaxub      xmm1,                   xmm5
-
-%if %1
-        movdqa      xmm2,                   [rsi+4*rax]       ; p3
-        movdqa      xmm4,                   [rdi+4*rax]       ; p2
-        movdqa      xmm6,                   [rsi+2*rax]       ; p1
-%else
-        movlps      xmm2,                   [rsi + rax]       ; p3
-        movlps      xmm4,                   [rsi]             ; p2
-        movlps      xmm6,                   [rsi + rcx]       ; p1
-
-        movhps      xmm2,                   [rdi + rax]
-        movhps      xmm4,                   [rdi]
-        movhps      xmm6,                   [rdi + rcx]
-
-        movdqa      XMMWORD PTR [rsp + 32], xmm4              ; store p2
-        movdqa      XMMWORD PTR [rsp + 48], xmm6              ; store p1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p2
-        movdqa      xmm3,                   xmm6              ; p1
-
-        psubusb     xmm4,                   xmm2              ; p2-=p3
-        psubusb     xmm2,                   xmm5              ; p3-=p2
-
-        psubusb     xmm3,                   xmm5              ; p1-=p2
-        pmaxub      xmm1,                   xmm4              ; abs(p3 - p2)
-
-        psubusb     xmm5,                   xmm6              ; p2-=p1
-        pmaxub      xmm1,                   xmm2              ; abs(p3 - p2)
-
-        pmaxub      xmm1,                   xmm5              ; abs(p2 - p1)
-        movdqa      xmm2,                   xmm6              ; p1
-
-        pmaxub      xmm1,                   xmm3              ; abs(p2 - p1)
-%if %1
-        movdqa      xmm4,                   [rsi+rax]         ; p0
-        movdqa      xmm3,                   [rdi]             ; q1
-%else
-        movlps      xmm4,                   [rsi + rcx*2]     ; p0
-        movhps      xmm4,                   [rdi + rcx*2]
-        movdqa      xmm3,                   q1                ; q1
-%endif
-
-        movdqa      xmm5,                   xmm4              ; p0
-        psubusb     xmm4,                   xmm6              ; p0-=p1
-
-        psubusb     xmm6,                   xmm5              ; p1-=p0
-
-        por         xmm6,                   xmm4              ; abs(p1 - p0)
-        mov         rdx,                    arg(2)            ; get blimit
-
-        movdqa        t1,                   xmm6              ; save to t1
-
-        movdqa      xmm4,                   xmm3              ; q1
-        pmaxub      xmm1,                   xmm6
-
-        psubusb     xmm3,                   xmm2              ; q1-=p1
-        psubusb     xmm2,                   xmm4              ; p1-=q1
-
-        psubusb     xmm1,                   xmm7
-        por         xmm2,                   xmm3              ; abs(p1-q1)
-
-        movdqa      xmm7,                   XMMWORD PTR [rdx] ; blimit
-
-        movdqa      xmm3,                   xmm0              ; q0
-        pand        xmm2,                   [GLOBAL(tfe)]     ; set lsb of each byte to zero
-
-        mov         rdx,                    arg(4)            ; hev get thresh
-
-        movdqa      xmm6,                   xmm5              ; p0
-        psrlw       xmm2,                   1                 ; abs(p1-q1)/2
-
-        psubusb     xmm5,                   xmm3              ; p0-=q0
-
-        psubusb     xmm3,                   xmm6              ; q0-=p0
-        por         xmm5,                   xmm3              ; abs(p0 - q0)
-
-        paddusb     xmm5,                   xmm5              ; abs(p0-q0)*2
-
-        movdqa      xmm4,                   t0                ; hev get abs (q1 - q0)
-
-        movdqa      xmm3,                   t1                ; get abs (p1 - p0)
-
-        paddusb     xmm5,                   xmm2              ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
-        movdqa      xmm2,                   XMMWORD PTR [rdx] ; hev
-
-        psubusb     xmm5,                   xmm7              ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        psubusb     xmm4,                   xmm2              ; hev
-
-        psubusb     xmm3,                   xmm2              ; hev
-        por         xmm1,                   xmm5
-
-        pxor        xmm7,                   xmm7
-        paddb       xmm4,                   xmm3              ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        pcmpeqb     xmm4,                   xmm5              ; hev
-        pcmpeqb     xmm3,                   xmm3              ; hev
-
-        pcmpeqb     xmm1,                   xmm7              ; mask xmm1
-        pxor        xmm4,                   xmm3              ; hev
-%endmacro
-
-%macro B_FILTER 1
-%if %1 == 0
-        movdqa      xmm2,                   p1                ; p1
-        movdqa      xmm7,                   q1                ; q1
-%elif %1 == 1
-        movdqa      xmm2,                   [rsi+2*rax]       ; p1
-        movdqa      xmm7,                   [rdi]             ; q1
-%elif %1 == 2
-        lea         rdx,                    srct
-
-        movdqa      xmm2,                   [rdx]             ; p1
-        movdqa      xmm7,                   [rdx+48]          ; q1
-        movdqa      xmm6,                   [rdx+16]          ; p0
-        movdqa      xmm0,                   [rdx+32]          ; q0
-%endif
-
-        pxor        xmm2,                   [GLOBAL(t80)]     ; p1 offset to convert to signed values
-        pxor        xmm7,                   [GLOBAL(t80)]     ; q1 offset to convert to signed values
-
-        psubsb      xmm2,                   xmm7              ; p1 - q1
-        pxor        xmm6,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        pand        xmm2,                   xmm4              ; high var mask (hvm)(p1 - q1)
-        pxor        xmm0,                   [GLOBAL(t80)]     ; offset to convert to signed values
-
-        movdqa      xmm3,                   xmm0              ; q0
-        psubsb      xmm0,                   xmm6              ; q0 - p0
-
-        paddsb      xmm2,                   xmm0              ; 1 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 2 * (q0 - p0) + hvm(p1 - q1)
-
-        paddsb      xmm2,                   xmm0              ; 3 * (q0 - p0) + hvm(p1 - q1)
-
-        pand        xmm1,                   xmm2              ; mask filter values we don't care about
-
-        movdqa      xmm2,                   xmm1
-
-        paddsb      xmm1,                   [GLOBAL(t4)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 4
-        paddsb      xmm2,                   [GLOBAL(t3)]      ; 3* (q0 - p0) + hvm(p1 - q1) + 3
-
-        punpckhbw   xmm5,                   xmm2              ; axbxcxdx
-        punpcklbw   xmm2,                   xmm2              ; exfxgxhx
-
-        punpcklbw   xmm0,                   xmm1              ; exfxgxhx
-        psraw       xmm5,                   11                ; sign extended shift right by 3
-
-        punpckhbw   xmm1,                   xmm1              ; axbxcxdx
-        psraw       xmm2,                   11                ; sign extended shift right by 3
-
-        packsswb    xmm2,                   xmm5              ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
-        psraw       xmm0,                   11                ; sign extended shift right by 3
-
-        psraw       xmm1,                   11                ; sign extended shift right by 3
-        movdqa      xmm5,                   xmm0              ; save results
-
-        packsswb    xmm0,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
-        paddsw      xmm5,                   [GLOBAL(ones)]
-
-        paddsw      xmm1,                   [GLOBAL(ones)]
-        psraw       xmm5,                   1                 ; partial shifted one more time for 2nd tap
-
-        psraw       xmm1,                   1                 ; partial shifted one more time for 2nd tap
-
-        paddsb      xmm6,                   xmm2              ; p0+= p0 add
-        packsswb    xmm5,                   xmm1              ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
-
-%if %1 == 0
-        movdqa      xmm1,                   p1                ; p1
-%elif %1 == 1
-        movdqa      xmm1,                   [rsi+2*rax]       ; p1
-%elif %1 == 2
-        movdqa      xmm1,                   [rdx]             ; p1
-%endif
-        pandn       xmm4,                   xmm5              ; high edge variance additive
-        pxor        xmm6,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; reoffset
-        psubsb      xmm3,                   xmm0              ; q0-= q0 add
-
-        paddsb      xmm1,                   xmm4              ; p1+= p1 add
-        pxor        xmm3,                   [GLOBAL(t80)]     ; unoffset
-
-        pxor        xmm1,                   [GLOBAL(t80)]     ; unoffset
-        psubsb      xmm7,                   xmm4              ; q1-= q1 add
-
-        pxor        xmm7,                   [GLOBAL(t80)]     ; unoffset
-%if %1 == 0
-        lea         rsi,                    [rsi + rcx*2]
-        lea         rdi,                    [rdi + rcx*2]
-        movq        MMWORD PTR [rsi],       xmm6              ; p0
-        movhps      MMWORD PTR [rdi],       xmm6
-        movq        MMWORD PTR [rsi + rax], xmm1              ; p1
-        movhps      MMWORD PTR [rdi + rax], xmm1
-        movq        MMWORD PTR [rsi + rcx], xmm3              ; q0
-        movhps      MMWORD PTR [rdi + rcx], xmm3
-        movq        MMWORD PTR [rsi + rcx*2],xmm7             ; q1
-        movhps      MMWORD PTR [rdi + rcx*2],xmm7
-%elif %1 == 1
-        movdqa      [rsi+rax],              xmm6              ; write back
-        movdqa      [rsi+2*rax],            xmm1              ; write back
-        movdqa      [rsi],                  xmm3              ; write back
-        movdqa      [rdi],                  xmm7              ; write back
-%endif
-
-%endmacro
-
-
-;void vp9_loop_filter_horizontal_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 32     ; reserve 32 bytes
-    %define t0 [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1 [rsp + 16]   ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)           ;src_ptr
-        movsxd      rax,                    dword ptr arg(1) ;src_pixel_step
-
-        mov         rdx,                    arg(3)           ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rdi,                    [rsi+rax]        ; rdi points to row +1 for indirect addressing
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 1
-        ; filter and write back the result
-        B_FILTER 1
-
-    add rsp, 32
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_horizontal_edge_uv_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_horizontal_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 96       ; reserve 96 bytes
-    %define q2  [rsp + 0]     ;__declspec(align(16)) char q2[16];
-    %define q1  [rsp + 16]    ;__declspec(align(16)) char q1[16];
-    %define p2  [rsp + 32]    ;__declspec(align(16)) char p2[16];
-    %define p1  [rsp + 48]    ;__declspec(align(16)) char p1[16];
-    %define t0  [rsp + 64]    ;__declspec(align(16)) char t0[16];
-    %define t1  [rsp + 80]    ;__declspec(align(16)) char t1[16];
-
-        mov         rsi,                    arg(0)             ; u
-        mov         rdi,                    arg(5)             ; v
-        movsxd      rax,                    dword ptr arg(1)   ; src_pixel_step
-        mov         rcx,                    rax
-        neg         rax                     ; negate pitch to deal with above border
-
-        mov         rdx,                    arg(3)             ;limit
-        movdqa      xmm7,                   XMMWORD PTR [rdx]
-
-        lea         rsi,                    [rsi + rcx]
-        lea         rdi,                    [rdi + rcx]
-
-        ; calculate breakout conditions and high edge variance
-        LFH_FILTER_AND_HEV_MASK 0
-        ; filter and write back the result
-        B_FILTER 0
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-%macro TRANSPOSE_16X8 2
-        movq        xmm4,               QWORD PTR [rsi]        ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
-        movq        xmm1,               QWORD PTR [rdi]        ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
-        movq        xmm0,               QWORD PTR [rsi+2*rax]  ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
-        movq        xmm7,               QWORD PTR [rdi+2*rax]  ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
-        movq        xmm5,               QWORD PTR [rsi+4*rax]  ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
-        movq        xmm2,               QWORD PTR [rdi+4*rax]  ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
-
-        punpcklbw   xmm4,               xmm1            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-
-        movq        xmm1,               QWORD PTR [rdi+2*rcx]  ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
-
-        movdqa      xmm3,               xmm4            ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
-        punpcklbw   xmm0,               xmm7            ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
-
-        movq        xmm7,               QWORD PTR [rsi+2*rcx]  ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
-
-        punpcklbw   xmm5,               xmm2            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-%if %1
-        lea         rsi,                [rsi+rax*8]
-%else
-        mov         rsi,                arg(5)          ; v_ptr
-%endif
-
-        movdqa      xmm6,               xmm5            ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
-        punpcklbw   xmm7,               xmm1            ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
-        punpcklwd   xmm5,               xmm7            ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
-        punpckhwd   xmm6,               xmm7            ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
-        lea         rdi,                [rdi+rax*8]
-%else
-        lea         rsi,                [rsi - 4]
-%endif
-
-        punpcklwd   xmm3,               xmm0            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
-        lea         rdx,                srct
-%else
-        lea         rdi,                [rsi + rax]     ; rdi points to row +1 for indirect addressing
-%endif
-
-        movdqa      xmm2,               xmm3            ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-        punpckhwd   xmm4,               xmm0            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-
-        movdqa      xmm7,               xmm4            ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
-        punpckhdq   xmm3,               xmm5            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
-        punpckhdq   xmm7,               xmm6            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpckldq   xmm4,               xmm6            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-
-        punpckldq   xmm2,               xmm5            ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
-
-        movdqa      t0,                 xmm2            ; save to free XMM2
-        movq        xmm2,               QWORD PTR [rsi]       ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
-        movq        xmm6,               QWORD PTR [rdi]       ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
-        movq        xmm0,               QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
-        movq        xmm5,               QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
-        movq        xmm1,               QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
-
-        punpcklbw   xmm2,               xmm6            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        movq        xmm6,               QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
-
-        punpcklbw   xmm0,               xmm5                  ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
-
-        movq        xmm5,               QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
-
-        punpcklbw   xmm1,               xmm6            ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
-
-        movq        xmm6,               QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
-
-        punpcklbw   xmm5,               xmm6            ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
-
-        movdqa      xmm6,               xmm1            ;
-        punpckhwd   xmm6,               xmm5            ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
-
-        punpcklwd   xmm1,               xmm5            ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-        movdqa      xmm5,               xmm2            ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
-
-        punpcklwd   xmm5,               xmm0            ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
-
-        punpckhwd   xmm2,               xmm0            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        movdqa      xmm0,               xmm5
-        punpckldq   xmm0,               xmm1            ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
-
-        punpckhdq   xmm5,               xmm1            ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-        movdqa      xmm1,               xmm2            ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
-
-        punpckldq   xmm1,               xmm6            ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
-
-        punpckhdq   xmm2,               xmm6            ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
-        movdqa      xmm6,               xmm7            ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
-
-        punpcklqdq  xmm6,               xmm2            ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
-
-        punpckhqdq  xmm7,               xmm2            ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        movdqa      [rdx],              xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+16],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+32],           xmm4            ; save 4
-        movdqa      [rdx+48],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1            ;
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
-        movdqa      [rdx+112],          xmm7            ; save 7
-
-        movdqa      [rdx+96],           xmm6            ; save 6
-
-        movdqa      xmm2,               xmm3            ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-        punpckhqdq  xmm3,               xmm5            ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
-        punpcklqdq  xmm2,               xmm5            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
-        movdqa      [rdx+32],           xmm2            ; save 2
-
-        movdqa      xmm5,               xmm4            ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
-        punpcklqdq  xmm4,               xmm1            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
-        movdqa      [rdx+48],           xmm3            ; save 3
-
-        punpckhqdq  xmm5,               xmm1            ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
-        movdqa      [rdx+64],           xmm4            ; save 4
-        movdqa      [rdx+80],           xmm5            ; save 5
-        movdqa      xmm1,               t0              ; get
-
-        movdqa      xmm2,               xmm1
-        punpckhqdq  xmm1,               xmm0            ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
-        punpcklqdq  xmm2,               xmm0            ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-
-        movdqa      [rdx+16],           xmm1
-
-        movdqa      [rdx],              xmm2
-%endif
-%endmacro
-
-%macro LFV_FILTER_MASK_HEV_MASK 1
-        movdqa      xmm0,               xmm6            ; q2
-        psubusb     xmm0,               xmm7            ; q2-q3
-
-        psubusb     xmm7,               xmm6            ; q3-q2
-        movdqa      xmm4,               xmm5            ; q1
-
-        por         xmm7,               xmm0            ; abs (q3-q2)
-        psubusb     xmm4,               xmm6            ; q1-q2
-
-        movdqa      xmm0,               xmm1
-        psubusb     xmm6,               xmm5            ; q2-q1
-
-        por         xmm6,               xmm4            ; abs (q2-q1)
-        psubusb     xmm0,               xmm2            ; p2 - p3;
-
-        psubusb     xmm2,               xmm1            ; p3 - p2;
-        por         xmm0,               xmm2            ; abs(p2-p3)
-%if %1
-        movdqa      xmm2,               [rdx]           ; p1
-%else
-        movdqa      xmm2,               [rdx+32]        ; p1
-%endif
-        movdqa      xmm5,               xmm2            ; p1
-        pmaxub      xmm0,               xmm7
-
-        psubusb     xmm5,               xmm1            ; p1-p2
-        psubusb     xmm1,               xmm2            ; p2-p1
-
-        movdqa      xmm7,               xmm3            ; p0
-        psubusb     xmm7,               xmm2            ; p0-p1
-
-        por         xmm1,               xmm5            ; abs(p2-p1)
-        pmaxub      xmm0,               xmm6
-
-        pmaxub      xmm0,               xmm1
-        movdqa      xmm1,               xmm2            ; p1
-
-        psubusb     xmm2,               xmm3            ; p1-p0
-        lea         rdx,                srct
-
-        por         xmm2,               xmm7            ; abs(p1-p0)
-
-        movdqa      t0,                 xmm2            ; save abs(p1-p0)
-
-        pmaxub      xmm0,               xmm2
-
-%if %1
-        movdqa      xmm5,               [rdx+32]        ; q0
-        movdqa      xmm7,               [rdx+48]        ; q1
-%else
-        movdqa      xmm5,               [rdx+64]        ; q0
-        movdqa      xmm7,               [rdx+80]        ; q1
-%endif
-        mov         rdx,                arg(3)          ; limit
-
-        movdqa      xmm6,               xmm5            ; q0
-        movdqa      xmm2,               xmm7            ; q1
-
-        psubusb     xmm5,               xmm7            ; q0-q1
-        psubusb     xmm7,               xmm6            ; q1-q0
-
-        por         xmm7,               xmm5            ; abs(q1-q0)
-
-        movdqa      t1,                 xmm7            ; save abs(q1-q0)
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; limit
-
-        pmaxub      xmm0,               xmm7
-        mov         rdx,                arg(2)          ; blimit
-
-        psubusb     xmm0,               xmm4
-        movdqa      xmm5,               xmm2            ; q1
-
-        psubusb     xmm5,               xmm1            ; q1-=p1
-        psubusb     xmm1,               xmm2            ; p1-=q1
-
-        por         xmm5,               xmm1            ; abs(p1-q1)
-        movdqa      xmm1,               xmm3            ; p0
-
-        pand        xmm5,               [GLOBAL(tfe)]   ; set lsb of each byte to zero
-        psubusb     xmm1,               xmm6            ; p0-q0
-
-        psrlw       xmm5,               1               ; abs(p1-q1)/2
-        psubusb     xmm6,               xmm3            ; q0-p0
-
-        movdqa      xmm4,               XMMWORD PTR [rdx]; blimit
-
-        mov         rdx,                arg(4)          ; get thresh
-
-        por         xmm1,               xmm6            ; abs(q0-p0)
-
-        movdqa      xmm6,               t0              ; get abs (q1 - q0)
-
-        paddusb     xmm1,               xmm1            ; abs(q0-p0)*2
-
-        movdqa      xmm3,               t1              ; get abs (p1 - p0)
-
-        movdqa      xmm7,               XMMWORD PTR [rdx]
-
-        paddusb     xmm1,               xmm5            ; abs (p0 - q0) *2 + abs(p1-q1)/2
-        psubusb     xmm6,               xmm7            ; abs(q1 - q0) > thresh
-
-        psubusb     xmm3,               xmm7            ; abs(p1 - p0)> thresh
-
-        psubusb     xmm1,               xmm4            ; abs (p0 - q0) *2 + abs(p1-q1)/2  > blimit
-        por         xmm6,               xmm3            ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
-
-        por         xmm1,               xmm0            ; mask
-        pcmpeqb     xmm6,               xmm0
-
-        pxor        xmm0,               xmm0
-        pcmpeqb     xmm4,               xmm4
-
-        pcmpeqb     xmm1,               xmm0
-        pxor        xmm4,               xmm6
-%endmacro
-
-%macro BV_TRANSPOSE 0
-        ; xmm1 =    f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        ; xmm6 =    f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-        ; xmm3 =    f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        ; xmm7 =    f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-        movdqa      xmm2,               xmm1            ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-        punpcklbw   xmm2,               xmm6            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-
-        movdqa      xmm4,               xmm3            ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-        punpckhbw   xmm1,               xmm6            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklbw   xmm4,               xmm7            ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
-
-        punpckhbw   xmm3,               xmm7            ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
-        movdqa      xmm6,               xmm2            ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
-        punpcklwd   xmm2,               xmm4            ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-
-        punpckhwd   xmm6,               xmm4            ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        movdqa      xmm5,               xmm1            ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
-        punpcklwd   xmm1,               xmm3            ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-
-        punpckhwd   xmm5,               xmm3            ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-        ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
-        ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
-        ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
-        ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
-%endmacro
-
-%macro BV_WRITEBACK 2
-        movd        [rsi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2],            %1
-        psrldq      %1,                 4
-
-        movd        [rsi+2*rax+2],      %1
-        psrldq      %1,                 4
-
-        movd        [rdi+2*rax+2],      %1
-
-        movd        [rsi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+4*rax+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rsi+2*rcx+2],      %2
-        psrldq      %2,                 4
-
-        movd        [rdi+2*rcx+2],      %2
-%endmacro
-
-
-;void vp9_loop_filter_vertical_edge_sse2
-;(
-;    unsigned char *src_ptr,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    int            count
-;)
-global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; src_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax*2+rax]
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 1, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-        ; store 16-line result
-
-        lea         rdx,        [rax]
-        neg         rdx
-
-        BV_WRITEBACK xmm1, xmm5
-
-        lea         rsi,        [rsi+rdx*8]
-        lea         rdi,        [rdi+rdx*8]
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_loop_filter_vertical_edge_uv_sse2
-;(
-;    unsigned char *u,
-;    int            src_pixel_step,
-;    const char    *blimit,
-;    const char    *limit,
-;    const char    *thresh,
-;    unsigned char *v
-;)
-global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE
-sym(vp9_loop_filter_vertical_edge_uv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub             rsp, 96      ; reserve 96 bytes
-    %define t0      [rsp + 0]    ;__declspec(align(16)) char t0[16];
-    %define t1      [rsp + 16]   ;__declspec(align(16)) char t1[16];
-    %define srct    [rsp + 32]   ;__declspec(align(16)) char srct[64];
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        movsxd      rax,        dword ptr arg(1)        ; src_pixel_step
-
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        lea         rcx,        [rax+2*rax]
-
-        lea         rdx,        srct
-
-        ;transpose 16x8 to 8x16, and store the 8-line result on stack.
-        TRANSPOSE_16X8 0, 1
-
-        ; calculate filter mask and high edge variance
-        LFV_FILTER_MASK_HEV_MASK 1
-
-        ; start work on filters
-        B_FILTER 2
-
-        ; tranpose and write back - only work on q1, q0, p0, p1
-        BV_TRANSPOSE
-
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-
-        ; store 16-line result
-        BV_WRITEBACK xmm1, xmm5
-
-        mov         rsi,        arg(0)                  ; u_ptr
-        lea         rsi,        [rsi - 4]
-        lea         rdi,        [rsi + rax]             ; rdi points to row +1 for indirect addressing
-        BV_WRITEBACK xmm2, xmm6
-
-    add rsp, 96
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-tfe:
-    times 16 db 0xfe
-align 16
-t80:
-    times 16 db 0x80
-align 16
-t1s:
-    times 16 db 0x01
-align 16
-t3:
-    times 16 db 0x03
-align 16
-t4:
-    times 16 db 0x04
-align 16
-ones:
-    times 8 dw 0x0001
-align 16
-s9:
-    times 8 dw 0x0900
-align 16
-s63:
-    times 8 dw 0x003f
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@ -1,35 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-#endif
-
-#if HAVE_SSE2
-extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
-extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-#endif
-
-#endif  // LOOPFILTER_X86_H
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@ -68,7 +68,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.h

-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
@ -76,7 +75,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm