Merge "Make coefficient skip condition an explicit RD choice."

2013-06-28 11:54:28 -07:00 · 2013-06-28 11:54:28 -07:00 · ec5d09b950
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@ -558,7 +558,7 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
-prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
 specialize vp9_block_error sse2
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@ -68,7 +68,6 @@ struct macroblock_plane {
  int16_t *quant;
  uint8_t *quant_shift;
  int16_t *zbin;
  int16_t *zrun_zbin_boost;
  int16_t *round;
  // Zbin Over Quant value
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@ -268,11 +268,7 @@ typedef struct VP9_COMP {
  DECLARE_ALIGNED(16, unsigned char, a_quant_shift[QINDEX_RANGE][16]);
  DECLARE_ALIGNED(16, short, a_zbin[QINDEX_RANGE][16]);
  DECLARE_ALIGNED(16, short, a_round[QINDEX_RANGE][16]);
  DECLARE_ALIGNED(16, short, zrun_zbin_boost_a[QINDEX_RANGE][16]);
 #endif
  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y[QINDEX_RANGE][16]);
  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
  MACROBLOCK mb;
  VP9_COMMON common;
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -21,8 +21,7 @@
 extern int enc_debug;
 #endif
-static void quantize(int16_t *zbin_boost_orig_ptr,
+static void quantize(int16_t *coeff_ptr, int n_coeffs, int skip_block,
                     int16_t *coeff_ptr, int n_coeffs, int skip_block,
                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
                     uint8_t *quant_shift_ptr,
                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@ -31,8 +30,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
  int i, rc, eob;
  int zbins[2], nzbins[2], zbin;
  int x, y, z, sz;
  int zero_run = 0;
  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
  int zero_flag = n_coeffs;
  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@ -65,8 +62,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
      rc = scan[i];
      z  = coeff_ptr[rc];
-      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+      zbin = (zbins[rc != 0]);
      zero_run += (zero_run < 15);
      sz = (z >> 31);                               // sign of z
      x  = (z ^ sz) - sz;
@ -81,7 +77,6 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
        if (y) {
          eob = i;                                  // last nonzero coeffs
          zero_run = 0;                             // set zero_run
        }
      }
    }
@ -90,8 +85,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
 }
 // This function works well for large transform size.
-static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
+static void quantize_sparse(int16_t *coeff_ptr, int n_coeffs, int skip_block,
                            int16_t *coeff_ptr, int n_coeffs, int skip_block,
                            int16_t *zbin_ptr, int16_t *round_ptr,
                            int16_t *quant_ptr, uint8_t *quant_shift_ptr,
                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
@ -101,10 +95,7 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
  int i, rc, eob;
  int zbins[2], nzbins[2], zbin;
  int x, y, z, sz;
  int zero_run = 0;
  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
  int idx = 0;
  int pre_idx = 0;
  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@ -135,11 +126,8 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
      rc = scan[idx_arr[i]];
      // Calculate ZBIN
-      zero_run += idx_arr[i] - pre_idx;
+      zbin = (zbins[rc != 0]);
      if(zero_run > 15) zero_run = 15;
      zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
      pre_idx = idx_arr[i];
      z = coeff_ptr[rc] * 2;
      sz = (z >> 31);                               // sign of z
      x  = (z ^ sz) - sz;                           // x = abs(z)
@ -155,7 +143,6 @@ static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
        if (y) {
          eob = idx_arr[i];                         // last nonzero coeffs
          zero_run = -1;                            // set zero_run
        }
      }
    }
@ -189,8 +176,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
    // Save index of picked coefficient in pre-scan pass.
    int idx_arr[1024];
-    quantize_sparse(mb->plane[plane].zrun_zbin_boost,
+    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
                    BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
                    n_coeffs, mb->skip_block,
                    mb->plane[plane].zbin,
                    mb->plane[plane].round,
@ -204,8 +190,7 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
                    scan, idx_arr);
  }
  else {
-    quantize(mb->plane[plane].zrun_zbin_boost,
+    quantize(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
             BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
             n_coeffs, mb->skip_block,
             mb->plane[plane].zbin,
             mb->plane[plane].round,
@ -226,8 +211,7 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
  const int *pt_scan = get_scan_4x4(tx_type);
-  quantize(mb->plane[pb_idx.plane].zrun_zbin_boost,
+  quantize(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
           BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block, 16),
           16, mb->skip_block,
           mb->plane[pb_idx.plane].zbin,
           mb->plane[pb_idx.plane].round,
@ -261,9 +245,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
 #endif
  int q;
  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,
                                     14, 16, 20, 24, 28, 32, 36, 40 };
  for (q = 0; q < QINDEX_RANGE; q++) {
    int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;
    int qrounding_factor = 48;
@ -277,14 +258,12 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    cpi->y_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
    cpi->y_round[q][0] = (qrounding_factor * quant_val) >> 7;
    cpi->common.y_dequant[q][0] = quant_val;
    cpi->zrun_zbin_boost_y[q][0] = (quant_val * zbin_boost[0]) >> 7;
    quant_val = vp9_dc_quant(q, cpi->common.uv_dc_delta_q);
    invert_quant(cpi->uv_quant[q] + 0, cpi->uv_quant_shift[q] + 0, quant_val);
    cpi->uv_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
    cpi->uv_round[q][0] = (qrounding_factor * quant_val) >> 7;
    cpi->common.uv_dequant[q][0] = quant_val;
    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;
 #if CONFIG_ALPHA
    quant_val = vp9_dc_quant(q, cpi->common.a_dc_delta_q);
@ -292,7 +271,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
    cpi->a_zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
    cpi->a_round[q][0] = (qrounding_factor * quant_val) >> 7;
    cpi->common.a_dequant[q][0] = quant_val;
    cpi->zrun_zbin_boost_a[q][0] = (quant_val * zbin_boost[0]) >> 7;
 #endif
    quant_val = vp9_ac_quant(q, 0);
@ -310,15 +288,11 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      invert_quant(cpi->y_quant[q] + rc, cpi->y_quant_shift[q] + rc, quant_val);
      cpi->y_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);
      cpi->y_round[q][rc] = (qrounding_factor * quant_val) >> 7;
      cpi->zrun_zbin_boost_y[q][i] =
          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);
      invert_quant(cpi->uv_quant[q] + rc, cpi->uv_quant_shift[q] + rc,
        quant_uv_val);
      cpi->uv_zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_uv_val, 7);
      cpi->uv_round[q][rc] = (qrounding_factor * quant_uv_val) >> 7;
      cpi->zrun_zbin_boost_uv[q][i] =
          ROUND_POWER_OF_TWO(quant_uv_val * zbin_boost[i], 7);
 #if CONFIG_ALPHA
      invert_quant(cpi->a_quant[q] + rc, cpi->a_quant_shift[q] + rc,
@ -326,8 +300,6 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
      cpi->a_zbin[q][rc] =
          ROUND_POWER_OF_TWO(qzbin_factor * quant_alpha_val, 7);
      cpi->a_round[q][rc] = (qrounding_factor * quant_alpha_val) >> 7;
      cpi->zrun_zbin_boost_a[q][i] =
          ROUND_POWER_OF_TWO(quant_alpha_val * zbin_boost[i], 7);
 #endif
    }
  }
@ -348,7 +320,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
  x->plane[0].quant_shift = cpi->y_quant_shift[qindex];
  x->plane[0].zbin = cpi->y_zbin[qindex];
  x->plane[0].round = cpi->y_round[qindex];
  x->plane[0].zrun_zbin_boost = cpi->zrun_zbin_boost_y[qindex];
  x->plane[0].zbin_extra = (int16_t)zbin_extra;
  x->e_mbd.plane[0].dequant = cpi->common.y_dequant[qindex];
@ -361,7 +332,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
    x->plane[i].quant_shift = cpi->uv_quant_shift[qindex];
    x->plane[i].zbin = cpi->uv_zbin[qindex];
    x->plane[i].round = cpi->uv_round[qindex];
    x->plane[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[qindex];
    x->plane[i].zbin_extra = (int16_t)zbin_extra;
    x->e_mbd.plane[i].dequant = cpi->common.uv_dequant[qindex];
  }
@ -371,7 +341,6 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
  x->plane[3].quant_shift = cpi->a_quant_shift[qindex];
  x->plane[3].zbin = cpi->a_zbin[qindex];
  x->plane[3].round = cpi->a_round[qindex];
  x->plane[3].zrun_zbin_boost = cpi->zrun_zbin_boost_a[qindex];
  x->plane[3].zbin_extra = (int16_t)zbin_extra;
  x->e_mbd.plane[3].dequant = cpi->common.a_dequant[qindex];
 #endif
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -283,15 +283,17 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
 }
 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
-                          intptr_t block_size) {
+                          intptr_t block_size, int64_t *ssz) {
  int i;
-  int64_t error = 0;
+  int64_t error = 0, sqcoeff = 0;
  for (i = 0; i < block_size; i++) {
    int this_diff = coeff[i] - dqcoeff[i];
    error += (unsigned)this_diff * this_diff;
    sqcoeff += (unsigned) coeff[i] * coeff[i];
  }
  *ssz = sqcoeff;
  return error;
 }
@ -501,27 +503,31 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
 }
 static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                               int shift) {
+                               int shift, int64_t *sse) {
  struct macroblockd_plane *p = &x->e_mbd.plane[0];
  const int bw = plane_block_width(bsize, p);
  const int bh = plane_block_height(bsize, p);
-  return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+  int64_t e = vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
-                         bw * bh) >> shift;
+                              bw * bh, sse) >> shift;
  *sse >>= shift;
  return e;
 }
 static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
-                                int shift) {
+                                int shift, int64_t *sse) {
-  int64_t sum = 0;
+  int64_t sum = 0, this_sse;
  int plane;
  *sse = 0;
  for (plane = 1; plane < MAX_MB_PLANE; plane++) {
    struct macroblockd_plane *p = &x->e_mbd.plane[plane];
    const int bw = plane_block_width(bsize, p);
    const int bh = plane_block_height(bsize, p);
    sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
-                           bw * bh);
+                           bw * bh, &this_sse);
    *sse += this_sse;
  }
-
+  *sse >>= shift;
  return sum >> shift;
 }
@ -581,7 +587,7 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
 static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                     int *rate, int64_t *distortion,
-                                     int *skippable,
+                                     int *skippable, int64_t *sse,
                                     BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
  MACROBLOCKD *const xd = &x->e_mbd;
  xd->mode_info_context->mbmi.txfm_size = tx_size;
@ -591,18 +597,18 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
  else
    vp9_xform_quant_sby(cm, x, bsize);
-  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2);
+  *distortion = block_error_sby(x, bsize, tx_size == TX_32X32 ? 0 : 2, sse);
  *rate       = rdcost_plane(cm, x, 0, bsize, tx_size);
  *skippable  = vp9_sby_is_skippable(xd, bsize);
 }
 static void super_block_yrd(VP9_COMP *cpi,
                            MACROBLOCK *x, int *rate, int64_t *distortion,
-                            int *skip, BLOCK_SIZE_TYPE bs,
+                            int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
                            int64_t txfm_cache[NB_TXFM_MODES]) {
  VP9_COMMON *const cm = &cpi->common;
  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
-  int64_t d[TX_SIZE_MAX_SB];
+  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
  MACROBLOCKD *xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@ -621,25 +627,27 @@ static void super_block_yrd(VP9_COMP *cpi,
      mbmi->txfm_size = TX_4X4;
    }
    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
-    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, bs,
+    super_block_yrd_for_txfm(cm, x, rate, distortion, skip, &sse[0], bs,
                             mbmi->txfm_size);
    return;
  }
  if (bs >= BLOCK_SIZE_SB32X32)
    super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
-                             bs, TX_32X32);
+                             &sse[TX_32X32], bs, TX_32X32);
  if (bs >= BLOCK_SIZE_MB16X16)
    super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
-                             bs, TX_16X16);
+                             &sse[TX_16X16], bs, TX_16X16);
-  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], bs,
+  super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                           TX_8X8);
+                           &sse[TX_8X8], bs, TX_8X8);
-  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], bs,
+  super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                           TX_4X4);
+                           &sse[TX_4X4], bs, TX_4X4);
  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                           skip, txfm_cache,
                           TX_32X32 - (bs < BLOCK_SIZE_SB32X32)
                           - (bs < BLOCK_SIZE_MB16X16));
  if (psse)
    *psse = sse[mbmi->txfm_size];
 }
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
@ -688,6 +696,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
    for (idy = 0; idy < bh; ++idy) {
      for (idx = 0; idx < bw; ++idx) {
        int64_t ssz;
        block = ib + idy * 2 + idx;
        xd->mode_info_context->bmi[block].as_mode.first = mode;
        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
@ -718,7 +728,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
                             tempa + idx, templ + idy, TX_4X4, 16);
        distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
-                                                          block, 16), 16) >> 2;
+                                                          block, 16),
                                      16, &ssz) >> 2;
        if (best_tx_type != DCT_DCT)
          vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
@ -881,7 +892,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
    }
    x->e_mbd.mode_info_context->mbmi.mode = mode;
-    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
                    bsize, local_txfm_cache);
    this_rate = this_rate_tokenonly + bmode_costs[mode];
@ -914,22 +925,25 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
 static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
                                      int *rate, int64_t *distortion,
-                                      int *skippable, BLOCK_SIZE_TYPE bsize,
+                                      int *skippable, int64_t *sse,
                                      BLOCK_SIZE_TYPE bsize,
                                      TX_SIZE uv_tx_size) {
  MACROBLOCKD *const xd = &x->e_mbd;
  int64_t dummy;
  if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
    vp9_encode_intra_block_uv(cm, x, bsize);
  else
    vp9_xform_quant_sbuv(cm, x, bsize);
-  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2);
+  *distortion = block_error_sbuv(x, bsize, uv_tx_size == TX_32X32 ? 0 : 2,
                                 sse ? sse : &dummy);
  *rate       = rdcost_uv(cm, x, bsize, uv_tx_size);
  *skippable  = vp9_sbuv_is_skippable(xd, bsize);
 }
 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
                             int *rate, int64_t *distortion, int *skippable,
-                             BLOCK_SIZE_TYPE bsize) {
+                             int64_t *sse, BLOCK_SIZE_TYPE bsize) {
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
  TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
@ -937,7 +951,7 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
  if (mbmi->ref_frame[0] > INTRA_FRAME)
    vp9_subtract_sbuv(x, bsize);
-  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, bsize,
+  super_block_uvrd_for_txfm(cm, x, rate, distortion, skippable, sse, bsize,
                            uv_txfm_size);
 }
@ -954,7 +968,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
-                     &this_distortion, &s, bsize);
+                     &this_distortion, &s, NULL, bsize);
    this_rate = this_rate_tokenonly +
                x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@ -1151,6 +1165,8 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
  k = i;
  for (idy = 0; idy < bh / 4; ++idy) {
    for (idx = 0; idx < bw / 4; ++idx) {
      int64_t ssz;
      k += (idy * 2 + idx);
      src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, k,
                                           x->plane[0].src_diff);
@ -1159,7 +1175,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
      x->quantize_b_4x4(x, k, DCT_DCT, 16);
      thisdistortion += vp9_block_error(coeff,
                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                     k, 16), 16);
+                                                     k, 16), 16, &ssz);
      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
                              ta + (k & 1),
                              tl + (k >> 1), TX_4X4, 16);
@ -2238,7 +2254,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                 INTERPOLATIONFILTERTYPE *best_filter,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
-                                 int_mv single_newmv[MAX_REF_FRAMES]) {
+                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int64_t *psse) {
  VP9_COMMON *cm = &cpi->common;
  MACROBLOCKD *xd = &x->e_mbd;
  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@ -2467,17 +2484,19 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  if (!x->skip) {
    int skippable_y, skippable_uv;
    int64_t sseuv = INT_MAX;
    // Y cost and distortion
-    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y,
+    super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
                    bsize, txfm_cache);
    *rate2 += *rate_y;
    *distortion += *distortion_y;
    super_block_uvrd(cm, x, rate_uv, distortion_uv,
-                     &skippable_uv, bsize);
+                     &skippable_uv, &sseuv, bsize);
    *psse += sseuv;
    *rate2 += *rate_uv;
    *distortion += *distortion_uv;
    *skippable = skippable_y && skippable_uv;
@ -2611,6 +2630,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  int bws = (1 << bwsl) / 4;  // mode_info step for subsize
  int bhsl = b_height_log2(bsize);
  int bhs = (1 << bhsl) / 4;  // mode_info step for subsize
  int best_skip2 = 0;
  for (i = 0; i < 4; i++) {
    int j;
@ -2702,6 +2722,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    int skippable;
    int64_t txfm_cache[NB_TXFM_MODES];
    int i;
    int this_skip2 = 0;
    int64_t total_sse = INT_MAX;
    for (i = 0; i < NB_TXFM_MODES; ++i)
      txfm_cache[i] = INT64_MAX;
@ -2863,7 +2885,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        txfm_cache[i] = txfm_cache[ONLY_4X4];
    } else if (ref_frame == INTRA_FRAME) {
      TX_SIZE uv_tx;
-      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
                      bsize, txfm_cache);
      uv_tx = mbmi->txfm_size;
@ -2989,7 +3011,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                      BLOCK_SIZE_SB8X8);
      vp9_subtract_sbuv(x, BLOCK_SIZE_SB8X8);
      super_block_uvrd_for_txfm(cm, x, &rate_uv, &distortion_uv,
-                                &uv_skippable, BLOCK_SIZE_SB8X8, TX_4X4);
+                                &uv_skippable, NULL, BLOCK_SIZE_SB8X8, TX_4X4);
      rate2 += rate_uv;
      distortion2 += distortion_uv;
      skippable = skippable && uv_skippable;
@ -3017,7 +3039,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                  &mode_excluded, &disable_skip,
                                  &tmp_best_filter, frame_mv[this_mode],
                                  mi_row, mi_col,
-                                  single_newmv);
+                                  single_newmv, &total_sse);
      if (this_rd == INT64_MAX)
        continue;
    }
@ -3062,10 +3084,29 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
            rate2 += prob_skip_cost;
          }
        }
      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
                 this_mode != SPLITMV) {
        if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
            RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
          // Add in the cost of the no skip flag.
          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
                                                          PRED_MBSKIP), 0);
          rate2 += prob_skip_cost;
        } else {
          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
                                                              PRED_MBSKIP), 1);
          rate2 += prob_skip_cost;
          distortion2 = total_sse;
          assert(total_sse >= 0);
          rate2 -= (rate_y + rate_uv);
          rate_y = 0;
          rate_uv = 0;
          this_skip2 = 1;
        }
      } else if (mb_skip_allowed) {
        // Add in the cost of the no skip flag.
        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob(cm, xd,
-                                                        PRED_MBSKIP), 0);
+                                                            PRED_MBSKIP), 0);
        rate2 += prob_skip_cost;
      }
@ -3119,6 +3160,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        *returndistortion = distortion2;
        best_rd = this_rd;
        best_mbmode = *mbmi;
        best_skip2 = this_skip2;
        best_partition = *x->partition_info;
        if (this_mode == I4X4_PRED || this_mode == SPLITMV)
@ -3301,6 +3343,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  // macroblock modes
  *mbmi = best_mbmode;
  x->skip |= best_skip2;
  if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
      best_mbmode.sb_type < BLOCK_SIZE_SB8X8) {
    for (i = 0; i < 4; i++)
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@ -12,45 +12,62 @@
 SECTION .text
-; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
 ;                         int64_t *ssz)
 INIT_XMM sse2
-cglobal block_error, 3, 3, 6, uqc, dqc, size
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; accumulator
+  pxor      m4, m4                 ; sse accumulator
  pxor      m6, m6                 ; ssz accumulator
  pxor      m5, m5                 ; dedicated zero register
  lea     uqcq, [uqcq+sizeq*2]
  lea     dqcq, [dqcq+sizeq*2]
  neg    sizeq
 .loop:
-  mova      m0, [uqcq+sizeq*2]
+  mova      m2, [uqcq+sizeq*2]
-  mova      m2, [dqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
-  mova      m1, [uqcq+sizeq*2+mmsize]
+  mova      m3, [uqcq+sizeq*2+mmsize]
-  mova      m3, [dqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
  psubw     m0, m2
  psubw     m1, m3
  ; individual errors are max. 15bit+sign, so squares are 30bit, and
  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  pmaddwd   m0, m0
  pmaddwd   m1, m1
  pmaddwd   m2, m2
  pmaddwd   m3, m3
  ; accumulate in 64bit
-  punpckldq m2, m0, m5
+  punpckldq m7, m0, m5
  punpckhdq m0, m5
-  punpckldq m3, m1, m5
+  paddq     m4, m7
-  punpckhdq m1, m5
+  punpckldq m7, m1, m5
  paddq     m4, m2
  paddq     m4, m0
-  paddq     m4, m3
+  punpckhdq m1, m5
  paddq     m4, m7
  punpckldq m7, m2, m5
  paddq     m4, m1
  punpckhdq m2, m5
  paddq     m6, m7
  punpckldq m7, m3, m5
  paddq     m6, m2
  punpckhdq m3, m5
  paddq     m6, m7
  paddq     m6, m3
  add    sizeq, mmsize
  jl .loop
  ; accumulate horizontally and store in return value
  movhlps   m5, m4
  movhlps   m7, m6
  paddq     m4, m5
  paddq     m6, m7
 %if ARCH_X86_64
  movq    rax, m4
  movq [sszq], m6
 %else
  mov     eax, sszm
  pshufd   m5, m4, 0x1
  movq  [eax], m6
  movd    eax, m4
  movd    edx, m5
 %endif