vp9_ethread: modify VP9_COMP structure

This patch modified struct VP9_COMP. Created a struct ThreadData to include data that need to be copied for each thread. In multiple thread case, one thread processes one tile. all threads share one copy of VP9_COMP, (refer to VP9_COMP *cpi in the code) but each thread has its own copy of ThreadData, (refer to ThreadData *td in the code). Therefore, within the scope of encode_tiles(), both cpi and td need to be passed as function parameters. In single thread case, the FRAME_COUNTS pointer in ThreadData points to "counts" in VP9_COMMON. Change-Id: Ib37908b2d8e2c0f4f9c18f38017df5ce60e8b13e
2014-11-21 11:11:06 -08:00 · 2014-11-21 11:11:06 -08:00 · edbd61e136
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@ -96,7 +96,8 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
 // An "aq_strength" value determines how many segments are supported,
 // the set of transition points to use and the extent of the quantizer
 // adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi, BLOCK_SIZE bs,
+void vp9_select_in_frame_q_segment(VP9_COMP *cpi, MACROBLOCK *mb,
+                                   BLOCK_SIZE bs,
                                   int mi_row, int mi_col,
                                   int output_enabled, int projected_rate) {
  VP9_COMMON *const cm = &cpi->common;
@ -122,8 +123,8 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi, BLOCK_SIZE bs,
    const int active_segments = aq_c_active_segments[aq_strength];
    double logvar;

-    vp9_setup_src_planes(&cpi->mb, cpi->Source, mi_row, mi_col);
-    logvar = vp9_log_block_var(cpi, &cpi->mb, bs);
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    logvar = vp9_log_block_var(cpi, mb, bs);

    // The number of segments considered and the transition points used to
    // select them is determined by the "aq_strength" value.
--- a/vp9/encoder/vp9_aq_complexity.h
+++ b/vp9/encoder/vp9_aq_complexity.h
@ -17,9 +17,11 @@ extern "C" {
 #endif

 struct VP9_COMP;
+struct macroblock;

 // Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, BLOCK_SIZE bs,
+void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, struct macroblock *x,
+                                   BLOCK_SIZE bs,
                                   int mi_row, int mi_col,
                                   int output_enabled, int projected_rate);

--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@ -104,19 +104,21 @@ static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
  }
 }

-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w,
+                              FRAME_COUNTS *counts) {
  int k;

  for (k = 0; k < SKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], cm->counts.skip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }

-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w,
+                                           FRAME_COUNTS *counts) {
  int j;
  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
    prob_diff_update(vp9_switchable_interp_tree,
                     cm->fc->switchable_interp_prob[j],
-                     cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
+                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }

 static void pack_mb_tokens(vp9_writer *w,
@ -238,7 +240,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                                vp9_writer *w) {
  VP9_COMMON *const cm = &cpi->common;
  const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCK *const x = &cpi->td.mb;
  const MACROBLOCKD *const xd = &x->e_mbd;
  const struct segmentation *const seg = &cm->seg;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;
@ -297,7 +299,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
    if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
      if (bsize >= BLOCK_8X8) {
        write_inter_mode(w, mode, inter_probs);
-        ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
+        ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
      }
    }

@ -320,7 +322,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
          const int j = idy * 2 + idx;
          const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
          write_inter_mode(w, b_mode, inter_probs);
-          ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
          if (b_mode == NEWMV) {
            for (ref = 0; ref < 1 + is_compound; ++ref)
              vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@ -382,7 +384,7 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                          const TOKENEXTRA *const tok_end,
                          int mi_row, int mi_col) {
  const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  MODE_INFO *m;

  xd->mi = cm->mi + (mi_row * cm->mi_stride + mi_col);
@ -429,7 +431,7 @@ static void write_modes_sb(VP9_COMP *cpi,
                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
  const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;

  const int bsl = b_width_log2_lookup[bsize];
  const int bs = (1 << bsl) / 4;
@ -485,11 +487,12 @@ static void write_modes_sb(VP9_COMP *cpi,
 static void write_modes(VP9_COMP *cpi,
                        const TileInfo *const tile, vp9_writer *w,
                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  int mi_row, mi_col;

  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
       mi_row += MI_BLOCK_SIZE) {
-    vp9_zero(cpi->mb.e_mbd.left_seg_context);
+    vp9_zero(xd->left_seg_context);
    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
         mi_col += MI_BLOCK_SIZE)
      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
@ -500,7 +503,7 @@ static void write_modes(VP9_COMP *cpi,
 static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
                                    vp9_coeff_stats *coef_branch_ct,
                                    vp9_coeff_probs_model *coef_probs) {
-  vp9_coeff_count *coef_counts = cpi->frame_counts->coef_counts[tx_size];
+  vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
      cpi->common.counts.eob_branch[tx_size];
  int i, j, k, l, m;
@ -813,7 +816,8 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
  }
 }

-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w,
+                              FRAME_COUNTS *counts) {
  // Mode
  vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
  if (cm->tx_mode >= ALLOW_32X32)
@ -828,20 +832,20 @@ static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {


    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
      for (j = 0; j < TX_SIZES - 3; j++)
        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
      for (j = 0; j < TX_SIZES - 2; j++)
        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
                                  ct_16x16p[j]);
    }

    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
      for (j = 0; j < TX_SIZES - 1; j++)
        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
                                  ct_32x32p[j]);
@ -858,7 +862,7 @@ static void write_interp_filter(INTERP_FILTER filter,
    vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
 }

-static void fix_interp_filter(VP9_COMMON *cm) {
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
  if (cm->interp_filter == SWITCHABLE) {
    // Check to see if only one of the filters is actually used
    int count[SWITCHABLE_FILTERS];
@ -866,7 +870,7 @@ static void fix_interp_filter(VP9_COMMON *cm) {
    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
      count[i] = 0;
      for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += cm->counts.switchable_interp[j][i];
+        count[i] += counts->switchable_interp[j][i];
      c += (count[i] > 0);
    }
    if (c == 1) {
@ -1084,6 +1088,7 @@ static void write_bitdepth_colorspace_sampling(
 static void write_uncompressed_header(VP9_COMP *cpi,
                                      struct vp9_write_bit_buffer *wb) {
  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;

  vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);

@ -1136,7 +1141,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,

      vp9_wb_write_bit(wb, cm->allow_high_precision_mv);

-      fix_interp_filter(cm);
+      fix_interp_filter(cm, cpi->td.counts);
      write_interp_filter(cm->interp_filter, wb);
    }
  }
@ -1150,15 +1155,16 @@ static void write_uncompressed_header(VP9_COMP *cpi,

  encode_loopfilter(&cm->lf, wb);
  encode_quantization(cm, wb);
-  encode_segmentation(cm, &cpi->mb.e_mbd, wb);
+  encode_segmentation(cm, xd, wb);

  write_tile_info(cm, wb);
 }

 static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
  vp9_writer header_bc;

  vp9_start_encode(&header_bc, data);
@ -1166,26 +1172,26 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
  if (xd->lossless)
    cm->tx_mode = ONLY_4X4;
  else
-    encode_txfm_probs(cm, &header_bc);
+    encode_txfm_probs(cm, &header_bc, counts);

  update_coef_probs(cpi, &header_bc);
-  update_skip_probs(cm, &header_bc);
+  update_skip_probs(cm, &header_bc, counts);

  if (!frame_is_intra_only(cm)) {
    int i;

    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
      prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
-                       cm->counts.inter_mode[i], INTER_MODES, &header_bc);
+                       counts->inter_mode[i], INTER_MODES, &header_bc);

-    vp9_zero(cm->counts.inter_mode);
+    vp9_zero(counts->inter_mode);

    if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cm, &header_bc);
+      update_switchable_interp_probs(cm, &header_bc, counts);

    for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
      vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                cm->counts.intra_inter[i]);
+                                counts->intra_inter[i]);

    if (cm->allow_comp_inter_inter) {
      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
@ -1197,33 +1203,34 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
        if (use_hybrid_pred)
          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
            vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      cm->counts.comp_inter[i]);
+                                      counts->comp_inter[i]);
      }
    }

    if (cm->reference_mode != COMPOUND_REFERENCE) {
      for (i = 0; i < REF_CONTEXTS; i++) {
        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  cm->counts.single_ref[i][0]);
+                                  counts->single_ref[i][0]);
        vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  cm->counts.single_ref[i][1]);
+                                  counts->single_ref[i][1]);
      }
    }

    if (cm->reference_mode != SINGLE_REFERENCE)
      for (i = 0; i < REF_CONTEXTS; i++)
        vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  cm->counts.comp_ref[i]);
+                                  counts->comp_ref[i]);

    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
      prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
-                       cm->counts.y_mode[i], INTRA_MODES, &header_bc);
+                       counts->y_mode[i], INTRA_MODES, &header_bc);

    for (i = 0; i < PARTITION_CONTEXTS; ++i)
      prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
-                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);
+                       counts->partition[i], PARTITION_TYPES, &header_bc);

-    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+                        &counts->mv);
  }

  vp9_stop_encode(&header_bc);
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@ -87,7 +87,7 @@ static void free_tree_contexts(PC_TREE *tree) {
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
 // represents the state of our search.
-void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
  int i, j;
  const int leaf_nodes = 64;
  const int tree_nodes = 64 + 16 + 4 + 1;
@ -97,24 +97,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
  int square_index = 1;
  int nodes;

-  vpx_free(cpi->leaf_tree);
-  CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
-                                                 sizeof(*cpi->leaf_tree)));
-  vpx_free(cpi->pc_tree);
-  CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
-                                               sizeof(*cpi->pc_tree)));
+  vpx_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+                                                sizeof(*td->leaf_tree)));
+  vpx_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->pc_tree)));

-  this_pc = &cpi->pc_tree[0];
-  this_leaf = &cpi->leaf_tree[0];
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];

  // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
  // context so we only need to allocate 1 for each 8x8 block.
  for (i = 0; i < leaf_nodes; ++i)
-    alloc_mode_context(cm, 1, &cpi->leaf_tree[i]);
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);

  // Sets up all the leaf nodes in the tree.
  for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-    PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
    tree->block_size = square[0];
    alloc_tree_contexts(cm, tree, 4);
    tree->leaf_split[0] = this_leaf++;
@ -126,7 +126,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
  // from leafs to the root.
  for (nodes = 16; nodes > 0; nodes >>= 2) {
    for (i = 0; i < nodes; ++i) {
-      PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
      alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
      tree->block_size = square[square_index];
      for (j = 0; j < 4; j++)
@ -135,24 +135,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
    }
    ++square_index;
  }
-  cpi->pc_root = &cpi->pc_tree[tree_nodes - 1];
-  cpi->pc_root[0].none.best_mode_index = 2;
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[0].none.best_mode_index = 2;
 }

-void vp9_free_pc_tree(VP9_COMP *cpi) {
+void vp9_free_pc_tree(ThreadData *td) {
  const int tree_nodes = 64 + 16 + 4 + 1;
  int i;

  // Set up all 4x4 mode contexts
  for (i = 0; i < 64; ++i)
-    free_mode_context(&cpi->leaf_tree[i]);
+    free_mode_context(&td->leaf_tree[i]);

  // Sets up all the leaf nodes in the tree.
  for (i = 0; i < tree_nodes; ++i)
-    free_tree_contexts(&cpi->pc_tree[i]);
+    free_tree_contexts(&td->pc_tree[i]);

-  vpx_free(cpi->pc_tree);
-  cpi->pc_tree = NULL;
-  vpx_free(cpi->leaf_tree);
-  cpi->leaf_tree = NULL;
+  vpx_free(td->pc_tree);
+  td->pc_tree = NULL;
+  vpx_free(td->leaf_tree);
+  td->leaf_tree = NULL;
 }
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@ -15,6 +15,7 @@

 struct VP9_COMP;
 struct VP9Common;
+struct ThreadData;

 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
@ -79,7 +80,7 @@ typedef struct PC_TREE {
  };
 } PC_TREE;

-void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi);
-void vp9_free_pc_tree(struct VP9_COMP *cpi);
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);

 #endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@ -161,10 +161,10 @@ static void write_mv_update(const vp9_tree_index *tree,
    update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }

-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+                         nmv_context_counts *const counts) {
  int i, j;
  nmv_context *const mvc = &cm->fc->nmvc;
-  nmv_context_counts *const counts = &cm->counts.mv;

  write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);

@ -241,7 +241,8 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
  }
 }

-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
+void vp9_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
  const MODE_INFO *mi = xd->mi[0].src_mi;
  const MB_MODE_INFO *const mbmi = &mi->mbmi;

@ -254,12 +255,12 @@ void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
      for (idx = 0; idx < 2; idx += num_4x4_w) {
        const int i = idy * 2 + idx;
        if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv);
+          inc_mvs(mbmi, mi->bmi[i].as_mv, &td->counts->mv);
      }
    }
  } else {
    if (mbmi->mode == NEWMV)
-      inc_mvs(mbmi, mbmi->mv, &cm->counts.mv);
+      inc_mvs(mbmi, mbmi->mv, &td->counts->mv);
  }
 }

--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@ -20,7 +20,8 @@ extern "C" {

 void vp9_entropy_mv_init();

-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+                         nmv_context_counts *const counts);

 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                   const nmv_context* mvctx, int usehp);
@ -28,7 +29,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                              const nmv_context* mvctx, int usehp);

-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd);
+void vp9_update_mv_count(ThreadData *td);

 #ifdef __cplusplus
 }  // extern "C"
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@ -103,7 +103,7 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
 }

 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
-  MACROBLOCK *const mb = &cpi->mb;
+  MACROBLOCK *const mb = &cpi->td.mb;
  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
  if (cpi->common.allow_high_precision_mv) {
    mb->mvcost = mb->nmvcost_hp;
@ -235,9 +235,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
  cpi->nmvsadcosts_hp[0] = NULL;
  cpi->nmvsadcosts_hp[1] = NULL;

-  vpx_free(cpi->frame_counts);
-  cpi->frame_counts = NULL;
-
  vp9_cyclic_refresh_free(cpi->cyclic_refresh);
  cpi->cyclic_refresh = NULL;

@ -253,7 +250,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
  vpx_free(cpi->tok);
  cpi->tok = 0;

-  vp9_free_pc_tree(cpi);
+  vp9_free_pc_tree(&cpi->td);

  for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
    LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
@ -285,7 +282,7 @@ static void save_coding_context(VP9_COMP *cpi) {
  // restored with a call to vp9_restore_coding_context. These functions are
  // intended for use in a re-code loop in vp9_compress_frame where the
  // quantizer value is adjusted between loop iterations.
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
+  vp9_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);

  vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
             MV_VALS * sizeof(*cpi->nmvcosts[0]));
@ -313,7 +310,7 @@ static void restore_coding_context(VP9_COMP *cpi) {

  // Restore key state variables to the snapshot state stored in the
  // previous call to vp9_save_coding_context.
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+  vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);

  vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
             MV_VALS * sizeof(*cc->nmvcosts[0]));
@ -553,12 +550,12 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
  }

-  vp9_setup_pc_tree(&cpi->common, cpi);
+  vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }

 static void update_frame_size(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;

  vp9_set_mb_mi(cm, cm->width, cm->height);
  vp9_init_context_buffers(cm);
@ -616,6 +613,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
  cm->height = oxcf->height;
  vp9_alloc_compressor_data(cpi);

+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
  // Spatial scalability.
  cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
  // Temporal scalability.
@ -1272,7 +1272,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {

  cpi->oxcf = *oxcf;
 #if CONFIG_VP9_HIGHBITDEPTH
-  cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH

  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@ -1473,9 +1473,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
                  vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));

-  CHECK_MEM_ERROR(cm, cpi->frame_counts, vpx_calloc(1,
-                  sizeof(*cpi->frame_counts)));
-
  for (i = 0; i < (sizeof(cpi->mbgraph_stats) /
                   sizeof(cpi->mbgraph_stats[0])); i++) {
    CHECK_MEM_ERROR(cm, cpi->mbgraph_stats[i].mb_stats,
@ -1537,18 +1534,18 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {

  cpi->first_time_stamp_ever = INT64_MAX;

-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->mb.nmvsadcost);
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);

-  cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);

 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
@ -2039,7 +2036,7 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
  PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
-                   cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
 #else
  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
 #endif
@ -2420,7 +2417,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
 }

 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
  struct loopfilter *lf = &cm->lf;
  if (xd->lossless) {
      lf->filter_level = 0;
@ -2685,7 +2682,7 @@ void set_frame_size(VP9_COMP *cpi) {
  int ref_frame;
  VP9_COMMON *const cm = &cpi->common;
  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;

  if (oxcf->pass == 2 &&
      cm->current_video_frame == 0 &&
@ -3281,7 +3278,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
  vp9_update_reference_frames(cpi);

  for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cm->counts.coef[t], cpi->frame_counts->coef_counts[t]);
+    full_to_model_counts(cpi->td.counts->coef[t],
+                         cpi->td.rd_counts.coef_counts[t]);

  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
    vp9_adapt_coef_probs(cm);
@ -3728,15 +3726,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
    const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
    if (cpi->oxcf.use_highbitdepth)
-      cpi->mb.fwd_txm4x4 = lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+      cpi->td.mb.fwd_txm4x4 = lossless ?
+          vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
    else
-      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-    cpi->mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->td.mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
                                         vp9_highbd_idct4x4_add;
 #else
-    cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
    vp9_first_pass(cpi, source);
  } else if (oxcf->pass == 2 &&
      (!cpi->use_svc || is_two_pass_svc(cpi))) {
@ -3789,7 +3788,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
        PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
-        calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
                         cpi->oxcf.input_bit_depth);
 #else
        calc_psnr(orig, recon, &psnr);
@ -3814,7 +3813,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
          vp9_clear_system_state();

 #if CONFIG_VP9_HIGHBITDEPTH
-          calc_highbd_psnr(orig, pp, &psnr, cpi->mb.e_mbd.bd,
+          calc_highbd_psnr(orig, pp, &psnr, cpi->td.mb.e_mbd.bd,
                           cpi->oxcf.input_bit_depth);
 #else
          calc_psnr(orig, pp, &psnr2);
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@ -241,16 +241,26 @@ typedef struct TileDataEnc {
  int mode_map[BLOCK_SIZES][MAX_MODES];
 } TileDataEnc;

-typedef struct {
+typedef struct RD_COUNTS {
  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
  int64_t comp_pred_diff[REFERENCE_MODES];
  int64_t tx_select_diff[TX_MODES];
  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
-} COUNTS;
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+} ThreadData;

 typedef struct VP9_COMP {
  QUANTS quants;
-  MACROBLOCK mb;
+  ThreadData td;
  VP9_COMMON common;
  VP9EncoderConfig oxcf;
  struct lookahead_ctx    *lookahead;
@ -294,7 +304,6 @@ typedef struct VP9_COMP {
  int ambient_err;

  RD_OPT rd;
-  COUNTS *frame_counts;

  CODING_CONTEXT coding_context;

@ -424,10 +433,6 @@ typedef struct VP9_COMP {
  int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
  int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
  int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
-  PICK_MODE_CONTEXT *leaf_tree;
-  PC_TREE *pc_tree;
-  PC_TREE *pc_root;
  int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];

  int multi_arf_allowed;
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@ -450,13 +450,13 @@ static void set_first_pass_params(VP9_COMP *cpi) {

 void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
  int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  VP9_COMMON *const cm = &cpi->common;
  MACROBLOCKD *const xd = &x->e_mbd;
  TileInfo tile;
  struct macroblock_plane *const p = x->plane;
  struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
  int i;

  int recon_yoffset, recon_uvoffset;
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@ -24,7 +24,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                              MV *dst_mv,
                                              int mb_row,
                                              int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
  const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@ -80,7 +80,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,

 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
                                  int_mv *dst_mv, int mb_row, int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  unsigned int err, tmp_err;
  MV tmp_mv;
@ -117,7 +117,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
 }

 static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  unsigned int err;

@ -131,7 +131,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
  return err;
 }
 static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
-  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCK   *const x  = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  PREDICTION_MODE best_mode = -1, mode;
  unsigned int best_err = INT_MAX;
@ -174,7 +174,7 @@ static void update_mbgraph_mb_stats
  int mb_row,
  int mb_col
 ) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  int intra_error;
  VP9_COMMON *cm = &cpi->common;
@ -229,7 +229,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
                                       YV12_BUFFER_CONFIG *buf,
                                       YV12_BUFFER_CONFIG *golden_ref,
                                       YV12_BUFFER_CONFIG *alt_ref) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  VP9_COMMON *const cm = &cpi->common;

--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@ -38,7 +38,7 @@ static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
  VP9_COMMON *const cm = &cpi->common;
  int filt_err;

-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
+  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
                        partial_frame);
 #if CONFIG_VP9_HIGHBITDEPTH
  if (cm->use_highbitdepth) {
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@ -744,7 +744,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
                            &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
          cost = RDCOST(x->rdmult, x->rddiv,
-                        vp9_get_switchable_rate(cpi) + pf_rate[filter],
+                        vp9_get_switchable_rate(cpi, xd) + pf_rate[filter],
                        pf_dist[filter]);
          pf_tx_size[filter] = mbmi->tx_size;
          if (cost < best_cost) {
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@ -701,7 +701,7 @@ void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {

 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
  cpi->zbin_mode_boost = 0;
-  vp9_init_plane_quantizers(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->td.mb);
 }

 void vp9_set_quantizer(VP9_COMMON *cm, int q) {
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@ -208,23 +208,23 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
 #if CONFIG_VP9_HIGHBITDEPTH
  switch (cpi->common.bit_depth) {
    case VPX_BITS_8:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+      cpi->td.mb.sadperbit16 = sad_per_bit16lut_8[qindex];
+      cpi->td.mb.sadperbit4 = sad_per_bit4lut_8[qindex];
      break;
    case VPX_BITS_10:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_10[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_10[qindex];
+      cpi->td.mb.sadperbit16 = sad_per_bit16lut_10[qindex];
+      cpi->td.mb.sadperbit4 = sad_per_bit4lut_10[qindex];
      break;
    case VPX_BITS_12:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_12[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_12[qindex];
+      cpi->td.mb.sadperbit16 = sad_per_bit16lut_12[qindex];
+      cpi->td.mb.sadperbit4 = sad_per_bit4lut_12[qindex];
      break;
    default:
      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
  }
 #else
-  cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
-  cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+  cpi->td.mb.sadperbit16 = sad_per_bit16lut_8[qindex];
+  cpi->td.mb.sadperbit4 = sad_per_bit4lut_8[qindex];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }

@ -262,7 +262,7 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {

 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  RD_OPT *const rd = &cpi->rd;
  int i;

@ -524,8 +524,7 @@ const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
 }

-int vp9_get_switchable_rate(const VP9_COMP *cpi) {
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
  const int ctx = vp9_get_pred_context_switchable_interp(xd);
  return SWITCHABLE_INTERP_RATE_FACTOR *
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@ -138,7 +138,8 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                  unsigned int qstep, int *rate,
                                  int64_t *dist);

-int vp9_get_switchable_rate(const struct VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+                            const MACROBLOCKD *const xd);

 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
                                                   int ref_frame);
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@ -1220,13 +1220,12 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }

-static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx,
                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
                                 int *rate_uv, int *rate_uv_tokenonly,
                                 int64_t *dist_uv, int *skip_uv,
                                 PREDICTION_MODE *mode_uv) {
-  MACROBLOCK *const x = &cpi->mb;
-
  // Use an estimated rd for uv_intra based on DC_PRED if the
  // appropriate speed flag is set.
  if (cpi->sf.use_uv_intra_rd_estimate) {
@ -2519,7 +2518,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
        int64_t tmp_skip_sse = INT64_MAX;

        mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);

        if (i > 0 && intpel_mv) {
@ -2603,7 +2602,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
  // Set the appropriate filter
  mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
      cm->interp_filter : best_filter;
-  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;

  if (pred_exists) {
    if (best_needs_copy) {
@ -3146,7 +3145,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
                                  pd->subsampling_y);
      if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
      }
@ -3517,7 +3516,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
      int best_rs = INT_MAX;
      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
        mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
        if (rs < best_rs) {
          best_rs = rs;
          best_filter = mbmi->interp_filter;
@ -3528,7 +3527,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
  // Set the appropriate filter
  if (cm->interp_filter == SWITCHABLE) {
    mbmi->interp_filter = best_filter;
-    rate2 += vp9_get_switchable_rate(cpi);
+    rate2 += vp9_get_switchable_rate(cpi, xd);
  } else {
    mbmi->interp_filter = cm->interp_filter;
  }
@ -3780,7 +3779,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
      distortion2 += distortion_y;

      if (rate_uv_intra == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
                             &rate_uv_intra,
                             &rate_uv_tokenonly,
                             &dist_uv, &skip_uv,
@ -3844,7 +3843,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,

            if (tmp_rd == INT64_MAX)
              continue;
-            rs = vp9_get_switchable_rate(cpi);
+            rs = vp9_get_switchable_rate(cpi, xd);
            rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
            filter_cache[switchable_filter_index] = tmp_rd;
            filter_cache[SWITCHABLE_FILTERS] =
@ -3922,7 +3921,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
      distortion2 += distortion;

      if (cm->interp_filter == SWITCHABLE)
-        rate2 += vp9_get_switchable_rate(cpi);
+        rate2 += vp9_get_switchable_rate(cpi, xd);

      if (!mode_excluded)
        mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@ -37,10 +37,6 @@ void vp9_set_segment_data(struct segmentation *seg,
  seg->abs_delta = abs_delta;

  vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-
-  // TBD ?? Set the feature mask
-  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
-  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                            SEG_LVL_FEATURES feature_id) {
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@ -382,7 +382,7 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
 void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
  SPEED_FEATURES *const sf = &cpi->sf;
  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
  int i;

--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@ -213,7 +213,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                              uint8_t *arf_frame_buf,
                                              uint8_t *frame_ptr_buf,
                                              int stride) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
  int step_param;
@ -282,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
  int mb_uv_offset = 0;
  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
-  MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
  YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
  uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
@ -321,8 +321,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
    //  8 - VP9_INTERP_EXTEND.
    // To keep the mv in play for both Y and UV planes the max that it
    //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+    cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
                         + (17 - 2 * VP9_INTERP_EXTEND);

    for (mb_col = 0; mb_col < mb_cols; mb_col++) {
@ -332,8 +332,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
      vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
      vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));

-      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+      cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+      cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
                           + (17 - 2 * VP9_INTERP_EXTEND);

      for (frame = 0; frame < frame_count; frame++) {
@ -653,6 +653,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi,
 void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
  VP9_COMMON *const cm = &cpi->common;
  RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
  int frame;
  int frames_to_blur;
  int start_frame;
@ -720,8 +721,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
        }
      }
      cm->mi = cm->mip + cm->mi_stride + 1;
-      cpi->mb.e_mbd.mi = cm->mi;
-      cpi->mb.e_mbd.mi[0].src_mi = &cpi->mb.e_mbd.mi[0];
+      xd->mi = cm->mi;
+      xd->mi[0].src_mi = &xd->mi[0];
    } else {
      // ARF is produced at the native frame size and resized when coded.
 #if CONFIG_VP9_HIGHBITDEPTH
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@ -244,15 +244,17 @@ void vp9_tokenize_initialize() {

 struct tokenize_b_args {
  VP9_COMP *cpi;
-  MACROBLOCKD *xd;
+  ThreadData *td;
  TOKENEXTRA **tp;
 };

 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                  TX_SIZE tx_size, void *arg) {
  struct tokenize_b_args* const args = arg;
-  MACROBLOCKD *const xd = args->xd;
-  struct macroblock_plane *p = &args->cpi->mb.plane[plane];
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
  struct macroblockd_plane *pd = &xd->plane[plane];
  int aoff, loff;
  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@ -294,10 +296,12 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                       TX_SIZE tx_size, void *arg) {
  struct tokenize_b_args* const args = arg;
  VP9_COMP *cpi = args->cpi;
-  MACROBLOCKD *xd = args->xd;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
  TOKENEXTRA **tp = args->tp;
  uint8_t token_cache[32 * 32];
-  struct macroblock_plane *p = &cpi->mb.plane[plane];
+  struct macroblock_plane *p = &x->plane[plane];
  struct macroblockd_plane *pd = &xd->plane[plane];
  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
  int pt; /* near block/prev token context index */
@ -311,11 +315,11 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
  const scan_order *so;
  const int ref = is_inter_block(mbmi);
  unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      cpi->frame_counts->coef_counts[tx_size][type][ref];
+      td->rd_counts.coef_counts[tx_size][type][ref];
  vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
      cpi->common.fc->coef_probs[tx_size][type][ref];
  unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
-      cpi->common.counts.eob_branch[tx_size][type][ref];
+      td->counts->eob_branch[tx_size][type][ref];
  const uint8_t *const band = get_band_translate(tx_size);
  const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
  const TOKENVALUE *dct_value_tokens;
@ -421,19 +425,20 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
  return result;
 }

-void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     int dry_run, BLOCK_SIZE bsize) {
  VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
  TOKENEXTRA *t_backup = *t;
  const int ctx = vp9_get_skip_context(xd);
  const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                              SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t};
+  struct tokenize_b_args arg = {cpi, td, t};
  if (mbmi->skip) {
    if (!dry_run)
-      cm->counts.skip[ctx][1] += skip_inc;
+      td->counts->skip[ctx][1] += skip_inc;
    reset_skip_context(xd, bsize);
    if (dry_run)
      *t = t_backup;
@ -441,7 +446,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
  }

  if (!dry_run) {
-    cm->counts.skip[ctx][0] += skip_inc;
+    td->counts->skip[ctx][0] += skip_inc;
    vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
  } else {
    vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@ -52,9 +52,10 @@ int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);

 struct VP9_COMP;
+struct ThreadData;

-void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);

 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to