Add single motion search for OBMC predictor

Weighted single motion search is implemented for obmc predictor. When NEWMV mode is used, to determine the MV for the current block, we run weighted motion search to compare the weighted prediction with (source - weighted prediction using neighbors' MVs), in which the distortion is the actual prediction error of obmc prediction. Coding gain: 0.404/0.425/0.366 for lowres/midres/hdres Speed impact: +14% encoding time (obmc w/o mv search 13%-> obmc w/ mv search 27%) Change-Id: Id7ad3fc6ba295b23d9c53c8a16a4ac1677ad835c
2016-04-22 15:09:12 -07:00 · 2016-04-22 15:09:12 -07:00 · 370f203a40
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@ -462,6 +462,7 @@ static INLINE int vp10_is_interp_needed(const MACROBLOCKD *const xd) {
 #endif  // CONFIG_EXT_INTERP

 #if CONFIG_OBMC
+void setup_obmc_mask(int length, const uint8_t *mask[2]);
 void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
                                      MACROBLOCKD *xd, int mi_row, int mi_col,
                                      int use_tmp_dst_buf,
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@ -1219,6 +1219,49 @@ MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad4x8)
 MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad4x4)
 #endif  // CONFIG_EXT_INTER

+#if CONFIG_OBMC
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF)                                   \
+  cpi->fn_ptr[BT].osdf            = OSDF;                                  \
+  cpi->fn_ptr[BT].ovf             = OVF;                                   \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                      \
+static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,     \
+                                   const int *wsrc, int wsrc_stride,       \
+                                   const int *msk, int msk_stride) {       \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride);      \
+}                                                                          \
+static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride,    \
+                                    const int *wsrc, int wsrc_stride,      \
+                                    const int *msk, int msk_stride) {      \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride) >> 2; \
+}                                                                          \
+static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride,    \
+                                    const int *wsrc, int wsrc_stride,      \
+                                    const int *msk, int msk_stride) {      \
+  return fnname(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride) >> 4; \
+}
+
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x4)
+#endif  // CONFIG_OBMC
+
 static void  highbd_set_var_fns(VP10_COMP *const cpi) {
  VP10_COMMON *const cm = &cpi->common;
  if (cm->use_highbitdepth) {
@ -1454,6 +1497,74 @@ static void  highbd_set_var_fns(VP10_COMP *const cpi) {
                    vpx_highbd_masked_variance4x4,
                    vpx_highbd_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits8,
+                    vpx_highbd_obmc_variance128x128,
+                    vpx_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits8,
+                    vpx_highbd_obmc_variance128x64,
+                    vpx_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits8,
+                    vpx_highbd_obmc_variance64x128,
+                    vpx_highbd_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits8,
+                    vpx_highbd_obmc_variance64x64,
+                    vpx_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits8,
+                    vpx_highbd_obmc_variance64x32,
+                    vpx_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits8,
+                    vpx_highbd_obmc_variance32x64,
+                    vpx_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits8,
+                    vpx_highbd_obmc_variance32x32,
+                    vpx_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits8,
+                    vpx_highbd_obmc_variance32x16,
+                    vpx_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits8,
+                    vpx_highbd_obmc_variance16x32,
+                    vpx_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits8,
+                    vpx_highbd_obmc_variance16x16,
+                    vpx_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits8,
+                    vpx_highbd_obmc_variance8x16,
+                    vpx_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits8,
+                    vpx_highbd_obmc_variance16x8,
+                    vpx_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits8,
+                    vpx_highbd_obmc_variance8x8,
+                    vpx_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits8,
+                    vpx_highbd_obmc_variance4x8,
+                    vpx_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits8,
+                    vpx_highbd_obmc_variance8x4,
+                    vpx_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits8,
+                    vpx_highbd_obmc_variance4x4,
+                    vpx_highbd_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
        break;

      case VPX_BITS_10:
@ -1687,6 +1798,74 @@ static void  highbd_set_var_fns(VP10_COMP *const cpi) {
                    vpx_highbd_10_masked_variance4x4,
                    vpx_highbd_10_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits10,
+                    vpx_highbd_10_obmc_variance128x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits10,
+                    vpx_highbd_10_obmc_variance128x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits10,
+                    vpx_highbd_10_obmc_variance64x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits10,
+                    vpx_highbd_10_obmc_variance64x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits10,
+                    vpx_highbd_10_obmc_variance64x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits10,
+                    vpx_highbd_10_obmc_variance32x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits10,
+                    vpx_highbd_10_obmc_variance32x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits10,
+                    vpx_highbd_10_obmc_variance32x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits10,
+                    vpx_highbd_10_obmc_variance16x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits10,
+                    vpx_highbd_10_obmc_variance16x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits10,
+                    vpx_highbd_10_obmc_variance8x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits10,
+                    vpx_highbd_10_obmc_variance16x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits10,
+                    vpx_highbd_10_obmc_variance8x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits10,
+                    vpx_highbd_10_obmc_variance4x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits10,
+                    vpx_highbd_10_obmc_variance8x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits10,
+                    vpx_highbd_10_obmc_variance4x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
        break;

      case VPX_BITS_12:
@ -1920,6 +2099,75 @@ static void  highbd_set_var_fns(VP10_COMP *const cpi) {
                    vpx_highbd_12_masked_variance4x4,
                    vpx_highbd_12_masked_sub_pixel_variance4x4)
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits12,
+                    vpx_highbd_12_obmc_variance128x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits12,
+                    vpx_highbd_12_obmc_variance128x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits12,
+                    vpx_highbd_12_obmc_variance64x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits12,
+                    vpx_highbd_12_obmc_variance64x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits12,
+                    vpx_highbd_12_obmc_variance64x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits12,
+                    vpx_highbd_12_obmc_variance32x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits12,
+                    vpx_highbd_12_obmc_variance32x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits12,
+                    vpx_highbd_12_obmc_variance32x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits12,
+                    vpx_highbd_12_obmc_variance16x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits12,
+                    vpx_highbd_12_obmc_variance16x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits12,
+                    vpx_highbd_12_obmc_variance8x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits12,
+                    vpx_highbd_12_obmc_variance16x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits12,
+                    vpx_highbd_12_obmc_variance8x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits12,
+                    vpx_highbd_12_obmc_variance4x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits12,
+                    vpx_highbd_12_obmc_variance8x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits12,
+                    vpx_highbd_12_obmc_variance4x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
        break;

      default:
@ -2415,6 +2663,48 @@ VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
      vpx_sub_pixel_avg_variance4x4,
      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)

+#if CONFIG_OBMC
+#define OBFP(BT, OSDF, OVF, OSVF)         \
+  cpi->fn_ptr[BT].osdf            = OSDF; \
+  cpi->fn_ptr[BT].ovf             = OVF;  \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_128X128, vpx_obmc_sad128x128, vpx_obmc_variance128x128,
+       vpx_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, vpx_obmc_sad128x64, vpx_obmc_variance128x64,
+       vpx_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, vpx_obmc_sad64x128, vpx_obmc_variance64x128,
+       vpx_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  OBFP(BLOCK_64X64, vpx_obmc_sad64x64, vpx_obmc_variance64x64,
+       vpx_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, vpx_obmc_sad64x32, vpx_obmc_variance64x32,
+       vpx_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, vpx_obmc_sad32x64, vpx_obmc_variance32x64,
+       vpx_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, vpx_obmc_sad32x32, vpx_obmc_variance32x32,
+       vpx_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, vpx_obmc_sad32x16, vpx_obmc_variance32x16,
+       vpx_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, vpx_obmc_sad16x32, vpx_obmc_variance16x32,
+       vpx_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, vpx_obmc_sad16x16, vpx_obmc_variance16x16,
+       vpx_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, vpx_obmc_sad16x8, vpx_obmc_variance16x8,
+       vpx_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, vpx_obmc_sad8x16, vpx_obmc_variance8x16,
+       vpx_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, vpx_obmc_sad8x8, vpx_obmc_variance8x8,
+       vpx_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, vpx_obmc_sad4x8, vpx_obmc_variance4x8,
+       vpx_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, vpx_obmc_sad8x4, vpx_obmc_variance8x4,
+       vpx_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, vpx_obmc_sad4x4, vpx_obmc_variance4x4,
+       vpx_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
+
 #if CONFIG_EXT_INTER
 #define MBFP(BT, MSDF, MVF, MSVF)         \
  cpi->fn_ptr[BT].msdf            = MSDF; \
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@ -3253,3 +3253,544 @@ int vp10_masked_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
  return bestsme;
 }
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                     \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,         \
+            src_stride, mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c)                                                      \
+  (mvcost ?                                                            \
+    ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +                         \
+      mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *                 \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_obmc_pref_error(xd,                            \
+                                        mask, mask_stride,             \
+                                        vfp, z, src_stride,            \
+                                        upre(y, y_stride, r, c),       \
+                                        y_stride,                      \
+                                        w, h, &sse);                   \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+static unsigned int setup_obmc_center_error(const int *mask,
+                                            int mask_stride,
+                                            const MV *bestmv,
+                                            const MV *ref_mv,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            const int *const wsrc,
+                                            const int wsrc_stride,
+                                            const uint8_t *const y,
+                                            int y_stride,
+                                            int offset,
+                                            int *mvjcost, int *mvcost[2],
+                                            unsigned int *sse1,
+                                            int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y + offset, y_stride, wsrc, wsrc_stride,
+                     mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(const MACROBLOCKD *xd,
+                                     const int *mask, int mask_stride,
+                                     const vp10_variance_fn_ptr_t *vfp,
+                                     const int *const wsrc,
+                                     const int wsrc_stride,
+                                     const uint8_t *const y, int y_stride,
+                                     int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, wsrc_stride,
+                       mask, mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->ovf(pred, w, wsrc, wsrc_stride, mask, mask_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+                        const MACROBLOCKD *xd,
+                        const int *mask, int mask_stride,
+                        const MV *bestmv, const MV *ref_mv,
+                        int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+                        const int *const wsrc, const int wsrc_stride,
+                        const uint8_t *const y, int y_stride,
+                        int w, int h, int offset, int *mvjcost, int *mvcost[2],
+                        unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_obmc_pref_error(xd, mask, mask_stride, vfp,
+                                                   wsrc, wsrc_stride,
+                                                   y + offset, y_stride,
+                                                   w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_obmc_sub_pixel_tree_up(VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int *wsrc, int wsrc_stride,
+                                          const int *mask, int mask_stride,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref) {
+  const int *const z = wsrc;
+  const int *const src_address = z;
+  const int src_stride = wsrc_stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_obmc_pref_error(xd, mask, mask_stride,
+                                              vfp, src_address, src_stride,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride,
+                              mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_obmc_pref_error(xd, mask, mask_stride,
+                                            vfp, src_address, src_stride,
+                                            pre_address, y_stride,
+                                            w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x,
+                               const int *wsrc, int wsrc_stride,
+                               const int *mask, int mask_stride,
+                               const MV *best_mv, const MV *center_mv,
+                               const vp10_variance_fn_ptr_t *vfp,
+                               int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  wsrc, wsrc_stride, mask, mask_stride, &unused) +
+         (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x,
+                             const int *wsrc, int wsrc_stride,
+                             const int *mask, int mask_stride,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp10_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride,
+                                       wsrc, wsrc_stride, mask, mask_stride) +
+                         mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+                                        in_what->stride, wsrc, wsrc_stride,
+                                        mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x,
+                            const search_site_config *cfg,
+                            const int *wsrc, int wsrc_stride,
+                            const int *mask, int mask_stride,
+                            MV *ref_mv, MV *best_mv,
+                            int search_param,
+                            int sad_per_bit, int *num00,
+                            const vp10_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, in_what->stride,
+                          wsrc, wsrc_stride, mask, mask_stride) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+                              wsrc, wsrc_stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+                                 in_what->stride, wsrc, wsrc_stride,
+                                 mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp10_obmc_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int *wsrc, int wsrc_stride,
+                                 const int *mask, int mask_stride,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, wsrc_stride,
+                                        mask, mask_stride,
+                                        mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                  &temp_mv, ref_mv, fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, wsrc_stride,
+                                        mask, mask_stride,
+                                        mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                      &temp_mv, ref_mv, fn_ptr, 1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = obmc_refining_search_sad(x, wsrc, wsrc_stride, mask, mask_stride,
+                                       &best_mv, sadpb, search_range,
+                                       fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, wsrc_stride, mask, mask_stride,
+                                    &best_mv, ref_mv, fn_ptr, 1, is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_OBMC
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@ -195,6 +195,29 @@ int vp10_masked_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
                                   const MV *ref_mv, MV *dst_mv,
                                   int is_second);
 #endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+int vp10_obmc_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int *wsrc, int wsrc_stride,
+                                 const int *mask, int mask_stride,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second);
+int vp10_find_best_obmc_sub_pixel_tree_up(struct VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int *wsrc, int wsrc_stride,
+                                          const int *mask, int mask_stride,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref);
+#endif  // CONFIG_OBMC
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@ -5980,6 +5980,149 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
  }
 }

+#if CONFIG_OBMC
+static void single_motion_search_obmc(VP10_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                      const int* wsrc, int wsrc_stride,
+                                      const int* mask, int mask_stride,
+#if CONFIG_EXT_INTER
+                                      int ref_idx,
+                                      int mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                      int_mv *tmp_mv, int_mv pred_mv,
+                                      int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP10_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+#else
+  int ref = mbmi->ref_frame[0];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp10_get_scaled_ref_frame(cpi,
+                                                                         ref);
+
+#if CONFIG_REF_MV
+  vp10_set_mvcost(x, ref);
+#endif
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    vp10_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp10_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp10_init_search_range(x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =  2 * (b_width_log2_lookup[cm->sb_size] -
+         VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[ref_idx] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv.as_mv;
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp10_obmc_full_pixel_diamond(cpi, x, wsrc, wsrc_stride,
+                                         mask, mask_stride,
+                                         &mvp_full, step_param, sadpb,
+                                         MAX_MVSEARCH_STEPS - 1 - step_param,
+                                         1, &cpi->fn_ptr[bsize],
+                                         &ref_mv, &tmp_mv->as_mv, ref_idx);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;
+    vp10_find_best_obmc_sub_pixel_tree_up(cpi, x,
+                                          wsrc, wsrc_stride,
+                                          mask, mask_stride,
+                                          mi_row, mi_col,
+                                          &tmp_mv->as_mv, &ref_mv,
+                                          cm->allow_high_precision_mv,
+                                          x->errorperbit,
+                                          &cpi->fn_ptr[bsize],
+                                          cpi->sf.mv.subpel_force_stop,
+                                          cpi->sf.mv.subpel_iters_per_step,
+                                          x->nmvjointcost, x->mvcost,
+                                          &dis, &x->pred_sse[ref],
+                                          ref_idx,
+                                          cpi->sf.use_upsampled_references);
+  }
+  *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+#endif  // CONFIG_OBMC
+
 #if CONFIG_EXT_INTER
 static void do_masked_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
                                    const uint8_t *mask, int mask_stride,
@ -6314,10 +6457,10 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                 int_mv (*mode_mv)[MAX_REF_FRAMES],
                                 int mi_row, int mi_col,
 #if CONFIG_OBMC
-                                 uint8_t *dst_buf1[3],
-                                 int dst_stride1[3],
-                                 uint8_t *dst_buf2[3],
-                                 int dst_stride2[3],
+                                 uint8_t *dst_buf1[3], int dst_stride1[3],
+                                 uint8_t *dst_buf2[3], int dst_stride2[3],
+                                 int *wsrc, int wsrc_strides,
+                                 int *mask2d, int mask2d_strides,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                 int_mv single_newmvs[2][MAX_REF_FRAMES],
@ -6379,6 +6522,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
  MB_MODE_INFO best_mbmi;
 #if CONFIG_EXT_INTER
  int rate2_bmc_nocoeff;
+  int rate_mv_bmc;
  MB_MODE_INFO best_bmc_mbmi;
 #endif  // CONFIG_EXT_INTER
 #endif  // CONFIG_OBMC
@ -6817,6 +6961,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
 #if CONFIG_EXT_INTER
 #if CONFIG_OBMC
  best_bmc_mbmi = *mbmi;
+  rate_mv_bmc = rate_mv;
  rate2_bmc_nocoeff = *rate2;
  if (cm->interp_filter == SWITCHABLE)
    rate2_bmc_nocoeff += rs;
@ -7294,14 +7439,45 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
  for (mbmi->obmc = 0; mbmi->obmc <= allow_obmc; mbmi->obmc++) {
    int64_t tmp_rd, tmp_dist;
    int tmp_rate;
+#if CONFIG_EXT_INTER
+    int tmp_rate2 = mbmi->obmc ? rate2_bmc_nocoeff : rate2_nocoeff;
+#else
+    int tmp_rate2 = rate2_nocoeff;
+#endif  // CONFIG_EXT_INTER

    if (mbmi->obmc) {
 #if CONFIG_EXT_INTER
      *mbmi = best_bmc_mbmi;
-      assert(!mbmi->use_wedge_interinter);
-      vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
      mbmi->obmc = 1;
 #endif  // CONFIG_EXT_INTER
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv;
+        int_mv pred_mv;
+        int tmp_rate_mv = 0;
+
+        pred_mv.as_int = mbmi->mv[0].as_int;
+        single_motion_search_obmc(cpi, x, bsize, mi_row, mi_col,
+                                  wsrc, wsrc_strides,
+                                  mask2d, mask2d_strides,
+#if CONFIG_EXT_INTER
+                                  0, mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                  &tmp_mv, pred_mv, &tmp_rate_mv);
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+          tmp_rate_mv = VPXMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        }
+#if CONFIG_EXT_INTER
+        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_EXT_INTER
+      } else {
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#endif  // CONFIG_EXT_INTER
+      }
      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, 0,
                                       NULL, NULL,
                                       dst_buf1, dst_stride1,
@ -7323,11 +7499,7 @@ static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
    x->skip = 0;

-#if CONFIG_EXT_INTER
-    *rate2 = mbmi->obmc ? rate2_bmc_nocoeff : rate2_nocoeff;
-#else
-    *rate2 = rate2_nocoeff;
-#endif  // CONFIG_EXT_INTER
+    *rate2 = tmp_rate2;
    if (allow_obmc)
      *rate2 += cpi->obmc_cost[bsize][mbmi->obmc];
    *distortion = 0;
@ -7835,9 +8007,13 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, int, weighted_src_buf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int, mask2d_buf[MAX_SB_SQUARE]);
  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
  int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
  int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+  int weighted_src_stride = MAX_SB_SIZE;
+  int mask2d_stride = MAX_SB_SIZE;

 #if CONFIG_VP9_HIGHBITDEPTH
  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@ -7939,6 +8115,11 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
  vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
                                      dst_stride2);
  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  calc_target_weighted_pred(cm, x, xd, mi_row, mi_col,
+                            dst_buf1[0], dst_stride1[0],
+                            dst_buf2[0], dst_stride2[0],
+                            mask2d_buf, mask2d_stride,
+                            weighted_src_buf, weighted_src_stride);
 #endif  // CONFIG_OBMC

  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
@ -8485,6 +8666,8 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
 #if CONFIG_OBMC
                                  dst_buf1, dst_stride1,
                                  dst_buf2, dst_stride2,
+                                  weighted_src_buf, weighted_src_stride,
+                                  mask2d_buf, mask2d_stride,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                  single_newmvs,
@ -8596,6 +8779,9 @@ void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
 #if CONFIG_OBMC
                                           dst_buf1, dst_stride1,
                                           dst_buf2, dst_stride2,
+                                           weighted_src_buf,
+                                           weighted_src_stride,
+                                           mask2d_buf, mask2d_stride,
 #endif  // CONFIG_OBMC
 #if CONFIG_EXT_INTER
                                           dummy_single_newmvs,
@ -10153,3 +10339,194 @@ void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
  store_coding_context(x, ctx, best_ref_index,
                       best_pred_diff, 0);
 }
+
+#if CONFIG_OBMC
+void calc_target_weighted_pred(VP10_COMMON *cm,
+                               MACROBLOCK *x,
+                               MACROBLOCKD *xd,
+                               int mi_row, int mi_col,
+                               uint8_t *above_buf, int above_stride,
+                               uint8_t *left_buf,  int left_stride,
+                               int *mask_buf, int mask_stride,
+                               int *weighted_src_buf, int weighted_src_stride) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int row, col, i, mi_step;
+  int bw = 8 * xd->n8_w;
+  int bh = 8 * xd->n8_h;
+  int *dst = weighted_src_buf;
+  int *mask2d = mask_buf;
+  uint8_t *src;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col) {
+      dst[col] = 0;
+      mask2d[col] = 64;
+    }
+    dst += weighted_src_stride;
+    mask2d += mask_stride;
+  }
+
+  // handle above row
+#if CONFIG_EXT_TILE
+  if (mi_row > 0 && (mi_row - 1 >= tile->mi_row_start)) {
+#else
+  if (mi_row > 0) {
+#endif  // CONFIG_EXT_TILE
+    for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+      int mi_row_offset = -1;
+      int mi_col_offset = i;
+      MODE_INFO *above_mi = xd->mi[mi_col_offset +
+                                   mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+      int overlap = num_4x4_blocks_high_lookup[bsize] << 1;
+
+      mi_step = VPXMIN(xd->n8_w,
+                       num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        const struct macroblockd_plane *pd = &xd->plane[0];
+        int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+        int bh = overlap >> pd->subsampling_y;
+        int dst_stride = weighted_src_stride;
+        int *dst = weighted_src_buf + (i * MI_SIZE >> pd->subsampling_x);
+        int tmp_stride = above_stride;
+        uint8_t *tmp = above_buf + (i * MI_SIZE >> pd->subsampling_x);
+        int mask2d_stride = mask_stride;
+        int *mask2d = mask_buf + (i * MI_SIZE >> pd->subsampling_x);
+        const uint8_t *mask1d[2];
+
+        setup_obmc_mask(bh, mask1d);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_hbd) {
+          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+          for (row = 0; row < bh; ++row) {
+            for (col = 0; col < bw; ++col) {
+              dst[col] = mask1d[1][row] * tmp16[col];
+              mask2d[col] = mask1d[0][row];
+            }
+            dst += dst_stride;
+            tmp16 += tmp_stride;
+            mask2d += mask2d_stride;
+          }
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (row = 0; row < bh; ++row) {
+          for (col = 0; col < bw; ++col) {
+            dst[col] = mask1d[1][row] * tmp[col];
+            mask2d[col] = mask1d[0][row];
+          }
+          dst += dst_stride;
+          tmp += tmp_stride;
+          mask2d += mask2d_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }  // each mi in the above row
+  }
+
+  // handle left column
+  dst = weighted_src_buf;
+  mask2d = mask_buf;
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col) {
+      dst[col] = dst[col] << 6;
+      mask2d[col] = mask2d[col] << 6;
+    }
+    dst += weighted_src_stride;
+    mask2d += mask_stride;
+  }
+
+  if (mi_col > 0 && (mi_col - 1 >= tile->mi_col_start)) {
+    for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+      int mi_row_offset = i;
+      int mi_col_offset = -1;
+      int overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
+      MODE_INFO *left_mi = xd->mi[mi_col_offset +
+                                  mi_row_offset * xd->mi_stride];
+      MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+
+      mi_step = VPXMIN(xd->n8_h,
+                       num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        const struct macroblockd_plane *pd = &xd->plane[0];
+        int bw = overlap >> pd->subsampling_x;
+        int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+        int dst_stride = weighted_src_stride;
+        int *dst = weighted_src_buf +
+                   (i * MI_SIZE * dst_stride >> pd->subsampling_y);
+        int tmp_stride = left_stride;
+        uint8_t *tmp = left_buf +
+                       (i * MI_SIZE * tmp_stride >> pd->subsampling_y);
+        int mask2d_stride = mask_stride;
+        int *mask2d = mask_buf +
+                      (i * MI_SIZE * mask2d_stride >> pd->subsampling_y);
+        const uint8_t *mask1d[2];
+
+        setup_obmc_mask(bw, mask1d);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_hbd) {
+          uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+          for (row = 0; row < bh; ++row) {
+            for (col = 0; col < bw; ++col) {
+              dst[col] = (dst[col] >> 6) * mask1d[0][col] +
+                         (tmp16[col] << 6) * mask1d[1][col];
+              mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
+            }
+            dst += dst_stride;
+            tmp16 += tmp_stride;
+            mask2d += mask2d_stride;
+          }
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (row = 0; row < bh; ++row) {
+          for (col = 0; col < bw; ++col) {
+            dst[col] = (dst[col] >> 6) * mask1d[0][col] +
+                       (tmp[col] << 6) * mask1d[1][col];
+            mask2d[col] = (mask2d[col] >> 6) * mask1d[0][col];
+          }
+          dst += dst_stride;
+          tmp += tmp_stride;
+          mask2d += mask2d_stride;
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  //  CONFIG_VP9_HIGHBITDEPTH
+      }
+    }  // each mi in the left column
+  }
+
+  dst = weighted_src_buf;
+  src = x->plane[0].src.buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hbd) {
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col)
+        dst[col] = (src16[col] << 12) - dst[col];
+      dst += weighted_src_stride;
+      src16 += x->plane[0].src.stride;
+    }
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  for (row = 0; row < bh; ++row) {
+    for (col = 0; col < bw; ++col)
+      dst[col] = (src[col] << 12) - dst[col];
+    dst += weighted_src_stride;
+    src += x->plane[0].src.stride;
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+#endif  // CONFIG_OBMC
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@ -108,6 +108,17 @@ static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(VP10_COMP *cpi,
  return &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[ref_idx]].buf;
 }

+#if CONFIG_OBMC
+void calc_target_weighted_pred(VP10_COMMON *cm,
+                               MACROBLOCK *x,
+                               MACROBLOCKD *xd,
+                               int mi_row, int mi_col,
+                               uint8_t *above_buf, int above_stride,
+                               uint8_t *left_buf, int left_stride,
+                               int *mask_buf, int mask_stride,
+                               int *weighted_src_buf, int weighted_src_stride);
+#endif  // CONFIG_OBMC
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@ -450,3 +450,109 @@ HIGHBD_MASKSADMXN(4, 8)
 HIGHBD_MASKSADMXN(4, 4)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+// a: pred
+// b: target weighted prediction (has been *4096 to keep precision)
+// m: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
+                                    const int *b, int b_stride,
+                                    const int *m, int m_stride,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int abs_diff = abs(b[x] - a[x] * m[x]);
+      sad += (abs_diff + 2048) >> 12;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                      \
+unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride,    \
+                                       const int *wsrc, int wsrc_stride,      \
+                                       const int *msk, int msk_stride) {      \
+  return obmc_sad(ref, ref_stride, wsrc, wsrc_stride, msk, msk_stride, m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
+                                           const int *b, int b_stride,
+                                           const int *m, int m_stride,
+                                           int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      int abs_diff = abs(b[x] - a[x] * m[x]);
+      sad += (abs_diff + 2048) >> 12;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                               \
+unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref,             \
+                                              int ref_stride,                 \
+                                              const int *wsrc,                \
+                                              int wsrc_stride,                \
+                                              const int *msk,                 \
+                                              int msk_stride) {               \
+  return highbd_obmc_sad(ref, ref_stride, wsrc, wsrc_stride,                  \
+                         msk, msk_stride, m, n);                              \
+}
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@ -7,6 +7,7 @@
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
+#include <stdlib.h>

 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@ -1022,3 +1023,322 @@ HIGHBD_MASK_SUBPIX_VAR(128, 128)
 #endif  // CONFIG_EXT_PARTITION
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+void obmc_variance(const uint8_t *a, int  a_stride,
+                   const int *b, int  b_stride,
+                   const int *m, int  m_stride,
+                   int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int scaled_diff = b[j] - a[j] * m[j];
+      int abs_diff = (abs(scaled_diff) + 2048) >> 12;
+      int diff = (scaled_diff >= 0) ? abs_diff : -abs_diff;
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+}
+
+#define OBMC_VAR(W, H) \
+unsigned int vpx_obmc_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                            const int *b, int b_stride, \
+                                            const int *m, int m_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  obmc_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define OBMC_SUBPIX_VAR(W, H) \
+unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(                        \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint8_t temp2[H * W];                                                       \
+                                                                              \
+  var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W,     \
+                                    bilinear_filters_2t[xoffset]);            \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,               \
+                                     bilinear_filters_2t[yoffset]);           \
+                                                                              \
+  return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, wsrc_stride,          \
+                                        msk, msk_stride, sse);                \
+}
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_obmc_variance64(const uint8_t *a8, int  a_stride,
+                            const int *b, int  b_stride,
+                            const int *m, int  m_stride,
+                            int w, int h, uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int scaled_diff = b[j] - a[j] * m[j];
+      int abs_diff = (abs(scaled_diff) + 2048) >> 12;
+      int diff = (scaled_diff >= 0) ? abs_diff : -abs_diff;
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+}
+
+void highbd_obmc_variance(const uint8_t *a8, int  a_stride,
+                          const int *b, int  b_stride,
+                          const int *m, int  m_stride,
+                          int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+void highbd_10_obmc_variance(const uint8_t *a8, int  a_stride,
+                             const int *b, int  b_stride,
+                             const int *m, int  m_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+void highbd_12_obmc_variance(const uint8_t *a8, int  a_stride,
+                             const int *b, int  b_stride,
+                             const int *m, int  m_stride,
+                             int  w, int  h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(a8, a_stride, b, b_stride, m, m_stride,
+                         w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                                 \
+unsigned int vpx_highbd_obmc_variance##W##x##H##_c(const uint8_t *a,          \
+                                                   int a_stride,              \
+                                                   const int *b,              \
+                                                   int b_stride,              \
+                                                   const int *m,              \
+                                                   int m_stride,              \
+                                                   unsigned int *sse) {       \
+  int sum;                                                                    \
+  highbd_obmc_variance(a, a_stride, b, b_stride, m, m_stride,                 \
+                       W, H, sse, &sum);                                      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(const uint8_t *a,       \
+                                                      int a_stride,           \
+                                                      const int *b,           \
+                                                      int b_stride,           \
+                                                      const int *m,           \
+                                                      int m_stride,           \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_10_obmc_variance(a, a_stride, b, b_stride, m, m_stride,              \
+                          W, H, sse, &sum);                                   \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(const uint8_t *a,       \
+                                                      int a_stride,           \
+                                                      const int *b,           \
+                                                      int b_stride,           \
+                                                      const int *m,           \
+                                                      int m_stride,           \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_12_obmc_variance(a, a_stride, b, b_stride, m, m_stride,              \
+                          W, H, sse, &sum);                                   \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                          \
+unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c(                 \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),     \
+                                               W, wsrc, wsrc_stride,          \
+                                               msk, msk_stride, sse);         \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, wsrc_stride,       \
+                                                  msk, msk_stride, sse);      \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int *wsrc, int wsrc_stride,     \
+                                        const int *msk, int msk_stride,       \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, wsrc_stride,       \
+                                                  msk, msk_stride, sse);      \
+}
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@ -98,6 +98,30 @@ typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
                                                       unsigned int *sse);
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER

+#if CONFIG_VP10 && CONFIG_OBMC
+typedef unsigned int(*vpx_obmc_sad_fn_t)(const uint8_t *pred,
+                                         int pred_stride,
+                                         const int *wsrc,
+                                         int wsrc_stride,
+                                         const int *msk,
+                                         int msk_stride);
+typedef unsigned int (*vpx_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int *wsrc,
+                                               int wsrc_stride,
+                                               const int *msk,
+                                               int msk_stride,
+                                               unsigned int *sse);
+typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(const uint8_t *pred,
+                                                     int pred_stride,
+                                                     int xoffset, int yoffset,
+                                                     const int *wsrc,
+                                                     int wsrc_stride,
+                                                     const int *msk,
+                                                     int msk_stride,
+                                                     unsigned int *sse);
+#endif  // CONFIG_VP10 && CONFIG_OBMC
+
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
  vpx_sad_fn_t               sdf;
@ -126,6 +150,11 @@ typedef struct vp10_variance_vtable {
  vpx_masked_variance_fn_t       mvf;
  vpx_masked_subpixvariance_fn_t msvf;
 #endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+  vpx_obmc_sad_fn_t              osdf;
+  vpx_obmc_variance_fn_t         ovf;
+  vpx_obmc_subpixvariance_fn_t   osvf;
+#endif  // CONFIG_OBMC
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@ -1094,6 +1094,25 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
  }
 }

+#
+# OBMC SAD
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride";
+    specialize "vpx_obmc_sad${w}x${h}";
+  }
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride";
+      specialize "vpx_highbd_obmc_sad${w}x${h}";
+    }
+  }
+}
+
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
@ -1364,6 +1383,31 @@ if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
  }
 }

+#
+# OBMC Variance / OBMC Subpixel Variance
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, int xoffset, int  yoffset, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+    specialize "vpx_obmc_variance${w}x${h}";
+    specialize "vpx_obmc_sub_pixel_variance${w}x${h}";
+  }
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre_ptr, int pre_stride, int xoffset, int  yoffset, const int *wsrc_ptr, int wsrc_stride, const int *mask, int mask_stride, unsigned int *sse";
+        specialize "vpx_highbd${bd}obmc_variance${w}x${h}";
+        specialize "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}";
+      }
+    }
+  }
+}
+
 #
 # Specialty Subpixel
 #