Adds a new subpel motion function

Adds a new subpel motion estimation function that uses a 2-level
tree-structured decision tree to eliminate redundant computations.
It searches fewer points than iterative search (which can search
the same point multiple times) but has the same quality roughly.

This is made the default setting at speeds 0 and 1, while at
speed 2 and above only a 1-level search is used.

Also includes various cleanups for consistency and redundancy removal.

Results:
derf: +0.012% psnr
stdhd: +0.09% psnr
Speedup of about 2-3%

Change-Id: Iedde4866f5475586dea0f0ba4cb7428fba24eee9
This commit is contained in:
Deb Mukherjee 2013-08-07 17:01:43 -07:00
Родитель 296931c817
Коммит 1ba91a84ad
5 изменённых файлов: 323 добавлений и 178 удалений

Просмотреть файл

@ -245,6 +245,71 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {
}, \
v = INT_MAX;)
#define FIRST_LEVEL_CHECKS \
{ \
unsigned int left, right, up, down, diag; \
CHECK_BETTER(left, tr, tc - hstep); \
CHECK_BETTER(right, tr, tc + hstep); \
CHECK_BETTER(up, tr - hstep, tc); \
CHECK_BETTER(down, tr + hstep, tc); \
whichdir = (left < right ? 0 : 1) + \
(up < down ? 0 : 2); \
switch (whichdir) { \
case 0: \
CHECK_BETTER(diag, tr - hstep, tc - hstep); \
break; \
case 1: \
CHECK_BETTER(diag, tr - hstep, tc + hstep); \
break; \
case 2: \
CHECK_BETTER(diag, tr + hstep, tc - hstep); \
break; \
case 3: \
CHECK_BETTER(diag, tr + hstep, tc + hstep); \
break; \
} \
}
#define SECOND_LEVEL_CHECKS \
{ \
int kr, kc; \
unsigned int second; \
if (tr != br && tc != bc) { \
kr = br - tr; \
kc = bc - tc; \
CHECK_BETTER(second, tr + kr, tc + 2 * kc); \
CHECK_BETTER(second, tr + 2 * kr, tc + kc); \
} else if (tr == br && tc != bc) { \
kc = bc - tc; \
CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \
CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \
switch (whichdir) { \
case 0: \
case 1: \
CHECK_BETTER(second, tr + hstep, tc + kc); \
break; \
case 2: \
case 3: \
CHECK_BETTER(second, tr - hstep, tc + kc); \
break; \
} \
} else if (tr != br && tc == bc) { \
kr = br - tr; \
CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \
CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \
switch (whichdir) { \
case 0: \
case 2: \
CHECK_BETTER(second, tr + kr, tc + hstep); \
break; \
case 1: \
case 3: \
CHECK_BETTER(second, tr + kr, tc - hstep); \
break; \
} \
} \
}
int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
@ -261,7 +326,6 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
int rr, rc, br, bc, hstep;
int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int left, right, up, down, diag;
unsigned int sse;
unsigned int whichdir;
unsigned int halfiters = iters_per_step;
@ -306,32 +370,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
// common with the last iteration could be 2 ( if diag selected)
while (halfiters--) {
// 1/2 pel
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
@ -343,32 +385,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
if (forced_stop != 2) {
hstep >>= 1;
while (quarteriters--) {
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
@ -378,32 +398,10 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
@ -419,6 +417,105 @@ int vp9_find_best_sub_pixel_iterative(MACROBLOCK *x,
return besterr;
}
int vp9_find_best_sub_pixel_tree(MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
int iters_per_step,
int *mvjcost, int *mvcost[2],
int *distortion,
unsigned int *sse1) {
uint8_t *z = x->plane[0].src.buf;
int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
int rr, rc, br, bc, hstep;
int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int sse;
unsigned int whichdir;
int thismse;
int maxc, minc, maxr, minr;
int y_stride;
int offset;
unsigned int halfiters = iters_per_step;
unsigned int quarteriters = iters_per_step;
unsigned int eighthiters = iters_per_step;
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
y_stride = xd->plane[0].pre[0].stride;
rr = ref_mv->as_mv.row;
rc = ref_mv->as_mv.col;
br = bestmv->as_mv.row << 3;
bc = bestmv->as_mv.col << 3;
hstep = 4;
minc = MAX(x->mv_col_min << 3,
(ref_mv->as_mv.col) - ((1 << MV_MAX_BITS) - 1));
maxc = MIN(x->mv_col_max << 3,
(ref_mv->as_mv.col) + ((1 << MV_MAX_BITS) - 1));
minr = MAX(x->mv_row_min << 3,
(ref_mv->as_mv.row) - ((1 << MV_MAX_BITS) - 1));
maxr = MIN(x->mv_row_max << 3,
(ref_mv->as_mv.row) + ((1 << MV_MAX_BITS) - 1));
tr = br;
tc = bc;
offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
// central mv
bestmv->as_mv.row <<= 3;
bestmv->as_mv.col <<= 3;
// calculate central point error
besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
// 1/2 pel
FIRST_LEVEL_CHECKS;
if (halfiters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
// Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
if (forced_stop != 2) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (quarteriters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
}
if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
forced_stop == 0) {
hstep >>= 1;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
}
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
}
#undef DIST
/* returns subpixel variance error function */
#define DIST(r, c) \
@ -443,7 +540,6 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
int rr, rc, br, bc, hstep;
int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int left, right, up, down, diag;
unsigned int sse;
unsigned int whichdir;
unsigned int halfiters = iters_per_step;
@ -478,6 +574,121 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
tr = br;
tc = bc;
offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
// central mv
bestmv->as_mv.row <<= 3;
bestmv->as_mv.col <<= 3;
// calculate central point error
// TODO(yunqingwang): central pointer error was already calculated in full-
// pixel search, and can be passed in this function.
comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
*distortion = besterr;
besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
// Each subsequent iteration checks at least one point in
// common with the last iteration could be 2 ( if diag selected)
while (halfiters--) {
// 1/2 pel
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
// Each subsequent iteration checks at least one point in common with
// the last iteration could be 2 ( if diag selected) 1/4 pel
// Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
if (forced_stop != 2) {
hstep >>= 1;
while (quarteriters--) {
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
}
if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
FIRST_LEVEL_CHECKS;
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
}
}
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;
if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
(abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
return INT_MAX;
return besterr;
}
int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop,
int iters_per_step,
int *mvjcost, int *mvcost[2],
int *distortion,
unsigned int *sse1,
const uint8_t *second_pred,
int w, int h) {
uint8_t *z = x->plane[0].src.buf;
int src_stride = x->plane[0].src.stride;
MACROBLOCKD *xd = &x->e_mbd;
int rr, rc, br, bc, hstep;
int tr, tc;
unsigned int besterr = INT_MAX;
unsigned int sse;
unsigned int whichdir;
int thismse;
int maxc, minc, maxr, minr;
int y_stride;
int offset;
unsigned int halfiters = iters_per_step;
unsigned int quarteriters = iters_per_step;
unsigned int eighthiters = iters_per_step;
DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
uint8_t *y = xd->plane[0].pre[0].buf +
(bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
bestmv->as_mv.col;
y_stride = xd->plane[0].pre[0].stride;
rr = ref_mv->as_mv.row;
rc = ref_mv->as_mv.col;
br = bestmv->as_mv.row << 3;
bc = bestmv->as_mv.col << 3;
hstep = 4;
minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
((1 << MV_MAX_BITS) - 1));
maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
((1 << MV_MAX_BITS) - 1));
minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
((1 << MV_MAX_BITS) - 1));
maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
((1 << MV_MAX_BITS) - 1));
tr = br;
tc = bc;
offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
@ -495,37 +706,13 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
// Each subsequent iteration checks at least one point in
// common with the last iteration could be 2 ( if diag selected)
while (halfiters--) {
// 1/2 pel
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
// 1/2 pel
FIRST_LEVEL_CHECKS;
if (halfiters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
// Each subsequent iteration checks at least one point in common with
// the last iteration could be 2 ( if diag selected) 1/4 pel
@ -533,71 +720,23 @@ int vp9_find_best_sub_pixel_comp_iterative(MACROBLOCK *x,
// Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
if (forced_stop != 2) {
hstep >>= 1;
while (quarteriters--) {
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
FIRST_LEVEL_CHECKS;
if (quarteriters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
}
if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv) &&
forced_stop == 0) {
hstep >>= 1;
while (eighthiters--) {
CHECK_BETTER(left, tr, tc - hstep);
CHECK_BETTER(right, tr, tc + hstep);
CHECK_BETTER(up, tr - hstep, tc);
CHECK_BETTER(down, tr + hstep, tc);
whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
switch (whichdir) {
case 0:
CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
}
// no reason to check the same one again.
if (tr == br && tc == bc)
break;
tr = br;
tc = bc;
FIRST_LEVEL_CHECKS;
if (eighthiters > 1) {
SECOND_LEVEL_CHECKS;
}
tr = br;
tc = bc;
}
bestmv->as_mv.row = br;
bestmv->as_mv.col = bc;

Просмотреть файл

@ -80,6 +80,21 @@ typedef int (fractional_mv_step_fp) (
int *distortion,
unsigned int *sse);
extern fractional_mv_step_fp vp9_find_best_sub_pixel_iterative;
extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
typedef int (fractional_mv_step_comp_fp) (
MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
int iters_per_step,
int *mvjcost, int *mvcost[2],
int *distortion, unsigned int *sse1,
const uint8_t *second_pred,
int w, int h);
extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative;
extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
int_mv *ref_mv, int sad_per_bit,
@ -102,18 +117,6 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
int *mvjcost, int *mvcost[2],
int_mv *center_mv);
int vp9_find_best_sub_pixel_comp_iterative(
MACROBLOCK *x,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
const vp9_variance_fn_ptr_t *vfp,
int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
int iters_per_step,
int *mvjcost, int *mvcost[2],
int *distortion, unsigned int *sse1,
const uint8_t *second_pred,
int w, int h);
int vp9_refining_search_8p_c(MACROBLOCK *x,
int_mv *ref_mv, int error_per_bit,
int search_range, vp9_variance_fn_ptr_t *fn_ptr,

Просмотреть файл

@ -713,8 +713,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->search_method = NSTEP;
sf->auto_filter = 1;
sf->recode_loop = 1;
sf->subpel_search_method = SUBPEL_ITERATIVE;
sf->subpel_iters_per_step = 3;
sf->subpel_search_method = SUBPEL_TREE;
sf->subpel_iters_per_step = 2;
sf->optimize_coefficients = !cpi->oxcf.lossless;
sf->reduce_first_step_size = 0;
sf->auto_mv_step_size = 0;
@ -830,7 +830,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
(MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
sf->auto_mv_step_size = 1;
sf->search_method = SQUARE;
sf->subpel_iters_per_step = 2;
sf->subpel_iters_per_step = 1;
}
if (speed == 3) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
@ -922,9 +922,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_iterative;
} else {
// TODO(debargha): Other methods to come
assert(0);
cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_iterative;
} else if (cpi->sf.subpel_search_method == SUBPEL_TREE) {
cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
cpi->find_fractional_mv_step_comp = vp9_find_best_sub_pixel_comp_tree;
}
cpi->mb.optimize = cpi->sf.optimize_coefficients == 1 && cpi->pass != 1;

Просмотреть файл

@ -234,6 +234,7 @@ typedef enum {
typedef enum {
SUBPEL_ITERATIVE = 0,
SUBPEL_TREE = 1,
// Other methods to come
} SUBPEL_SEARCH_METHODS;
@ -533,6 +534,7 @@ typedef struct VP9_COMP {
unsigned int active_map_enabled;
fractional_mv_step_fp *find_fractional_mv_step;
fractional_mv_step_comp_fp *find_fractional_mv_step_comp;
vp9_full_search_fn_t full_search_sad;
vp9_refining_search_fn_t refining_search_sad;
vp9_diamond_search_fn_t diamond_search_sad;

Просмотреть файл

@ -2675,7 +2675,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int dis; /* TODO: use dis in distortion calculation later. */
unsigned int sse;
bestsme = vp9_find_best_sub_pixel_comp_iterative(
bestsme = cpi->find_fractional_mv_step_comp(
x, &tmp_mv,
&ref_mv[id],
x->errorperbit,