Full search SAD function optimization in SSE4.1
Use mpsadbw, and calculate 8 sad at once. Function list: vp8_sad16x16x8_sse4 vp8_sad16x8x8_sse4 vp8_sad8x16x8_sse4 vp8_sad8x8x8_sse4 vp8_sad4x4x8_sse4 (test clip: tulip) For best quality mode, this gave encoder a 5% performance boost. For good quality mode with speed=1, this gave encoder a 3% performance boost. Change-Id: I083b5a39d39144f88dcbccbef95da6498e490134
This commit is contained in:
Родитель
a0ae3682aa
Коммит
71ecb5d7d9
|
@ -824,6 +824,7 @@ process_common_toolchain() {
|
||||||
soft_enable sse2
|
soft_enable sse2
|
||||||
soft_enable sse3
|
soft_enable sse3
|
||||||
soft_enable ssse3
|
soft_enable ssse3
|
||||||
|
soft_enable sse4_1
|
||||||
|
|
||||||
case ${tgt_os} in
|
case ${tgt_os} in
|
||||||
win*)
|
win*)
|
||||||
|
|
|
@ -199,6 +199,7 @@ ARCH_EXT_LIST="
|
||||||
sse2
|
sse2
|
||||||
sse3
|
sse3
|
||||||
ssse3
|
ssse3
|
||||||
|
sse4_1
|
||||||
|
|
||||||
altivec
|
altivec
|
||||||
"
|
"
|
||||||
|
|
|
@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
|
||||||
cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c;
|
cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c;
|
||||||
cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c;
|
cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c;
|
||||||
|
|
||||||
|
cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_c;
|
||||||
|
cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_c;
|
||||||
|
cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_c;
|
||||||
|
cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_c;
|
||||||
|
cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_c;
|
||||||
|
|
||||||
cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c;
|
cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c;
|
||||||
cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c;
|
cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c;
|
||||||
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
|
cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c;
|
||||||
|
|
|
@ -1323,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
|
||||||
check_here = r * mv_stride + in_what + col_min;
|
check_here = r * mv_stride + in_what + col_min;
|
||||||
c = col_min;
|
c = col_min;
|
||||||
|
|
||||||
while ((c + 3) < col_max)
|
while ((c + 2) < col_max)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
@ -1388,6 +1388,158 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
|
||||||
|
{
|
||||||
|
unsigned char *what = (*(b->base_src) + b->src);
|
||||||
|
int what_stride = b->src_stride;
|
||||||
|
unsigned char *in_what;
|
||||||
|
int in_what_stride = d->pre_stride;
|
||||||
|
int mv_stride = d->pre_stride;
|
||||||
|
unsigned char *bestaddress;
|
||||||
|
MV *best_mv = &d->bmi.mv.as_mv;
|
||||||
|
MV this_mv;
|
||||||
|
int bestsad = INT_MAX;
|
||||||
|
int r, c;
|
||||||
|
|
||||||
|
unsigned char *check_here;
|
||||||
|
unsigned int thissad;
|
||||||
|
|
||||||
|
int ref_row = ref_mv->row >> 3;
|
||||||
|
int ref_col = ref_mv->col >> 3;
|
||||||
|
|
||||||
|
int row_min = ref_row - distance;
|
||||||
|
int row_max = ref_row + distance;
|
||||||
|
int col_min = ref_col - distance;
|
||||||
|
int col_max = ref_col + distance;
|
||||||
|
|
||||||
|
unsigned short sad_array8[8];
|
||||||
|
unsigned int sad_array[3];
|
||||||
|
|
||||||
|
// Work out the mid point for the search
|
||||||
|
in_what = *(d->base_pre) + d->pre;
|
||||||
|
bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
|
||||||
|
|
||||||
|
best_mv->row = ref_row;
|
||||||
|
best_mv->col = ref_col;
|
||||||
|
|
||||||
|
// We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
|
||||||
|
if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
|
||||||
|
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
|
||||||
|
{
|
||||||
|
// Baseline value at the centre
|
||||||
|
bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
|
||||||
|
if (col_min < x->mv_col_min)
|
||||||
|
col_min = x->mv_col_min;
|
||||||
|
|
||||||
|
if (col_max > x->mv_col_max)
|
||||||
|
col_max = x->mv_col_max;
|
||||||
|
|
||||||
|
if (row_min < x->mv_row_min)
|
||||||
|
row_min = x->mv_row_min;
|
||||||
|
|
||||||
|
if (row_max > x->mv_row_max)
|
||||||
|
row_max = x->mv_row_max;
|
||||||
|
|
||||||
|
for (r = row_min; r < row_max ; r++)
|
||||||
|
{
|
||||||
|
this_mv.row = r << 3;
|
||||||
|
check_here = r * mv_stride + in_what + col_min;
|
||||||
|
c = col_min;
|
||||||
|
|
||||||
|
while ((c + 7) < col_max)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8);
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i++)
|
||||||
|
{
|
||||||
|
thissad = (unsigned int)sad_array8[i];
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
this_mv.col = c << 3;
|
||||||
|
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
bestsad = thissad;
|
||||||
|
best_mv->row = r;
|
||||||
|
best_mv->col = c;
|
||||||
|
bestaddress = check_here;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
check_here++;
|
||||||
|
c++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((c + 2) < col_max)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
|
||||||
|
|
||||||
|
for (i = 0; i < 3; i++)
|
||||||
|
{
|
||||||
|
thissad = sad_array[i];
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
this_mv.col = c << 3;
|
||||||
|
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
bestsad = thissad;
|
||||||
|
best_mv->row = r;
|
||||||
|
best_mv->col = c;
|
||||||
|
bestaddress = check_here;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
check_here++;
|
||||||
|
c++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (c < col_max)
|
||||||
|
{
|
||||||
|
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
this_mv.col = c << 3;
|
||||||
|
thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
|
||||||
|
|
||||||
|
if (thissad < bestsad)
|
||||||
|
{
|
||||||
|
bestsad = thissad;
|
||||||
|
best_mv->row = r;
|
||||||
|
best_mv->col = c;
|
||||||
|
bestaddress = check_here;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
check_here ++;
|
||||||
|
c ++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this_mv.row = best_mv->row << 3;
|
||||||
|
this_mv.col = best_mv->col << 3;
|
||||||
|
|
||||||
|
if (bestsad < INT_MAX)
|
||||||
|
return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
|
||||||
|
+ vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
|
||||||
|
else
|
||||||
|
return INT_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef ENTROPY_STATS
|
#ifdef ENTROPY_STATS
|
||||||
void print_mode_context(void)
|
void print_mode_context(void)
|
||||||
{
|
{
|
||||||
|
|
|
@ -93,6 +93,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
|
||||||
typedef prototype_full_search_sad(*vp8_full_search_fn_t);
|
typedef prototype_full_search_sad(*vp8_full_search_fn_t);
|
||||||
extern prototype_full_search_sad(vp8_full_search_sad);
|
extern prototype_full_search_sad(vp8_full_search_sad);
|
||||||
extern prototype_full_search_sad(vp8_full_search_sadx3);
|
extern prototype_full_search_sad(vp8_full_search_sadx3);
|
||||||
|
extern prototype_full_search_sad(vp8_full_search_sadx8);
|
||||||
|
|
||||||
typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
|
typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
|
||||||
extern prototype_diamond_search_sad(vp8_diamond_search_sad);
|
extern prototype_diamond_search_sad(vp8_diamond_search_sad);
|
||||||
|
|
|
@ -2341,6 +2341,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
|
||||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
|
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
|
||||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
|
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
|
||||||
cpi->fn_ptr[BLOCK_16X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
|
cpi->fn_ptr[BLOCK_16X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
|
||||||
|
cpi->fn_ptr[BLOCK_16X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8);
|
||||||
cpi->fn_ptr[BLOCK_16X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
|
cpi->fn_ptr[BLOCK_16X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_16X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
|
cpi->fn_ptr[BLOCK_16X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
|
||||||
|
@ -2350,6 +2351,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
|
||||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
||||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
|
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
|
||||||
cpi->fn_ptr[BLOCK_16X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
|
cpi->fn_ptr[BLOCK_16X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
|
||||||
|
cpi->fn_ptr[BLOCK_16X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8);
|
||||||
cpi->fn_ptr[BLOCK_16X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
|
cpi->fn_ptr[BLOCK_16X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_8X16].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
|
cpi->fn_ptr[BLOCK_8X16].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
|
||||||
|
@ -2359,6 +2361,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
|
||||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
|
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
|
cpi->fn_ptr[BLOCK_8X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
|
||||||
|
cpi->fn_ptr[BLOCK_8X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8);
|
||||||
cpi->fn_ptr[BLOCK_8X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
|
cpi->fn_ptr[BLOCK_8X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_8X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
|
cpi->fn_ptr[BLOCK_8X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
|
||||||
|
@ -2368,6 +2371,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
|
||||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
|
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
|
cpi->fn_ptr[BLOCK_8X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
|
||||||
|
cpi->fn_ptr[BLOCK_8X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8);
|
||||||
cpi->fn_ptr[BLOCK_8X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
|
cpi->fn_ptr[BLOCK_8X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_4X4].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
|
cpi->fn_ptr[BLOCK_4X4].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
|
||||||
|
@ -2377,6 +2381,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
|
||||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
||||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
|
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
|
||||||
cpi->fn_ptr[BLOCK_4X4].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
|
cpi->fn_ptr[BLOCK_4X4].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
|
||||||
|
cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8);
|
||||||
cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
|
cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
|
||||||
|
|
||||||
#if !(CONFIG_REALTIME_ONLY)
|
#if !(CONFIG_REALTIME_ONLY)
|
||||||
|
|
|
@ -126,6 +126,24 @@ void vp8_sad16x16x3_c(
|
||||||
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp8_sad16x16x8_c(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int ref_stride,
|
||||||
|
unsigned short *sad_array
|
||||||
|
)
|
||||||
|
{
|
||||||
|
sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
|
||||||
|
}
|
||||||
|
|
||||||
void vp8_sad16x8x3_c(
|
void vp8_sad16x8x3_c(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
|
@ -139,6 +157,24 @@ void vp8_sad16x8x3_c(
|
||||||
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp8_sad16x8x8_c(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int ref_stride,
|
||||||
|
unsigned short *sad_array
|
||||||
|
)
|
||||||
|
{
|
||||||
|
sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
|
||||||
|
}
|
||||||
|
|
||||||
void vp8_sad8x8x3_c(
|
void vp8_sad8x8x3_c(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
|
@ -152,6 +188,24 @@ void vp8_sad8x8x3_c(
|
||||||
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp8_sad8x8x8_c(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int ref_stride,
|
||||||
|
unsigned short *sad_array
|
||||||
|
)
|
||||||
|
{
|
||||||
|
sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
|
||||||
|
}
|
||||||
|
|
||||||
void vp8_sad8x16x3_c(
|
void vp8_sad8x16x3_c(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
|
@ -165,6 +219,24 @@ void vp8_sad8x16x3_c(
|
||||||
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp8_sad8x16x8_c(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int ref_stride,
|
||||||
|
unsigned short *sad_array
|
||||||
|
)
|
||||||
|
{
|
||||||
|
sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
|
||||||
|
}
|
||||||
|
|
||||||
void vp8_sad4x4x3_c(
|
void vp8_sad4x4x3_c(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
|
@ -178,6 +250,24 @@ void vp8_sad4x4x3_c(
|
||||||
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp8_sad4x4x8_c(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int src_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int ref_stride,
|
||||||
|
unsigned short *sad_array
|
||||||
|
)
|
||||||
|
{
|
||||||
|
sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
|
||||||
|
sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff);
|
||||||
|
sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
|
||||||
|
}
|
||||||
|
|
||||||
void vp8_sad16x16x4d_c(
|
void vp8_sad16x16x4d_c(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
|
|
|
@ -32,6 +32,16 @@
|
||||||
unsigned int *sad_array\
|
unsigned int *sad_array\
|
||||||
)
|
)
|
||||||
|
|
||||||
|
#define prototype_sad_multi_same_address_1(sym)\
|
||||||
|
void (sym)\
|
||||||
|
(\
|
||||||
|
const unsigned char *src_ptr, \
|
||||||
|
int source_stride, \
|
||||||
|
const unsigned char *ref_ptr, \
|
||||||
|
int ref_stride, \
|
||||||
|
unsigned short *sad_array\
|
||||||
|
)
|
||||||
|
|
||||||
#define prototype_sad_multi_dif_address(sym)\
|
#define prototype_sad_multi_dif_address(sym)\
|
||||||
void (sym)\
|
void (sym)\
|
||||||
(\
|
(\
|
||||||
|
@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
|
||||||
#endif
|
#endif
|
||||||
extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
|
extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
|
||||||
|
|
||||||
|
#ifndef vp8_variance_sad16x16x8
|
||||||
|
#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
|
||||||
|
#endif
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8);
|
||||||
|
|
||||||
|
#ifndef vp8_variance_sad16x8x8
|
||||||
|
#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c
|
||||||
|
#endif
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8);
|
||||||
|
|
||||||
|
#ifndef vp8_variance_sad8x8x8
|
||||||
|
#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c
|
||||||
|
#endif
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8);
|
||||||
|
|
||||||
|
#ifndef vp8_variance_sad8x16x8
|
||||||
|
#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c
|
||||||
|
#endif
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8);
|
||||||
|
|
||||||
|
#ifndef vp8_variance_sad4x4x8
|
||||||
|
#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c
|
||||||
|
#endif
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8);
|
||||||
|
|
||||||
//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||||
|
|
||||||
#ifndef vp8_variance_sad16x16x4d
|
#ifndef vp8_variance_sad16x16x4d
|
||||||
|
@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs);
|
||||||
|
|
||||||
typedef prototype_sad(*vp8_sad_fn_t);
|
typedef prototype_sad(*vp8_sad_fn_t);
|
||||||
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
|
typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
|
||||||
|
typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
|
||||||
typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
|
typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
|
||||||
typedef prototype_variance(*vp8_variance_fn_t);
|
typedef prototype_variance(*vp8_variance_fn_t);
|
||||||
typedef prototype_variance2(*vp8_variance2_fn_t);
|
typedef prototype_variance2(*vp8_variance2_fn_t);
|
||||||
|
@ -317,6 +353,12 @@ typedef struct
|
||||||
vp8_sad_multi_fn_t sad8x8x3;
|
vp8_sad_multi_fn_t sad8x8x3;
|
||||||
vp8_sad_multi_fn_t sad4x4x3;
|
vp8_sad_multi_fn_t sad4x4x3;
|
||||||
|
|
||||||
|
vp8_sad_multi1_fn_t sad16x16x8;
|
||||||
|
vp8_sad_multi1_fn_t sad16x8x8;
|
||||||
|
vp8_sad_multi1_fn_t sad8x16x8;
|
||||||
|
vp8_sad_multi1_fn_t sad8x8x8;
|
||||||
|
vp8_sad_multi1_fn_t sad4x4x8;
|
||||||
|
|
||||||
vp8_sad_multi_d_fn_t sad16x16x4d;
|
vp8_sad_multi_d_fn_t sad16x16x4d;
|
||||||
vp8_sad_multi_d_fn_t sad16x8x4d;
|
vp8_sad_multi_d_fn_t sad16x8x4d;
|
||||||
vp8_sad_multi_d_fn_t sad8x16x4d;
|
vp8_sad_multi_d_fn_t sad8x16x4d;
|
||||||
|
@ -334,6 +376,7 @@ typedef struct
|
||||||
vp8_variance_fn_t svf_halfpix_v;
|
vp8_variance_fn_t svf_halfpix_v;
|
||||||
vp8_variance_fn_t svf_halfpix_hv;
|
vp8_variance_fn_t svf_halfpix_hv;
|
||||||
vp8_sad_multi_fn_t sdx3f;
|
vp8_sad_multi_fn_t sdx3f;
|
||||||
|
vp8_sad_multi1_fn_t sdx8f;
|
||||||
vp8_sad_multi_d_fn_t sdx4df;
|
vp8_sad_multi_d_fn_t sdx4df;
|
||||||
} vp8_variance_fn_ptr_t;
|
} vp8_variance_fn_ptr_t;
|
||||||
|
|
||||||
|
|
|
@ -24,5 +24,14 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_SSE4_1
|
||||||
|
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||||
|
|
||||||
|
#undef vp8_search_full_search
|
||||||
|
#define vp8_search_full_search vp8_full_search_sadx8
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,353 @@
|
||||||
|
;
|
||||||
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license
|
||||||
|
; that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. An additional intellectual property rights grant can be found
|
||||||
|
; in the file PATENTS. All contributing project authors may
|
||||||
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
%include "vpx_ports/x86_abi_support.asm"
|
||||||
|
|
||||||
|
%macro PROCESS_16X2X8 1
|
||||||
|
%if %1
|
||||||
|
movdqa xmm0, XMMWORD PTR [rsi]
|
||||||
|
movq xmm1, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
movq xmm2, MMWORD PTR [rdi+16]
|
||||||
|
punpcklqdq xmm1, xmm3
|
||||||
|
punpcklqdq xmm3, xmm2
|
||||||
|
|
||||||
|
movdqa xmm2, xmm1
|
||||||
|
mpsadbw xmm1, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
|
||||||
|
psrldq xmm0, 8
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
mpsadbw xmm3, xmm0, 0x0
|
||||||
|
mpsadbw xmm4, xmm0, 0x5
|
||||||
|
|
||||||
|
paddw xmm1, xmm2
|
||||||
|
paddw xmm1, xmm3
|
||||||
|
paddw xmm1, xmm4
|
||||||
|
%else
|
||||||
|
movdqa xmm0, XMMWORD PTR [rsi]
|
||||||
|
movq xmm5, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
movq xmm2, MMWORD PTR [rdi+16]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
punpcklqdq xmm3, xmm2
|
||||||
|
|
||||||
|
movdqa xmm2, xmm5
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
|
||||||
|
psrldq xmm0, 8
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
mpsadbw xmm3, xmm0, 0x0
|
||||||
|
mpsadbw xmm4, xmm0, 0x5
|
||||||
|
|
||||||
|
paddw xmm5, xmm2
|
||||||
|
paddw xmm5, xmm3
|
||||||
|
paddw xmm5, xmm4
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endif
|
||||||
|
movdqa xmm0, XMMWORD PTR [rsi + rax]
|
||||||
|
movq xmm5, MMWORD PTR [rdi+ rdx]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+ rdx+8]
|
||||||
|
movq xmm2, MMWORD PTR [rdi+ rdx+16]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
punpcklqdq xmm3, xmm2
|
||||||
|
|
||||||
|
lea rsi, [rsi+rax*2]
|
||||||
|
lea rdi, [rdi+rdx*2]
|
||||||
|
|
||||||
|
movdqa xmm2, xmm5
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
mpsadbw xmm3, xmm0, 0x0
|
||||||
|
mpsadbw xmm4, xmm0, 0x5
|
||||||
|
|
||||||
|
paddw xmm5, xmm2
|
||||||
|
paddw xmm5, xmm3
|
||||||
|
paddw xmm5, xmm4
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro PROCESS_8X2X8 1
|
||||||
|
%if %1
|
||||||
|
movq xmm0, MMWORD PTR [rsi]
|
||||||
|
movq xmm1, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
punpcklqdq xmm1, xmm3
|
||||||
|
|
||||||
|
movdqa xmm2, xmm1
|
||||||
|
mpsadbw xmm1, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
paddw xmm1, xmm2
|
||||||
|
%else
|
||||||
|
movq xmm0, MMWORD PTR [rsi]
|
||||||
|
movq xmm5, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
|
||||||
|
movdqa xmm2, xmm5
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
paddw xmm5, xmm2
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endif
|
||||||
|
movq xmm0, MMWORD PTR [rsi + rax]
|
||||||
|
movq xmm5, MMWORD PTR [rdi+ rdx]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+ rdx+8]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
|
||||||
|
lea rsi, [rsi+rax*2]
|
||||||
|
lea rdi, [rdi+rdx*2]
|
||||||
|
|
||||||
|
movdqa xmm2, xmm5
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
mpsadbw xmm2, xmm0, 0x5
|
||||||
|
paddw xmm5, xmm2
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro PROCESS_4X2X8 1
|
||||||
|
%if %1
|
||||||
|
movd xmm0, [rsi]
|
||||||
|
movq xmm1, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
punpcklqdq xmm1, xmm3
|
||||||
|
|
||||||
|
mpsadbw xmm1, xmm0, 0x0
|
||||||
|
%else
|
||||||
|
movd xmm0, [rsi]
|
||||||
|
movq xmm5, MMWORD PTR [rdi]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+8]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endif
|
||||||
|
movd xmm0, [rsi + rax]
|
||||||
|
movq xmm5, MMWORD PTR [rdi+ rdx]
|
||||||
|
movq xmm3, MMWORD PTR [rdi+ rdx+8]
|
||||||
|
punpcklqdq xmm5, xmm3
|
||||||
|
|
||||||
|
lea rsi, [rsi+rax*2]
|
||||||
|
lea rdi, [rdi+rdx*2]
|
||||||
|
|
||||||
|
mpsadbw xmm5, xmm0, 0x0
|
||||||
|
|
||||||
|
paddw xmm1, xmm5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
|
||||||
|
;void vp8_sad16x16x8_sse4(
|
||||||
|
; const unsigned char *src_ptr,
|
||||||
|
; int src_stride,
|
||||||
|
; const unsigned char *ref_ptr,
|
||||||
|
; int ref_stride,
|
||||||
|
; unsigned short *sad_array);
|
||||||
|
global sym(vp8_sad16x16x8_sse4)
|
||||||
|
sym(vp8_sad16x16x8_sse4):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 5
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rsi, arg(0) ;src_ptr
|
||||||
|
mov rdi, arg(2) ;ref_ptr
|
||||||
|
|
||||||
|
movsxd rax, dword ptr arg(1) ;src_stride
|
||||||
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
||||||
|
|
||||||
|
PROCESS_16X2X8 1
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
|
||||||
|
mov rdi, arg(4) ;Results
|
||||||
|
movdqa XMMWORD PTR [rdi], xmm1
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;void vp8_sad16x8x8_sse4(
|
||||||
|
; const unsigned char *src_ptr,
|
||||||
|
; int src_stride,
|
||||||
|
; const unsigned char *ref_ptr,
|
||||||
|
; int ref_stride,
|
||||||
|
; unsigned short *sad_array
|
||||||
|
;);
|
||||||
|
global sym(vp8_sad16x8x8_sse4)
|
||||||
|
sym(vp8_sad16x8x8_sse4):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 5
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rsi, arg(0) ;src_ptr
|
||||||
|
mov rdi, arg(2) ;ref_ptr
|
||||||
|
|
||||||
|
movsxd rax, dword ptr arg(1) ;src_stride
|
||||||
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
||||||
|
|
||||||
|
PROCESS_16X2X8 1
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
PROCESS_16X2X8 0
|
||||||
|
|
||||||
|
mov rdi, arg(4) ;Results
|
||||||
|
movdqa XMMWORD PTR [rdi], xmm1
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;void vp8_sad8x8x8_sse4(
|
||||||
|
; const unsigned char *src_ptr,
|
||||||
|
; int src_stride,
|
||||||
|
; const unsigned char *ref_ptr,
|
||||||
|
; int ref_stride,
|
||||||
|
; unsigned short *sad_array
|
||||||
|
;);
|
||||||
|
global sym(vp8_sad8x8x8_sse4)
|
||||||
|
sym(vp8_sad8x8x8_sse4):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 5
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rsi, arg(0) ;src_ptr
|
||||||
|
mov rdi, arg(2) ;ref_ptr
|
||||||
|
|
||||||
|
movsxd rax, dword ptr arg(1) ;src_stride
|
||||||
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
||||||
|
|
||||||
|
PROCESS_8X2X8 1
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
|
||||||
|
mov rdi, arg(4) ;Results
|
||||||
|
movdqa XMMWORD PTR [rdi], xmm1
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;void vp8_sad8x16x8_sse4(
|
||||||
|
; const unsigned char *src_ptr,
|
||||||
|
; int src_stride,
|
||||||
|
; const unsigned char *ref_ptr,
|
||||||
|
; int ref_stride,
|
||||||
|
; unsigned short *sad_array
|
||||||
|
;);
|
||||||
|
global sym(vp8_sad8x16x8_sse4)
|
||||||
|
sym(vp8_sad8x16x8_sse4):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 5
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rsi, arg(0) ;src_ptr
|
||||||
|
mov rdi, arg(2) ;ref_ptr
|
||||||
|
|
||||||
|
movsxd rax, dword ptr arg(1) ;src_stride
|
||||||
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
||||||
|
|
||||||
|
PROCESS_8X2X8 1
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
PROCESS_8X2X8 0
|
||||||
|
mov rdi, arg(4) ;Results
|
||||||
|
movdqa XMMWORD PTR [rdi], xmm1
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;void vp8_sad4x4x8_c(
|
||||||
|
; const unsigned char *src_ptr,
|
||||||
|
; int src_stride,
|
||||||
|
; const unsigned char *ref_ptr,
|
||||||
|
; int ref_stride,
|
||||||
|
; unsigned short *sad_array
|
||||||
|
;);
|
||||||
|
global sym(vp8_sad4x4x8_sse4)
|
||||||
|
sym(vp8_sad4x4x8_sse4):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 5
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rsi, arg(0) ;src_ptr
|
||||||
|
mov rdi, arg(2) ;ref_ptr
|
||||||
|
|
||||||
|
movsxd rax, dword ptr arg(1) ;src_stride
|
||||||
|
movsxd rdx, dword ptr arg(3) ;ref_stride
|
||||||
|
|
||||||
|
PROCESS_4X2X8 1
|
||||||
|
PROCESS_4X2X8 0
|
||||||
|
|
||||||
|
mov rdi, arg(4) ;Results
|
||||||
|
movdqa XMMWORD PTR [rdi], xmm1
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -297,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if HAVE_SSE4_1
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4);
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4);
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4);
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4);
|
||||||
|
extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4);
|
||||||
|
|
||||||
|
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||||
|
#undef vp8_variance_sad16x16x8
|
||||||
|
#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4
|
||||||
|
|
||||||
|
#undef vp8_variance_sad16x8x8
|
||||||
|
#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4
|
||||||
|
|
||||||
|
#undef vp8_variance_sad8x16x8
|
||||||
|
#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4
|
||||||
|
|
||||||
|
#undef vp8_variance_sad8x8x8
|
||||||
|
#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4
|
||||||
|
|
||||||
|
#undef vp8_variance_sad4x4x8
|
||||||
|
#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4
|
||||||
|
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -188,6 +188,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||||
int wmt_enabled = flags & HAS_SSE2;
|
int wmt_enabled = flags & HAS_SSE2;
|
||||||
int SSE3Enabled = flags & HAS_SSE3;
|
int SSE3Enabled = flags & HAS_SSE3;
|
||||||
int SSSE3Enabled = flags & HAS_SSSE3;
|
int SSSE3Enabled = flags & HAS_SSSE3;
|
||||||
|
int SSE4_1Enabled = flags & HAS_SSE4_1;
|
||||||
|
|
||||||
/* Note:
|
/* Note:
|
||||||
*
|
*
|
||||||
|
@ -198,7 +199,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||||
|
|
||||||
/* Override default functions with fastest ones for this CPU. */
|
/* Override default functions with fastest ones for this CPU. */
|
||||||
#if HAVE_MMX
|
#if HAVE_MMX
|
||||||
|
|
||||||
if (mmx_enabled)
|
if (mmx_enabled)
|
||||||
{
|
{
|
||||||
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
|
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx;
|
||||||
|
@ -254,10 +254,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||||
|
|
||||||
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
|
/*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#if HAVE_SSE2
|
|
||||||
|
|
||||||
|
#if HAVE_SSE2
|
||||||
if (wmt_enabled)
|
if (wmt_enabled)
|
||||||
{
|
{
|
||||||
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
|
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt;
|
||||||
|
@ -307,10 +306,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||||
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
|
||||||
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
|
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#if HAVE_SSE3
|
|
||||||
|
|
||||||
|
#if HAVE_SSE3
|
||||||
if (SSE3Enabled)
|
if (SSE3Enabled)
|
||||||
{
|
{
|
||||||
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
|
cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3;
|
||||||
|
@ -328,16 +326,27 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
|
||||||
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
|
cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3;
|
||||||
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
|
cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#if HAVE_SSSE3
|
|
||||||
|
|
||||||
|
#if HAVE_SSSE3
|
||||||
if (SSSE3Enabled)
|
if (SSSE3Enabled)
|
||||||
{
|
{
|
||||||
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
|
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
|
||||||
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
|
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if HAVE_SSE4_1
|
||||||
|
if (SSE4_1Enabled)
|
||||||
|
{
|
||||||
|
cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4;
|
||||||
|
cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4;
|
||||||
|
cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4;
|
||||||
|
cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4;
|
||||||
|
cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4;
|
||||||
|
cpi->rtcd.search.full_search = vp8_full_search_sadx8;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,6 +109,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
|
||||||
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
|
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
|
||||||
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
|
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
|
||||||
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
|
VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
|
||||||
|
VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm
|
||||||
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
|
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
|
||||||
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
|
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ void __cpuid(int CPUInfo[4], int info_type);
|
||||||
#define HAS_SSE2 0x04
|
#define HAS_SSE2 0x04
|
||||||
#define HAS_SSE3 0x08
|
#define HAS_SSE3 0x08
|
||||||
#define HAS_SSSE3 0x10
|
#define HAS_SSSE3 0x10
|
||||||
|
#define HAS_SSE4_1 0x20
|
||||||
#ifndef BIT
|
#ifndef BIT
|
||||||
#define BIT(n) (1<<n)
|
#define BIT(n) (1<<n)
|
||||||
#endif
|
#endif
|
||||||
|
@ -117,6 +118,8 @@ x86_simd_caps(void)
|
||||||
|
|
||||||
if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
|
if (reg_ecx & BIT(9)) flags |= HAS_SSSE3;
|
||||||
|
|
||||||
|
if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
|
||||||
|
|
||||||
return flags & mask;
|
return flags & mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче